Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Release-7.3] Do not select a machine to build team if each SS of the machine has too many ServerTeams #11666

Draft
wants to merge 5 commits into
base: release-7.3
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions fdbclient/ServerKnobs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( REBALANCE_STORAGE_QUEUE_SHARD_PER_KSEC_MIN, SHARD_MIN_BYTES_PER_KSEC);
init( DD_ENABLE_REBALANCE_STORAGE_QUEUE_WITH_LIGHT_WRITE_SHARD, true ); if ( isSimulated ) DD_ENABLE_REBALANCE_STORAGE_QUEUE_WITH_LIGHT_WRITE_SHARD = deterministicRandom()->coinflip();
init( DD_WAIT_TSS_DATA_MOVE_DELAY, 15.0 ); if (isSimulated) DD_WAIT_TSS_DATA_MOVE_DELAY = deterministicRandom()->randomInt(5, 30);
init( CHECK_SERVER_TEAM_WHEN_SELECT_MACHINE_TO_BUILD_SERVER_TEAM, false ); if (isSimulated) CHECK_SERVER_TEAM_WHEN_SELECT_MACHINE_TO_BUILD_SERVER_TEAM = deterministicRandom()->coinflip();

// Large teams are disabled when SHARD_ENCODE_LOCATION_METADATA is enabled
init( DD_MAX_SHARDS_ON_LARGE_TEAMS, 100 ); if( randomize && BUGGIFY ) DD_MAX_SHARDS_ON_LARGE_TEAMS = deterministicRandom()->randomInt(0, 3);
Expand Down
2 changes: 2 additions & 0 deletions fdbclient/include/fdbclient/ServerKnobs.h
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,8 @@ class ServerKnobs : public KnobsImpl<ServerKnobs> {
bool DD_ENABLE_REBALANCE_STORAGE_QUEUE_WITH_LIGHT_WRITE_SHARD; // Enable to allow storage queue rebalancer to move
// light-traffic shards out of the overloading server
double DD_WAIT_TSS_DATA_MOVE_DELAY;
bool CHECK_SERVER_TEAM_WHEN_SELECT_MACHINE_TO_BUILD_SERVER_TEAM; // Enable to build team on servers which do not
// have teams exceeding the target count

// TeamRemover to remove redundant teams
bool TR_FLAG_DISABLE_MACHINE_TEAM_REMOVER; // disable the machineTeamRemover actor
Expand Down
61 changes: 55 additions & 6 deletions fdbserver/DDTeamCollection.actor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
#include "fdbserver/DDTeamCollection.h"
#include "fdbserver/ExclusionTracker.actor.h"
#include "fdbserver/DataDistributionTeam.h"
#include "fdbserver/Knobs.h"
#include "flow/Error.h"
#include "flow/IRandom.h"
#include "flow/Trace.h"
#include "flow/network.h"
Expand Down Expand Up @@ -5080,13 +5082,48 @@ Reference<TCServerInfo> DDTeamCollection::findOneLeastUsedServer() const {
}
}

bool DDTeamCollection::isAvailableToBuildMoreServerTeam(const TCMachineTeamInfo& machineTeam) const {
// This checking takes effects only if CHECK_SERVER_TEAM_WHEN_SELECT_MACHINE_TO_BUILD_SERVER_TEAM is set
ASSERT(SERVER_KNOBS->CHECK_SERVER_TEAM_WHEN_SELECT_MACHINE_TO_BUILD_SERVER_TEAM);
const int targetTeamNumPerServer =
kakaiu marked this conversation as resolved.
Show resolved Hide resolved
(SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * (configuration.storageTeamSize + 1)) / 2;
for (const auto& machine : machineTeam.getMachines()) {
bool existSSBelowTarget = false;
for (const auto& server : machine->serversOnMachine) {
if (server->getTeams().size() < targetTeamNumPerServer) {
existSSBelowTarget = true;
break;
}
}
if (!existSSBelowTarget) {
// In case where each of the servers of the machine has serverTeams is no less than targetTeamNumPerServer
// We do not want to add more serverTeams to this machine
// For targetTeamNumPerServer, see the comment of notEnoughTeamsForAServer()
TraceEvent e(SevWarnAlways, "MachineTeamIsNotAvailableToAddMoreServerTeam");
for (int i = 0; i < machine->serversOnMachine.size(); i++) {
e.detail("Server" + std::to_string(i), machine->serversOnMachine[i]->getId());
e.detail("TeamCountOnServer" + std::to_string(i), machine->serversOnMachine[i]->getTeams().size());
e.detail("HealthyServer" + std::to_string(i),
!(server_status.get(machine->serversOnMachine[i]->getId()).isUnhealthy()));
}
return false;
}
}
return true;
}

Reference<TCMachineTeamInfo> DDTeamCollection::findOneRandomMachineTeam(TCServerInfo const& chosenServer) const {
if (!chosenServer.machine->machineTeams.empty()) {
std::vector<Reference<TCMachineTeamInfo>> healthyMachineTeamsForChosenServer;
for (auto& mt : chosenServer.machine->machineTeams) {
if (isMachineTeamHealthy(*mt)) {
healthyMachineTeamsForChosenServer.push_back(mt);
if (!isMachineTeamHealthy(*mt)) {
continue;
}
if (SERVER_KNOBS->CHECK_SERVER_TEAM_WHEN_SELECT_MACHINE_TO_BUILD_SERVER_TEAM &&
!isAvailableToBuildMoreServerTeam(*mt)) {
continue;
}
healthyMachineTeamsForChosenServer.push_back(mt);
}
if (!healthyMachineTeamsForChosenServer.empty()) {
return deterministicRandom()->randomChoice(healthyMachineTeamsForChosenServer);
Expand Down Expand Up @@ -5396,6 +5433,9 @@ int DDTeamCollection::addTeamsBestOf(int teamsToBuild, int desiredTeams, int max
// Step 2: Randomly pick 1 server from each machine in the chosen machine team to form a server team
std::vector<UID> serverTeam;
int chosenServerCount = 0;
const int targetTeamNumPerServer =
(SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * (configuration.storageTeamSize + 1)) / 2;
Optional<Reference<TCMachineInfo>> unavailableMachine;
for (auto& machine : chosenMachineTeam->getMachines()) {
UID serverID;
if (machine == chosenServer->machine) {
Expand All @@ -5404,13 +5444,22 @@ int DDTeamCollection::addTeamsBestOf(int teamsToBuild, int desiredTeams, int max
serverID = chosenServer->getId();
++chosenServerCount;
} else {
std::vector<Reference<TCServerInfo>> healthyProcesses;
std::vector<Reference<TCServerInfo>> candidateProcesses;
for (auto it : machine->serversOnMachine) {
if (!server_status.get(it->getId()).isUnhealthy()) {
healthyProcesses.push_back(it);
if (server_status.get(it->getId()).isUnhealthy()) {
continue;
}
// When CHECK_SERVER_TEAM_WHEN_SELECT_MACHINE_TO_BUILD_SERVER_TEAM is set,
// we build team on a server which does not have teams more than the target count.
// For targetTeamNumPerServer, see the comment of notEnoughTeamsForAServer()
if (SERVER_KNOBS->CHECK_SERVER_TEAM_WHEN_SELECT_MACHINE_TO_BUILD_SERVER_TEAM &&
it->getTeams().size() >= targetTeamNumPerServer) {
continue;
}
candidateProcesses.push_back(it);
}
serverID = deterministicRandom()->randomChoice(healthyProcesses)->getId();
ASSERT_WE_THINK(candidateProcesses.size() > 0);
serverID = deterministicRandom()->randomChoice(candidateProcesses)->getId();
}
serverTeam.push_back(serverID);
}
Expand Down
4 changes: 4 additions & 0 deletions fdbserver/include/fdbserver/DDTeamCollection.h
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,10 @@ class DDTeamCollection : public ReferenceCounted<DDTeamCollection> {
// Return the healthy server with the least number of correct-size server teams
Reference<TCServerInfo> findOneLeastUsedServer() const;

// Return true if each machine of the input machineTeam has at least one server that has server team size is smaller
// than the target server teams, i,e, still has room to add more serverTeam
bool isAvailableToBuildMoreServerTeam(const TCMachineTeamInfo& machineTeam) const;

// A server team should always come from servers on a machine team
// Check if it is true
bool isOnSameMachineTeam(TCTeamInfo const& team) const;
Expand Down