From fdd86f6fac3b12b089d7b31646b0afe7cec9d01f Mon Sep 17 00:00:00 2001 From: Bukhtawar Khan Date: Tue, 24 Aug 2021 14:34:55 +0530 Subject: [PATCH 1/8] Initial changes to handle skewness Signed-off-by: Bukhtawar Khan --- .../decider/AwarenessAllocationDecider.java | 126 ++++++++++++++++-- .../common/settings/ClusterSettings.java | 3 + .../allocation/AwarenessAllocationTests.java | 100 ++++++++++++++ 3 files changed, 221 insertions(+), 8 deletions(-) diff --git a/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/AwarenessAllocationDecider.java b/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/AwarenessAllocationDecider.java index a5c90bdfccd95..bc3509b36da57 100644 --- a/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/AwarenessAllocationDecider.java +++ b/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/AwarenessAllocationDecider.java @@ -32,12 +32,17 @@ package org.opensearch.cluster.routing.allocation.decider; +import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.function.Function; import com.carrotsearch.hppc.ObjectIntHashMap; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.apache.logging.log4j.message.ParameterizedMessage; import org.opensearch.cluster.metadata.IndexMetadata; import org.opensearch.cluster.routing.RoutingNode; import org.opensearch.cluster.routing.ShardRouting; @@ -47,6 +52,7 @@ import org.opensearch.common.settings.Setting; import org.opensearch.common.settings.Setting.Property; import org.opensearch.common.settings.Settings; +import org.opensearch.common.util.set.Sets; import static java.util.Collections.emptyList; @@ -99,17 +105,41 @@ public class AwarenessAllocationDecider extends AllocationDecider { Property.NodeScope); public static final Setting CLUSTER_ROUTING_ALLOCATION_AWARENESS_FORCE_GROUP_SETTING = Setting.groupSetting("cluster.routing.allocation.awareness.force.", Property.Dynamic, Property.NodeScope); + public static final Setting CLUSTER_ROUTING_ALLOCATION_AWARENESS_ATTRIBUTE_CAPACITY_GROUP_SETTING = + Setting.groupSetting("cluster.routing.allocation.awareness.attribute.", Property.Dynamic, Property.NodeScope); + public static final Setting CLUSTER_ROUTING_ALLOCATION_AWARENESS_FORCED_ALLOCATION_DISABLE_SETTING = + Setting.boolSetting("cluster.routing.allocation.awareness.forced_allocation.disable", false, + Property.Dynamic, Property.NodeScope); + public static final Setting CLUSTER_ROUTING_ALLOCATION_AWARENESS_SKEWNESS_LIMIT = + Setting.intSetting("cluster.routing.allocation.awareness.skewness.limit", 10, Property.Dynamic, Property.NodeScope); private volatile List awarenessAttributes; private volatile Map> forcedAwarenessAttributes; + private volatile Map awarenessAttributeCapacities; + + private volatile boolean disableForcedAllocation; + + private volatile int skewnessLimit; + + private static final Logger logger = LogManager.getLogger(AwarenessAllocationDecider.class); + public AwarenessAllocationDecider(Settings settings, ClusterSettings clusterSettings) { this.awarenessAttributes = CLUSTER_ROUTING_ALLOCATION_AWARENESS_ATTRIBUTE_SETTING.get(settings); + this.disableForcedAllocation = CLUSTER_ROUTING_ALLOCATION_AWARENESS_FORCED_ALLOCATION_DISABLE_SETTING.get(settings); + this.skewnessLimit = CLUSTER_ROUTING_ALLOCATION_AWARENESS_SKEWNESS_LIMIT.get(settings); + clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_AWARENESS_SKEWNESS_LIMIT, + this::setSkewnessLimit); clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_AWARENESS_ATTRIBUTE_SETTING, this::setAwarenessAttributes); setForcedAwarenessAttributes(CLUSTER_ROUTING_ALLOCATION_AWARENESS_FORCE_GROUP_SETTING.get(settings)); + setAwarenessAttributeCapacities(CLUSTER_ROUTING_ALLOCATION_AWARENESS_ATTRIBUTE_CAPACITY_GROUP_SETTING.get(settings)); clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_AWARENESS_FORCE_GROUP_SETTING, - this::setForcedAwarenessAttributes); + this::setForcedAwarenessAttributes); + clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_AWARENESS_ATTRIBUTE_CAPACITY_GROUP_SETTING, + this::setAwarenessAttributeCapacities); + clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_AWARENESS_FORCED_ALLOCATION_DISABLE_SETTING, + this::setDisableForcedAllocation); } private void setForcedAwarenessAttributes(Settings forceSettings) { @@ -124,10 +154,28 @@ private void setForcedAwarenessAttributes(Settings forceSettings) { this.forcedAwarenessAttributes = forcedAwarenessAttributes; } + private void setSkewnessLimit(int skewnessLimit) { + this.skewnessLimit = skewnessLimit; + } + + private void setAwarenessAttributeCapacities(Settings awarenessCapacitySettings) { + Map groupCapacity = new HashMap<>(); + Map forceGroups = awarenessCapacitySettings.getAsGroups(); + for (Map.Entry entry : forceGroups.entrySet()) { + Integer capacity = entry.getValue().getAsInt("capacity", -1); + groupCapacity.put(entry.getKey(), capacity); + } + this.awarenessAttributeCapacities = groupCapacity; + } + private void setAwarenessAttributes(List awarenessAttributes) { this.awarenessAttributes = awarenessAttributes; } + private void setDisableForcedAllocation(boolean disableForcedAllocation) { + this.disableForcedAllocation = disableForcedAllocation; + } + @Override public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) { return underCapacity(shardRouting, node, allocation, true); @@ -159,6 +207,29 @@ private Decision underCapacity(ShardRouting shardRouting, RoutingNode node, Rout // build attr_value -> nodes map ObjectIntHashMap nodesPerAttribute = allocation.routingNodes().nodesPerAttributesCounts(awarenessAttribute); + if (disableForcedAllocation) { + //the current node attribute value under consideration + String nodeAttributeValue = node.node().getAttributes().get(awarenessAttribute); + Set skewedAttributeValues = null; + try { + skewedAttributeValues = skewedNodesPerAttributeValue(nodesPerAttribute, awarenessAttribute); + } catch (IllegalStateException e) { + logger.warn(() -> new ParameterizedMessage("Inconsistent configuration to decide on skewness for attribute " + + "[{}] due to ", awarenessAttribute) , e); + } + if (skewedAttributeValues != null && skewedAttributeValues.contains(nodeAttributeValue)) { + //the current attribute value has nodes that are skewed + return allocation.decision(Decision.NO, NAME, + "there are too many copies of the shard allocated to nodes with attribute [%s], due to skewed distribution of " + + "nodes for attribute value [%s] expected the nodes for this attribute to be [%d] but found nodes per " + + "attribute to be [%d]", + awarenessAttribute, + nodeAttributeValue, + awarenessAttributeCapacities.get(awarenessAttribute), + nodesPerAttribute.get(awarenessAttribute)); + } + } + // build the count of shards per attribute value ObjectIntHashMap shardPerAttribute = new ObjectIntHashMap<>(); for (ShardRouting assignedShard : allocation.routingNodes().assignedShards(shardRouting.shardId())) { @@ -176,7 +247,7 @@ private Decision underCapacity(ShardRouting shardRouting, RoutingNode node, Rout if (node.nodeId().equals(nodeId) == false) { // we work on different nodes, move counts around shardPerAttribute.putOrAdd(allocation.routingNodes().node(nodeId).node().getAttributes().get(awarenessAttribute), - 0, -1); + 0, -1); shardPerAttribute.addTo(node.node().getAttributes().get(awarenessAttribute), 1); } } else { @@ -199,17 +270,56 @@ private Decision underCapacity(ShardRouting shardRouting, RoutingNode node, Rout final int maximumNodeCount = (shardCount + numberOfAttributes - 1) / numberOfAttributes; // ceil(shardCount/numberOfAttributes) if (currentNodeCount > maximumNodeCount) { return allocation.decision(Decision.NO, NAME, - "there are too many copies of the shard allocated to nodes with attribute [%s], there are [%d] total configured " + + "there are too many copies of the shard allocated to nodes with attribute [%s], there are [%d] total configured " + "shard copies for this shard id and [%d] total attribute values, expected the allocated shard count per " + "attribute [%d] to be less than or equal to the upper bound of the required number of shards per attribute [%d]", - awarenessAttribute, - shardCount, - numberOfAttributes, - currentNodeCount, - maximumNodeCount); + awarenessAttribute, + shardCount, + numberOfAttributes, + currentNodeCount, + maximumNodeCount); } } return allocation.decision(Decision.YES, NAME, "node meets all awareness attribute requirements"); } + + private Set skewedNodesPerAttributeValue(ObjectIntHashMap nodesPerAttribute, String awarenessAttribute) { + Set underCapacityAttributeValues = null; + int capacity = awarenessAttributeCapacities.getOrDefault(awarenessAttribute, -1); + if (forcedAwarenessAttributes.containsKey(awarenessAttribute) == false || capacity <= 0) { + // forced awareness is not enabled for this attribute + return Collections.emptySet(); + } + List forcedAwarenessAttribute = forcedAwarenessAttributes.get(awarenessAttribute); + if (forcedAwarenessAttribute.size() > nodesPerAttribute.size()) { + //we have a complete attribute failures + return Collections.emptySet(); + } else if (forcedAwarenessAttribute.size() == nodesPerAttribute.size()) { + int minimumNodesBeforeSkewness = (int) Math.ceil((1 - skewnessLimit / 100.0) * capacity); + for (String attributeValue : forcedAwarenessAttribute) { + if (nodesPerAttribute.containsKey(attributeValue) == false) { + //forced attribute values and discovery nodes have a mismatch + throw new IllegalStateException("Missing attribute value in discovered nodes:" + attributeValue); + } else if (nodesPerAttribute.get(attributeValue) < minimumNodesBeforeSkewness) { + if (underCapacityAttributeValues == null) { + underCapacityAttributeValues = Sets.newHashSet(attributeValue); + } else { + underCapacityAttributeValues.add(attributeValue); + } + } else if (nodesPerAttribute.get(attributeValue) > capacity) { + throw new IllegalStateException("Unexpected capacity for attribute value :" + attributeValue + "expected : " + capacity + + "found :" + nodesPerAttribute.get(attributeValue)); + } + } + if (underCapacityAttributeValues != null && underCapacityAttributeValues.size() == forcedAwarenessAttribute.size() + && forcedAwarenessAttribute.size() != 1) { + throw new IllegalStateException("Unexpected capacity for attribute :" + awarenessAttribute + "capacity" + capacity); + } + } else { + throw new IllegalStateException("Mismatch between forced awareness attribute :" + forcedAwarenessAttributes + + "and discovered nodes " + nodesPerAttribute); + } + return underCapacityAttributeValues; + } } diff --git a/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java b/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java index fdd48fe0ee2af..b9f8f09f42c1d 100644 --- a/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java +++ b/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java @@ -220,6 +220,9 @@ public void apply(Settings value, Settings current, Settings previous) { TransportClient.CLIENT_TRANSPORT_IGNORE_CLUSTER_NAME, TransportClient.CLIENT_TRANSPORT_SNIFF, AwarenessAllocationDecider.CLUSTER_ROUTING_ALLOCATION_AWARENESS_FORCE_GROUP_SETTING, + AwarenessAllocationDecider.CLUSTER_ROUTING_ALLOCATION_AWARENESS_ATTRIBUTE_CAPACITY_GROUP_SETTING, + AwarenessAllocationDecider.CLUSTER_ROUTING_ALLOCATION_AWARENESS_SKEWNESS_LIMIT, + AwarenessAllocationDecider.CLUSTER_ROUTING_ALLOCATION_AWARENESS_FORCED_ALLOCATION_DISABLE_SETTING, BalancedShardsAllocator.INDEX_BALANCE_FACTOR_SETTING, BalancedShardsAllocator.SHARD_BALANCE_FACTOR_SETTING, BalancedShardsAllocator.THRESHOLD_SETTING, diff --git a/server/src/test/java/org/opensearch/cluster/routing/allocation/AwarenessAllocationTests.java b/server/src/test/java/org/opensearch/cluster/routing/allocation/AwarenessAllocationTests.java index fb2800090770a..d469c37b9b1bc 100644 --- a/server/src/test/java/org/opensearch/cluster/routing/allocation/AwarenessAllocationTests.java +++ b/server/src/test/java/org/opensearch/cluster/routing/allocation/AwarenessAllocationTests.java @@ -35,14 +35,17 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.opensearch.Version; +import org.opensearch.cluster.ClusterName; import org.opensearch.cluster.ClusterState; import org.opensearch.cluster.OpenSearchAllocationTestCase; import org.opensearch.cluster.metadata.IndexMetadata; import org.opensearch.cluster.metadata.Metadata; +import org.opensearch.cluster.node.DiscoveryNode; import org.opensearch.cluster.node.DiscoveryNodes; import org.opensearch.cluster.routing.RoutingTable; import org.opensearch.cluster.routing.ShardRouting; import org.opensearch.cluster.routing.ShardRoutingState; +import org.opensearch.cluster.routing.UnassignedInfo; import org.opensearch.cluster.routing.allocation.command.AllocationCommands; import org.opensearch.cluster.routing.allocation.command.CancelAllocationCommand; import org.opensearch.cluster.routing.allocation.command.MoveAllocationCommand; @@ -901,4 +904,101 @@ public void testMultipleAwarenessAttributes() { assertThat(clusterState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(2)); assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(0)); } + + public void testDisabledForcedAllocationPreventsOverload() { + AllocationService strategy = createAllocationService(Settings.builder() + .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_RECOVERIES_SETTING.getKey(), 21) + .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), 21) + .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 21) + .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") + .put(AwarenessAllocationDecider.CLUSTER_ROUTING_ALLOCATION_AWARENESS_FORCED_ALLOCATION_DISABLE_SETTING.getKey(), true) + .put("cluster.routing.allocation.awareness.attribute.zone.capacity", 3) + .put("cluster.routing.allocation.awareness.force.zone.values", "zone_1,zone_2,zone_3") + .put("cluster.routing.allocation.awareness.attributes", "zone") + .build()); + + logger.info("Building initial routing table for 'fullAwareness1'"); + + Metadata metadata = Metadata.builder() + .put(IndexMetadata.builder("test").settings(settings(Version.CURRENT)).numberOfShards(21).numberOfReplicas(2)) + .build(); + + RoutingTable initialRoutingTable = RoutingTable.builder() + .addAsNew(metadata.index("test")) + .build(); + + ClusterState clusterState = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING + .getDefault(Settings.EMPTY)).metadata(metadata).routingTable(initialRoutingTable).build(); + + logger.info("--> adding three nodes on same zone and do rerouting"); + clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder() + .add(newNode("node1", singletonMap("zone", "zone_1"))) + .add(newNode("node2", singletonMap("zone", "zone_1"))) + .add(newNode("node3", singletonMap("zone", "zone_1"))) + ).build(); + clusterState = strategy.reroute(clusterState, "reroute"); + assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(21)); + + logger.info("--> start the shards (primaries)"); + clusterState = startInitializingShardsAndReroute(strategy, clusterState); + + logger.info("--> replica will not start because we have only one rack value"); + assertThat(clusterState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(21)); + assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(0)); + + logger.info("--> add three new node with a new rack and reroute"); + clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()) + .add(newNode("node4", singletonMap("zone", "zone_2"))) + .add(newNode("node5", singletonMap("zone", "zone_2"))) + .add(newNode("node6", singletonMap("zone", "zone_2"))) + ).build(); + clusterState = strategy.reroute(clusterState, "reroute"); + + assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.STARTED).size(), equalTo(21)); + assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.INITIALIZING).size(), equalTo(21)); + assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.INITIALIZING).get(0).currentNodeId(), + equalTo("node4")); + + logger.info("--> complete relocation"); + clusterState = startInitializingShardsAndReroute(strategy, clusterState); + + assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.STARTED).size(), equalTo(42)); + + logger.info("--> do another reroute, make sure nothing moves"); + assertThat(strategy.reroute(clusterState, "reroute").routingTable(), sameInstance(clusterState.routingTable())); + + logger.info("--> add another node with a new rack, make sure nothing moves"); + + clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()) + .add(newNode("node7", singletonMap("zone", "zone_3"))) + .add(newNode("node8", singletonMap("zone", "zone_3"))) + .add(newNode("node9", singletonMap("zone", "zone_3"))) + ).build(); + ClusterState newState = strategy.reroute(clusterState, "reroute"); + while (newState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { + newState = startInitializingShardsAndReroute(strategy, newState); + } + assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(63)); + + logger.info("--> Remove random node from zones holding all primary and all replicas"); + //remove two nodes in one zone to cause distribution zone1->3 , zone2->3, zone3->1 + newState = removeNode(newState, randomFrom("node1", "node7" ), strategy); + logger.info("--> Remove another random node from zones holding all primary and all replicas"); + newState = removeNode(newState, randomFrom("node2", "node8" ), strategy); + newState = strategy.reroute(newState, "reroute"); + while (newState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { + newState = startInitializingShardsAndReroute(strategy, newState); + } + //ensure minority zone doesn't get overloaded + assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(49)); + assertThat(newState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(14)); + for (ShardRouting shard : newState.getRoutingNodes().shardsWithState(UNASSIGNED)) { + assertEquals(shard.unassignedInfo().getReason(), UnassignedInfo.Reason.NODE_LEFT); + } + } + + private ClusterState removeNode(ClusterState clusterState, String nodeName, AllocationService allocationService) { + return allocationService.disassociateDeadNodes(ClusterState.builder(clusterState) + .nodes(DiscoveryNodes.builder(clusterState.getNodes()).remove(nodeName)).build(), true, "reroute"); + } } From c8a2066f609907acb153a2ccfe8242d05c5cd7dc Mon Sep 17 00:00:00 2001 From: Bukhtawar Khan Date: Tue, 24 Aug 2021 14:42:26 +0530 Subject: [PATCH 2/8] Checkstyle failures Signed-off-by: Bukhtawar Khan --- .../allocation/AwarenessAllocationTests.java | 366 +++++++++++++++++- 1 file changed, 365 insertions(+), 1 deletion(-) diff --git a/server/src/test/java/org/opensearch/cluster/routing/allocation/AwarenessAllocationTests.java b/server/src/test/java/org/opensearch/cluster/routing/allocation/AwarenessAllocationTests.java index d469c37b9b1bc..33d3ad7f5aca7 100644 --- a/server/src/test/java/org/opensearch/cluster/routing/allocation/AwarenessAllocationTests.java +++ b/server/src/test/java/org/opensearch/cluster/routing/allocation/AwarenessAllocationTests.java @@ -40,7 +40,6 @@ import org.opensearch.cluster.OpenSearchAllocationTestCase; import org.opensearch.cluster.metadata.IndexMetadata; import org.opensearch.cluster.metadata.Metadata; -import org.opensearch.cluster.node.DiscoveryNode; import org.opensearch.cluster.node.DiscoveryNodes; import org.opensearch.cluster.routing.RoutingTable; import org.opensearch.cluster.routing.ShardRouting; @@ -997,6 +996,371 @@ public void testDisabledForcedAllocationPreventsOverload() { } } + public void testMoveShardDuringPartialFailureSkewnessLimitNotBreached(){ + AllocationService strategy = createAllocationService(Settings.builder() + .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_RECOVERIES_SETTING.getKey(), 20) + .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), 20) + .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 20) + .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") + .put(AwarenessAllocationDecider.CLUSTER_ROUTING_ALLOCATION_AWARENESS_FORCED_ALLOCATION_DISABLE_SETTING.getKey(), true) + .put("cluster.routing.allocation.awareness.attribute.zone.capacity", 5) + .put("cluster.routing.allocation.awareness.force.zone.values", "zone1,zone2,zone3") + .put("cluster.routing.allocation.awareness.attributes", "zone") + .put(AwarenessAllocationDecider.CLUSTER_ROUTING_ALLOCATION_AWARENESS_SKEWNESS_LIMIT.getKey(), 20) + .build()); + + logger.info("Building initial routing table for 'testMoveShardDuringPartialFailureSkewnessLimitNotBreached'"); + + Metadata metadata = Metadata.builder() + .put(IndexMetadata.builder("test").settings(settings(Version.CURRENT)).numberOfShards(20).numberOfReplicas(2)) + .build(); + + RoutingTable initialRoutingTable = RoutingTable.builder() + .addAsNew(metadata.index("test")) + .build(); + + ClusterState clusterState = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING + .getDefault(Settings.EMPTY)).metadata(metadata).routingTable(initialRoutingTable).build(); + + logger.info("--> adding five nodes on same zone and do rerouting"); + clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder() + .add(newNode("node1", singletonMap("zone", "zone1"))) + .add(newNode("node2", singletonMap("zone", "zone1"))) + .add(newNode("node3", singletonMap("zone", "zone1"))) + .add(newNode("node4", singletonMap("zone", "zone1"))) + .add(newNode("node5", singletonMap("zone", "zone1"))) + ).build(); + clusterState = strategy.reroute(clusterState, "reroute"); + assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(20)); + + logger.info("--> start the shards (primaries)"); + clusterState = startInitializingShardsAndReroute(strategy, clusterState); + + logger.info("--> replica will not start because we have only one zone value"); + assertThat(clusterState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(20)); + assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(0)); + //replicas are unassigned + assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(40)); + + logger.info("--> add five new node in new zone and reroute"); + clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()) + .add(newNode("node6", singletonMap("zone", "zone2"))) + .add(newNode("node7", singletonMap("zone", "zone2"))) + .add(newNode("node8", singletonMap("zone", "zone2"))) + .add(newNode("node9", singletonMap("zone", "zone2"))) + .add(newNode("node10", singletonMap("zone", "zone2"))) + ).build(); + clusterState = strategy.reroute(clusterState, "reroute"); + + assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.STARTED).size(), equalTo(20)); + assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.INITIALIZING).size(), equalTo(20)); + + logger.info("--> complete relocation"); + clusterState = startInitializingShardsAndReroute(strategy, clusterState); + + assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.STARTED).size(), equalTo(40)); + + logger.info("--> do another reroute, make sure nothing moves"); + assertThat(strategy.reroute(clusterState, "reroute").routingTable(), sameInstance(clusterState.routingTable())); + + logger.info("--> add another five node in new zone and reroute"); + + clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()) + .add(newNode("node11", singletonMap("zone", "zone3"))) + .add(newNode("node12", singletonMap("zone", "zone3"))) + .add(newNode("node13", singletonMap("zone", "zone3"))) + .add(newNode("node14", singletonMap("zone", "zone3"))) + .add(newNode("node15", singletonMap("zone", "zone3"))) + ).build(); + ClusterState newState = strategy.reroute(clusterState, "reroute"); + while (newState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { + newState = startInitializingShardsAndReroute(strategy, newState); + } + assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(60)); + + logger.info("--> Remove one node from zone3 holding all primary and all replicas"); + + assertThat(newState.getRoutingNodes().node("node11").size(), equalTo(4)); + assertThat(newState.getRoutingNodes().node("node12").size(), equalTo(4)); + assertThat(newState.getRoutingNodes().node("node13").size(), equalTo(4)); + assertThat(newState.getRoutingNodes().node("node14").size(), equalTo(4)); + assertThat(newState.getRoutingNodes().node("node15").size(), equalTo(4)); + + // remove one nodes in one zone to cause distribution zone1->5 , zone2->5, zone3->4 + newState = removeNode(newState, "node11", strategy); + newState = strategy.reroute(newState, "reroute"); + + while (newState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { + newState = startInitializingShardsAndReroute(strategy, newState); + } + + assertThat(newState.getRoutingNodes().node("node12").size(), equalTo(5)); + assertThat(newState.getRoutingNodes().node("node13").size(), equalTo(5)); + assertThat(newState.getRoutingNodes().node("node14").size(), equalTo(5)); + assertThat(newState.getRoutingNodes().node("node15").size(), equalTo(5)); + + // //ensure all shards are assigned + assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(60)); + assertThat(newState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(0)); + } + + public void testShardUnassignedDuringPartialFailureSkewnessLimitBreached(){ + AllocationService strategy = createAllocationService(Settings.builder() + .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_RECOVERIES_SETTING.getKey(), 20) + .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), 20) + .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 20) + .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") + .put(AwarenessAllocationDecider.CLUSTER_ROUTING_ALLOCATION_AWARENESS_FORCED_ALLOCATION_DISABLE_SETTING.getKey(), true) + .put("cluster.routing.allocation.awareness.attribute.zone.capacity", 5) + .put("cluster.routing.allocation.awareness.force.zone.values", "zone1,zone2,zone3") + .put("cluster.routing.allocation.awareness.attributes", "zone") + .put(AwarenessAllocationDecider.CLUSTER_ROUTING_ALLOCATION_AWARENESS_SKEWNESS_LIMIT.getKey(), 20) + .build()); + + logger.info("Building initial routing table for 'testShardUnassignedDuringPartialFailureSkewnessLimitBreached'"); + + Metadata metadata = Metadata.builder() + .put(IndexMetadata.builder("test").settings(settings(Version.CURRENT)).numberOfShards(20).numberOfReplicas(2)) + .build(); + + RoutingTable initialRoutingTable = RoutingTable.builder() + .addAsNew(metadata.index("test")) + .build(); + + ClusterState clusterState = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING + .getDefault(Settings.EMPTY)).metadata(metadata).routingTable(initialRoutingTable).build(); + + logger.info("--> adding five nodes on same zone and do rerouting"); + clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder() + .add(newNode("node1", singletonMap("zone", "zone1"))) + .add(newNode("node2", singletonMap("zone", "zone1"))) + .add(newNode("node3", singletonMap("zone", "zone1"))) + .add(newNode("node4", singletonMap("zone", "zone1"))) + .add(newNode("node5", singletonMap("zone", "zone1"))) + ).build(); + clusterState = strategy.reroute(clusterState, "reroute"); + assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(20)); + + logger.info("--> start the shards (primaries)"); + clusterState = startInitializingShardsAndReroute(strategy, clusterState); + + logger.info("--> replica will not start because we have only one zone value"); + assertThat(clusterState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(20)); + assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(0)); + //replicas are unassigned + assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(40)); + + logger.info("--> add five new node in new zone and reroute"); + clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()) + .add(newNode("node6", singletonMap("zone", "zone2"))) + .add(newNode("node7", singletonMap("zone", "zone2"))) + .add(newNode("node8", singletonMap("zone", "zone2"))) + .add(newNode("node9", singletonMap("zone", "zone2"))) + .add(newNode("node10", singletonMap("zone", "zone2"))) + ).build(); + clusterState = strategy.reroute(clusterState, "reroute"); + + assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.STARTED).size(), equalTo(20)); + assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.INITIALIZING).size(), equalTo(20)); + + logger.info("--> complete relocation"); + clusterState = startInitializingShardsAndReroute(strategy, clusterState); + + assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.STARTED).size(), equalTo(40)); + + logger.info("--> do another reroute, make sure nothing moves"); + assertThat(strategy.reroute(clusterState, "reroute").routingTable(), sameInstance(clusterState.routingTable())); + + logger.info("--> add another five node in new zone and reroute"); + + clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()) + .add(newNode("node11", singletonMap("zone", "zone3"))) + .add(newNode("node12", singletonMap("zone", "zone3"))) + .add(newNode("node13", singletonMap("zone", "zone3"))) + .add(newNode("node14", singletonMap("zone", "zone3"))) + .add(newNode("node15", singletonMap("zone", "zone3"))) + ).build(); + ClusterState newState = strategy.reroute(clusterState, "reroute"); + while (newState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { + newState = startInitializingShardsAndReroute(strategy, newState); + } + assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(60)); + + logger.info("--> Remove one node from zone3 holding all primary and all replicas"); + + assertThat(newState.getRoutingNodes().node("node11").size(), equalTo(4)); + assertThat(newState.getRoutingNodes().node("node12").size(), equalTo(4)); + assertThat(newState.getRoutingNodes().node("node13").size(), equalTo(4)); + assertThat(newState.getRoutingNodes().node("node14").size(), equalTo(4)); + assertThat(newState.getRoutingNodes().node("node15").size(), equalTo(4)); + + // remove one nodes in one zone to cause distribution zone1->5 , zone2->5, zone3->4 + newState = removeNode(newState, "node11", strategy); + newState = strategy.reroute(newState, "reroute"); + + while (newState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { + newState = startInitializingShardsAndReroute(strategy, newState); + } + + assertThat(newState.getRoutingNodes().node("node12").size(), equalTo(5)); + assertThat(newState.getRoutingNodes().node("node13").size(), equalTo(5)); + assertThat(newState.getRoutingNodes().node("node14").size(), equalTo(5)); + assertThat(newState.getRoutingNodes().node("node15").size(), equalTo(5)); + + // ensure all shards are assigned + assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(60)); + assertThat(newState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(0)); + + // remove one more node subsequently in one zone to cause distribution zone1->5 , zone2->5, zone3->3 + newState = removeNode(newState, "node12", strategy); + newState = strategy.reroute(newState, "reroute"); + + assertThat(newState.getRoutingNodes().node("node13").size(), equalTo(5)); + assertThat(newState.getRoutingNodes().node("node14").size(), equalTo(5)); + assertThat(newState.getRoutingNodes().node("node15").size(), equalTo(5)); + + assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(55)); + assertThat(newState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(5)); + + for (ShardRouting shard : newState.getRoutingNodes().shardsWithState(UNASSIGNED)) { + assertEquals(shard.unassignedInfo().getReason(), UnassignedInfo.Reason.NODE_LEFT); + } + } + + public void testSingleZoneReplicaUnassignedOnSkewnessWithThreeShardCopies() { + AllocationService strategy = createAllocationService(Settings.builder() + .put("cluster.routing.allocation.node_concurrent_recoveries", 10) + .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 10) + .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") + .put(AwarenessAllocationDecider.CLUSTER_ROUTING_ALLOCATION_AWARENESS_FORCED_ALLOCATION_DISABLE_SETTING.getKey(), true) + .put("cluster.routing.allocation.awareness.attribute.zone.capacity", 3) + .put("cluster.routing.allocation.awareness.force.zone.values", "zone1") + .put("cluster.routing.allocation.awareness.attributes", "zone") + .build()); + + logger.info("Building initial routing table for 'testSingleZoneReplicaUnassignedOnSkewnessWithThreeShardCopies'"); + + Metadata metadata = Metadata.builder() + .put(IndexMetadata.builder("test").settings(settings(Version.CURRENT)).numberOfShards(3).numberOfReplicas(2)) + .build(); + + RoutingTable initialRoutingTable = RoutingTable.builder() + .addAsNew(metadata.index("test")) + .build(); + + ClusterState clusterState = ClusterState.builder(org.opensearch.cluster.ClusterName.CLUSTER_NAME_SETTING + .getDefault(Settings.EMPTY)).metadata(metadata).routingTable(initialRoutingTable).build(); + + logger.info("--> adding two nodes on same rack and do rerouting"); + clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder() + .add(newNode("node1", singletonMap("zone", "zone1"))) + .add(newNode("node2", singletonMap("zone", "zone1"))) + .add(newNode("node3", singletonMap("zone", "zone1"))) + ).build(); + clusterState = strategy.reroute(clusterState, "reroute"); + assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(3)); + + logger.info("--> start the shards (primaries)"); + clusterState = startInitializingShardsAndReroute(strategy, clusterState); + + logger.info("--> replicas are initializing"); + assertThat(clusterState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(3)); + assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(6)); + + logger.info("--> start the shards (replicas)"); + clusterState = startInitializingShardsAndReroute(strategy, clusterState); + + logger.info("--> all shards are started"); + assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.STARTED).size(), equalTo(9)); + + logger.info("--> do another reroute, make sure nothing moves"); + assertThat(strategy.reroute(clusterState, "reroute").routingTable(), sameInstance(clusterState.routingTable())); + + //remove one node to make zone1 skewed + clusterState = removeNode(clusterState, randomFrom("node1", "node2", "node3"), strategy); + clusterState = strategy.reroute(clusterState, "reroute"); + + while (clusterState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { + clusterState = startInitializingShardsAndReroute(strategy, clusterState); + } + clusterState = startInitializingShardsAndReroute(strategy, clusterState); + + assertThat(clusterState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(6)); + assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(3)); + + for (ShardRouting shard : clusterState.getRoutingNodes().shardsWithState(UNASSIGNED)) { + assertEquals(shard.unassignedInfo().getReason(), UnassignedInfo.Reason.NODE_LEFT); + assertFalse(shard.primary()); + } + } + + public void testSingleZoneReplicaUnassignedOnSkewnessWithTwoShardCopies() { + AllocationService strategy = createAllocationService(Settings.builder() + .put("cluster.routing.allocation.node_concurrent_recoveries", 10) + .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 10) + .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") + .put(AwarenessAllocationDecider.CLUSTER_ROUTING_ALLOCATION_AWARENESS_FORCED_ALLOCATION_DISABLE_SETTING.getKey(), true) + .put("cluster.routing.allocation.awareness.attribute.zone.capacity", 3) + .put("cluster.routing.allocation.awareness.force.zone.values", "zone1") + .put("cluster.routing.allocation.awareness.attributes", "zone") + .build()); + + logger.info("Building initial routing table for 'testSingleZoneReplicaUnassignedOnSkewnessWithTwoShardCopies'"); + + Metadata metadata = Metadata.builder() + .put(IndexMetadata.builder("test").settings(settings(Version.CURRENT)).numberOfShards(3).numberOfReplicas(1)) + .build(); + + RoutingTable initialRoutingTable = RoutingTable.builder() + .addAsNew(metadata.index("test")) + .build(); + + ClusterState clusterState = ClusterState.builder(org.opensearch.cluster.ClusterName.CLUSTER_NAME_SETTING + .getDefault(Settings.EMPTY)).metadata(metadata).routingTable(initialRoutingTable).build(); + + logger.info("--> adding two nodes on same rack and do rerouting"); + clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder() + .add(newNode("node1", singletonMap("zone", "zone1"))) + .add(newNode("node2", singletonMap("zone", "zone1"))) + .add(newNode("node3", singletonMap("zone", "zone1"))) + ).build(); + clusterState = strategy.reroute(clusterState, "reroute"); + assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(3)); + + logger.info("--> start the shards (primaries)"); + clusterState = startInitializingShardsAndReroute(strategy, clusterState); + + logger.info("--> replicas are initializing"); + assertThat(clusterState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(3)); + assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(3)); + + logger.info("--> start the shards (replicas)"); + clusterState = startInitializingShardsAndReroute(strategy, clusterState); + + logger.info("--> all shards are started"); + assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.STARTED).size(), equalTo(6)); + + logger.info("--> do another reroute, make sure nothing moves"); + assertThat(strategy.reroute(clusterState, "reroute").routingTable(), sameInstance(clusterState.routingTable())); + + //remove one node to make zone1 skewed + clusterState = removeNode(clusterState, randomFrom("node1", "node2", "node3"), strategy); + clusterState = strategy.reroute(clusterState, "reroute"); + + while (clusterState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { + clusterState = startInitializingShardsAndReroute(strategy, clusterState); + } + clusterState = startInitializingShardsAndReroute(strategy, clusterState); + + assertThat(clusterState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(4)); + assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(2)); + + for (ShardRouting shard : clusterState.getRoutingNodes().shardsWithState(UNASSIGNED)) { + assertEquals(shard.unassignedInfo().getReason(), UnassignedInfo.Reason.NODE_LEFT); + assertFalse(shard.primary()); + } + } + private ClusterState removeNode(ClusterState clusterState, String nodeName, AllocationService allocationService) { return allocationService.disassociateDeadNodes(ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder(clusterState.getNodes()).remove(nodeName)).build(), true, "reroute"); From efa06c20357e37358905c183d14c4286a8342aa9 Mon Sep 17 00:00:00 2001 From: Bukhtawar Khan Date: Tue, 24 Aug 2021 15:01:31 +0530 Subject: [PATCH 3/8] Checkstyle changes Signed-off-by: Bukhtawar Khan Single attribute value skewed fails to assign replicas Signed-off-by: Bukhtawar Khan More tests Signed-off-by: Bukhtawar Khan Refactor zone aware changes Signed-off-by: Bukhtawar Khan Check style Signed-off-by: Bukhtawar Khan Support node join shard distribution Signed-off-by: Bukhtawar Khan Test fix ups Signed-off-by: Bukhtawar Khan Java doc Signed-off-by: Bukhtawar Khan Minor fixup Signed-off-by: Bukhtawar Khan Simplification Signed-off-by: Bukhtawar Khan Test fix up Signed-off-by: Bukhtawar Khan Minor changes --- .../org/opensearch/cluster/ClusterModule.java | 2 + .../decider/AwarenessAllocationDecider.java | 126 +-- .../NodeOverloadAwareAllocationDecider.java | 117 +++ .../common/settings/ClusterSettings.java | 9 +- .../cluster/ClusterModuleTests.java | 4 +- .../allocation/AwarenessAllocationTests.java | 464 ---------- .../NodeOverloadAwareAllocationTests.java | 821 ++++++++++++++++++ 7 files changed, 956 insertions(+), 587 deletions(-) create mode 100644 server/src/main/java/org/opensearch/cluster/routing/allocation/decider/NodeOverloadAwareAllocationDecider.java create mode 100644 server/src/test/java/org/opensearch/cluster/routing/allocation/NodeOverloadAwareAllocationTests.java diff --git a/server/src/main/java/org/opensearch/cluster/ClusterModule.java b/server/src/main/java/org/opensearch/cluster/ClusterModule.java index 5bd0a661c381d..58ffe2c4d6ef5 100644 --- a/server/src/main/java/org/opensearch/cluster/ClusterModule.java +++ b/server/src/main/java/org/opensearch/cluster/ClusterModule.java @@ -63,6 +63,7 @@ import org.opensearch.cluster.routing.allocation.decider.EnableAllocationDecider; import org.opensearch.cluster.routing.allocation.decider.FilterAllocationDecider; import org.opensearch.cluster.routing.allocation.decider.MaxRetryAllocationDecider; +import org.opensearch.cluster.routing.allocation.decider.NodeOverloadAwareAllocationDecider; import org.opensearch.cluster.routing.allocation.decider.NodeVersionAllocationDecider; import org.opensearch.cluster.routing.allocation.decider.RebalanceOnlyWhenActiveAllocationDecider; import org.opensearch.cluster.routing.allocation.decider.ReplicaAfterPrimaryActiveAllocationDecider; @@ -256,6 +257,7 @@ public static Collection createAllocationDeciders(Settings se addAllocationDecider(deciders, new ThrottlingAllocationDecider(settings, clusterSettings)); addAllocationDecider(deciders, new ShardsLimitAllocationDecider(settings, clusterSettings)); addAllocationDecider(deciders, new AwarenessAllocationDecider(settings, clusterSettings)); + addAllocationDecider(deciders, new NodeOverloadAwareAllocationDecider(settings, clusterSettings)); clusterPlugins.stream() .flatMap(p -> p.createAllocationDeciders(settings, clusterSettings).stream()) diff --git a/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/AwarenessAllocationDecider.java b/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/AwarenessAllocationDecider.java index bc3509b36da57..a5c90bdfccd95 100644 --- a/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/AwarenessAllocationDecider.java +++ b/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/AwarenessAllocationDecider.java @@ -32,17 +32,12 @@ package org.opensearch.cluster.routing.allocation.decider; -import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.Set; import java.util.function.Function; import com.carrotsearch.hppc.ObjectIntHashMap; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; -import org.apache.logging.log4j.message.ParameterizedMessage; import org.opensearch.cluster.metadata.IndexMetadata; import org.opensearch.cluster.routing.RoutingNode; import org.opensearch.cluster.routing.ShardRouting; @@ -52,7 +47,6 @@ import org.opensearch.common.settings.Setting; import org.opensearch.common.settings.Setting.Property; import org.opensearch.common.settings.Settings; -import org.opensearch.common.util.set.Sets; import static java.util.Collections.emptyList; @@ -105,41 +99,17 @@ public class AwarenessAllocationDecider extends AllocationDecider { Property.NodeScope); public static final Setting CLUSTER_ROUTING_ALLOCATION_AWARENESS_FORCE_GROUP_SETTING = Setting.groupSetting("cluster.routing.allocation.awareness.force.", Property.Dynamic, Property.NodeScope); - public static final Setting CLUSTER_ROUTING_ALLOCATION_AWARENESS_ATTRIBUTE_CAPACITY_GROUP_SETTING = - Setting.groupSetting("cluster.routing.allocation.awareness.attribute.", Property.Dynamic, Property.NodeScope); - public static final Setting CLUSTER_ROUTING_ALLOCATION_AWARENESS_FORCED_ALLOCATION_DISABLE_SETTING = - Setting.boolSetting("cluster.routing.allocation.awareness.forced_allocation.disable", false, - Property.Dynamic, Property.NodeScope); - public static final Setting CLUSTER_ROUTING_ALLOCATION_AWARENESS_SKEWNESS_LIMIT = - Setting.intSetting("cluster.routing.allocation.awareness.skewness.limit", 10, Property.Dynamic, Property.NodeScope); private volatile List awarenessAttributes; private volatile Map> forcedAwarenessAttributes; - private volatile Map awarenessAttributeCapacities; - - private volatile boolean disableForcedAllocation; - - private volatile int skewnessLimit; - - private static final Logger logger = LogManager.getLogger(AwarenessAllocationDecider.class); - public AwarenessAllocationDecider(Settings settings, ClusterSettings clusterSettings) { this.awarenessAttributes = CLUSTER_ROUTING_ALLOCATION_AWARENESS_ATTRIBUTE_SETTING.get(settings); - this.disableForcedAllocation = CLUSTER_ROUTING_ALLOCATION_AWARENESS_FORCED_ALLOCATION_DISABLE_SETTING.get(settings); - this.skewnessLimit = CLUSTER_ROUTING_ALLOCATION_AWARENESS_SKEWNESS_LIMIT.get(settings); - clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_AWARENESS_SKEWNESS_LIMIT, - this::setSkewnessLimit); clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_AWARENESS_ATTRIBUTE_SETTING, this::setAwarenessAttributes); setForcedAwarenessAttributes(CLUSTER_ROUTING_ALLOCATION_AWARENESS_FORCE_GROUP_SETTING.get(settings)); - setAwarenessAttributeCapacities(CLUSTER_ROUTING_ALLOCATION_AWARENESS_ATTRIBUTE_CAPACITY_GROUP_SETTING.get(settings)); clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_AWARENESS_FORCE_GROUP_SETTING, - this::setForcedAwarenessAttributes); - clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_AWARENESS_ATTRIBUTE_CAPACITY_GROUP_SETTING, - this::setAwarenessAttributeCapacities); - clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_AWARENESS_FORCED_ALLOCATION_DISABLE_SETTING, - this::setDisableForcedAllocation); + this::setForcedAwarenessAttributes); } private void setForcedAwarenessAttributes(Settings forceSettings) { @@ -154,28 +124,10 @@ private void setForcedAwarenessAttributes(Settings forceSettings) { this.forcedAwarenessAttributes = forcedAwarenessAttributes; } - private void setSkewnessLimit(int skewnessLimit) { - this.skewnessLimit = skewnessLimit; - } - - private void setAwarenessAttributeCapacities(Settings awarenessCapacitySettings) { - Map groupCapacity = new HashMap<>(); - Map forceGroups = awarenessCapacitySettings.getAsGroups(); - for (Map.Entry entry : forceGroups.entrySet()) { - Integer capacity = entry.getValue().getAsInt("capacity", -1); - groupCapacity.put(entry.getKey(), capacity); - } - this.awarenessAttributeCapacities = groupCapacity; - } - private void setAwarenessAttributes(List awarenessAttributes) { this.awarenessAttributes = awarenessAttributes; } - private void setDisableForcedAllocation(boolean disableForcedAllocation) { - this.disableForcedAllocation = disableForcedAllocation; - } - @Override public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) { return underCapacity(shardRouting, node, allocation, true); @@ -207,29 +159,6 @@ private Decision underCapacity(ShardRouting shardRouting, RoutingNode node, Rout // build attr_value -> nodes map ObjectIntHashMap nodesPerAttribute = allocation.routingNodes().nodesPerAttributesCounts(awarenessAttribute); - if (disableForcedAllocation) { - //the current node attribute value under consideration - String nodeAttributeValue = node.node().getAttributes().get(awarenessAttribute); - Set skewedAttributeValues = null; - try { - skewedAttributeValues = skewedNodesPerAttributeValue(nodesPerAttribute, awarenessAttribute); - } catch (IllegalStateException e) { - logger.warn(() -> new ParameterizedMessage("Inconsistent configuration to decide on skewness for attribute " + - "[{}] due to ", awarenessAttribute) , e); - } - if (skewedAttributeValues != null && skewedAttributeValues.contains(nodeAttributeValue)) { - //the current attribute value has nodes that are skewed - return allocation.decision(Decision.NO, NAME, - "there are too many copies of the shard allocated to nodes with attribute [%s], due to skewed distribution of " + - "nodes for attribute value [%s] expected the nodes for this attribute to be [%d] but found nodes per " + - "attribute to be [%d]", - awarenessAttribute, - nodeAttributeValue, - awarenessAttributeCapacities.get(awarenessAttribute), - nodesPerAttribute.get(awarenessAttribute)); - } - } - // build the count of shards per attribute value ObjectIntHashMap shardPerAttribute = new ObjectIntHashMap<>(); for (ShardRouting assignedShard : allocation.routingNodes().assignedShards(shardRouting.shardId())) { @@ -247,7 +176,7 @@ private Decision underCapacity(ShardRouting shardRouting, RoutingNode node, Rout if (node.nodeId().equals(nodeId) == false) { // we work on different nodes, move counts around shardPerAttribute.putOrAdd(allocation.routingNodes().node(nodeId).node().getAttributes().get(awarenessAttribute), - 0, -1); + 0, -1); shardPerAttribute.addTo(node.node().getAttributes().get(awarenessAttribute), 1); } } else { @@ -270,56 +199,17 @@ private Decision underCapacity(ShardRouting shardRouting, RoutingNode node, Rout final int maximumNodeCount = (shardCount + numberOfAttributes - 1) / numberOfAttributes; // ceil(shardCount/numberOfAttributes) if (currentNodeCount > maximumNodeCount) { return allocation.decision(Decision.NO, NAME, - "there are too many copies of the shard allocated to nodes with attribute [%s], there are [%d] total configured " + + "there are too many copies of the shard allocated to nodes with attribute [%s], there are [%d] total configured " + "shard copies for this shard id and [%d] total attribute values, expected the allocated shard count per " + "attribute [%d] to be less than or equal to the upper bound of the required number of shards per attribute [%d]", - awarenessAttribute, - shardCount, - numberOfAttributes, - currentNodeCount, - maximumNodeCount); + awarenessAttribute, + shardCount, + numberOfAttributes, + currentNodeCount, + maximumNodeCount); } } return allocation.decision(Decision.YES, NAME, "node meets all awareness attribute requirements"); } - - private Set skewedNodesPerAttributeValue(ObjectIntHashMap nodesPerAttribute, String awarenessAttribute) { - Set underCapacityAttributeValues = null; - int capacity = awarenessAttributeCapacities.getOrDefault(awarenessAttribute, -1); - if (forcedAwarenessAttributes.containsKey(awarenessAttribute) == false || capacity <= 0) { - // forced awareness is not enabled for this attribute - return Collections.emptySet(); - } - List forcedAwarenessAttribute = forcedAwarenessAttributes.get(awarenessAttribute); - if (forcedAwarenessAttribute.size() > nodesPerAttribute.size()) { - //we have a complete attribute failures - return Collections.emptySet(); - } else if (forcedAwarenessAttribute.size() == nodesPerAttribute.size()) { - int minimumNodesBeforeSkewness = (int) Math.ceil((1 - skewnessLimit / 100.0) * capacity); - for (String attributeValue : forcedAwarenessAttribute) { - if (nodesPerAttribute.containsKey(attributeValue) == false) { - //forced attribute values and discovery nodes have a mismatch - throw new IllegalStateException("Missing attribute value in discovered nodes:" + attributeValue); - } else if (nodesPerAttribute.get(attributeValue) < minimumNodesBeforeSkewness) { - if (underCapacityAttributeValues == null) { - underCapacityAttributeValues = Sets.newHashSet(attributeValue); - } else { - underCapacityAttributeValues.add(attributeValue); - } - } else if (nodesPerAttribute.get(attributeValue) > capacity) { - throw new IllegalStateException("Unexpected capacity for attribute value :" + attributeValue + "expected : " + capacity - + "found :" + nodesPerAttribute.get(attributeValue)); - } - } - if (underCapacityAttributeValues != null && underCapacityAttributeValues.size() == forcedAwarenessAttribute.size() - && forcedAwarenessAttribute.size() != 1) { - throw new IllegalStateException("Unexpected capacity for attribute :" + awarenessAttribute + "capacity" + capacity); - } - } else { - throw new IllegalStateException("Mismatch between forced awareness attribute :" + forcedAwarenessAttributes - + "and discovered nodes " + nodesPerAttribute); - } - return underCapacityAttributeValues; - } } diff --git a/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/NodeOverloadAwareAllocationDecider.java b/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/NodeOverloadAwareAllocationDecider.java new file mode 100644 index 0000000000000..6c3f3b49dcc13 --- /dev/null +++ b/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/NodeOverloadAwareAllocationDecider.java @@ -0,0 +1,117 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.cluster.routing.allocation.decider; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.apache.logging.log4j.message.ParameterizedMessage; +import org.opensearch.cluster.metadata.Metadata; +import org.opensearch.cluster.routing.RoutingNode; +import org.opensearch.cluster.routing.ShardRouting; +import org.opensearch.cluster.routing.allocation.RoutingAllocation; +import org.opensearch.common.settings.ClusterSettings; +import org.opensearch.common.settings.Setting; +import org.opensearch.common.settings.Settings; +import org.opensearch.common.settings.Setting.Property; + +import java.util.function.BiPredicate; + +/** + * This {@link NodeOverloadAwareAllocationDecider} controls shard over-allocation + * due to node failures or otherwise on the surviving nodes + *
+ * cluster.routing.allocation.overload_aware.capacity: N
+ * 
+ *

+ * and prevent allocation on the surviving nodes of the under capacity cluster + * based on a skewness limit defined as a percentage by + *

+ * cluster.routing.allocation.overload_aware.limit: X
+ * 
+ */ +public class NodeOverloadAwareAllocationDecider extends AllocationDecider { + + public static final String NAME = "overload_aware"; + + public static final Setting CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_TOTAL_CAPACITY_SETTING = + Setting.intSetting("cluster.routing.allocation.overload_aware.capacity", -1, -1, Property.Dynamic, Property.NodeScope); + public static final Setting CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_LIMIT_SETTING = + Setting.intSetting("cluster.routing.allocation.overload_aware.limit", 50, -1, Property.Dynamic, Property.NodeScope); + public static final Setting CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_ALLOW_UNASSIGNED_PRIMARIES_SETTING = + Setting.boolSetting("cluster.routing.allocation.overload_aware.allow_unassigned_primaries", + true, Setting.Property.Dynamic, Property.NodeScope); + + private volatile int totalCapacity; + + private volatile int skewnessLimit; + + private volatile boolean allowUnassignedPrimaries; + + private static final Logger logger = LogManager.getLogger(NodeOverloadAwareAllocationDecider.class); + + public NodeOverloadAwareAllocationDecider(Settings settings, ClusterSettings clusterSettings) { + this.skewnessLimit = CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_LIMIT_SETTING.get(settings); + this.totalCapacity = CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_TOTAL_CAPACITY_SETTING.get(settings); + this.allowUnassignedPrimaries = CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_ALLOW_UNASSIGNED_PRIMARIES_SETTING.get(settings); + clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_LIMIT_SETTING, + this::setSkewnessLimit); + clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_TOTAL_CAPACITY_SETTING, + this::setTotalCapacity); + clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_ALLOW_UNASSIGNED_PRIMARIES_SETTING, + this::setAllowUnassignedPrimaries); + } + + private void setAllowUnassignedPrimaries(boolean allowUnassignedPrimaries) { + this.allowUnassignedPrimaries = allowUnassignedPrimaries; + } + + private void setSkewnessLimit(int skewnessLimit) { + this.skewnessLimit = skewnessLimit; + } + + private void setTotalCapacity(int totalCapacity) { + this.totalCapacity = totalCapacity; + } + + @Override + public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) { + return underCapacity(shardRouting, node, allocation, (count, limit) -> count >= limit); + } + + @Override + public Decision canRemain(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) { + return underCapacity(shardRouting, node, allocation, (count, limit) -> count > limit); + } + + private Decision underCapacity(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation, + BiPredicate decider) { + if (totalCapacity <= 0 || skewnessLimit < 0 ) { + return allocation.decision(Decision.YES, NAME, + "overload awareness allocation is not enabled, set cluster setting [%s] and cluster se=tting [%s] to enable it", + CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_LIMIT_SETTING.getKey(), + CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_TOTAL_CAPACITY_SETTING.getKey()); + } + if (shardRouting.unassigned() && shardRouting.primary() && allowUnassignedPrimaries) { + return allocation.decision(Decision.YES, NAME, + "overload allocation awareness is allowed for unassigned primaries, set cluster setting [%s] to disable it", + CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey()); + } + Metadata metadata = allocation.metadata(); + float expectedAvgShardsPerNode = (float) metadata.getTotalNumberOfShards() / totalCapacity; + int nodeShardCount = node.numberOfOwningShards(); + logger.debug(() -> new ParameterizedMessage("Expected shards per node {}, current node shard count {}", + expectedAvgShardsPerNode, nodeShardCount)); + if (decider.test(nodeShardCount, (int) Math.ceil(expectedAvgShardsPerNode * (1 + skewnessLimit / 100.0)))) { + return allocation.decision(Decision.NO, NAME, + "too many shards [%d] allocated to this node, cluster setting [%s=%d] based on capacity [%s]", + nodeShardCount, CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_LIMIT_SETTING.getKey(), skewnessLimit, totalCapacity); + } + return allocation.decision(Decision.YES, NAME, "node meets all skew awareness attribute requirements"); + } +} diff --git a/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java b/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java index b9f8f09f42c1d..dd92311651d70 100644 --- a/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java +++ b/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java @@ -33,6 +33,7 @@ import org.apache.logging.log4j.LogManager; import org.opensearch.action.main.TransportMainAction; +import org.opensearch.cluster.routing.allocation.decider.NodeOverloadAwareAllocationDecider; import org.opensearch.watcher.ResourceWatcherService; import org.opensearch.action.admin.cluster.configuration.TransportAddVotingConfigExclusionsAction; import org.opensearch.action.admin.indices.close.TransportCloseIndexAction; @@ -220,9 +221,6 @@ public void apply(Settings value, Settings current, Settings previous) { TransportClient.CLIENT_TRANSPORT_IGNORE_CLUSTER_NAME, TransportClient.CLIENT_TRANSPORT_SNIFF, AwarenessAllocationDecider.CLUSTER_ROUTING_ALLOCATION_AWARENESS_FORCE_GROUP_SETTING, - AwarenessAllocationDecider.CLUSTER_ROUTING_ALLOCATION_AWARENESS_ATTRIBUTE_CAPACITY_GROUP_SETTING, - AwarenessAllocationDecider.CLUSTER_ROUTING_ALLOCATION_AWARENESS_SKEWNESS_LIMIT, - AwarenessAllocationDecider.CLUSTER_ROUTING_ALLOCATION_AWARENESS_FORCED_ALLOCATION_DISABLE_SETTING, BalancedShardsAllocator.INDEX_BALANCE_FACTOR_SETTING, BalancedShardsAllocator.SHARD_BALANCE_FACTOR_SETTING, BalancedShardsAllocator.THRESHOLD_SETTING, @@ -583,7 +581,10 @@ public void apply(Settings value, Settings current, Settings previous) { FsHealthService.REFRESH_INTERVAL_SETTING, FsHealthService.SLOW_PATH_LOGGING_THRESHOLD_SETTING, TransportMainAction.OVERRIDE_MAIN_RESPONSE_VERSION, - IndexingPressure.MAX_INDEXING_BYTES))); + IndexingPressure.MAX_INDEXING_BYTES, + NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_TOTAL_CAPACITY_SETTING, + NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_LIMIT_SETTING, + NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_ALLOW_UNASSIGNED_PRIMARIES_SETTING))); public static List> BUILT_IN_SETTING_UPGRADERS = Collections.unmodifiableList(Arrays.asList( SniffConnectionStrategy.SEARCH_REMOTE_CLUSTER_SEEDS_UPGRADER, diff --git a/server/src/test/java/org/opensearch/cluster/ClusterModuleTests.java b/server/src/test/java/org/opensearch/cluster/ClusterModuleTests.java index b9636472ffeb1..1314aa21abf6a 100644 --- a/server/src/test/java/org/opensearch/cluster/ClusterModuleTests.java +++ b/server/src/test/java/org/opensearch/cluster/ClusterModuleTests.java @@ -48,6 +48,7 @@ import org.opensearch.cluster.routing.allocation.decider.EnableAllocationDecider; import org.opensearch.cluster.routing.allocation.decider.FilterAllocationDecider; import org.opensearch.cluster.routing.allocation.decider.MaxRetryAllocationDecider; +import org.opensearch.cluster.routing.allocation.decider.NodeOverloadAwareAllocationDecider; import org.opensearch.cluster.routing.allocation.decider.NodeVersionAllocationDecider; import org.opensearch.cluster.routing.allocation.decider.RebalanceOnlyWhenActiveAllocationDecider; import org.opensearch.cluster.routing.allocation.decider.ReplicaAfterPrimaryActiveAllocationDecider; @@ -230,7 +231,8 @@ public void testAllocationDeciderOrder() { DiskThresholdDecider.class, ThrottlingAllocationDecider.class, ShardsLimitAllocationDecider.class, - AwarenessAllocationDecider.class); + AwarenessAllocationDecider.class, + NodeOverloadAwareAllocationDecider.class); Collection deciders = ClusterModule.createAllocationDeciders(Settings.EMPTY, new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS), Collections.emptyList()); Iterator iter = deciders.iterator(); diff --git a/server/src/test/java/org/opensearch/cluster/routing/allocation/AwarenessAllocationTests.java b/server/src/test/java/org/opensearch/cluster/routing/allocation/AwarenessAllocationTests.java index 33d3ad7f5aca7..fb2800090770a 100644 --- a/server/src/test/java/org/opensearch/cluster/routing/allocation/AwarenessAllocationTests.java +++ b/server/src/test/java/org/opensearch/cluster/routing/allocation/AwarenessAllocationTests.java @@ -35,7 +35,6 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.opensearch.Version; -import org.opensearch.cluster.ClusterName; import org.opensearch.cluster.ClusterState; import org.opensearch.cluster.OpenSearchAllocationTestCase; import org.opensearch.cluster.metadata.IndexMetadata; @@ -44,7 +43,6 @@ import org.opensearch.cluster.routing.RoutingTable; import org.opensearch.cluster.routing.ShardRouting; import org.opensearch.cluster.routing.ShardRoutingState; -import org.opensearch.cluster.routing.UnassignedInfo; import org.opensearch.cluster.routing.allocation.command.AllocationCommands; import org.opensearch.cluster.routing.allocation.command.CancelAllocationCommand; import org.opensearch.cluster.routing.allocation.command.MoveAllocationCommand; @@ -903,466 +901,4 @@ public void testMultipleAwarenessAttributes() { assertThat(clusterState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(2)); assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(0)); } - - public void testDisabledForcedAllocationPreventsOverload() { - AllocationService strategy = createAllocationService(Settings.builder() - .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_RECOVERIES_SETTING.getKey(), 21) - .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), 21) - .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 21) - .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") - .put(AwarenessAllocationDecider.CLUSTER_ROUTING_ALLOCATION_AWARENESS_FORCED_ALLOCATION_DISABLE_SETTING.getKey(), true) - .put("cluster.routing.allocation.awareness.attribute.zone.capacity", 3) - .put("cluster.routing.allocation.awareness.force.zone.values", "zone_1,zone_2,zone_3") - .put("cluster.routing.allocation.awareness.attributes", "zone") - .build()); - - logger.info("Building initial routing table for 'fullAwareness1'"); - - Metadata metadata = Metadata.builder() - .put(IndexMetadata.builder("test").settings(settings(Version.CURRENT)).numberOfShards(21).numberOfReplicas(2)) - .build(); - - RoutingTable initialRoutingTable = RoutingTable.builder() - .addAsNew(metadata.index("test")) - .build(); - - ClusterState clusterState = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING - .getDefault(Settings.EMPTY)).metadata(metadata).routingTable(initialRoutingTable).build(); - - logger.info("--> adding three nodes on same zone and do rerouting"); - clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder() - .add(newNode("node1", singletonMap("zone", "zone_1"))) - .add(newNode("node2", singletonMap("zone", "zone_1"))) - .add(newNode("node3", singletonMap("zone", "zone_1"))) - ).build(); - clusterState = strategy.reroute(clusterState, "reroute"); - assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(21)); - - logger.info("--> start the shards (primaries)"); - clusterState = startInitializingShardsAndReroute(strategy, clusterState); - - logger.info("--> replica will not start because we have only one rack value"); - assertThat(clusterState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(21)); - assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(0)); - - logger.info("--> add three new node with a new rack and reroute"); - clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()) - .add(newNode("node4", singletonMap("zone", "zone_2"))) - .add(newNode("node5", singletonMap("zone", "zone_2"))) - .add(newNode("node6", singletonMap("zone", "zone_2"))) - ).build(); - clusterState = strategy.reroute(clusterState, "reroute"); - - assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.STARTED).size(), equalTo(21)); - assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.INITIALIZING).size(), equalTo(21)); - assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.INITIALIZING).get(0).currentNodeId(), - equalTo("node4")); - - logger.info("--> complete relocation"); - clusterState = startInitializingShardsAndReroute(strategy, clusterState); - - assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.STARTED).size(), equalTo(42)); - - logger.info("--> do another reroute, make sure nothing moves"); - assertThat(strategy.reroute(clusterState, "reroute").routingTable(), sameInstance(clusterState.routingTable())); - - logger.info("--> add another node with a new rack, make sure nothing moves"); - - clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()) - .add(newNode("node7", singletonMap("zone", "zone_3"))) - .add(newNode("node8", singletonMap("zone", "zone_3"))) - .add(newNode("node9", singletonMap("zone", "zone_3"))) - ).build(); - ClusterState newState = strategy.reroute(clusterState, "reroute"); - while (newState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { - newState = startInitializingShardsAndReroute(strategy, newState); - } - assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(63)); - - logger.info("--> Remove random node from zones holding all primary and all replicas"); - //remove two nodes in one zone to cause distribution zone1->3 , zone2->3, zone3->1 - newState = removeNode(newState, randomFrom("node1", "node7" ), strategy); - logger.info("--> Remove another random node from zones holding all primary and all replicas"); - newState = removeNode(newState, randomFrom("node2", "node8" ), strategy); - newState = strategy.reroute(newState, "reroute"); - while (newState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { - newState = startInitializingShardsAndReroute(strategy, newState); - } - //ensure minority zone doesn't get overloaded - assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(49)); - assertThat(newState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(14)); - for (ShardRouting shard : newState.getRoutingNodes().shardsWithState(UNASSIGNED)) { - assertEquals(shard.unassignedInfo().getReason(), UnassignedInfo.Reason.NODE_LEFT); - } - } - - public void testMoveShardDuringPartialFailureSkewnessLimitNotBreached(){ - AllocationService strategy = createAllocationService(Settings.builder() - .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_RECOVERIES_SETTING.getKey(), 20) - .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), 20) - .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 20) - .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") - .put(AwarenessAllocationDecider.CLUSTER_ROUTING_ALLOCATION_AWARENESS_FORCED_ALLOCATION_DISABLE_SETTING.getKey(), true) - .put("cluster.routing.allocation.awareness.attribute.zone.capacity", 5) - .put("cluster.routing.allocation.awareness.force.zone.values", "zone1,zone2,zone3") - .put("cluster.routing.allocation.awareness.attributes", "zone") - .put(AwarenessAllocationDecider.CLUSTER_ROUTING_ALLOCATION_AWARENESS_SKEWNESS_LIMIT.getKey(), 20) - .build()); - - logger.info("Building initial routing table for 'testMoveShardDuringPartialFailureSkewnessLimitNotBreached'"); - - Metadata metadata = Metadata.builder() - .put(IndexMetadata.builder("test").settings(settings(Version.CURRENT)).numberOfShards(20).numberOfReplicas(2)) - .build(); - - RoutingTable initialRoutingTable = RoutingTable.builder() - .addAsNew(metadata.index("test")) - .build(); - - ClusterState clusterState = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING - .getDefault(Settings.EMPTY)).metadata(metadata).routingTable(initialRoutingTable).build(); - - logger.info("--> adding five nodes on same zone and do rerouting"); - clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder() - .add(newNode("node1", singletonMap("zone", "zone1"))) - .add(newNode("node2", singletonMap("zone", "zone1"))) - .add(newNode("node3", singletonMap("zone", "zone1"))) - .add(newNode("node4", singletonMap("zone", "zone1"))) - .add(newNode("node5", singletonMap("zone", "zone1"))) - ).build(); - clusterState = strategy.reroute(clusterState, "reroute"); - assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(20)); - - logger.info("--> start the shards (primaries)"); - clusterState = startInitializingShardsAndReroute(strategy, clusterState); - - logger.info("--> replica will not start because we have only one zone value"); - assertThat(clusterState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(20)); - assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(0)); - //replicas are unassigned - assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(40)); - - logger.info("--> add five new node in new zone and reroute"); - clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()) - .add(newNode("node6", singletonMap("zone", "zone2"))) - .add(newNode("node7", singletonMap("zone", "zone2"))) - .add(newNode("node8", singletonMap("zone", "zone2"))) - .add(newNode("node9", singletonMap("zone", "zone2"))) - .add(newNode("node10", singletonMap("zone", "zone2"))) - ).build(); - clusterState = strategy.reroute(clusterState, "reroute"); - - assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.STARTED).size(), equalTo(20)); - assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.INITIALIZING).size(), equalTo(20)); - - logger.info("--> complete relocation"); - clusterState = startInitializingShardsAndReroute(strategy, clusterState); - - assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.STARTED).size(), equalTo(40)); - - logger.info("--> do another reroute, make sure nothing moves"); - assertThat(strategy.reroute(clusterState, "reroute").routingTable(), sameInstance(clusterState.routingTable())); - - logger.info("--> add another five node in new zone and reroute"); - - clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()) - .add(newNode("node11", singletonMap("zone", "zone3"))) - .add(newNode("node12", singletonMap("zone", "zone3"))) - .add(newNode("node13", singletonMap("zone", "zone3"))) - .add(newNode("node14", singletonMap("zone", "zone3"))) - .add(newNode("node15", singletonMap("zone", "zone3"))) - ).build(); - ClusterState newState = strategy.reroute(clusterState, "reroute"); - while (newState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { - newState = startInitializingShardsAndReroute(strategy, newState); - } - assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(60)); - - logger.info("--> Remove one node from zone3 holding all primary and all replicas"); - - assertThat(newState.getRoutingNodes().node("node11").size(), equalTo(4)); - assertThat(newState.getRoutingNodes().node("node12").size(), equalTo(4)); - assertThat(newState.getRoutingNodes().node("node13").size(), equalTo(4)); - assertThat(newState.getRoutingNodes().node("node14").size(), equalTo(4)); - assertThat(newState.getRoutingNodes().node("node15").size(), equalTo(4)); - - // remove one nodes in one zone to cause distribution zone1->5 , zone2->5, zone3->4 - newState = removeNode(newState, "node11", strategy); - newState = strategy.reroute(newState, "reroute"); - - while (newState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { - newState = startInitializingShardsAndReroute(strategy, newState); - } - - assertThat(newState.getRoutingNodes().node("node12").size(), equalTo(5)); - assertThat(newState.getRoutingNodes().node("node13").size(), equalTo(5)); - assertThat(newState.getRoutingNodes().node("node14").size(), equalTo(5)); - assertThat(newState.getRoutingNodes().node("node15").size(), equalTo(5)); - - // //ensure all shards are assigned - assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(60)); - assertThat(newState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(0)); - } - - public void testShardUnassignedDuringPartialFailureSkewnessLimitBreached(){ - AllocationService strategy = createAllocationService(Settings.builder() - .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_RECOVERIES_SETTING.getKey(), 20) - .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), 20) - .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 20) - .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") - .put(AwarenessAllocationDecider.CLUSTER_ROUTING_ALLOCATION_AWARENESS_FORCED_ALLOCATION_DISABLE_SETTING.getKey(), true) - .put("cluster.routing.allocation.awareness.attribute.zone.capacity", 5) - .put("cluster.routing.allocation.awareness.force.zone.values", "zone1,zone2,zone3") - .put("cluster.routing.allocation.awareness.attributes", "zone") - .put(AwarenessAllocationDecider.CLUSTER_ROUTING_ALLOCATION_AWARENESS_SKEWNESS_LIMIT.getKey(), 20) - .build()); - - logger.info("Building initial routing table for 'testShardUnassignedDuringPartialFailureSkewnessLimitBreached'"); - - Metadata metadata = Metadata.builder() - .put(IndexMetadata.builder("test").settings(settings(Version.CURRENT)).numberOfShards(20).numberOfReplicas(2)) - .build(); - - RoutingTable initialRoutingTable = RoutingTable.builder() - .addAsNew(metadata.index("test")) - .build(); - - ClusterState clusterState = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING - .getDefault(Settings.EMPTY)).metadata(metadata).routingTable(initialRoutingTable).build(); - - logger.info("--> adding five nodes on same zone and do rerouting"); - clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder() - .add(newNode("node1", singletonMap("zone", "zone1"))) - .add(newNode("node2", singletonMap("zone", "zone1"))) - .add(newNode("node3", singletonMap("zone", "zone1"))) - .add(newNode("node4", singletonMap("zone", "zone1"))) - .add(newNode("node5", singletonMap("zone", "zone1"))) - ).build(); - clusterState = strategy.reroute(clusterState, "reroute"); - assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(20)); - - logger.info("--> start the shards (primaries)"); - clusterState = startInitializingShardsAndReroute(strategy, clusterState); - - logger.info("--> replica will not start because we have only one zone value"); - assertThat(clusterState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(20)); - assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(0)); - //replicas are unassigned - assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(40)); - - logger.info("--> add five new node in new zone and reroute"); - clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()) - .add(newNode("node6", singletonMap("zone", "zone2"))) - .add(newNode("node7", singletonMap("zone", "zone2"))) - .add(newNode("node8", singletonMap("zone", "zone2"))) - .add(newNode("node9", singletonMap("zone", "zone2"))) - .add(newNode("node10", singletonMap("zone", "zone2"))) - ).build(); - clusterState = strategy.reroute(clusterState, "reroute"); - - assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.STARTED).size(), equalTo(20)); - assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.INITIALIZING).size(), equalTo(20)); - - logger.info("--> complete relocation"); - clusterState = startInitializingShardsAndReroute(strategy, clusterState); - - assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.STARTED).size(), equalTo(40)); - - logger.info("--> do another reroute, make sure nothing moves"); - assertThat(strategy.reroute(clusterState, "reroute").routingTable(), sameInstance(clusterState.routingTable())); - - logger.info("--> add another five node in new zone and reroute"); - - clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()) - .add(newNode("node11", singletonMap("zone", "zone3"))) - .add(newNode("node12", singletonMap("zone", "zone3"))) - .add(newNode("node13", singletonMap("zone", "zone3"))) - .add(newNode("node14", singletonMap("zone", "zone3"))) - .add(newNode("node15", singletonMap("zone", "zone3"))) - ).build(); - ClusterState newState = strategy.reroute(clusterState, "reroute"); - while (newState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { - newState = startInitializingShardsAndReroute(strategy, newState); - } - assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(60)); - - logger.info("--> Remove one node from zone3 holding all primary and all replicas"); - - assertThat(newState.getRoutingNodes().node("node11").size(), equalTo(4)); - assertThat(newState.getRoutingNodes().node("node12").size(), equalTo(4)); - assertThat(newState.getRoutingNodes().node("node13").size(), equalTo(4)); - assertThat(newState.getRoutingNodes().node("node14").size(), equalTo(4)); - assertThat(newState.getRoutingNodes().node("node15").size(), equalTo(4)); - - // remove one nodes in one zone to cause distribution zone1->5 , zone2->5, zone3->4 - newState = removeNode(newState, "node11", strategy); - newState = strategy.reroute(newState, "reroute"); - - while (newState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { - newState = startInitializingShardsAndReroute(strategy, newState); - } - - assertThat(newState.getRoutingNodes().node("node12").size(), equalTo(5)); - assertThat(newState.getRoutingNodes().node("node13").size(), equalTo(5)); - assertThat(newState.getRoutingNodes().node("node14").size(), equalTo(5)); - assertThat(newState.getRoutingNodes().node("node15").size(), equalTo(5)); - - // ensure all shards are assigned - assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(60)); - assertThat(newState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(0)); - - // remove one more node subsequently in one zone to cause distribution zone1->5 , zone2->5, zone3->3 - newState = removeNode(newState, "node12", strategy); - newState = strategy.reroute(newState, "reroute"); - - assertThat(newState.getRoutingNodes().node("node13").size(), equalTo(5)); - assertThat(newState.getRoutingNodes().node("node14").size(), equalTo(5)); - assertThat(newState.getRoutingNodes().node("node15").size(), equalTo(5)); - - assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(55)); - assertThat(newState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(5)); - - for (ShardRouting shard : newState.getRoutingNodes().shardsWithState(UNASSIGNED)) { - assertEquals(shard.unassignedInfo().getReason(), UnassignedInfo.Reason.NODE_LEFT); - } - } - - public void testSingleZoneReplicaUnassignedOnSkewnessWithThreeShardCopies() { - AllocationService strategy = createAllocationService(Settings.builder() - .put("cluster.routing.allocation.node_concurrent_recoveries", 10) - .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 10) - .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") - .put(AwarenessAllocationDecider.CLUSTER_ROUTING_ALLOCATION_AWARENESS_FORCED_ALLOCATION_DISABLE_SETTING.getKey(), true) - .put("cluster.routing.allocation.awareness.attribute.zone.capacity", 3) - .put("cluster.routing.allocation.awareness.force.zone.values", "zone1") - .put("cluster.routing.allocation.awareness.attributes", "zone") - .build()); - - logger.info("Building initial routing table for 'testSingleZoneReplicaUnassignedOnSkewnessWithThreeShardCopies'"); - - Metadata metadata = Metadata.builder() - .put(IndexMetadata.builder("test").settings(settings(Version.CURRENT)).numberOfShards(3).numberOfReplicas(2)) - .build(); - - RoutingTable initialRoutingTable = RoutingTable.builder() - .addAsNew(metadata.index("test")) - .build(); - - ClusterState clusterState = ClusterState.builder(org.opensearch.cluster.ClusterName.CLUSTER_NAME_SETTING - .getDefault(Settings.EMPTY)).metadata(metadata).routingTable(initialRoutingTable).build(); - - logger.info("--> adding two nodes on same rack and do rerouting"); - clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder() - .add(newNode("node1", singletonMap("zone", "zone1"))) - .add(newNode("node2", singletonMap("zone", "zone1"))) - .add(newNode("node3", singletonMap("zone", "zone1"))) - ).build(); - clusterState = strategy.reroute(clusterState, "reroute"); - assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(3)); - - logger.info("--> start the shards (primaries)"); - clusterState = startInitializingShardsAndReroute(strategy, clusterState); - - logger.info("--> replicas are initializing"); - assertThat(clusterState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(3)); - assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(6)); - - logger.info("--> start the shards (replicas)"); - clusterState = startInitializingShardsAndReroute(strategy, clusterState); - - logger.info("--> all shards are started"); - assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.STARTED).size(), equalTo(9)); - - logger.info("--> do another reroute, make sure nothing moves"); - assertThat(strategy.reroute(clusterState, "reroute").routingTable(), sameInstance(clusterState.routingTable())); - - //remove one node to make zone1 skewed - clusterState = removeNode(clusterState, randomFrom("node1", "node2", "node3"), strategy); - clusterState = strategy.reroute(clusterState, "reroute"); - - while (clusterState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { - clusterState = startInitializingShardsAndReroute(strategy, clusterState); - } - clusterState = startInitializingShardsAndReroute(strategy, clusterState); - - assertThat(clusterState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(6)); - assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(3)); - - for (ShardRouting shard : clusterState.getRoutingNodes().shardsWithState(UNASSIGNED)) { - assertEquals(shard.unassignedInfo().getReason(), UnassignedInfo.Reason.NODE_LEFT); - assertFalse(shard.primary()); - } - } - - public void testSingleZoneReplicaUnassignedOnSkewnessWithTwoShardCopies() { - AllocationService strategy = createAllocationService(Settings.builder() - .put("cluster.routing.allocation.node_concurrent_recoveries", 10) - .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 10) - .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") - .put(AwarenessAllocationDecider.CLUSTER_ROUTING_ALLOCATION_AWARENESS_FORCED_ALLOCATION_DISABLE_SETTING.getKey(), true) - .put("cluster.routing.allocation.awareness.attribute.zone.capacity", 3) - .put("cluster.routing.allocation.awareness.force.zone.values", "zone1") - .put("cluster.routing.allocation.awareness.attributes", "zone") - .build()); - - logger.info("Building initial routing table for 'testSingleZoneReplicaUnassignedOnSkewnessWithTwoShardCopies'"); - - Metadata metadata = Metadata.builder() - .put(IndexMetadata.builder("test").settings(settings(Version.CURRENT)).numberOfShards(3).numberOfReplicas(1)) - .build(); - - RoutingTable initialRoutingTable = RoutingTable.builder() - .addAsNew(metadata.index("test")) - .build(); - - ClusterState clusterState = ClusterState.builder(org.opensearch.cluster.ClusterName.CLUSTER_NAME_SETTING - .getDefault(Settings.EMPTY)).metadata(metadata).routingTable(initialRoutingTable).build(); - - logger.info("--> adding two nodes on same rack and do rerouting"); - clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder() - .add(newNode("node1", singletonMap("zone", "zone1"))) - .add(newNode("node2", singletonMap("zone", "zone1"))) - .add(newNode("node3", singletonMap("zone", "zone1"))) - ).build(); - clusterState = strategy.reroute(clusterState, "reroute"); - assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(3)); - - logger.info("--> start the shards (primaries)"); - clusterState = startInitializingShardsAndReroute(strategy, clusterState); - - logger.info("--> replicas are initializing"); - assertThat(clusterState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(3)); - assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(3)); - - logger.info("--> start the shards (replicas)"); - clusterState = startInitializingShardsAndReroute(strategy, clusterState); - - logger.info("--> all shards are started"); - assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.STARTED).size(), equalTo(6)); - - logger.info("--> do another reroute, make sure nothing moves"); - assertThat(strategy.reroute(clusterState, "reroute").routingTable(), sameInstance(clusterState.routingTable())); - - //remove one node to make zone1 skewed - clusterState = removeNode(clusterState, randomFrom("node1", "node2", "node3"), strategy); - clusterState = strategy.reroute(clusterState, "reroute"); - - while (clusterState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { - clusterState = startInitializingShardsAndReroute(strategy, clusterState); - } - clusterState = startInitializingShardsAndReroute(strategy, clusterState); - - assertThat(clusterState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(4)); - assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(2)); - - for (ShardRouting shard : clusterState.getRoutingNodes().shardsWithState(UNASSIGNED)) { - assertEquals(shard.unassignedInfo().getReason(), UnassignedInfo.Reason.NODE_LEFT); - assertFalse(shard.primary()); - } - } - - private ClusterState removeNode(ClusterState clusterState, String nodeName, AllocationService allocationService) { - return allocationService.disassociateDeadNodes(ClusterState.builder(clusterState) - .nodes(DiscoveryNodes.builder(clusterState.getNodes()).remove(nodeName)).build(), true, "reroute"); - } } diff --git a/server/src/test/java/org/opensearch/cluster/routing/allocation/NodeOverloadAwareAllocationTests.java b/server/src/test/java/org/opensearch/cluster/routing/allocation/NodeOverloadAwareAllocationTests.java new file mode 100644 index 0000000000000..5bd21f6ca5540 --- /dev/null +++ b/server/src/test/java/org/opensearch/cluster/routing/allocation/NodeOverloadAwareAllocationTests.java @@ -0,0 +1,821 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.cluster.routing.allocation; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.opensearch.Version; +import org.opensearch.cluster.ClusterName; +import org.opensearch.cluster.ClusterState; +import org.opensearch.cluster.OpenSearchAllocationTestCase; +import org.opensearch.cluster.metadata.IndexMetadata; +import org.opensearch.cluster.metadata.Metadata; +import org.opensearch.cluster.node.DiscoveryNodes; +import org.opensearch.cluster.routing.RoutingNode; +import org.opensearch.cluster.routing.RoutingTable; +import org.opensearch.cluster.routing.ShardRouting; +import org.opensearch.cluster.routing.ShardRoutingState; +import org.opensearch.cluster.routing.UnassignedInfo; +import org.opensearch.cluster.routing.allocation.decider.ClusterRebalanceAllocationDecider; +import org.opensearch.cluster.routing.allocation.decider.NodeOverloadAwareAllocationDecider; +import org.opensearch.cluster.routing.allocation.decider.ThrottlingAllocationDecider; +import org.opensearch.common.settings.Settings; + +import static java.util.Collections.singletonMap; +import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.sameInstance; +import static org.opensearch.cluster.routing.ShardRoutingState.INITIALIZING; +import static org.opensearch.cluster.routing.ShardRoutingState.STARTED; +import static org.opensearch.cluster.routing.ShardRoutingState.UNASSIGNED; + +public class NodeOverloadAwareAllocationTests extends OpenSearchAllocationTestCase { + + private final Logger logger = LogManager.getLogger(NodeOverloadAwareAllocationTests.class); + + public void testSingleZoneZeroReplicaUnassignedPrimaryAllocation() { + AllocationService strategy = createAllocationService(Settings.builder() + .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_RECOVERIES_SETTING.getKey(), 20) + .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), 20) + .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 20) + .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") + .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_TOTAL_CAPACITY_SETTING.getKey(), 15) + .put("cluster.routing.allocation.awareness.attributes", "zone") + .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_LIMIT_SETTING.getKey(), 20) + .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey(), + true) + .build()); + + logger.info("Building initial routing table for 'testSingleZoneZeroReplicaUnassignedPrimaryAllocation'"); + + Metadata metadata = Metadata.builder() + .put(IndexMetadata.builder("test").settings(settings(Version.CURRENT)).numberOfShards(20).numberOfReplicas(0)) + .build(); + + RoutingTable initialRoutingTable = RoutingTable.builder() + .addAsNew(metadata.index("test")) + .build(); + + ClusterState clusterState = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING + .getDefault(Settings.EMPTY)).metadata(metadata).routingTable(initialRoutingTable).build(); + + logger.info("--> adding three nodes on same zone and do rerouting"); + clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder() + .add(newNode("node1", singletonMap("zone", "zone_1"))) + .add(newNode("node2", singletonMap("zone", "zone_1"))) + .add(newNode("node3", singletonMap("zone", "zone_1"))) + .add(newNode("node4", singletonMap("zone", "zone_1"))) + .add(newNode("node5", singletonMap("zone", "zone_1"))) + ).build(); + clusterState = strategy.reroute(clusterState, "reroute"); + assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(20)); + + logger.info("--> start the shards (primaries)"); + clusterState = startInitializingShardsAndReroute(strategy, clusterState); + assertThat(clusterState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(20)); + + logger.info("--> Remove node from zone holding primaries"); + ClusterState newState = removeNode(clusterState, "node1", strategy); + logger.info("--> Remove node from zone holding primaries"); + newState = removeNode(newState, "node2", strategy); + logger.info("--> Remove node from zone holding primaries"); + newState = removeNode(newState, "node3", strategy); + + logger.info("add another index with 20 shards"); + metadata = Metadata.builder(newState.metadata()) + .put(IndexMetadata.builder("test1").settings(settings(Version.CURRENT) + .put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 20) + .put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 0) + )) + .build(); + RoutingTable updatedRoutingTable = RoutingTable.builder(newState.routingTable()) + .addAsNew(metadata.index("test1")) + .build(); + + newState = ClusterState.builder(newState).metadata(metadata).routingTable(updatedRoutingTable).build(); + newState = strategy.reroute(newState, "reroute"); + + newState = startInitializingShardsAndReroute(strategy, newState); + + assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(28)); + assertThat(newState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(12)); + for (ShardRouting shard : newState.getRoutingNodes().shardsWithState(UNASSIGNED)) { + assertEquals(shard.unassignedInfo().getReason(), UnassignedInfo.Reason.NODE_LEFT); + } + + newState = ClusterState.builder(newState).nodes(DiscoveryNodes.builder(newState.nodes()) + .add(newNode("node1", singletonMap("zone", "zone_1")))) + .build(); + + newState = strategy.reroute(newState, "reroute"); + assertThat(newState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(4)); + + newState = startInitializingShardsAndReroute(strategy, newState); + assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(32)); + + //add back node2 when skewness is still breached + newState = ClusterState.builder(newState).nodes(DiscoveryNodes.builder(newState.nodes()) + .add(newNode("node2", singletonMap("zone", "zone_1")))) + .build(); + newState = strategy.reroute(newState, "reroute"); + assertThat(newState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(4)); + + newState = startInitializingShardsAndReroute(strategy, newState); + assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(36)); + + //add back node3 + newState = ClusterState.builder(newState).nodes(DiscoveryNodes.builder(newState.nodes()) + .add(newNode("node3", singletonMap("zone", "zone_1")))) + .build(); + newState = strategy.reroute(newState, "reroute"); + assertThat(newState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(4)); + + newState = startInitializingShardsAndReroute(strategy, newState); + assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(40)); + } + + public void testSingleZoneOneReplicaLimitsShardAllocationOnOverload() { + AllocationService strategy = createAllocationService(Settings.builder() + .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_RECOVERIES_SETTING.getKey(), 20) + .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), 20) + .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 20) + .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") + .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_TOTAL_CAPACITY_SETTING.getKey(), 5) + .put("cluster.routing.allocation.awareness.attributes", "zone") + .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_LIMIT_SETTING.getKey(), 20) + .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey(), + true) + .build()); + + logger.info("Building initial routing table for 'testSingleZoneOneReplicaLimitsShardAllocationOnOverload'"); + + Metadata metadata = Metadata.builder() + .put(IndexMetadata.builder("test").settings(settings(Version.CURRENT)).numberOfShards(20).numberOfReplicas(1)) + .build(); + + RoutingTable initialRoutingTable = RoutingTable.builder() + .addAsNew(metadata.index("test")) + .build(); + + ClusterState clusterState = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING + .getDefault(Settings.EMPTY)).metadata(metadata).routingTable(initialRoutingTable).build(); + + logger.info("--> adding three nodes on same zone and do rerouting"); + clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder() + .add(newNode("node1", singletonMap("zone", "zone_1"))) + .add(newNode("node2", singletonMap("zone", "zone_1"))) + .add(newNode("node3", singletonMap("zone", "zone_1"))) + .add(newNode("node4", singletonMap("zone", "zone_1"))) + .add(newNode("node5", singletonMap("zone", "zone_1"))) + ).build(); + clusterState = strategy.reroute(clusterState, "reroute"); + assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(20)); + + logger.info("--> start the shards (primaries)"); + clusterState = startInitializingShardsAndReroute(strategy, clusterState); + assertThat(clusterState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(20)); + assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(20)); + + clusterState = startInitializingShardsAndReroute(strategy, clusterState); + assertThat(clusterState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(40)); + + logger.info("--> Remove node from zone"); + ClusterState newState = removeNode(clusterState, "node1", strategy); + newState = startInitializingShardsAndReroute(strategy, newState); + assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(40)); + + for (RoutingNode node : newState.getRoutingNodes()) { + assertThat(node.size(), equalTo(10)); + } + + logger.info("--> Remove node from zone when the limit of overload is reached"); + newState = removeNode(newState, "node2", strategy); + newState = startInitializingShardsAndReroute(strategy, newState); + assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(30)); + + for (ShardRouting shard : newState.getRoutingNodes().shardsWithState(UNASSIGNED)) { + assertEquals(shard.unassignedInfo().getReason(), UnassignedInfo.Reason.NODE_LEFT); + assertFalse(shard.primary()); + } + + logger.info("add another index with 20 shards"); + metadata = Metadata.builder(newState.metadata()) + .put(IndexMetadata.builder("test1").settings(settings(Version.CURRENT) + .put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 20) + .put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 1) + )) + .build(); + RoutingTable updatedRoutingTable = RoutingTable.builder(newState.routingTable()) + .addAsNew(metadata.index("test1")) + .build(); + //increases avg shard per node to 80/5 = 16, overload factor 1.2, total allowed 20 + newState = ClusterState.builder(newState).metadata(metadata).routingTable(updatedRoutingTable).build(); + newState = strategy.reroute(newState, "reroute"); + + newState = startInitializingShardsAndReroute(strategy, newState); + + assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(60)); + assertThat(newState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(20)); + + logger.info("add another index with 60 shards"); + metadata = Metadata.builder(newState.metadata()) + .put(IndexMetadata.builder("test2").settings(settings(Version.CURRENT) + .put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 60) + .put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 0) + )) + .build(); + updatedRoutingTable = RoutingTable.builder(newState.routingTable()) + .addAsNew(metadata.index("test2")) + .build(); + //increases avg shard per node to 140/5 = 28, overload factor 1.2, total allowed 34 per node but still ALL primaries get assigned + newState = ClusterState.builder(newState).metadata(metadata).routingTable(updatedRoutingTable).build(); + newState = strategy.reroute(newState, "reroute"); + + newState = startInitializingShardsAndReroute(strategy, newState); + + assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(120)); + assertThat(newState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(20)); + + for (ShardRouting shard : newState.getRoutingNodes().shardsWithState(UNASSIGNED)) { + assertFalse(shard.primary()); + } + + strategy = createAllocationService(Settings.builder() + .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_RECOVERIES_SETTING.getKey(), 20) + .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), 20) + .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 20) + .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") + .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_TOTAL_CAPACITY_SETTING.getKey(), 5) + .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_LIMIT_SETTING.getKey(), 20) + .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey(), + false) + .build()); + + for (RoutingNode node : newState.getRoutingNodes()) { + assertThat(node.size(), equalTo(40)); + } + + logger.info("add another index with 5 shards"); + metadata = Metadata.builder(newState.metadata()) + .put(IndexMetadata.builder("test3").settings(settings(Version.CURRENT) + .put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 5) + .put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 0) + )) + .build(); + updatedRoutingTable = RoutingTable.builder(newState.routingTable()) + .addAsNew(metadata.index("test3")) + .build(); + //increases avg shard per node to 145/5 = 29, overload factor 1.2, total allowed 35 per node and NO primaries get assigned + //since total owning shards are 40 per node already + newState = ClusterState.builder(newState).metadata(metadata).routingTable(updatedRoutingTable).build(); + newState = strategy.reroute(newState, "reroute"); + + newState = startInitializingShardsAndReroute(strategy, newState); + + assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(120)); + assertThat(newState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(25)); + assertThat(newState.getRoutingNodes().shardsWithState(UNASSIGNED).stream().filter(x -> x.primary()).count(), equalTo(5L)); + } + + public void testThreeZoneTwoReplicaLimitsShardAllocationOnOverload(){ + AllocationService strategy = createAllocationService(Settings.builder() + .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_RECOVERIES_SETTING.getKey(), 20) + .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), 20) + .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 20) + .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") + .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_TOTAL_CAPACITY_SETTING.getKey(), 15) + .put("cluster.routing.allocation.awareness.attributes", "zone") + .put("cluster.routing.allocation.awareness.force.zone.values", "zone_1,zone_2,zone_3") + .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_LIMIT_SETTING.getKey(), 20) + .build()); + + logger.info("Building initial routing table for 'testThreeZoneTwoReplicaLimitsShardAllocationOnOverload'"); + + Metadata metadata = Metadata.builder() + .put(IndexMetadata.builder("test").settings(settings(Version.CURRENT)).numberOfShards(20).numberOfReplicas(2)) + .build(); + + RoutingTable initialRoutingTable = RoutingTable.builder() + .addAsNew(metadata.index("test")) + .build(); + + ClusterState clusterState = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING + .getDefault(Settings.EMPTY)).metadata(metadata).routingTable(initialRoutingTable).build(); + + logger.info("--> adding five nodes on same zone and do rerouting"); + clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder() + .add(newNode("node1", singletonMap("zone", "zone1"))) + .add(newNode("node2", singletonMap("zone", "zone1"))) + .add(newNode("node3", singletonMap("zone", "zone1"))) + .add(newNode("node4", singletonMap("zone", "zone1"))) + .add(newNode("node5", singletonMap("zone", "zone1"))) + ).build(); + clusterState = strategy.reroute(clusterState, "reroute"); + assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(20)); + + logger.info("--> start the shards (primaries)"); + clusterState = startInitializingShardsAndReroute(strategy, clusterState); + + logger.info("--> replica will not start because we have only one zone value"); + assertThat(clusterState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(20)); + assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(0)); + //replicas are unassigned + assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(40)); + + logger.info("--> add five new node in new zone and reroute"); + clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()) + .add(newNode("node6", singletonMap("zone", "zone2"))) + .add(newNode("node7", singletonMap("zone", "zone2"))) + .add(newNode("node8", singletonMap("zone", "zone2"))) + .add(newNode("node9", singletonMap("zone", "zone2"))) + .add(newNode("node10", singletonMap("zone", "zone2"))) + ).build(); + clusterState = strategy.reroute(clusterState, "reroute"); + + assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.STARTED).size(), equalTo(20)); + assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.INITIALIZING).size(), equalTo(20)); + + logger.info("--> complete relocation"); + clusterState = startInitializingShardsAndReroute(strategy, clusterState); + + assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.STARTED).size(), equalTo(40)); + + logger.info("--> do another reroute, make sure nothing moves"); + assertThat(strategy.reroute(clusterState, "reroute").routingTable(), sameInstance(clusterState.routingTable())); + + logger.info("--> add another five node in new zone and reroute"); + + clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()) + .add(newNode("node11", singletonMap("zone", "zone3"))) + .add(newNode("node12", singletonMap("zone", "zone3"))) + .add(newNode("node13", singletonMap("zone", "zone3"))) + .add(newNode("node14", singletonMap("zone", "zone3"))) + .add(newNode("node15", singletonMap("zone", "zone3"))) + ).build(); + ClusterState newState = strategy.reroute(clusterState, "reroute"); + while (newState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { + newState = startInitializingShardsAndReroute(strategy, newState); + } + assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(60)); + + assertThat(newState.getRoutingNodes().node("node11").size(), equalTo(4)); + assertThat(newState.getRoutingNodes().node("node12").size(), equalTo(4)); + assertThat(newState.getRoutingNodes().node("node13").size(), equalTo(4)); + assertThat(newState.getRoutingNodes().node("node14").size(), equalTo(4)); + assertThat(newState.getRoutingNodes().node("node15").size(), equalTo(4)); + + logger.info("--> Remove three node from zone3 holding primary and replicas"); + //remove one nodes in one zone to cause distribution zone1->5 , zone2->5, zone3->4 + newState = removeNode(newState, "node11", strategy); + newState = removeNode(newState, "node12", strategy); + newState = removeNode(newState, "node13", strategy); + newState = strategy.reroute(newState, "reroute"); + + while (newState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { + newState = startInitializingShardsAndReroute(strategy, newState); + } + + assertThat(newState.getRoutingNodes().node("node14").size(), equalTo(5)); + assertThat(newState.getRoutingNodes().node("node15").size(), equalTo(5)); + + //add the removed node + newState = ClusterState.builder(newState).nodes(DiscoveryNodes.builder(newState.nodes()) + .add(newNode("node11", singletonMap("zone", "zone3")))) + .build(); + newState = strategy.reroute(newState, "reroute"); + + assertThat(newState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(5)); + newState = startInitializingShardsAndReroute(strategy, newState); + assertThat(newState.getRoutingNodes().node("node11").size(), equalTo(5)); + + //add the removed node + newState = ClusterState.builder(newState).nodes(DiscoveryNodes.builder(newState.nodes()) + .add(newNode("node12", singletonMap("zone", "zone3")))) + .build(); + newState = strategy.reroute(newState, "reroute"); + + assertThat(newState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(5)); + newState = startInitializingShardsAndReroute(strategy, newState); + assertThat(newState.getRoutingNodes().node("node12").size(), equalTo(5)); + + //add the removed node + newState = ClusterState.builder(newState).nodes(DiscoveryNodes.builder(newState.nodes()) + .add(newNode("node13", singletonMap("zone", "zone3")))) + .build(); + newState = strategy.reroute(newState, "reroute"); + + while (newState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { + newState = startInitializingShardsAndReroute(strategy, newState); + } + assertThat(newState.getRoutingNodes().node("node13").size(), equalTo(4)); + assertThat(newState.getRoutingNodes().node("node12").size(), equalTo(4)); + assertThat(newState.getRoutingNodes().node("node11").size(), equalTo(4)); + //ensure all shards are assigned + assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(60)); + assertThat(newState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(0)); + } + + public void testThreeZoneOneReplicaLimitsShardAllocationOnOverload(){ + AllocationService strategy = createAllocationService(Settings.builder() + .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_RECOVERIES_SETTING.getKey(), 20) + .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), 20) + .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 20) + .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") + .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_TOTAL_CAPACITY_SETTING.getKey(), 15) + .put("cluster.routing.allocation.awareness.attributes", "zone") + .put("cluster.routing.allocation.awareness.force.zone.values", "zone_1,zone_2,zone_3") + .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_LIMIT_SETTING.getKey(), 20) + .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey(), + true) + .build()); + + logger.info("Building initial routing table for 'testThreeZoneOneReplicaLimitsShardAllocationOnOverload'"); + + Metadata metadata = Metadata.builder() + .put(IndexMetadata.builder("test").settings(settings(Version.CURRENT)).numberOfShards(30).numberOfReplicas(1)) + .build(); + + RoutingTable initialRoutingTable = RoutingTable.builder() + .addAsNew(metadata.index("test")) + .build(); + + ClusterState clusterState = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING + .getDefault(Settings.EMPTY)).metadata(metadata).routingTable(initialRoutingTable).build(); + + logger.info("--> adding five nodes on same zone and do rerouting"); + clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder() + .add(newNode("node1", singletonMap("zone", "zone1"))) + .add(newNode("node2", singletonMap("zone", "zone1"))) + .add(newNode("node3", singletonMap("zone", "zone1"))) + .add(newNode("node4", singletonMap("zone", "zone1"))) + .add(newNode("node5", singletonMap("zone", "zone1"))) + ).build(); + clusterState = strategy.reroute(clusterState, "reroute"); + assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(30)); + + logger.info("--> start the shards (primaries)"); + clusterState = startInitializingShardsAndReroute(strategy, clusterState); + + logger.info("--> replica will not start because we have only one zone value"); + assertThat(clusterState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(30)); + assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(0)); + //replicas are unassigned + assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(30)); + + logger.info("--> add five new node in new zone and reroute"); + clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()) + .add(newNode("node6", singletonMap("zone", "zone2"))) + .add(newNode("node7", singletonMap("zone", "zone2"))) + .add(newNode("node8", singletonMap("zone", "zone2"))) + .add(newNode("node9", singletonMap("zone", "zone2"))) + .add(newNode("node10", singletonMap("zone", "zone2"))) + ).build(); + clusterState = strategy.reroute(clusterState, "reroute"); + + assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.STARTED).size(), equalTo(30)); + assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.INITIALIZING).size(), equalTo(25)); + + logger.info("--> complete relocation"); + clusterState = startInitializingShardsAndReroute(strategy, clusterState); + + assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.STARTED).size(), equalTo(55)); + + logger.info("--> do another reroute, make sure nothing moves"); + assertThat(strategy.reroute(clusterState, "reroute").routingTable(), sameInstance(clusterState.routingTable())); + + logger.info("--> add another five node in new zone and reroute"); + + clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()) + .add(newNode("node11", singletonMap("zone", "zone3"))) + .add(newNode("node12", singletonMap("zone", "zone3"))) + .add(newNode("node13", singletonMap("zone", "zone3"))) + .add(newNode("node14", singletonMap("zone", "zone3"))) + .add(newNode("node15", singletonMap("zone", "zone3"))) + ).build(); + ClusterState newState = strategy.reroute(clusterState, "reroute"); + while (newState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { + newState = startInitializingShardsAndReroute(strategy, newState); + } + assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(60)); + + assertThat(newState.getRoutingNodes().node("node11").size(), equalTo(4)); + assertThat(newState.getRoutingNodes().node("node12").size(), equalTo(4)); + assertThat(newState.getRoutingNodes().node("node13").size(), equalTo(4)); + assertThat(newState.getRoutingNodes().node("node14").size(), equalTo(4)); + assertThat(newState.getRoutingNodes().node("node15").size(), equalTo(4)); + + logger.info("--> Remove three node from zone3"); + //remove one nodes in one zone to cause distribution zone1->5 , zone2->5, zone3->4 + newState = removeNode(newState, "node11", strategy); + newState = removeNode(newState, "node12", strategy); + newState = removeNode(newState, "node13", strategy); + newState = strategy.reroute(newState, "reroute"); + + while (newState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { + newState = startInitializingShardsAndReroute(strategy, newState); + } + + assertThat(newState.getRoutingNodes().node("node14").size(), equalTo(5)); + assertThat(newState.getRoutingNodes().node("node15").size(), equalTo(5)); + + //add the removed node + newState = ClusterState.builder(newState).nodes(DiscoveryNodes.builder(newState.nodes()) + .add(newNode("node11", singletonMap("zone", "zone3")))) + .build(); + newState = strategy.reroute(newState, "reroute"); + + //add the removed node + newState = ClusterState.builder(newState).nodes(DiscoveryNodes.builder(newState.nodes()) + .add(newNode("node12", singletonMap("zone", "zone3")))) + .build(); + newState = strategy.reroute(newState, "reroute"); + + //add the removed node + newState = ClusterState.builder(newState).nodes(DiscoveryNodes.builder(newState.nodes()) + .add(newNode("node13", singletonMap("zone", "zone3")))) + .build(); + newState = strategy.reroute(newState, "reroute"); + + while (newState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { + newState = startInitializingShardsAndReroute(strategy, newState); + } + assertThat(newState.getRoutingNodes().node("node13").size(), equalTo(4)); + assertThat(newState.getRoutingNodes().node("node12").size(), equalTo(4)); + assertThat(newState.getRoutingNodes().node("node11").size(), equalTo(4)); + //ensure all shards are assigned + assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(60)); + assertThat(newState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(0)); + } + + public void testThreeZoneTwoReplicaLimitsShardAllocationOnOverloadAcrossZones() { + AllocationService strategy = createAllocationService(Settings.builder() + .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_RECOVERIES_SETTING.getKey(), 21) + .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), 21) + .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 21) + .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") + .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_TOTAL_CAPACITY_SETTING.getKey(), 9) + .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_LIMIT_SETTING.getKey(), 10) + .put("cluster.routing.allocation.awareness.attributes", "zone") + .put("cluster.routing.allocation.awareness.force.zone.values", "zone_1,zone_2,zone_3") + .build()); + + logger.info("Building initial routing table for 'testThreeZoneTwoReplicaLimitsShardAllocationOnOverloadAcrossZones'"); + + Metadata metadata = Metadata.builder() + .put(IndexMetadata.builder("test").settings(settings(Version.CURRENT)).numberOfShards(21).numberOfReplicas(2)) + .build(); + + RoutingTable initialRoutingTable = RoutingTable.builder() + .addAsNew(metadata.index("test")) + .build(); + + ClusterState clusterState = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING + .getDefault(Settings.EMPTY)).metadata(metadata).routingTable(initialRoutingTable).build(); + + logger.info("--> adding three nodes on same zone and do rerouting"); + clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder() + .add(newNode("node1", singletonMap("zone", "zone_1"))) + .add(newNode("node2", singletonMap("zone", "zone_1"))) + .add(newNode("node3", singletonMap("zone", "zone_1"))) + ).build(); + clusterState = strategy.reroute(clusterState, "reroute"); + assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(21)); + + logger.info("--> start the shards (primaries)"); + clusterState = startInitializingShardsAndReroute(strategy, clusterState); + + logger.info("--> replica will not start because we have only one rack value"); + assertThat(clusterState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(21)); + assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(0)); + + logger.info("--> add three new node with a new rack and reroute"); + clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()) + .add(newNode("node4", singletonMap("zone", "zone_2"))) + .add(newNode("node5", singletonMap("zone", "zone_2"))) + .add(newNode("node6", singletonMap("zone", "zone_2"))) + ).build(); + clusterState = strategy.reroute(clusterState, "reroute"); + + assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.STARTED).size(), equalTo(21)); + assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.INITIALIZING).size(), equalTo(21)); + assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.INITIALIZING).get(0).currentNodeId(), + equalTo("node4")); + + logger.info("--> complete relocation"); + clusterState = startInitializingShardsAndReroute(strategy, clusterState); + + assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.STARTED).size(), equalTo(42)); + + logger.info("--> do another reroute, make sure nothing moves"); + assertThat(strategy.reroute(clusterState, "reroute").routingTable(), sameInstance(clusterState.routingTable())); + + logger.info("--> add another node with a new rack, make sure nothing moves"); + + clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()) + .add(newNode("node7", singletonMap("zone", "zone_3"))) + .add(newNode("node8", singletonMap("zone", "zone_3"))) + .add(newNode("node9", singletonMap("zone", "zone_3"))) + ).build(); + ClusterState newState = strategy.reroute(clusterState, "reroute"); + while (newState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { + newState = startInitializingShardsAndReroute(strategy, newState); + } + assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(63)); + + logger.info("--> Remove one node from zone1"); + //remove one nodes in one zone to cause distribution zone1->2 , zone2->3, zone3->2 + newState = removeNode(newState, "node7", strategy); + logger.info("--> Remove another node from zones2"); + newState = removeNode(newState, "node2", strategy); + newState = strategy.reroute(newState, "reroute"); + while (newState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { + newState = startInitializingShardsAndReroute(strategy, newState); + } + //ensure minority zone doesn't get overloaded + assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(53)); + assertThat(newState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(10)); + for (ShardRouting shard : newState.getRoutingNodes().shardsWithState(UNASSIGNED)) { + assertEquals(shard.unassignedInfo().getReason(), UnassignedInfo.Reason.NODE_LEFT); + } + + newState = ClusterState.builder(newState).nodes(DiscoveryNodes.builder(newState.nodes()) + .add(newNode("node7", singletonMap("zone", "zone_3"))) + .add(newNode("node2", singletonMap("zone", "zone_1"))) + ).build(); + newState = strategy.reroute(newState, "reroute"); + + while (newState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { + newState = startInitializingShardsAndReroute(strategy, newState); + } + + for (RoutingNode node : newState.getRoutingNodes()) { + assertThat(node.size(), equalTo(7)); + } + assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(63)); + assertThat(newState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(0)); + } + + public void testSingleZoneTwoReplicaLimitsReplicaAllocationOnOverload() { + AllocationService strategy = createAllocationService(Settings.builder() + .put("cluster.routing.allocation.node_concurrent_recoveries", 10) + .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 10) + .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") + .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_TOTAL_CAPACITY_SETTING.getKey(), 3) + .put("cluster.routing.allocation.awareness.attributes", "zone") + .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_LIMIT_SETTING.getKey(), 10) + .build()); + + logger.info("Building initial routing table for 'testSingleZoneTwoReplicaLimitsReplicaAllocationOnOverload'"); + + Metadata metadata = Metadata.builder() + .put(IndexMetadata.builder("test").settings(settings(Version.CURRENT)).numberOfShards(3).numberOfReplicas(2)) + .build(); + + RoutingTable initialRoutingTable = RoutingTable.builder() + .addAsNew(metadata.index("test")) + .build(); + + ClusterState clusterState = ClusterState.builder(org.opensearch.cluster.ClusterName.CLUSTER_NAME_SETTING + .getDefault(Settings.EMPTY)).metadata(metadata).routingTable(initialRoutingTable).build(); + + logger.info("--> adding two nodes on same rack and do rerouting"); + clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder() + .add(newNode("node1", singletonMap("zone", "zone1"))) + .add(newNode("node2", singletonMap("zone", "zone1"))) + .add(newNode("node3", singletonMap("zone", "zone1"))) + ).build(); + clusterState = strategy.reroute(clusterState, "reroute"); + assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(3)); + + logger.info("--> start the shards (primaries)"); + clusterState = startInitializingShardsAndReroute(strategy, clusterState); + + logger.info("--> replicas are initializing"); + assertThat(clusterState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(3)); + assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(6)); + + logger.info("--> start the shards (replicas)"); + clusterState = startInitializingShardsAndReroute(strategy, clusterState); + + logger.info("--> all shards are started"); + assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.STARTED).size(), equalTo(9)); + + logger.info("--> do another reroute, make sure nothing moves"); + assertThat(strategy.reroute(clusterState, "reroute").routingTable(), sameInstance(clusterState.routingTable())); + + //remove one node to make zone1 skewed + clusterState = removeNode(clusterState, randomFrom("node1", "node2", "node3"), strategy); + clusterState = strategy.reroute(clusterState, "reroute"); + + while (clusterState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { + clusterState = startInitializingShardsAndReroute(strategy, clusterState); + } + clusterState = startInitializingShardsAndReroute(strategy, clusterState); + + assertThat(clusterState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(6)); + assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(3)); + + for (ShardRouting shard : clusterState.getRoutingNodes().shardsWithState(UNASSIGNED)) { + assertEquals(shard.unassignedInfo().getReason(), UnassignedInfo.Reason.NODE_LEFT); + assertFalse(shard.primary()); + } + } + + public void testSingleZoneOneReplicaLimitsReplicaAllocationOnOverload() { + AllocationService strategy = createAllocationService(Settings.builder() + .put("cluster.routing.allocation.node_concurrent_recoveries", 10) + .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), 20) + .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 20) + .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") + .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_TOTAL_CAPACITY_SETTING.getKey(), 5) + .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_LIMIT_SETTING.getKey(), 10) + .put("cluster.routing.allocation.awareness.attributes", "zone") + .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey(), + true) + .build()); + + logger.info("Building initial routing table for 'testSingleZoneOneReplicaLimitsReplicaAllocationOnOverload'"); + + Metadata metadata = Metadata.builder() + .put(IndexMetadata.builder("test").settings(settings(Version.CURRENT)).numberOfShards(20).numberOfReplicas(1)) + .build(); + + RoutingTable initialRoutingTable = RoutingTable.builder() + .addAsNew(metadata.index("test")) + .build(); + + ClusterState clusterState = ClusterState.builder(org.opensearch.cluster.ClusterName.CLUSTER_NAME_SETTING + .getDefault(Settings.EMPTY)).metadata(metadata).routingTable(initialRoutingTable).build(); + + logger.info("--> adding two nodes on same rack and do rerouting"); + clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder() + .add(newNode("node1", singletonMap("zone", "zone1"))) + .add(newNode("node2", singletonMap("zone", "zone1"))) + ).build(); + clusterState = strategy.reroute(clusterState, "reroute"); + //skewness limit doesn't apply to primary + assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(20)); + + logger.info("--> start the shards (primaries)"); + clusterState = startInitializingShardsAndReroute(strategy, clusterState); + + assertThat(clusterState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(20)); + //assert replicas are not assigned but primaries are + logger.info("--> replicas are not initializing"); + assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(0)); + + for (ShardRouting shard : clusterState.getRoutingNodes().shardsWithState(UNASSIGNED)) { + assertEquals(shard.unassignedInfo().getReason(), UnassignedInfo.Reason.INDEX_CREATED); + assertFalse(shard.primary()); + } + + logger.info("--> do another reroute, make sure nothing moves"); + assertThat(strategy.reroute(clusterState, "reroute").routingTable(), sameInstance(clusterState.routingTable())); + + //add the third and fourth node + clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()) + .add(newNode("node3", singletonMap("zone", "zone1"))) + .add(newNode("node4", singletonMap("zone", "zone1"))) + ).build(); + clusterState = strategy.reroute(clusterState, "reroute"); + + assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(18)); + + clusterState = startInitializingShardsAndReroute(strategy, clusterState); + + logger.info("--> replicas are started"); + assertThat(clusterState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(38)); + + for (ShardRouting shard : clusterState.getRoutingNodes().shardsWithState(UNASSIGNED)) { + assertEquals(shard.unassignedInfo().getReason(), UnassignedInfo.Reason.INDEX_CREATED); + assertFalse(shard.primary()); + } + + clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()) + .add(newNode("node5", singletonMap("zone", "zone1"))) + ).build(); + clusterState = strategy.reroute(clusterState, "reroute"); + + while (clusterState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { + clusterState = startInitializingShardsAndReroute(strategy, clusterState); + } + clusterState = startInitializingShardsAndReroute(strategy, clusterState); + + assertThat(clusterState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(40)); + assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(0)); + + for (RoutingNode node : clusterState.getRoutingNodes()) { + assertThat(node.size(), equalTo(8)); + } + } + + private ClusterState removeNode(ClusterState clusterState, String nodeName, AllocationService allocationService) { + return allocationService.disassociateDeadNodes(ClusterState.builder(clusterState) + .nodes(DiscoveryNodes.builder(clusterState.getNodes()).remove(nodeName)).build(), true, "reroute"); + } +} From e84197972a3c97f834f76ef4112e80abdbbd5141 Mon Sep 17 00:00:00 2001 From: Bukhtawar Khan Date: Wed, 8 Sep 2021 13:13:57 +0530 Subject: [PATCH 4/8] Minor changes Signed-off-by: Bukhtawar Khan --- .../NodeOverloadAwareAllocationDecider.java | 64 ++++++++++--------- .../common/settings/ClusterSettings.java | 6 +- .../NodeOverloadAwareAllocationTests.java | 42 ++++++------ 3 files changed, 57 insertions(+), 55 deletions(-) diff --git a/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/NodeOverloadAwareAllocationDecider.java b/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/NodeOverloadAwareAllocationDecider.java index 6c3f3b49dcc13..6fcbdca150186 100644 --- a/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/NodeOverloadAwareAllocationDecider.java +++ b/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/NodeOverloadAwareAllocationDecider.java @@ -26,44 +26,46 @@ * This {@link NodeOverloadAwareAllocationDecider} controls shard over-allocation * due to node failures or otherwise on the surviving nodes *
- * cluster.routing.allocation.overload_aware.capacity: N
+ * cluster.routing.allocation.overload_awareness.provisioned_capacity: N
  * 
*

* and prevent allocation on the surviving nodes of the under capacity cluster - * based on a skewness limit defined as a percentage by + * based on oveload factor defined as a percentage by *

- * cluster.routing.allocation.overload_aware.limit: X
+ * cluster.routing.allocation.overload_awareness.factor: X
  * 
*/ public class NodeOverloadAwareAllocationDecider extends AllocationDecider { - public static final String NAME = "overload_aware"; + public static final String NAME = "overload_awareness"; - public static final Setting CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_TOTAL_CAPACITY_SETTING = - Setting.intSetting("cluster.routing.allocation.overload_aware.capacity", -1, -1, Property.Dynamic, Property.NodeScope); - public static final Setting CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_LIMIT_SETTING = - Setting.intSetting("cluster.routing.allocation.overload_aware.limit", 50, -1, Property.Dynamic, Property.NodeScope); - public static final Setting CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_ALLOW_UNASSIGNED_PRIMARIES_SETTING = - Setting.boolSetting("cluster.routing.allocation.overload_aware.allow_unassigned_primaries", + public static final Setting CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING = + Setting.intSetting("cluster.routing.allocation.overload_awareness.provisioned_capacity", -1, -1, + Property.Dynamic, Property.NodeScope); + public static final Setting CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_FACTOR_SETTING = + Setting.intSetting("cluster.routing.allocation.overload_awareness.factor", 50, -1, Property.Dynamic, + Property.NodeScope); + public static final Setting CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING = + Setting.boolSetting("cluster.routing.allocation.overload_awareness.allow_unassigned_primaries", true, Setting.Property.Dynamic, Property.NodeScope); - private volatile int totalCapacity; + private volatile int provisionedCapacity; - private volatile int skewnessLimit; + private volatile int overloadFactor; private volatile boolean allowUnassignedPrimaries; private static final Logger logger = LogManager.getLogger(NodeOverloadAwareAllocationDecider.class); public NodeOverloadAwareAllocationDecider(Settings settings, ClusterSettings clusterSettings) { - this.skewnessLimit = CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_LIMIT_SETTING.get(settings); - this.totalCapacity = CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_TOTAL_CAPACITY_SETTING.get(settings); - this.allowUnassignedPrimaries = CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_ALLOW_UNASSIGNED_PRIMARIES_SETTING.get(settings); - clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_LIMIT_SETTING, - this::setSkewnessLimit); - clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_TOTAL_CAPACITY_SETTING, - this::setTotalCapacity); - clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_ALLOW_UNASSIGNED_PRIMARIES_SETTING, + this.overloadFactor = CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_FACTOR_SETTING.get(settings); + this.provisionedCapacity = CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.get(settings); + this.allowUnassignedPrimaries = CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING.get(settings); + clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_FACTOR_SETTING, + this::setOverloadFactor); + clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING, + this::setProvisionedCapacity); + clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING, this::setAllowUnassignedPrimaries); } @@ -71,12 +73,12 @@ private void setAllowUnassignedPrimaries(boolean allowUnassignedPrimaries) { this.allowUnassignedPrimaries = allowUnassignedPrimaries; } - private void setSkewnessLimit(int skewnessLimit) { - this.skewnessLimit = skewnessLimit; + private void setOverloadFactor(int overloadFactor) { + this.overloadFactor = overloadFactor; } - private void setTotalCapacity(int totalCapacity) { - this.totalCapacity = totalCapacity; + private void setProvisionedCapacity(int provisionedCapacity) { + this.provisionedCapacity = provisionedCapacity; } @Override @@ -91,26 +93,26 @@ public Decision canRemain(ShardRouting shardRouting, RoutingNode node, RoutingAl private Decision underCapacity(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation, BiPredicate decider) { - if (totalCapacity <= 0 || skewnessLimit < 0 ) { + if (provisionedCapacity <= 0 || overloadFactor < 0 ) { return allocation.decision(Decision.YES, NAME, "overload awareness allocation is not enabled, set cluster setting [%s] and cluster se=tting [%s] to enable it", - CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_LIMIT_SETTING.getKey(), - CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_TOTAL_CAPACITY_SETTING.getKey()); + CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_FACTOR_SETTING.getKey(), + CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey()); } if (shardRouting.unassigned() && shardRouting.primary() && allowUnassignedPrimaries) { return allocation.decision(Decision.YES, NAME, "overload allocation awareness is allowed for unassigned primaries, set cluster setting [%s] to disable it", - CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey()); + CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey()); } Metadata metadata = allocation.metadata(); - float expectedAvgShardsPerNode = (float) metadata.getTotalNumberOfShards() / totalCapacity; + float expectedAvgShardsPerNode = (float) metadata.getTotalNumberOfShards() / provisionedCapacity; int nodeShardCount = node.numberOfOwningShards(); logger.debug(() -> new ParameterizedMessage("Expected shards per node {}, current node shard count {}", expectedAvgShardsPerNode, nodeShardCount)); - if (decider.test(nodeShardCount, (int) Math.ceil(expectedAvgShardsPerNode * (1 + skewnessLimit / 100.0)))) { + if (decider.test(nodeShardCount, (int) Math.ceil(expectedAvgShardsPerNode * (1 + overloadFactor / 100.0)))) { return allocation.decision(Decision.NO, NAME, "too many shards [%d] allocated to this node, cluster setting [%s=%d] based on capacity [%s]", - nodeShardCount, CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_LIMIT_SETTING.getKey(), skewnessLimit, totalCapacity); + nodeShardCount, CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_FACTOR_SETTING.getKey(), overloadFactor, provisionedCapacity); } return allocation.decision(Decision.YES, NAME, "node meets all skew awareness attribute requirements"); } diff --git a/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java b/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java index dd92311651d70..d4fddc84a2a3b 100644 --- a/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java +++ b/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java @@ -582,9 +582,9 @@ public void apply(Settings value, Settings current, Settings previous) { FsHealthService.SLOW_PATH_LOGGING_THRESHOLD_SETTING, TransportMainAction.OVERRIDE_MAIN_RESPONSE_VERSION, IndexingPressure.MAX_INDEXING_BYTES, - NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_TOTAL_CAPACITY_SETTING, - NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_LIMIT_SETTING, - NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_ALLOW_UNASSIGNED_PRIMARIES_SETTING))); + NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING, + NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_FACTOR_SETTING, + NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING))); public static List> BUILT_IN_SETTING_UPGRADERS = Collections.unmodifiableList(Arrays.asList( SniffConnectionStrategy.SEARCH_REMOTE_CLUSTER_SEEDS_UPGRADER, diff --git a/server/src/test/java/org/opensearch/cluster/routing/allocation/NodeOverloadAwareAllocationTests.java b/server/src/test/java/org/opensearch/cluster/routing/allocation/NodeOverloadAwareAllocationTests.java index 5bd21f6ca5540..953ddb2c78330 100644 --- a/server/src/test/java/org/opensearch/cluster/routing/allocation/NodeOverloadAwareAllocationTests.java +++ b/server/src/test/java/org/opensearch/cluster/routing/allocation/NodeOverloadAwareAllocationTests.java @@ -44,10 +44,10 @@ public void testSingleZoneZeroReplicaUnassignedPrimaryAllocation() { .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), 20) .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 20) .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") - .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_TOTAL_CAPACITY_SETTING.getKey(), 15) + .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(),15) .put("cluster.routing.allocation.awareness.attributes", "zone") - .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_LIMIT_SETTING.getKey(), 20) - .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey(), + .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_FACTOR_SETTING.getKey(), 20) + .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey(), true) .build()); @@ -145,10 +145,10 @@ public void testSingleZoneOneReplicaLimitsShardAllocationOnOverload() { .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), 20) .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 20) .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") - .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_TOTAL_CAPACITY_SETTING.getKey(), 5) + .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(),5) .put("cluster.routing.allocation.awareness.attributes", "zone") - .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_LIMIT_SETTING.getKey(), 20) - .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey(), + .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_FACTOR_SETTING.getKey(), 20) + .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey(), true) .build()); @@ -250,9 +250,9 @@ public void testSingleZoneOneReplicaLimitsShardAllocationOnOverload() { .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), 20) .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 20) .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") - .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_TOTAL_CAPACITY_SETTING.getKey(), 5) - .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_LIMIT_SETTING.getKey(), 20) - .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey(), + .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(), 5) + .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_FACTOR_SETTING.getKey(), 20) + .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey(), false) .build()); @@ -288,10 +288,10 @@ public void testThreeZoneTwoReplicaLimitsShardAllocationOnOverload(){ .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), 20) .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 20) .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") - .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_TOTAL_CAPACITY_SETTING.getKey(), 15) + .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(),15) .put("cluster.routing.allocation.awareness.attributes", "zone") .put("cluster.routing.allocation.awareness.force.zone.values", "zone_1,zone_2,zone_3") - .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_LIMIT_SETTING.getKey(), 20) + .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_FACTOR_SETTING.getKey(), 20) .build()); logger.info("Building initial routing table for 'testThreeZoneTwoReplicaLimitsShardAllocationOnOverload'"); @@ -426,11 +426,11 @@ public void testThreeZoneOneReplicaLimitsShardAllocationOnOverload(){ .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), 20) .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 20) .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") - .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_TOTAL_CAPACITY_SETTING.getKey(), 15) + .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(),15) .put("cluster.routing.allocation.awareness.attributes", "zone") .put("cluster.routing.allocation.awareness.force.zone.values", "zone_1,zone_2,zone_3") - .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_LIMIT_SETTING.getKey(), 20) - .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey(), + .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_FACTOR_SETTING.getKey(), 20) + .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey(), true) .build()); @@ -558,8 +558,8 @@ public void testThreeZoneTwoReplicaLimitsShardAllocationOnOverloadAcrossZones() .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), 21) .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 21) .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") - .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_TOTAL_CAPACITY_SETTING.getKey(), 9) - .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_LIMIT_SETTING.getKey(), 10) + .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(),9) + .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_FACTOR_SETTING.getKey(), 10) .put("cluster.routing.allocation.awareness.attributes", "zone") .put("cluster.routing.allocation.awareness.force.zone.values", "zone_1,zone_2,zone_3") .build()); @@ -665,9 +665,9 @@ public void testSingleZoneTwoReplicaLimitsReplicaAllocationOnOverload() { .put("cluster.routing.allocation.node_concurrent_recoveries", 10) .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 10) .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") - .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_TOTAL_CAPACITY_SETTING.getKey(), 3) + .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(),3) .put("cluster.routing.allocation.awareness.attributes", "zone") - .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_LIMIT_SETTING.getKey(), 10) + .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_FACTOR_SETTING.getKey(), 10) .build()); logger.info("Building initial routing table for 'testSingleZoneTwoReplicaLimitsReplicaAllocationOnOverload'"); @@ -732,10 +732,10 @@ public void testSingleZoneOneReplicaLimitsReplicaAllocationOnOverload() { .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), 20) .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 20) .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") - .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_TOTAL_CAPACITY_SETTING.getKey(), 5) - .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_LIMIT_SETTING.getKey(), 10) + .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(),5) + .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_FACTOR_SETTING.getKey(), 10) .put("cluster.routing.allocation.awareness.attributes", "zone") - .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARE_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey(), + .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey(), true) .build()); From 537c24dc036230ba1f42549f8964c2876f73986d Mon Sep 17 00:00:00 2001 From: Bukhtawar Khan Date: Thu, 9 Sep 2021 22:29:23 +0530 Subject: [PATCH 5/8] Revision Signed-off-by: Bukhtawar Khan --- .../org/opensearch/cluster/ClusterModule.java | 4 +- ...va => NodeLoadAwareAllocationDecider.java} | 49 +++--- .../common/settings/ClusterSettings.java | 8 +- .../cluster/ClusterModuleTests.java | 9 +- ...java => NodeLoadAwareAllocationTests.java} | 166 ++++++++++++++---- 5 files changed, 169 insertions(+), 67 deletions(-) rename server/src/main/java/org/opensearch/cluster/routing/allocation/decider/{NodeOverloadAwareAllocationDecider.java => NodeLoadAwareAllocationDecider.java} (69%) rename server/src/test/java/org/opensearch/cluster/routing/allocation/{NodeOverloadAwareAllocationTests.java => NodeLoadAwareAllocationTests.java} (82%) diff --git a/server/src/main/java/org/opensearch/cluster/ClusterModule.java b/server/src/main/java/org/opensearch/cluster/ClusterModule.java index 58ffe2c4d6ef5..fc93e1be134b9 100644 --- a/server/src/main/java/org/opensearch/cluster/ClusterModule.java +++ b/server/src/main/java/org/opensearch/cluster/ClusterModule.java @@ -63,7 +63,7 @@ import org.opensearch.cluster.routing.allocation.decider.EnableAllocationDecider; import org.opensearch.cluster.routing.allocation.decider.FilterAllocationDecider; import org.opensearch.cluster.routing.allocation.decider.MaxRetryAllocationDecider; -import org.opensearch.cluster.routing.allocation.decider.NodeOverloadAwareAllocationDecider; +import org.opensearch.cluster.routing.allocation.decider.NodeLoadAwareAllocationDecider; import org.opensearch.cluster.routing.allocation.decider.NodeVersionAllocationDecider; import org.opensearch.cluster.routing.allocation.decider.RebalanceOnlyWhenActiveAllocationDecider; import org.opensearch.cluster.routing.allocation.decider.ReplicaAfterPrimaryActiveAllocationDecider; @@ -257,7 +257,7 @@ public static Collection createAllocationDeciders(Settings se addAllocationDecider(deciders, new ThrottlingAllocationDecider(settings, clusterSettings)); addAllocationDecider(deciders, new ShardsLimitAllocationDecider(settings, clusterSettings)); addAllocationDecider(deciders, new AwarenessAllocationDecider(settings, clusterSettings)); - addAllocationDecider(deciders, new NodeOverloadAwareAllocationDecider(settings, clusterSettings)); + addAllocationDecider(deciders, new NodeLoadAwareAllocationDecider(settings, clusterSettings)); clusterPlugins.stream() .flatMap(p -> p.createAllocationDeciders(settings, clusterSettings).stream()) diff --git a/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/NodeOverloadAwareAllocationDecider.java b/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/NodeLoadAwareAllocationDecider.java similarity index 69% rename from server/src/main/java/org/opensearch/cluster/routing/allocation/decider/NodeOverloadAwareAllocationDecider.java rename to server/src/main/java/org/opensearch/cluster/routing/allocation/decider/NodeLoadAwareAllocationDecider.java index 6fcbdca150186..217b82a1a0947 100644 --- a/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/NodeOverloadAwareAllocationDecider.java +++ b/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/NodeLoadAwareAllocationDecider.java @@ -23,30 +23,31 @@ import java.util.function.BiPredicate; /** - * This {@link NodeOverloadAwareAllocationDecider} controls shard over-allocation - * due to node failures or otherwise on the surviving nodes + * This {@link NodeLoadAwareAllocationDecider} controls shard over-allocation + * due to node failures or otherwise on the surviving nodes. The allocation limits + * are decided by the user provisioned capacity, to determine if there were lost nodes *
  * cluster.routing.allocation.overload_awareness.provisioned_capacity: N
  * 
*

* and prevent allocation on the surviving nodes of the under capacity cluster - * based on oveload factor defined as a percentage by + * based on overload factor defined as a percentage by *

  * cluster.routing.allocation.overload_awareness.factor: X
  * 
*/ -public class NodeOverloadAwareAllocationDecider extends AllocationDecider { +public class NodeLoadAwareAllocationDecider extends AllocationDecider { - public static final String NAME = "overload_awareness"; + public static final String NAME = "load_awareness"; - public static final Setting CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING = - Setting.intSetting("cluster.routing.allocation.overload_awareness.provisioned_capacity", -1, -1, + public static final Setting CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING = + Setting.intSetting("cluster.routing.allocation.load_awareness.provisioned_capacity", -1, -1, Property.Dynamic, Property.NodeScope); - public static final Setting CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_FACTOR_SETTING = - Setting.intSetting("cluster.routing.allocation.overload_awareness.factor", 50, -1, Property.Dynamic, + public static final Setting CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_FACTOR_SETTING = + Setting.intSetting("cluster.routing.allocation.load_awareness.factor", 50, -1, Property.Dynamic, Property.NodeScope); - public static final Setting CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING = - Setting.boolSetting("cluster.routing.allocation.overload_awareness.allow_unassigned_primaries", + public static final Setting CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING = + Setting.boolSetting("cluster.routing.allocation.load_awareness.allow_unassigned_primaries", true, Setting.Property.Dynamic, Property.NodeScope); private volatile int provisionedCapacity; @@ -55,17 +56,17 @@ public class NodeOverloadAwareAllocationDecider extends AllocationDecider { private volatile boolean allowUnassignedPrimaries; - private static final Logger logger = LogManager.getLogger(NodeOverloadAwareAllocationDecider.class); + private static final Logger logger = LogManager.getLogger(NodeLoadAwareAllocationDecider.class); - public NodeOverloadAwareAllocationDecider(Settings settings, ClusterSettings clusterSettings) { - this.overloadFactor = CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_FACTOR_SETTING.get(settings); - this.provisionedCapacity = CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.get(settings); - this.allowUnassignedPrimaries = CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING.get(settings); - clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_FACTOR_SETTING, + public NodeLoadAwareAllocationDecider(Settings settings, ClusterSettings clusterSettings) { + this.overloadFactor = CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_FACTOR_SETTING.get(settings); + this.provisionedCapacity = CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.get(settings); + this.allowUnassignedPrimaries = CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING.get(settings); + clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_FACTOR_SETTING, this::setOverloadFactor); - clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING, + clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING, this::setProvisionedCapacity); - clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING, + clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING, this::setAllowUnassignedPrimaries); } @@ -95,14 +96,14 @@ private Decision underCapacity(ShardRouting shardRouting, RoutingNode node, Rout BiPredicate decider) { if (provisionedCapacity <= 0 || overloadFactor < 0 ) { return allocation.decision(Decision.YES, NAME, - "overload awareness allocation is not enabled, set cluster setting [%s] and cluster se=tting [%s] to enable it", - CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_FACTOR_SETTING.getKey(), - CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey()); + "overload awareness allocation is not enabled, set cluster setting [%s] and cluster setting [%s] to enable it", + CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_FACTOR_SETTING.getKey(), + CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey()); } if (shardRouting.unassigned() && shardRouting.primary() && allowUnassignedPrimaries) { return allocation.decision(Decision.YES, NAME, "overload allocation awareness is allowed for unassigned primaries, set cluster setting [%s] to disable it", - CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey()); + CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey()); } Metadata metadata = allocation.metadata(); float expectedAvgShardsPerNode = (float) metadata.getTotalNumberOfShards() / provisionedCapacity; @@ -112,7 +113,7 @@ private Decision underCapacity(ShardRouting shardRouting, RoutingNode node, Rout if (decider.test(nodeShardCount, (int) Math.ceil(expectedAvgShardsPerNode * (1 + overloadFactor / 100.0)))) { return allocation.decision(Decision.NO, NAME, "too many shards [%d] allocated to this node, cluster setting [%s=%d] based on capacity [%s]", - nodeShardCount, CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_FACTOR_SETTING.getKey(), overloadFactor, provisionedCapacity); + nodeShardCount, CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_FACTOR_SETTING.getKey(), overloadFactor, provisionedCapacity); } return allocation.decision(Decision.YES, NAME, "node meets all skew awareness attribute requirements"); } diff --git a/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java b/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java index d4fddc84a2a3b..c02d40bcb53cb 100644 --- a/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java +++ b/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java @@ -33,7 +33,7 @@ import org.apache.logging.log4j.LogManager; import org.opensearch.action.main.TransportMainAction; -import org.opensearch.cluster.routing.allocation.decider.NodeOverloadAwareAllocationDecider; +import org.opensearch.cluster.routing.allocation.decider.NodeLoadAwareAllocationDecider; import org.opensearch.watcher.ResourceWatcherService; import org.opensearch.action.admin.cluster.configuration.TransportAddVotingConfigExclusionsAction; import org.opensearch.action.admin.indices.close.TransportCloseIndexAction; @@ -582,9 +582,9 @@ public void apply(Settings value, Settings current, Settings previous) { FsHealthService.SLOW_PATH_LOGGING_THRESHOLD_SETTING, TransportMainAction.OVERRIDE_MAIN_RESPONSE_VERSION, IndexingPressure.MAX_INDEXING_BYTES, - NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING, - NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_FACTOR_SETTING, - NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING))); + NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING, + NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_FACTOR_SETTING, + NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING))); public static List> BUILT_IN_SETTING_UPGRADERS = Collections.unmodifiableList(Arrays.asList( SniffConnectionStrategy.SEARCH_REMOTE_CLUSTER_SEEDS_UPGRADER, diff --git a/server/src/test/java/org/opensearch/cluster/ClusterModuleTests.java b/server/src/test/java/org/opensearch/cluster/ClusterModuleTests.java index 1314aa21abf6a..f51f0934fb674 100644 --- a/server/src/test/java/org/opensearch/cluster/ClusterModuleTests.java +++ b/server/src/test/java/org/opensearch/cluster/ClusterModuleTests.java @@ -48,7 +48,7 @@ import org.opensearch.cluster.routing.allocation.decider.EnableAllocationDecider; import org.opensearch.cluster.routing.allocation.decider.FilterAllocationDecider; import org.opensearch.cluster.routing.allocation.decider.MaxRetryAllocationDecider; -import org.opensearch.cluster.routing.allocation.decider.NodeOverloadAwareAllocationDecider; +import org.opensearch.cluster.routing.allocation.decider.NodeLoadAwareAllocationDecider; import org.opensearch.cluster.routing.allocation.decider.NodeVersionAllocationDecider; import org.opensearch.cluster.routing.allocation.decider.RebalanceOnlyWhenActiveAllocationDecider; import org.opensearch.cluster.routing.allocation.decider.ReplicaAfterPrimaryActiveAllocationDecider; @@ -70,12 +70,7 @@ import org.opensearch.gateway.GatewayAllocator; import org.opensearch.plugins.ClusterPlugin; import org.opensearch.test.gateway.TestGatewayAllocator; -import org.opensearch.cluster.ClusterInfoService; import org.opensearch.cluster.ClusterModule; -import org.opensearch.cluster.ClusterName; -import org.opensearch.cluster.ClusterState; -import org.opensearch.cluster.EmptyClusterInfoService; -import org.opensearch.cluster.RestoreInProgress; import java.util.Arrays; import java.util.Collection; @@ -232,7 +227,7 @@ public void testAllocationDeciderOrder() { ThrottlingAllocationDecider.class, ShardsLimitAllocationDecider.class, AwarenessAllocationDecider.class, - NodeOverloadAwareAllocationDecider.class); + NodeLoadAwareAllocationDecider.class); Collection deciders = ClusterModule.createAllocationDeciders(Settings.EMPTY, new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS), Collections.emptyList()); Iterator iter = deciders.iterator(); diff --git a/server/src/test/java/org/opensearch/cluster/routing/allocation/NodeOverloadAwareAllocationTests.java b/server/src/test/java/org/opensearch/cluster/routing/allocation/NodeLoadAwareAllocationTests.java similarity index 82% rename from server/src/test/java/org/opensearch/cluster/routing/allocation/NodeOverloadAwareAllocationTests.java rename to server/src/test/java/org/opensearch/cluster/routing/allocation/NodeLoadAwareAllocationTests.java index 953ddb2c78330..d9c0b3b40efb5 100644 --- a/server/src/test/java/org/opensearch/cluster/routing/allocation/NodeOverloadAwareAllocationTests.java +++ b/server/src/test/java/org/opensearch/cluster/routing/allocation/NodeLoadAwareAllocationTests.java @@ -23,20 +23,22 @@ import org.opensearch.cluster.routing.ShardRoutingState; import org.opensearch.cluster.routing.UnassignedInfo; import org.opensearch.cluster.routing.allocation.decider.ClusterRebalanceAllocationDecider; -import org.opensearch.cluster.routing.allocation.decider.NodeOverloadAwareAllocationDecider; +import org.opensearch.cluster.routing.allocation.decider.NodeLoadAwareAllocationDecider; import org.opensearch.cluster.routing.allocation.decider.ThrottlingAllocationDecider; import org.opensearch.common.settings.Settings; import static java.util.Collections.singletonMap; import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.is; +import static org.hamcrest.Matchers.oneOf; import static org.hamcrest.Matchers.sameInstance; import static org.opensearch.cluster.routing.ShardRoutingState.INITIALIZING; import static org.opensearch.cluster.routing.ShardRoutingState.STARTED; import static org.opensearch.cluster.routing.ShardRoutingState.UNASSIGNED; -public class NodeOverloadAwareAllocationTests extends OpenSearchAllocationTestCase { +public class NodeLoadAwareAllocationTests extends OpenSearchAllocationTestCase { - private final Logger logger = LogManager.getLogger(NodeOverloadAwareAllocationTests.class); + private final Logger logger = LogManager.getLogger(NodeLoadAwareAllocationTests.class); public void testSingleZoneZeroReplicaUnassignedPrimaryAllocation() { AllocationService strategy = createAllocationService(Settings.builder() @@ -44,10 +46,10 @@ public void testSingleZoneZeroReplicaUnassignedPrimaryAllocation() { .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), 20) .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 20) .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") - .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(),15) + .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(),5) .put("cluster.routing.allocation.awareness.attributes", "zone") - .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_FACTOR_SETTING.getKey(), 20) - .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey(), + .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_FACTOR_SETTING.getKey(), 20) + .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey(), true) .build()); @@ -102,6 +104,7 @@ public void testSingleZoneZeroReplicaUnassignedPrimaryAllocation() { newState = startInitializingShardsAndReroute(strategy, newState); + logger.info("no limits should be applied on newly create primaries"); assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(28)); assertThat(newState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(12)); for (ShardRouting shard : newState.getRoutingNodes().shardsWithState(UNASSIGNED)) { @@ -112,9 +115,8 @@ public void testSingleZoneZeroReplicaUnassignedPrimaryAllocation() { .add(newNode("node1", singletonMap("zone", "zone_1")))) .build(); + //4 existing shards from this node's local store get started newState = strategy.reroute(newState, "reroute"); - assertThat(newState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(4)); - newState = startInitializingShardsAndReroute(strategy, newState); assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(32)); @@ -123,9 +125,112 @@ public void testSingleZoneZeroReplicaUnassignedPrimaryAllocation() { .add(newNode("node2", singletonMap("zone", "zone_1")))) .build(); newState = strategy.reroute(newState, "reroute"); - assertThat(newState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(4)); + while (newState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { + newState = startInitializingShardsAndReroute(strategy, newState); + } + assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(36)); + + //add back node3 + newState = ClusterState.builder(newState).nodes(DiscoveryNodes.builder(newState.nodes()) + .add(newNode("node3", singletonMap("zone", "zone_1")))) + .build(); + newState = strategy.reroute(newState, "reroute"); + + while (newState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { + newState = startInitializingShardsAndReroute(strategy, newState); + } + assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(40)); + } + + public void testSingleZoneOneReplicaLimitsShardAllocationOnOverloadNoUnassignedPrimaries() { + AllocationService strategy = createAllocationService(Settings.builder() + .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_RECOVERIES_SETTING.getKey(), 20) + .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), 20) + .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 20) + .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") + .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(), 5) + .put("cluster.routing.allocation.awareness.attributes", "zone") + .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_FACTOR_SETTING.getKey(), 20) + .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey(), + false) + .build()); + + logger.info("Building initial routing table for 'testSingleZoneOneReplicaLimitsShardAllocationOnOverloadNoUnassignedPrimaries'"); + + Metadata metadata = Metadata.builder() + .put(IndexMetadata.builder("test").settings(settings(Version.CURRENT)).numberOfShards(20).numberOfReplicas(0)) + .build(); + + RoutingTable initialRoutingTable = RoutingTable.builder() + .addAsNew(metadata.index("test")) + .build(); + + ClusterState clusterState = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING + .getDefault(Settings.EMPTY)).metadata(metadata).routingTable(initialRoutingTable).build(); + + logger.info("--> adding three nodes on same zone and do rerouting"); + clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder() + .add(newNode("node1", singletonMap("zone", "zone_1"))) + .add(newNode("node2", singletonMap("zone", "zone_1"))) + .add(newNode("node3", singletonMap("zone", "zone_1"))) + .add(newNode("node4", singletonMap("zone", "zone_1"))) + .add(newNode("node5", singletonMap("zone", "zone_1"))) + ).build(); + clusterState = strategy.reroute(clusterState, "reroute"); + assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(20)); + + logger.info("--> start the shards (primaries)"); + clusterState = startInitializingShardsAndReroute(strategy, clusterState); + assertThat(clusterState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(20)); + + logger.info("--> Remove node from zone holding primaries"); + ClusterState newState = removeNode(clusterState, "node1", strategy); + logger.info("--> Remove node from zone holding primaries"); + newState = removeNode(newState, "node2", strategy); + logger.info("--> Remove node from zone holding primaries"); + newState = removeNode(newState, "node3", strategy); + + logger.info("add another index with 20 shards"); + metadata = Metadata.builder(newState.metadata()) + .put(IndexMetadata.builder("test1").settings(settings(Version.CURRENT) + .put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 20) + .put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 0) + )) + .build(); + RoutingTable updatedRoutingTable = RoutingTable.builder(newState.routingTable()) + .addAsNew(metadata.index("test1")) + .build(); + + newState = ClusterState.builder(newState).metadata(metadata).routingTable(updatedRoutingTable).build(); + newState = strategy.reroute(newState, "reroute"); newState = startInitializingShardsAndReroute(strategy, newState); + + logger.info("no limits should be applied on newly create primaries"); + assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(20)); + assertThat(newState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(20)); + for (ShardRouting shard : newState.getRoutingNodes().shardsWithState(UNASSIGNED)) { + assertThat(shard.unassignedInfo().getReason(), is(oneOf(UnassignedInfo.Reason.NODE_LEFT, UnassignedInfo.Reason.INDEX_CREATED))); + } + + newState = ClusterState.builder(newState).nodes(DiscoveryNodes.builder(newState.nodes()) + .add(newNode("node1", singletonMap("zone", "zone_1")))) + .build(); + + newState = strategy.reroute(newState, "reroute"); + while (newState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { + newState = startInitializingShardsAndReroute(strategy, newState); + } + assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(30)); + + //add back node2 when skewness is still breached + newState = ClusterState.builder(newState).nodes(DiscoveryNodes.builder(newState.nodes()) + .add(newNode("node2", singletonMap("zone", "zone_1")))) + .build(); + newState = strategy.reroute(newState, "reroute"); + while (newState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { + newState = startInitializingShardsAndReroute(strategy, newState); + } assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(36)); //add back node3 @@ -133,9 +238,10 @@ public void testSingleZoneZeroReplicaUnassignedPrimaryAllocation() { .add(newNode("node3", singletonMap("zone", "zone_1")))) .build(); newState = strategy.reroute(newState, "reroute"); - assertThat(newState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(4)); - newState = startInitializingShardsAndReroute(strategy, newState); + while (newState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { + newState = startInitializingShardsAndReroute(strategy, newState); + } assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(40)); } @@ -145,11 +251,11 @@ public void testSingleZoneOneReplicaLimitsShardAllocationOnOverload() { .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), 20) .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 20) .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") - .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(),5) + .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(),5) .put("cluster.routing.allocation.awareness.attributes", "zone") - .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_FACTOR_SETTING.getKey(), 20) - .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey(), - true) + .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_FACTOR_SETTING.getKey(), 20) + .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey(), + false) .build()); logger.info("Building initial routing table for 'testSingleZoneOneReplicaLimitsShardAllocationOnOverload'"); @@ -250,9 +356,9 @@ public void testSingleZoneOneReplicaLimitsShardAllocationOnOverload() { .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), 20) .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 20) .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") - .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(), 5) - .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_FACTOR_SETTING.getKey(), 20) - .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey(), + .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(), 5) + .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_FACTOR_SETTING.getKey(), 20) + .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey(), false) .build()); @@ -288,10 +394,10 @@ public void testThreeZoneTwoReplicaLimitsShardAllocationOnOverload(){ .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), 20) .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 20) .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") - .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(),15) + .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(),15) .put("cluster.routing.allocation.awareness.attributes", "zone") .put("cluster.routing.allocation.awareness.force.zone.values", "zone_1,zone_2,zone_3") - .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_FACTOR_SETTING.getKey(), 20) + .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_FACTOR_SETTING.getKey(), 20) .build()); logger.info("Building initial routing table for 'testThreeZoneTwoReplicaLimitsShardAllocationOnOverload'"); @@ -426,11 +532,11 @@ public void testThreeZoneOneReplicaLimitsShardAllocationOnOverload(){ .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), 20) .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 20) .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") - .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(),15) + .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(),15) .put("cluster.routing.allocation.awareness.attributes", "zone") .put("cluster.routing.allocation.awareness.force.zone.values", "zone_1,zone_2,zone_3") - .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_FACTOR_SETTING.getKey(), 20) - .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey(), + .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_FACTOR_SETTING.getKey(), 20) + .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey(), true) .build()); @@ -558,8 +664,8 @@ public void testThreeZoneTwoReplicaLimitsShardAllocationOnOverloadAcrossZones() .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), 21) .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 21) .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") - .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(),9) - .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_FACTOR_SETTING.getKey(), 10) + .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(),9) + .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_FACTOR_SETTING.getKey(), 10) .put("cluster.routing.allocation.awareness.attributes", "zone") .put("cluster.routing.allocation.awareness.force.zone.values", "zone_1,zone_2,zone_3") .build()); @@ -665,9 +771,9 @@ public void testSingleZoneTwoReplicaLimitsReplicaAllocationOnOverload() { .put("cluster.routing.allocation.node_concurrent_recoveries", 10) .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 10) .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") - .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(),3) + .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(),3) .put("cluster.routing.allocation.awareness.attributes", "zone") - .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_FACTOR_SETTING.getKey(), 10) + .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_FACTOR_SETTING.getKey(), 10) .build()); logger.info("Building initial routing table for 'testSingleZoneTwoReplicaLimitsReplicaAllocationOnOverload'"); @@ -732,10 +838,10 @@ public void testSingleZoneOneReplicaLimitsReplicaAllocationOnOverload() { .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), 20) .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 20) .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") - .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(),5) - .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_FACTOR_SETTING.getKey(), 10) + .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(),5) + .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_FACTOR_SETTING.getKey(), 10) .put("cluster.routing.allocation.awareness.attributes", "zone") - .put(NodeOverloadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_OVERLOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey(), + .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey(), true) .build()); From 9262f66afd0466b80a782f06bffe738d0864772e Mon Sep 17 00:00:00 2001 From: Bukhtawar Khan Date: Sat, 11 Sep 2021 22:49:59 +0530 Subject: [PATCH 6/8] Changes Signed-off-by: Bukhtawar Khan --- .../NodeLoadAwareAllocationDecider.java | 41 +- .../common/settings/ClusterSettings.java | 2 +- .../cluster/ClusterModuleTests.java | 5 + .../NodeLoadAwareAllocationTests.java | 369 ++++++++++++++++-- 4 files changed, 362 insertions(+), 55 deletions(-) diff --git a/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/NodeLoadAwareAllocationDecider.java b/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/NodeLoadAwareAllocationDecider.java index 217b82a1a0947..1250174ba66ae 100644 --- a/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/NodeLoadAwareAllocationDecider.java +++ b/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/NodeLoadAwareAllocationDecider.java @@ -33,7 +33,14 @@ * and prevent allocation on the surviving nodes of the under capacity cluster * based on overload factor defined as a percentage by *
- * cluster.routing.allocation.overload_awareness.factor: X
+ * cluster.routing.allocation.load_awareness.skew_factor: X
+ * 
+ * The total limit per node based on skew_factor doesn't limit primaries that previously + * existed on the disk as those shards are force allocated by + * {@link AllocationDeciders#canForceAllocatePrimary(ShardRouting, RoutingNode, RoutingAllocation)} + * however new primaries due to index creation, snapshot restore etc can be controlled via the below settings + *
+ * cluster.routing.allocation.load_awareness.allow_unassigned_primaries
  * 
*/ public class NodeLoadAwareAllocationDecider extends AllocationDecider { @@ -43,8 +50,8 @@ public class NodeLoadAwareAllocationDecider extends AllocationDecider { public static final Setting CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING = Setting.intSetting("cluster.routing.allocation.load_awareness.provisioned_capacity", -1, -1, Property.Dynamic, Property.NodeScope); - public static final Setting CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_FACTOR_SETTING = - Setting.intSetting("cluster.routing.allocation.load_awareness.factor", 50, -1, Property.Dynamic, + public static final Setting CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_SKEW_FACTOR_SETTING = + Setting.intSetting("cluster.routing.allocation.load_awareness.skew_factor", 50, -1, Property.Dynamic, Property.NodeScope); public static final Setting CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING = Setting.boolSetting("cluster.routing.allocation.load_awareness.allow_unassigned_primaries", @@ -52,18 +59,18 @@ public class NodeLoadAwareAllocationDecider extends AllocationDecider { private volatile int provisionedCapacity; - private volatile int overloadFactor; + private volatile int skewFactor; private volatile boolean allowUnassignedPrimaries; private static final Logger logger = LogManager.getLogger(NodeLoadAwareAllocationDecider.class); public NodeLoadAwareAllocationDecider(Settings settings, ClusterSettings clusterSettings) { - this.overloadFactor = CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_FACTOR_SETTING.get(settings); + this.skewFactor = CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_SKEW_FACTOR_SETTING.get(settings); this.provisionedCapacity = CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.get(settings); this.allowUnassignedPrimaries = CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING.get(settings); - clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_FACTOR_SETTING, - this::setOverloadFactor); + clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_SKEW_FACTOR_SETTING, + this::setSkewFactor); clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING, this::setProvisionedCapacity); clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING, @@ -74,8 +81,8 @@ private void setAllowUnassignedPrimaries(boolean allowUnassignedPrimaries) { this.allowUnassignedPrimaries = allowUnassignedPrimaries; } - private void setOverloadFactor(int overloadFactor) { - this.overloadFactor = overloadFactor; + private void setSkewFactor(int skewFactor) { + this.skewFactor = skewFactor; } private void setProvisionedCapacity(int provisionedCapacity) { @@ -94,10 +101,10 @@ public Decision canRemain(ShardRouting shardRouting, RoutingNode node, RoutingAl private Decision underCapacity(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation, BiPredicate decider) { - if (provisionedCapacity <= 0 || overloadFactor < 0 ) { + if (provisionedCapacity <= 0 || skewFactor < 0 ) { return allocation.decision(Decision.YES, NAME, "overload awareness allocation is not enabled, set cluster setting [%s] and cluster setting [%s] to enable it", - CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_FACTOR_SETTING.getKey(), + CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_SKEW_FACTOR_SETTING.getKey(), CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey()); } if (shardRouting.unassigned() && shardRouting.primary() && allowUnassignedPrimaries) { @@ -108,12 +115,14 @@ private Decision underCapacity(ShardRouting shardRouting, RoutingNode node, Rout Metadata metadata = allocation.metadata(); float expectedAvgShardsPerNode = (float) metadata.getTotalNumberOfShards() / provisionedCapacity; int nodeShardCount = node.numberOfOwningShards(); - logger.debug(() -> new ParameterizedMessage("Expected shards per node {}, current node shard count {}", - expectedAvgShardsPerNode, nodeShardCount)); - if (decider.test(nodeShardCount, (int) Math.ceil(expectedAvgShardsPerNode * (1 + overloadFactor / 100.0)))) { + int limit = (int) Math.ceil(expectedAvgShardsPerNode * (1 + skewFactor / 100.0)); + if (decider.test(nodeShardCount, limit)) { + logger.debug(() -> new ParameterizedMessage("Too many shards [{}] allocated to this node [{}]. Expected average shards" + + " per node [{}], overload factor [{}], node limit [{}]", nodeShardCount, node.nodeId(), expectedAvgShardsPerNode, + skewFactor, limit)); return allocation.decision(Decision.NO, NAME, - "too many shards [%d] allocated to this node, cluster setting [%s=%d] based on capacity [%s]", - nodeShardCount, CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_FACTOR_SETTING.getKey(), overloadFactor, provisionedCapacity); + "too many shards [%d] allocated to this node, limit per node [%d] for overload factor [%s] based on capacity [%d]", + nodeShardCount, limit, skewFactor, provisionedCapacity); } return allocation.decision(Decision.YES, NAME, "node meets all skew awareness attribute requirements"); } diff --git a/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java b/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java index c02d40bcb53cb..0af78d033a2cb 100644 --- a/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java +++ b/server/src/main/java/org/opensearch/common/settings/ClusterSettings.java @@ -583,7 +583,7 @@ public void apply(Settings value, Settings current, Settings previous) { TransportMainAction.OVERRIDE_MAIN_RESPONSE_VERSION, IndexingPressure.MAX_INDEXING_BYTES, NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING, - NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_FACTOR_SETTING, + NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_SKEW_FACTOR_SETTING, NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING))); public static List> BUILT_IN_SETTING_UPGRADERS = Collections.unmodifiableList(Arrays.asList( diff --git a/server/src/test/java/org/opensearch/cluster/ClusterModuleTests.java b/server/src/test/java/org/opensearch/cluster/ClusterModuleTests.java index f51f0934fb674..0066d6895eaa5 100644 --- a/server/src/test/java/org/opensearch/cluster/ClusterModuleTests.java +++ b/server/src/test/java/org/opensearch/cluster/ClusterModuleTests.java @@ -70,7 +70,12 @@ import org.opensearch.gateway.GatewayAllocator; import org.opensearch.plugins.ClusterPlugin; import org.opensearch.test.gateway.TestGatewayAllocator; +import org.opensearch.cluster.ClusterInfoService; import org.opensearch.cluster.ClusterModule; +import org.opensearch.cluster.ClusterName; +import org.opensearch.cluster.ClusterState; +import org.opensearch.cluster.EmptyClusterInfoService; +import org.opensearch.cluster.RestoreInProgress; import java.util.Arrays; import java.util.Collection; diff --git a/server/src/test/java/org/opensearch/cluster/routing/allocation/NodeLoadAwareAllocationTests.java b/server/src/test/java/org/opensearch/cluster/routing/allocation/NodeLoadAwareAllocationTests.java index d9c0b3b40efb5..872f2b266111c 100644 --- a/server/src/test/java/org/opensearch/cluster/routing/allocation/NodeLoadAwareAllocationTests.java +++ b/server/src/test/java/org/opensearch/cluster/routing/allocation/NodeLoadAwareAllocationTests.java @@ -26,11 +26,11 @@ import org.opensearch.cluster.routing.allocation.decider.NodeLoadAwareAllocationDecider; import org.opensearch.cluster.routing.allocation.decider.ThrottlingAllocationDecider; import org.opensearch.common.settings.Settings; +import org.opensearch.gateway.GatewayAllocator; +import org.opensearch.test.gateway.TestGatewayAllocator; import static java.util.Collections.singletonMap; import static org.hamcrest.Matchers.equalTo; -import static org.hamcrest.Matchers.is; -import static org.hamcrest.Matchers.oneOf; import static org.hamcrest.Matchers.sameInstance; import static org.opensearch.cluster.routing.ShardRoutingState.INITIALIZING; import static org.opensearch.cluster.routing.ShardRoutingState.STARTED; @@ -40,20 +40,19 @@ public class NodeLoadAwareAllocationTests extends OpenSearchAllocationTestCase { private final Logger logger = LogManager.getLogger(NodeLoadAwareAllocationTests.class); - public void testSingleZoneZeroReplicaUnassignedPrimaryAllocation() { + public void testNewUnassignedPrimaryAllocationOnOverload() { AllocationService strategy = createAllocationService(Settings.builder() .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_RECOVERIES_SETTING.getKey(), 20) .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), 20) .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 20) .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(),5) - .put("cluster.routing.allocation.awareness.attributes", "zone") - .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_FACTOR_SETTING.getKey(), 20) + .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_SKEW_FACTOR_SETTING.getKey(), 20) .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey(), true) .build()); - logger.info("Building initial routing table for 'testSingleZoneZeroReplicaUnassignedPrimaryAllocation'"); + logger.info("Building initial routing table for 'testNewUnassignedPrimaryAllocationOnOverload'"); Metadata metadata = Metadata.builder() .put(IndexMetadata.builder("test").settings(settings(Version.CURRENT)).numberOfShards(20).numberOfReplicas(0)) @@ -104,7 +103,7 @@ public void testSingleZoneZeroReplicaUnassignedPrimaryAllocation() { newState = startInitializingShardsAndReroute(strategy, newState); - logger.info("no limits should be applied on newly create primaries"); + logger.info("no limits should be applied on newly created primaries"); assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(28)); assertThat(newState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(12)); for (ShardRouting shard : newState.getRoutingNodes().shardsWithState(UNASSIGNED)) { @@ -142,20 +141,19 @@ public void testSingleZoneZeroReplicaUnassignedPrimaryAllocation() { assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(40)); } - public void testSingleZoneOneReplicaLimitsShardAllocationOnOverloadNoUnassignedPrimaries() { + public void testNoAllocationLimitsOnOverloadForDisabledLoadFactor() { AllocationService strategy = createAllocationService(Settings.builder() .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_RECOVERIES_SETTING.getKey(), 20) .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), 20) .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 20) .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") - .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(), 5) - .put("cluster.routing.allocation.awareness.attributes", "zone") - .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_FACTOR_SETTING.getKey(), 20) + .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(),5) + .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_SKEW_FACTOR_SETTING.getKey(), -1) .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey(), false) .build()); - logger.info("Building initial routing table for 'testSingleZoneOneReplicaLimitsShardAllocationOnOverloadNoUnassignedPrimaries'"); + logger.info("Building initial routing table for 'testNoAllocationLimitsOnOverloadForDisabledLoadFactor'"); Metadata metadata = Metadata.builder() .put(IndexMetadata.builder("test").settings(settings(Version.CURRENT)).numberOfShards(20).numberOfReplicas(0)) @@ -206,22 +204,23 @@ public void testSingleZoneOneReplicaLimitsShardAllocationOnOverloadNoUnassignedP newState = startInitializingShardsAndReroute(strategy, newState); - logger.info("no limits should be applied on newly create primaries"); - assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(20)); - assertThat(newState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(20)); + logger.info("no limits should be applied on newly created primaries"); + assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(28)); + assertThat(newState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(12)); for (ShardRouting shard : newState.getRoutingNodes().shardsWithState(UNASSIGNED)) { - assertThat(shard.unassignedInfo().getReason(), is(oneOf(UnassignedInfo.Reason.NODE_LEFT, UnassignedInfo.Reason.INDEX_CREATED))); + assertEquals(shard.unassignedInfo().getReason(), UnassignedInfo.Reason.NODE_LEFT); } newState = ClusterState.builder(newState).nodes(DiscoveryNodes.builder(newState.nodes()) .add(newNode("node1", singletonMap("zone", "zone_1")))) .build(); + //4 existing shards from this node's local store get started newState = strategy.reroute(newState, "reroute"); while (newState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { newState = startInitializingShardsAndReroute(strategy, newState); } - assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(30)); + assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(32)); //add back node2 when skewness is still breached newState = ClusterState.builder(newState).nodes(DiscoveryNodes.builder(newState.nodes()) @@ -245,18 +244,164 @@ public void testSingleZoneOneReplicaLimitsShardAllocationOnOverloadNoUnassignedP assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(40)); } - public void testSingleZoneOneReplicaLimitsShardAllocationOnOverload() { + public void testExistingPrimariesAllocationOnOverload() { + GatewayAllocator gatewayAllocator = new TestGatewayAllocator(); AllocationService strategy = createAllocationService(Settings.builder() .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_RECOVERIES_SETTING.getKey(), 20) .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), 20) .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 20) .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") - .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(),5) + .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(), 5) + .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_SKEW_FACTOR_SETTING.getKey(), 50) + .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey(), + false) + .build(), gatewayAllocator); + + logger.info("Building initial routing table for 'testExistingPrimariesAllocationOnOverload'"); + + Metadata metadata = Metadata.builder() + .put(IndexMetadata.builder("test").settings(settings(Version.CURRENT)).numberOfShards(20).numberOfReplicas(0)) + .build(); + + RoutingTable initialRoutingTable = RoutingTable.builder() + .addAsNew(metadata.index("test")) + .build(); + + ClusterState clusterState = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING + .getDefault(Settings.EMPTY)).metadata(metadata).routingTable(initialRoutingTable).build(); + + logger.info("--> adding three nodes on same zone and do rerouting"); + clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder() + .add(newNode("node1", singletonMap("zone", "zone_1"))) + .add(newNode("node2", singletonMap("zone", "zone_1"))) + .add(newNode("node3", singletonMap("zone", "zone_1"))) + .add(newNode("node4", singletonMap("zone", "zone_1"))) + .add(newNode("node5", singletonMap("zone", "zone_1"))) + ).build(); + clusterState = strategy.reroute(clusterState, "reroute"); + assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(20)); + + logger.info("--> start the shards (primaries)"); + clusterState = startInitializingShardsAndReroute(strategy, clusterState); + assertThat(clusterState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(20)); + + logger.info("--> Remove node1 from zone holding primaries"); + ClusterState newState = removeNode(clusterState, "node1", strategy); + logger.info("--> Remove node2 from zone holding primaries"); + newState = removeNode(newState, "node2", strategy); + logger.info("--> Remove node3 from zone holding primaries"); + newState = removeNode(newState, "node3", strategy); + + assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(8)); + + logger.info("add another index with 20 shards"); + metadata = Metadata.builder(newState.metadata()) + .put(IndexMetadata.builder("test1").settings(settings(Version.CURRENT) + .put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 20) + .put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 0) + )) + .build(); + RoutingTable updatedRoutingTable = RoutingTable.builder(newState.routingTable()) + .addAsNew(metadata.index("test1")) + .build(); + + newState = ClusterState.builder(newState).metadata(metadata).routingTable(updatedRoutingTable).build(); + newState = strategy.reroute(newState, "reroute"); + + newState = startInitializingShardsAndReroute(strategy, newState); + + logger.info("limits should be applied on newly create primaries"); + assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(24)); + assertThat(newState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(16)); + + assertEquals(12L, newState.getRoutingNodes().shardsWithState(UNASSIGNED).stream(). + filter(r -> r.unassignedInfo().getReason() == UnassignedInfo.Reason.NODE_LEFT).count()); + + assertEquals(4L, newState.getRoutingNodes().shardsWithState(UNASSIGNED).stream(). + filter(r -> r.unassignedInfo().getReason() == UnassignedInfo.Reason.INDEX_CREATED).count()); + + assertThat(newState.getRoutingNodes().node("node4").size(), equalTo(12)); + + logger.info("--> Remove node4 from zone holding primaries"); + newState = removeNode(newState, "node4", strategy); + + logger.info("--> change the overload load factor to zero and verify if unassigned primaries on disk get assigned despite overload"); + strategy = createAllocationService(Settings.builder() + .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_RECOVERIES_SETTING.getKey(), 20) + .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), 20) + .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 20) + .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") + .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(), 5) .put("cluster.routing.allocation.awareness.attributes", "zone") - .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_FACTOR_SETTING.getKey(), 20) + .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_SKEW_FACTOR_SETTING.getKey(), 0) .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey(), false) - .build()); + .build(), gatewayAllocator); + + newState = strategy.reroute(newState, "reroute"); + + logger.info("--> Add back node4 and ensure existing primaries are assigned"); + newState = ClusterState.builder(newState).nodes(DiscoveryNodes.builder(newState.nodes()) + .add(newNode("node4", singletonMap("zone", "zone_1")))) + .build(); + + newState = strategy.reroute(newState, "reroute"); + while (newState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { + newState = startInitializingShardsAndReroute(strategy, newState); + } + + logger.info("--> do another reroute, make sure nothing moves"); + assertThat(strategy.reroute(newState, "reroute").routingTable(), sameInstance(newState.routingTable())); + + assertThat(newState.getRoutingNodes().node("node4").size(), equalTo(12)); + assertThat(newState.getRoutingNodes().node("node5").size(), equalTo(12)); + + assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(24)); + + newState = ClusterState.builder(newState).nodes(DiscoveryNodes.builder(newState.nodes()) + .add(newNode("node1", singletonMap("zone", "zone_1")))) + .build(); + + newState = strategy.reroute(newState, "reroute"); + while (newState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { + newState = startInitializingShardsAndReroute(strategy, newState); + } + assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(32)); + + //add back node2 when skewness is still breached + newState = ClusterState.builder(newState).nodes(DiscoveryNodes.builder(newState.nodes()) + .add(newNode("node2", singletonMap("zone", "zone_1")))) + .build(); + newState = strategy.reroute(newState, "reroute"); + while (newState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { + newState = startInitializingShardsAndReroute(strategy, newState); + } + assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(36)); + + //add back node3 + newState = ClusterState.builder(newState).nodes(DiscoveryNodes.builder(newState.nodes()) + .add(newNode("node3", singletonMap("zone", "zone_1")))) + .build(); + newState = strategy.reroute(newState, "reroute"); + + while (newState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { + newState = startInitializingShardsAndReroute(strategy, newState); + } + assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(40)); + } + + public void testSingleZoneOneReplicaLimitsShardAllocationOnOverload() { + GatewayAllocator gatewayAllocator = new TestGatewayAllocator(); + AllocationService strategy = createAllocationService(Settings.builder() + .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_RECOVERIES_SETTING.getKey(), 20) + .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), 20) + .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 20) + .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") + .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(),5) + .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_SKEW_FACTOR_SETTING.getKey(), 20) + .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey(), + true) + .build(), gatewayAllocator); logger.info("Building initial routing table for 'testSingleZoneOneReplicaLimitsShardAllocationOnOverload'"); @@ -271,7 +416,7 @@ public void testSingleZoneOneReplicaLimitsShardAllocationOnOverload() { ClusterState clusterState = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING .getDefault(Settings.EMPTY)).metadata(metadata).routingTable(initialRoutingTable).build(); - logger.info("--> adding three nodes on same zone and do rerouting"); + logger.info("--> adding five nodes on same zone and do rerouting"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder() .add(newNode("node1", singletonMap("zone", "zone_1"))) .add(newNode("node2", singletonMap("zone", "zone_1"))) @@ -290,18 +435,28 @@ public void testSingleZoneOneReplicaLimitsShardAllocationOnOverload() { clusterState = startInitializingShardsAndReroute(strategy, clusterState); assertThat(clusterState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(40)); - logger.info("--> Remove node from zone"); + logger.info("--> Remove node1 from zone"); ClusterState newState = removeNode(clusterState, "node1", strategy); - newState = startInitializingShardsAndReroute(strategy, newState); + newState = strategy.reroute(newState, "reroute"); + + while (newState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { + newState = startInitializingShardsAndReroute(strategy, newState); + } + assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(40)); for (RoutingNode node : newState.getRoutingNodes()) { assertThat(node.size(), equalTo(10)); } - logger.info("--> Remove node from zone when the limit of overload is reached"); + logger.info("--> Remove node2 when the limit of overload is reached"); newState = removeNode(newState, "node2", strategy); - newState = startInitializingShardsAndReroute(strategy, newState); + newState = strategy.reroute(newState, "reroute"); + + while (newState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { + newState = startInitializingShardsAndReroute(strategy, newState); + } + assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(30)); for (ShardRouting shard : newState.getRoutingNodes().shardsWithState(UNASSIGNED)) { @@ -342,25 +497,24 @@ public void testSingleZoneOneReplicaLimitsShardAllocationOnOverload() { newState = ClusterState.builder(newState).metadata(metadata).routingTable(updatedRoutingTable).build(); newState = strategy.reroute(newState, "reroute"); - newState = startInitializingShardsAndReroute(strategy, newState); + while (newState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { + newState = startInitializingShardsAndReroute(strategy, newState); + } assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(120)); assertThat(newState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(20)); - for (ShardRouting shard : newState.getRoutingNodes().shardsWithState(UNASSIGNED)) { - assertFalse(shard.primary()); - } - + logger.info("change settings to allow unassigned primaries"); strategy = createAllocationService(Settings.builder() .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_RECOVERIES_SETTING.getKey(), 20) .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), 20) .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 20) .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(), 5) - .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_FACTOR_SETTING.getKey(), 20) + .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_SKEW_FACTOR_SETTING.getKey(), 20) .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey(), false) - .build()); + .build(), gatewayAllocator); for (RoutingNode node : newState.getRoutingNodes()) { assertThat(node.size(), equalTo(40)); @@ -397,7 +551,7 @@ public void testThreeZoneTwoReplicaLimitsShardAllocationOnOverload(){ .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(),15) .put("cluster.routing.allocation.awareness.attributes", "zone") .put("cluster.routing.allocation.awareness.force.zone.values", "zone_1,zone_2,zone_3") - .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_FACTOR_SETTING.getKey(), 20) + .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_SKEW_FACTOR_SETTING.getKey(), 20) .build()); logger.info("Building initial routing table for 'testThreeZoneTwoReplicaLimitsShardAllocationOnOverload'"); @@ -535,7 +689,7 @@ public void testThreeZoneOneReplicaLimitsShardAllocationOnOverload(){ .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(),15) .put("cluster.routing.allocation.awareness.attributes", "zone") .put("cluster.routing.allocation.awareness.force.zone.values", "zone_1,zone_2,zone_3") - .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_FACTOR_SETTING.getKey(), 20) + .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_SKEW_FACTOR_SETTING.getKey(), 20) .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey(), true) .build()); @@ -665,7 +819,7 @@ public void testThreeZoneTwoReplicaLimitsShardAllocationOnOverloadAcrossZones() .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 21) .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(),9) - .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_FACTOR_SETTING.getKey(), 10) + .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_SKEW_FACTOR_SETTING.getKey(), 10) .put("cluster.routing.allocation.awareness.attributes", "zone") .put("cluster.routing.allocation.awareness.force.zone.values", "zone_1,zone_2,zone_3") .build()); @@ -773,7 +927,7 @@ public void testSingleZoneTwoReplicaLimitsReplicaAllocationOnOverload() { .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(),3) .put("cluster.routing.allocation.awareness.attributes", "zone") - .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_FACTOR_SETTING.getKey(), 10) + .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_SKEW_FACTOR_SETTING.getKey(), 10) .build()); logger.info("Building initial routing table for 'testSingleZoneTwoReplicaLimitsReplicaAllocationOnOverload'"); @@ -839,7 +993,7 @@ public void testSingleZoneOneReplicaLimitsReplicaAllocationOnOverload() { .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 20) .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(),5) - .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_FACTOR_SETTING.getKey(), 10) + .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_SKEW_FACTOR_SETTING.getKey(), 10) .put("cluster.routing.allocation.awareness.attributes", "zone") .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey(), true) @@ -920,6 +1074,145 @@ public void testSingleZoneOneReplicaLimitsReplicaAllocationOnOverload() { } } + public void testThreeZoneTwoReplicaLimitsReplicaAllocationUnderFullZoneFailure(){ + AllocationService strategy = createAllocationService(Settings.builder() + .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_RECOVERIES_SETTING.getKey(), 20) + .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), 20) + .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 20) + .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") + .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(),15) + .put("cluster.routing.allocation.awareness.attributes", "zone") + .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_SKEW_FACTOR_SETTING.getKey(), 20) + .build()); + + logger.info("Building initial routing table for 'testThreeZoneTwoReplicaLimitsUnderFullZoneFailure'"); + + Metadata metadata = Metadata.builder() + .put(IndexMetadata.builder("test").settings(settings(Version.CURRENT)).numberOfShards(20).numberOfReplicas(2)) + .build(); + + RoutingTable initialRoutingTable = RoutingTable.builder() + .addAsNew(metadata.index("test")) + .build(); + + ClusterState clusterState = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING + .getDefault(Settings.EMPTY)).metadata(metadata).routingTable(initialRoutingTable).build(); + + logger.info("--> adding five nodes on same zone and do rerouting"); + clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder() + .add(newNode("node1", singletonMap("zone", "zone1"))) + .add(newNode("node2", singletonMap("zone", "zone1"))) + .add(newNode("node3", singletonMap("zone", "zone1"))) + .add(newNode("node4", singletonMap("zone", "zone1"))) + .add(newNode("node5", singletonMap("zone", "zone1"))) + ).build(); + clusterState = strategy.reroute(clusterState, "reroute"); + assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(20)); + + logger.info("--> start the shards (primaries)"); + clusterState = startInitializingShardsAndReroute(strategy, clusterState); + + logger.info("--> add five new node in new zone and reroute"); + clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()) + .add(newNode("node6", singletonMap("zone", "zone2"))) + .add(newNode("node7", singletonMap("zone", "zone2"))) + .add(newNode("node8", singletonMap("zone", "zone2"))) + .add(newNode("node9", singletonMap("zone", "zone2"))) + .add(newNode("node10", singletonMap("zone", "zone2"))) + ).build(); + clusterState = strategy.reroute(clusterState, "reroute"); + + logger.info("--> complete relocation"); + clusterState = startInitializingShardsAndReroute(strategy, clusterState); + + clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()) + .add(newNode("node11", singletonMap("zone", "zone3"))) + .add(newNode("node12", singletonMap("zone", "zone3"))) + .add(newNode("node13", singletonMap("zone", "zone3"))) + .add(newNode("node14", singletonMap("zone", "zone3"))) + .add(newNode("node15", singletonMap("zone", "zone3"))) + ).build(); + ClusterState newState = strategy.reroute(clusterState, "reroute"); + while (newState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { + newState = startInitializingShardsAndReroute(strategy, newState); + } + assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(60)); + assertThat(newState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(0)); + + assertThat(newState.getRoutingNodes().node("node11").size(), equalTo(4)); + assertThat(newState.getRoutingNodes().node("node12").size(), equalTo(4)); + assertThat(newState.getRoutingNodes().node("node13").size(), equalTo(4)); + assertThat(newState.getRoutingNodes().node("node14").size(), equalTo(4)); + assertThat(newState.getRoutingNodes().node("node15").size(), equalTo(4)); + + logger.info("--> Remove complete zone3 holding primary and replicas"); + newState = removeNode(newState, "node11", strategy); + newState = removeNode(newState, "node12", strategy); + newState = removeNode(newState, "node13", strategy); + newState = removeNode(newState, "node14", strategy); + newState = removeNode(newState, "node15", strategy); + newState = strategy.reroute(newState, "reroute"); + + while (newState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { + newState = startInitializingShardsAndReroute(strategy, newState); + } + + assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(50)); + assertThat(newState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(10)); + + for (RoutingNode node : newState.getRoutingNodes()) { + assertThat(node.size(), equalTo(5)); + } + + //add the removed node + newState = ClusterState.builder(newState).nodes(DiscoveryNodes.builder(newState.nodes()) + .add(newNode("node11", singletonMap("zone", "zone3")))) + .build(); + newState = strategy.reroute(newState, "reroute"); + + assertThat(newState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(5)); + newState = startInitializingShardsAndReroute(strategy, newState); + assertThat(newState.getRoutingNodes().node("node11").size(), equalTo(5)); + + //add the removed node + newState = ClusterState.builder(newState).nodes(DiscoveryNodes.builder(newState.nodes()) + .add(newNode("node12", singletonMap("zone", "zone3")))) + .build(); + newState = strategy.reroute(newState, "reroute"); + + assertThat(newState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(5)); + newState = startInitializingShardsAndReroute(strategy, newState); + assertThat(newState.getRoutingNodes().node("node12").size(), equalTo(5)); + + //add the removed node + newState = ClusterState.builder(newState).nodes(DiscoveryNodes.builder(newState.nodes()) + .add(newNode("node13", singletonMap("zone", "zone3")))) + .build(); + newState = strategy.reroute(newState, "reroute"); + + //add the removed node + newState = ClusterState.builder(newState).nodes(DiscoveryNodes.builder(newState.nodes()) + .add(newNode("node14", singletonMap("zone", "zone3")))) + .build(); + newState = strategy.reroute(newState, "reroute"); + + //add the removed node + newState = ClusterState.builder(newState).nodes(DiscoveryNodes.builder(newState.nodes()) + .add(newNode("node15", singletonMap("zone", "zone3")))) + .build(); + newState = strategy.reroute(newState, "reroute"); + + while (newState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { + newState = startInitializingShardsAndReroute(strategy, newState); + } + assertThat(newState.getRoutingNodes().node("node13").size(), equalTo(4)); + assertThat(newState.getRoutingNodes().node("node12").size(), equalTo(4)); + assertThat(newState.getRoutingNodes().node("node11").size(), equalTo(4)); + //ensure all shards are assigned + assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(60)); + assertThat(newState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(0)); + } + private ClusterState removeNode(ClusterState clusterState, String nodeName, AllocationService allocationService) { return allocationService.disassociateDeadNodes(ClusterState.builder(clusterState) .nodes(DiscoveryNodes.builder(clusterState.getNodes()).remove(nodeName)).build(), true, "reroute"); From 24f29eec3c7a220feeb44ba0aa5d35a9ce0ce9dd Mon Sep 17 00:00:00 2001 From: Bukhtawar Khan Date: Fri, 17 Sep 2021 23:04:48 +0530 Subject: [PATCH 7/8] Review comments Signed-off-by: Bukhtawar Khan --- .../NodeLoadAwareAllocationDecider.java | 22 +- .../NodeLoadAwareAllocationTests.java | 500 ++++++------------ 2 files changed, 168 insertions(+), 354 deletions(-) diff --git a/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/NodeLoadAwareAllocationDecider.java b/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/NodeLoadAwareAllocationDecider.java index 1250174ba66ae..4262a27d99d6e 100644 --- a/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/NodeLoadAwareAllocationDecider.java +++ b/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/NodeLoadAwareAllocationDecider.java @@ -25,7 +25,9 @@ /** * This {@link NodeLoadAwareAllocationDecider} controls shard over-allocation * due to node failures or otherwise on the surviving nodes. The allocation limits - * are decided by the user provisioned capacity, to determine if there were lost nodes + * are decided by the user provisioned capacity, to determine if there were lost nodes. + * The provisioned capacity as defined by the below settings needs to updated one every + * cluster scale up and scale down operations. *
  * cluster.routing.allocation.overload_awareness.provisioned_capacity: N
  * 
@@ -38,7 +40,10 @@ * The total limit per node based on skew_factor doesn't limit primaries that previously * existed on the disk as those shards are force allocated by * {@link AllocationDeciders#canForceAllocatePrimary(ShardRouting, RoutingNode, RoutingAllocation)} - * however new primaries due to index creation, snapshot restore etc can be controlled via the below settings + * however new primaries due to index creation, snapshot restore etc can be controlled via the below settings. + * Setting the value to true allows newly created primaries to get assigned while preventing the replica allocation + * breaching the skew factor. + * Note that setting this to false can result in the primaries not get assigned and the cluster turning RED *
  * cluster.routing.allocation.load_awareness.allow_unassigned_primaries
  * 
@@ -50,8 +55,8 @@ public class NodeLoadAwareAllocationDecider extends AllocationDecider { public static final Setting CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING = Setting.intSetting("cluster.routing.allocation.load_awareness.provisioned_capacity", -1, -1, Property.Dynamic, Property.NodeScope); - public static final Setting CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_SKEW_FACTOR_SETTING = - Setting.intSetting("cluster.routing.allocation.load_awareness.skew_factor", 50, -1, Property.Dynamic, + public static final Setting CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_SKEW_FACTOR_SETTING = + Setting.doubleSetting("cluster.routing.allocation.load_awareness.skew_factor", 50, -1, Property.Dynamic, Property.NodeScope); public static final Setting CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING = Setting.boolSetting("cluster.routing.allocation.load_awareness.allow_unassigned_primaries", @@ -59,7 +64,7 @@ public class NodeLoadAwareAllocationDecider extends AllocationDecider { private volatile int provisionedCapacity; - private volatile int skewFactor; + private volatile double skewFactor; private volatile boolean allowUnassignedPrimaries; @@ -81,7 +86,7 @@ private void setAllowUnassignedPrimaries(boolean allowUnassignedPrimaries) { this.allowUnassignedPrimaries = allowUnassignedPrimaries; } - private void setSkewFactor(int skewFactor) { + private void setSkewFactor(double skewFactor) { this.skewFactor = skewFactor; } @@ -120,9 +125,8 @@ private Decision underCapacity(ShardRouting shardRouting, RoutingNode node, Rout logger.debug(() -> new ParameterizedMessage("Too many shards [{}] allocated to this node [{}]. Expected average shards" + " per node [{}], overload factor [{}], node limit [{}]", nodeShardCount, node.nodeId(), expectedAvgShardsPerNode, skewFactor, limit)); - return allocation.decision(Decision.NO, NAME, - "too many shards [%d] allocated to this node, limit per node [%d] for overload factor [%s] based on capacity [%d]", - nodeShardCount, limit, skewFactor, provisionedCapacity); + return allocation.decision(Decision.NO, NAME, "too many shards [%d] allocated to this node, limit per node [%d] considering" + + " overload factor [%.2f] based on capacity [%d]", nodeShardCount, limit, skewFactor, provisionedCapacity); } return allocation.decision(Decision.YES, NAME, "node meets all skew awareness attribute requirements"); } diff --git a/server/src/test/java/org/opensearch/cluster/routing/allocation/NodeLoadAwareAllocationTests.java b/server/src/test/java/org/opensearch/cluster/routing/allocation/NodeLoadAwareAllocationTests.java index 872f2b266111c..4ecfe3fd1cbda 100644 --- a/server/src/test/java/org/opensearch/cluster/routing/allocation/NodeLoadAwareAllocationTests.java +++ b/server/src/test/java/org/opensearch/cluster/routing/allocation/NodeLoadAwareAllocationTests.java @@ -29,6 +29,8 @@ import org.opensearch.gateway.GatewayAllocator; import org.opensearch.test.gateway.TestGatewayAllocator; +import java.util.Map; + import static java.util.Collections.singletonMap; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.sameInstance; @@ -41,19 +43,13 @@ public class NodeLoadAwareAllocationTests extends OpenSearchAllocationTestCase { private final Logger logger = LogManager.getLogger(NodeLoadAwareAllocationTests.class); public void testNewUnassignedPrimaryAllocationOnOverload() { - AllocationService strategy = createAllocationService(Settings.builder() - .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_RECOVERIES_SETTING.getKey(), 20) - .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), 20) - .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 20) - .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") - .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(),5) - .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_SKEW_FACTOR_SETTING.getKey(), 20) - .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey(), - true) - .build()); + AllocationService strategy = createAllocationServiceWithAdditionalSettings(org.opensearch.common.collect.Map.of( + NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(), 5, + NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_SKEW_FACTOR_SETTING.getKey(), 20, + NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey(), true) + ); logger.info("Building initial routing table for 'testNewUnassignedPrimaryAllocationOnOverload'"); - Metadata metadata = Metadata.builder() .put(IndexMetadata.builder("test").settings(settings(Version.CURRENT)).numberOfShards(20).numberOfReplicas(0)) .build(); @@ -65,27 +61,17 @@ public void testNewUnassignedPrimaryAllocationOnOverload() { ClusterState clusterState = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING .getDefault(Settings.EMPTY)).metadata(metadata).routingTable(initialRoutingTable).build(); - logger.info("--> adding three nodes on same zone and do rerouting"); - clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder() - .add(newNode("node1", singletonMap("zone", "zone_1"))) - .add(newNode("node2", singletonMap("zone", "zone_1"))) - .add(newNode("node3", singletonMap("zone", "zone_1"))) - .add(newNode("node4", singletonMap("zone", "zone_1"))) - .add(newNode("node5", singletonMap("zone", "zone_1"))) - ).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + logger.info("--> adding five nodes on same zone and do rerouting"); + clusterState = addNodes(clusterState, strategy, "zone_1", "node1", "node2", "node3", "node4", "node5"); + assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(20)); logger.info("--> start the shards (primaries)"); clusterState = startInitializingShardsAndReroute(strategy, clusterState); assertThat(clusterState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(20)); - logger.info("--> Remove node from zone holding primaries"); - ClusterState newState = removeNode(clusterState, "node1", strategy); - logger.info("--> Remove node from zone holding primaries"); - newState = removeNode(newState, "node2", strategy); - logger.info("--> Remove node from zone holding primaries"); - newState = removeNode(newState, "node3", strategy); + logger.info("--> Remove nodes from zone holding primaries"); + ClusterState newState = removeNodes(clusterState, strategy, "node1", "node2", "node3"); logger.info("add another index with 20 shards"); metadata = Metadata.builder(newState.metadata()) @@ -142,16 +128,11 @@ public void testNewUnassignedPrimaryAllocationOnOverload() { } public void testNoAllocationLimitsOnOverloadForDisabledLoadFactor() { - AllocationService strategy = createAllocationService(Settings.builder() - .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_RECOVERIES_SETTING.getKey(), 20) - .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), 20) - .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 20) - .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") - .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(),5) - .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_SKEW_FACTOR_SETTING.getKey(), -1) - .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey(), - false) - .build()); + AllocationService strategy = createAllocationServiceWithAdditionalSettings(org.opensearch.common.collect.Map.of( + NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(), 5, + NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_SKEW_FACTOR_SETTING.getKey(), -1, + NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey(), false) + ); logger.info("Building initial routing table for 'testNoAllocationLimitsOnOverloadForDisabledLoadFactor'"); @@ -166,27 +147,17 @@ public void testNoAllocationLimitsOnOverloadForDisabledLoadFactor() { ClusterState clusterState = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING .getDefault(Settings.EMPTY)).metadata(metadata).routingTable(initialRoutingTable).build(); - logger.info("--> adding three nodes on same zone and do rerouting"); - clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder() - .add(newNode("node1", singletonMap("zone", "zone_1"))) - .add(newNode("node2", singletonMap("zone", "zone_1"))) - .add(newNode("node3", singletonMap("zone", "zone_1"))) - .add(newNode("node4", singletonMap("zone", "zone_1"))) - .add(newNode("node5", singletonMap("zone", "zone_1"))) - ).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + logger.info("--> adding five nodes on same zone and do rerouting"); + clusterState = addNodes(clusterState, strategy, "zone_1", "node1", "node2", "node3", "node4", "node5"); + assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(20)); logger.info("--> start the shards (primaries)"); clusterState = startInitializingShardsAndReroute(strategy, clusterState); assertThat(clusterState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(20)); - logger.info("--> Remove node from zone holding primaries"); - ClusterState newState = removeNode(clusterState, "node1", strategy); - logger.info("--> Remove node from zone holding primaries"); - newState = removeNode(newState, "node2", strategy); - logger.info("--> Remove node from zone holding primaries"); - newState = removeNode(newState, "node3", strategy); + logger.info("--> Remove nodes from zone holding primaries"); + ClusterState newState = removeNodes(clusterState, strategy, "node1", "node2", "node3" ); logger.info("add another index with 20 shards"); metadata = Metadata.builder(newState.metadata()) @@ -246,16 +217,11 @@ public void testNoAllocationLimitsOnOverloadForDisabledLoadFactor() { public void testExistingPrimariesAllocationOnOverload() { GatewayAllocator gatewayAllocator = new TestGatewayAllocator(); - AllocationService strategy = createAllocationService(Settings.builder() - .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_RECOVERIES_SETTING.getKey(), 20) - .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), 20) - .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 20) - .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") - .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(), 5) - .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_SKEW_FACTOR_SETTING.getKey(), 50) - .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey(), - false) - .build(), gatewayAllocator); + AllocationService strategy = createAllocationServiceWithAdditionalSettings(org.opensearch.common.collect.Map.of( + NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(), 5, + NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_SKEW_FACTOR_SETTING.getKey(), 50, + NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey(), false), + gatewayAllocator); logger.info("Building initial routing table for 'testExistingPrimariesAllocationOnOverload'"); @@ -270,27 +236,16 @@ public void testExistingPrimariesAllocationOnOverload() { ClusterState clusterState = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING .getDefault(Settings.EMPTY)).metadata(metadata).routingTable(initialRoutingTable).build(); - logger.info("--> adding three nodes on same zone and do rerouting"); - clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder() - .add(newNode("node1", singletonMap("zone", "zone_1"))) - .add(newNode("node2", singletonMap("zone", "zone_1"))) - .add(newNode("node3", singletonMap("zone", "zone_1"))) - .add(newNode("node4", singletonMap("zone", "zone_1"))) - .add(newNode("node5", singletonMap("zone", "zone_1"))) - ).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + logger.info("--> adding five nodes on same zone and do rerouting"); + clusterState = addNodes(clusterState, strategy, "zone_1", "node1", "node2", "node3", "node4", "node5"); assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(20)); logger.info("--> start the shards (primaries)"); clusterState = startInitializingShardsAndReroute(strategy, clusterState); assertThat(clusterState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(20)); - logger.info("--> Remove node1 from zone holding primaries"); - ClusterState newState = removeNode(clusterState, "node1", strategy); - logger.info("--> Remove node2 from zone holding primaries"); - newState = removeNode(newState, "node2", strategy); - logger.info("--> Remove node3 from zone holding primaries"); - newState = removeNode(newState, "node3", strategy); + logger.info("--> Remove nodes from zone holding primaries"); + ClusterState newState = removeNodes(clusterState, strategy, "node1", "node2", "node3"); assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(8)); @@ -323,20 +278,14 @@ public void testExistingPrimariesAllocationOnOverload() { assertThat(newState.getRoutingNodes().node("node4").size(), equalTo(12)); logger.info("--> Remove node4 from zone holding primaries"); - newState = removeNode(newState, "node4", strategy); + newState = removeNodes(newState, strategy,"node4"); logger.info("--> change the overload load factor to zero and verify if unassigned primaries on disk get assigned despite overload"); - strategy = createAllocationService(Settings.builder() - .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_RECOVERIES_SETTING.getKey(), 20) - .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), 20) - .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 20) - .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") - .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(), 5) - .put("cluster.routing.allocation.awareness.attributes", "zone") - .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_SKEW_FACTOR_SETTING.getKey(), 0) - .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey(), - false) - .build(), gatewayAllocator); + strategy = createAllocationServiceWithAdditionalSettings(org.opensearch.common.collect.Map.of( + NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(), 5, + NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_SKEW_FACTOR_SETTING.getKey(), 0, + NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey(), false), + gatewayAllocator); newState = strategy.reroute(newState, "reroute"); @@ -392,16 +341,11 @@ public void testExistingPrimariesAllocationOnOverload() { public void testSingleZoneOneReplicaLimitsShardAllocationOnOverload() { GatewayAllocator gatewayAllocator = new TestGatewayAllocator(); - AllocationService strategy = createAllocationService(Settings.builder() - .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_RECOVERIES_SETTING.getKey(), 20) - .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), 20) - .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 20) - .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") - .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(),5) - .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_SKEW_FACTOR_SETTING.getKey(), 20) - .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey(), - true) - .build(), gatewayAllocator); + AllocationService strategy = createAllocationServiceWithAdditionalSettings(org.opensearch.common.collect.Map.of( + NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(), 5, + NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_SKEW_FACTOR_SETTING.getKey(), 20, + NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey(), true), + gatewayAllocator); logger.info("Building initial routing table for 'testSingleZoneOneReplicaLimitsShardAllocationOnOverload'"); @@ -417,14 +361,7 @@ public void testSingleZoneOneReplicaLimitsShardAllocationOnOverload() { .getDefault(Settings.EMPTY)).metadata(metadata).routingTable(initialRoutingTable).build(); logger.info("--> adding five nodes on same zone and do rerouting"); - clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder() - .add(newNode("node1", singletonMap("zone", "zone_1"))) - .add(newNode("node2", singletonMap("zone", "zone_1"))) - .add(newNode("node3", singletonMap("zone", "zone_1"))) - .add(newNode("node4", singletonMap("zone", "zone_1"))) - .add(newNode("node5", singletonMap("zone", "zone_1"))) - ).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = addNodes(clusterState, strategy, "zone_1", "node1", "node2", "node3", "node4", "node5"); assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(20)); logger.info("--> start the shards (primaries)"); @@ -436,8 +373,7 @@ public void testSingleZoneOneReplicaLimitsShardAllocationOnOverload() { assertThat(clusterState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(40)); logger.info("--> Remove node1 from zone"); - ClusterState newState = removeNode(clusterState, "node1", strategy); - newState = strategy.reroute(newState, "reroute"); + ClusterState newState = removeNodes(clusterState, strategy, "node1"); while (newState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { newState = startInitializingShardsAndReroute(strategy, newState); @@ -450,7 +386,7 @@ public void testSingleZoneOneReplicaLimitsShardAllocationOnOverload() { } logger.info("--> Remove node2 when the limit of overload is reached"); - newState = removeNode(newState, "node2", strategy); + newState = removeNodes(newState, strategy, "node2"); newState = strategy.reroute(newState, "reroute"); while (newState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { @@ -505,16 +441,11 @@ public void testSingleZoneOneReplicaLimitsShardAllocationOnOverload() { assertThat(newState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(20)); logger.info("change settings to allow unassigned primaries"); - strategy = createAllocationService(Settings.builder() - .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_RECOVERIES_SETTING.getKey(), 20) - .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), 20) - .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 20) - .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") - .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(), 5) - .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_SKEW_FACTOR_SETTING.getKey(), 20) - .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey(), - false) - .build(), gatewayAllocator); + strategy = createAllocationServiceWithAdditionalSettings(org.opensearch.common.collect.Map.of( + NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(), 5, + NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_SKEW_FACTOR_SETTING.getKey(), 20, + NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey(), false), + gatewayAllocator); for (RoutingNode node : newState.getRoutingNodes()) { assertThat(node.size(), equalTo(40)); @@ -542,17 +473,12 @@ public void testSingleZoneOneReplicaLimitsShardAllocationOnOverload() { assertThat(newState.getRoutingNodes().shardsWithState(UNASSIGNED).stream().filter(x -> x.primary()).count(), equalTo(5L)); } - public void testThreeZoneTwoReplicaLimitsShardAllocationOnOverload(){ - AllocationService strategy = createAllocationService(Settings.builder() - .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_RECOVERIES_SETTING.getKey(), 20) - .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), 20) - .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 20) - .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") - .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(),15) - .put("cluster.routing.allocation.awareness.attributes", "zone") - .put("cluster.routing.allocation.awareness.force.zone.values", "zone_1,zone_2,zone_3") - .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_SKEW_FACTOR_SETTING.getKey(), 20) - .build()); + public void testThreeZoneTwoReplicaLimitsShardAllocationOnOverload() { + AllocationService strategy = createAllocationServiceWithAdditionalSettings(org.opensearch.common.collect.Map.of( + NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(), 15, + NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_SKEW_FACTOR_SETTING.getKey(), 20, + "cluster.routing.allocation.awareness.force.zone.values", "zone_1,zone_2,zone_3") + ); logger.info("Building initial routing table for 'testThreeZoneTwoReplicaLimitsShardAllocationOnOverload'"); @@ -568,14 +494,7 @@ public void testThreeZoneTwoReplicaLimitsShardAllocationOnOverload(){ .getDefault(Settings.EMPTY)).metadata(metadata).routingTable(initialRoutingTable).build(); logger.info("--> adding five nodes on same zone and do rerouting"); - clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder() - .add(newNode("node1", singletonMap("zone", "zone1"))) - .add(newNode("node2", singletonMap("zone", "zone1"))) - .add(newNode("node3", singletonMap("zone", "zone1"))) - .add(newNode("node4", singletonMap("zone", "zone1"))) - .add(newNode("node5", singletonMap("zone", "zone1"))) - ).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = addNodes(clusterState, strategy, "zone1", "node1", "node2", "node3", "node4", "node5"); assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(20)); logger.info("--> start the shards (primaries)"); @@ -588,14 +507,7 @@ public void testThreeZoneTwoReplicaLimitsShardAllocationOnOverload(){ assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(40)); logger.info("--> add five new node in new zone and reroute"); - clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()) - .add(newNode("node6", singletonMap("zone", "zone2"))) - .add(newNode("node7", singletonMap("zone", "zone2"))) - .add(newNode("node8", singletonMap("zone", "zone2"))) - .add(newNode("node9", singletonMap("zone", "zone2"))) - .add(newNode("node10", singletonMap("zone", "zone2"))) - ).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = addNodes(clusterState, strategy, "zone2", "node6", "node7", "node8", "node9", "node10"); assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.STARTED).size(), equalTo(20)); assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.INITIALIZING).size(), equalTo(20)); @@ -610,14 +522,8 @@ public void testThreeZoneTwoReplicaLimitsShardAllocationOnOverload(){ logger.info("--> add another five node in new zone and reroute"); - clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()) - .add(newNode("node11", singletonMap("zone", "zone3"))) - .add(newNode("node12", singletonMap("zone", "zone3"))) - .add(newNode("node13", singletonMap("zone", "zone3"))) - .add(newNode("node14", singletonMap("zone", "zone3"))) - .add(newNode("node15", singletonMap("zone", "zone3"))) - ).build(); - ClusterState newState = strategy.reroute(clusterState, "reroute"); + ClusterState newState = addNodes(clusterState, strategy, "zone3", "node11", "node12", "node13", "node14", "node15"); + while (newState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { newState = startInitializingShardsAndReroute(strategy, newState); } @@ -630,11 +536,8 @@ public void testThreeZoneTwoReplicaLimitsShardAllocationOnOverload(){ assertThat(newState.getRoutingNodes().node("node15").size(), equalTo(4)); logger.info("--> Remove three node from zone3 holding primary and replicas"); - //remove one nodes in one zone to cause distribution zone1->5 , zone2->5, zone3->4 - newState = removeNode(newState, "node11", strategy); - newState = removeNode(newState, "node12", strategy); - newState = removeNode(newState, "node13", strategy); - newState = strategy.reroute(newState, "reroute"); + + newState = removeNodes(newState, strategy, "node11", "node12", "node13"); while (newState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { newState = startInitializingShardsAndReroute(strategy, newState); @@ -644,30 +547,21 @@ public void testThreeZoneTwoReplicaLimitsShardAllocationOnOverload(){ assertThat(newState.getRoutingNodes().node("node15").size(), equalTo(5)); //add the removed node - newState = ClusterState.builder(newState).nodes(DiscoveryNodes.builder(newState.nodes()) - .add(newNode("node11", singletonMap("zone", "zone3")))) - .build(); - newState = strategy.reroute(newState, "reroute"); + newState = addNodes(newState, strategy, "zone3", "node11"); assertThat(newState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(5)); newState = startInitializingShardsAndReroute(strategy, newState); assertThat(newState.getRoutingNodes().node("node11").size(), equalTo(5)); //add the removed node - newState = ClusterState.builder(newState).nodes(DiscoveryNodes.builder(newState.nodes()) - .add(newNode("node12", singletonMap("zone", "zone3")))) - .build(); - newState = strategy.reroute(newState, "reroute"); + newState = addNodes(newState, strategy, "zone3", "node12"); assertThat(newState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(5)); newState = startInitializingShardsAndReroute(strategy, newState); assertThat(newState.getRoutingNodes().node("node12").size(), equalTo(5)); //add the removed node - newState = ClusterState.builder(newState).nodes(DiscoveryNodes.builder(newState.nodes()) - .add(newNode("node13", singletonMap("zone", "zone3")))) - .build(); - newState = strategy.reroute(newState, "reroute"); + newState = addNodes(newState, strategy, "zone3", "node13"); while (newState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { newState = startInitializingShardsAndReroute(strategy, newState); @@ -680,19 +574,13 @@ public void testThreeZoneTwoReplicaLimitsShardAllocationOnOverload(){ assertThat(newState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(0)); } - public void testThreeZoneOneReplicaLimitsShardAllocationOnOverload(){ - AllocationService strategy = createAllocationService(Settings.builder() - .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_RECOVERIES_SETTING.getKey(), 20) - .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), 20) - .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 20) - .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") - .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(),15) - .put("cluster.routing.allocation.awareness.attributes", "zone") - .put("cluster.routing.allocation.awareness.force.zone.values", "zone_1,zone_2,zone_3") - .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_SKEW_FACTOR_SETTING.getKey(), 20) - .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey(), - true) - .build()); + public void testThreeZoneOneReplicaLimitsShardAllocationOnOverload() { + AllocationService strategy = createAllocationServiceWithAdditionalSettings(org.opensearch.common.collect.Map.of( + NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(), 15, + NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_SKEW_FACTOR_SETTING.getKey(), 20, + NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey(), true, + "cluster.routing.allocation.awareness.force.zone.values", "zone_1,zone_2,zone_3") + ); logger.info("Building initial routing table for 'testThreeZoneOneReplicaLimitsShardAllocationOnOverload'"); @@ -708,14 +596,7 @@ public void testThreeZoneOneReplicaLimitsShardAllocationOnOverload(){ .getDefault(Settings.EMPTY)).metadata(metadata).routingTable(initialRoutingTable).build(); logger.info("--> adding five nodes on same zone and do rerouting"); - clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder() - .add(newNode("node1", singletonMap("zone", "zone1"))) - .add(newNode("node2", singletonMap("zone", "zone1"))) - .add(newNode("node3", singletonMap("zone", "zone1"))) - .add(newNode("node4", singletonMap("zone", "zone1"))) - .add(newNode("node5", singletonMap("zone", "zone1"))) - ).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = addNodes(clusterState, strategy, "zone1", "node1", "node2", "node3", "node4", "node5"); assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(30)); logger.info("--> start the shards (primaries)"); @@ -728,14 +609,7 @@ public void testThreeZoneOneReplicaLimitsShardAllocationOnOverload(){ assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(30)); logger.info("--> add five new node in new zone and reroute"); - clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()) - .add(newNode("node6", singletonMap("zone", "zone2"))) - .add(newNode("node7", singletonMap("zone", "zone2"))) - .add(newNode("node8", singletonMap("zone", "zone2"))) - .add(newNode("node9", singletonMap("zone", "zone2"))) - .add(newNode("node10", singletonMap("zone", "zone2"))) - ).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = addNodes(clusterState, strategy, "zone2", "node6", "node7", "node8", "node9", "node10"); assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.STARTED).size(), equalTo(30)); assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.INITIALIZING).size(), equalTo(25)); @@ -750,14 +624,7 @@ public void testThreeZoneOneReplicaLimitsShardAllocationOnOverload(){ logger.info("--> add another five node in new zone and reroute"); - clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()) - .add(newNode("node11", singletonMap("zone", "zone3"))) - .add(newNode("node12", singletonMap("zone", "zone3"))) - .add(newNode("node13", singletonMap("zone", "zone3"))) - .add(newNode("node14", singletonMap("zone", "zone3"))) - .add(newNode("node15", singletonMap("zone", "zone3"))) - ).build(); - ClusterState newState = strategy.reroute(clusterState, "reroute"); + ClusterState newState = addNodes(clusterState, strategy, "zone3", "node11", "node12", "node13", "node14", "node15"); while (newState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { newState = startInitializingShardsAndReroute(strategy, newState); } @@ -770,11 +637,8 @@ public void testThreeZoneOneReplicaLimitsShardAllocationOnOverload(){ assertThat(newState.getRoutingNodes().node("node15").size(), equalTo(4)); logger.info("--> Remove three node from zone3"); - //remove one nodes in one zone to cause distribution zone1->5 , zone2->5, zone3->4 - newState = removeNode(newState, "node11", strategy); - newState = removeNode(newState, "node12", strategy); - newState = removeNode(newState, "node13", strategy); - newState = strategy.reroute(newState, "reroute"); + + newState = removeNodes(newState, strategy, "node11", "node12", "node13"); while (newState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { newState = startInitializingShardsAndReroute(strategy, newState); @@ -783,23 +647,8 @@ public void testThreeZoneOneReplicaLimitsShardAllocationOnOverload(){ assertThat(newState.getRoutingNodes().node("node14").size(), equalTo(5)); assertThat(newState.getRoutingNodes().node("node15").size(), equalTo(5)); - //add the removed node - newState = ClusterState.builder(newState).nodes(DiscoveryNodes.builder(newState.nodes()) - .add(newNode("node11", singletonMap("zone", "zone3")))) - .build(); - newState = strategy.reroute(newState, "reroute"); - - //add the removed node - newState = ClusterState.builder(newState).nodes(DiscoveryNodes.builder(newState.nodes()) - .add(newNode("node12", singletonMap("zone", "zone3")))) - .build(); - newState = strategy.reroute(newState, "reroute"); - - //add the removed node - newState = ClusterState.builder(newState).nodes(DiscoveryNodes.builder(newState.nodes()) - .add(newNode("node13", singletonMap("zone", "zone3")))) - .build(); - newState = strategy.reroute(newState, "reroute"); + //add the removed nodes + newState = addNodes(clusterState, strategy, "zone3", "node11", "node12", "node13"); while (newState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { newState = startInitializingShardsAndReroute(strategy, newState); @@ -813,16 +662,11 @@ public void testThreeZoneOneReplicaLimitsShardAllocationOnOverload(){ } public void testThreeZoneTwoReplicaLimitsShardAllocationOnOverloadAcrossZones() { - AllocationService strategy = createAllocationService(Settings.builder() - .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_RECOVERIES_SETTING.getKey(), 21) - .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), 21) - .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 21) - .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") - .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(),9) - .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_SKEW_FACTOR_SETTING.getKey(), 10) - .put("cluster.routing.allocation.awareness.attributes", "zone") - .put("cluster.routing.allocation.awareness.force.zone.values", "zone_1,zone_2,zone_3") - .build()); + AllocationService strategy = createAllocationServiceWithAdditionalSettings(org.opensearch.common.collect.Map.of( + NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(), 9, + NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_SKEW_FACTOR_SETTING.getKey(), 10, + "cluster.routing.allocation.awareness.force.zone.values", "zone_1,zone_2,zone_3") + ); logger.info("Building initial routing table for 'testThreeZoneTwoReplicaLimitsShardAllocationOnOverloadAcrossZones'"); @@ -838,12 +682,7 @@ public void testThreeZoneTwoReplicaLimitsShardAllocationOnOverloadAcrossZones() .getDefault(Settings.EMPTY)).metadata(metadata).routingTable(initialRoutingTable).build(); logger.info("--> adding three nodes on same zone and do rerouting"); - clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder() - .add(newNode("node1", singletonMap("zone", "zone_1"))) - .add(newNode("node2", singletonMap("zone", "zone_1"))) - .add(newNode("node3", singletonMap("zone", "zone_1"))) - ).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = addNodes(clusterState, strategy, "zone_1", "node1", "node2", "node3"); assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(21)); logger.info("--> start the shards (primaries)"); @@ -854,12 +693,7 @@ public void testThreeZoneTwoReplicaLimitsShardAllocationOnOverloadAcrossZones() assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(0)); logger.info("--> add three new node with a new rack and reroute"); - clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()) - .add(newNode("node4", singletonMap("zone", "zone_2"))) - .add(newNode("node5", singletonMap("zone", "zone_2"))) - .add(newNode("node6", singletonMap("zone", "zone_2"))) - ).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = addNodes(clusterState, strategy, "zone_2", "node4", "node5", "node6"); assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.STARTED).size(), equalTo(21)); assertThat(clusterState.getRoutingNodes().shardsWithState(ShardRoutingState.INITIALIZING).size(), equalTo(21)); @@ -876,23 +710,16 @@ public void testThreeZoneTwoReplicaLimitsShardAllocationOnOverloadAcrossZones() logger.info("--> add another node with a new rack, make sure nothing moves"); - clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()) - .add(newNode("node7", singletonMap("zone", "zone_3"))) - .add(newNode("node8", singletonMap("zone", "zone_3"))) - .add(newNode("node9", singletonMap("zone", "zone_3"))) - ).build(); - ClusterState newState = strategy.reroute(clusterState, "reroute"); + ClusterState newState = addNodes(clusterState, strategy, "zone_3", "node7", "node8", "node9"); while (newState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { newState = startInitializingShardsAndReroute(strategy, newState); } assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(63)); - logger.info("--> Remove one node from zone1"); + logger.info("--> Remove two nodes from zones"); //remove one nodes in one zone to cause distribution zone1->2 , zone2->3, zone3->2 - newState = removeNode(newState, "node7", strategy); - logger.info("--> Remove another node from zones2"); - newState = removeNode(newState, "node2", strategy); - newState = strategy.reroute(newState, "reroute"); + newState = removeNodes(newState, strategy, "node7", "node2"); + while (newState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { newState = startInitializingShardsAndReroute(strategy, newState); } @@ -921,14 +748,10 @@ public void testThreeZoneTwoReplicaLimitsShardAllocationOnOverloadAcrossZones() } public void testSingleZoneTwoReplicaLimitsReplicaAllocationOnOverload() { - AllocationService strategy = createAllocationService(Settings.builder() - .put("cluster.routing.allocation.node_concurrent_recoveries", 10) - .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 10) - .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") - .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(),3) - .put("cluster.routing.allocation.awareness.attributes", "zone") - .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_SKEW_FACTOR_SETTING.getKey(), 10) - .build()); + AllocationService strategy = createAllocationServiceWithAdditionalSettings(org.opensearch.common.collect.Map.of( + NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(), 3, + NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_SKEW_FACTOR_SETTING.getKey(), 10) + ); logger.info("Building initial routing table for 'testSingleZoneTwoReplicaLimitsReplicaAllocationOnOverload'"); @@ -943,13 +766,9 @@ public void testSingleZoneTwoReplicaLimitsReplicaAllocationOnOverload() { ClusterState clusterState = ClusterState.builder(org.opensearch.cluster.ClusterName.CLUSTER_NAME_SETTING .getDefault(Settings.EMPTY)).metadata(metadata).routingTable(initialRoutingTable).build(); - logger.info("--> adding two nodes on same rack and do rerouting"); - clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder() - .add(newNode("node1", singletonMap("zone", "zone1"))) - .add(newNode("node2", singletonMap("zone", "zone1"))) - .add(newNode("node3", singletonMap("zone", "zone1"))) - ).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + logger.info("--> adding three nodes on same rack and do rerouting"); + clusterState = addNodes(clusterState, strategy, "zone1", "node1", "node2", "node3"); + assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(3)); logger.info("--> start the shards (primaries)"); @@ -969,8 +788,7 @@ public void testSingleZoneTwoReplicaLimitsReplicaAllocationOnOverload() { assertThat(strategy.reroute(clusterState, "reroute").routingTable(), sameInstance(clusterState.routingTable())); //remove one node to make zone1 skewed - clusterState = removeNode(clusterState, randomFrom("node1", "node2", "node3"), strategy); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = removeNodes(clusterState, strategy, randomFrom("node1", "node2", "node3")); while (clusterState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { clusterState = startInitializingShardsAndReroute(strategy, clusterState); @@ -987,17 +805,11 @@ public void testSingleZoneTwoReplicaLimitsReplicaAllocationOnOverload() { } public void testSingleZoneOneReplicaLimitsReplicaAllocationOnOverload() { - AllocationService strategy = createAllocationService(Settings.builder() - .put("cluster.routing.allocation.node_concurrent_recoveries", 10) - .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), 20) - .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 20) - .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") - .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(),5) - .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_SKEW_FACTOR_SETTING.getKey(), 10) - .put("cluster.routing.allocation.awareness.attributes", "zone") - .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey(), - true) - .build()); + AllocationService strategy = createAllocationServiceWithAdditionalSettings(org.opensearch.common.collect.Map.of( + NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(), 5, + NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_SKEW_FACTOR_SETTING.getKey(), 10, + NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey(), true) + ); logger.info("Building initial routing table for 'testSingleZoneOneReplicaLimitsReplicaAllocationOnOverload'"); @@ -1013,11 +825,7 @@ public void testSingleZoneOneReplicaLimitsReplicaAllocationOnOverload() { .getDefault(Settings.EMPTY)).metadata(metadata).routingTable(initialRoutingTable).build(); logger.info("--> adding two nodes on same rack and do rerouting"); - clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder() - .add(newNode("node1", singletonMap("zone", "zone1"))) - .add(newNode("node2", singletonMap("zone", "zone1"))) - ).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = addNodes(clusterState, strategy, "zone1", "node1","node2"); //skewness limit doesn't apply to primary assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(20)); @@ -1038,11 +846,7 @@ public void testSingleZoneOneReplicaLimitsReplicaAllocationOnOverload() { assertThat(strategy.reroute(clusterState, "reroute").routingTable(), sameInstance(clusterState.routingTable())); //add the third and fourth node - clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()) - .add(newNode("node3", singletonMap("zone", "zone1"))) - .add(newNode("node4", singletonMap("zone", "zone1"))) - ).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = addNodes(clusterState, strategy, "zone1", "node3", "node4"); assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(18)); @@ -1056,10 +860,7 @@ public void testSingleZoneOneReplicaLimitsReplicaAllocationOnOverload() { assertFalse(shard.primary()); } - clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()) - .add(newNode("node5", singletonMap("zone", "zone1"))) - ).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = addNodes(clusterState, strategy, "zone1", "node5"); while (clusterState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { clusterState = startInitializingShardsAndReroute(strategy, clusterState); @@ -1075,15 +876,9 @@ public void testSingleZoneOneReplicaLimitsReplicaAllocationOnOverload() { } public void testThreeZoneTwoReplicaLimitsReplicaAllocationUnderFullZoneFailure(){ - AllocationService strategy = createAllocationService(Settings.builder() - .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_RECOVERIES_SETTING.getKey(), 20) - .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), 20) - .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 20) - .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") - .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(),15) - .put("cluster.routing.allocation.awareness.attributes", "zone") - .put(NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_SKEW_FACTOR_SETTING.getKey(), 20) - .build()); + AllocationService strategy = createAllocationServiceWithAdditionalSettings(org.opensearch.common.collect.Map.of( + NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey(), 15, + NodeLoadAwareAllocationDecider.CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_SKEW_FACTOR_SETTING.getKey(), 20)); logger.info("Building initial routing table for 'testThreeZoneTwoReplicaLimitsUnderFullZoneFailure'"); @@ -1099,13 +894,7 @@ public void testThreeZoneTwoReplicaLimitsReplicaAllocationUnderFullZoneFailure() .getDefault(Settings.EMPTY)).metadata(metadata).routingTable(initialRoutingTable).build(); logger.info("--> adding five nodes on same zone and do rerouting"); - clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder() - .add(newNode("node1", singletonMap("zone", "zone1"))) - .add(newNode("node2", singletonMap("zone", "zone1"))) - .add(newNode("node3", singletonMap("zone", "zone1"))) - .add(newNode("node4", singletonMap("zone", "zone1"))) - .add(newNode("node5", singletonMap("zone", "zone1"))) - ).build(); + clusterState = addNodes(clusterState, strategy, "zone1", "node1", "node2", "node3", "node4", "node5"); clusterState = strategy.reroute(clusterState, "reroute"); assertThat(clusterState.getRoutingNodes().shardsWithState(INITIALIZING).size(), equalTo(20)); @@ -1113,26 +902,13 @@ public void testThreeZoneTwoReplicaLimitsReplicaAllocationUnderFullZoneFailure() clusterState = startInitializingShardsAndReroute(strategy, clusterState); logger.info("--> add five new node in new zone and reroute"); - clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()) - .add(newNode("node6", singletonMap("zone", "zone2"))) - .add(newNode("node7", singletonMap("zone", "zone2"))) - .add(newNode("node8", singletonMap("zone", "zone2"))) - .add(newNode("node9", singletonMap("zone", "zone2"))) - .add(newNode("node10", singletonMap("zone", "zone2"))) - ).build(); - clusterState = strategy.reroute(clusterState, "reroute"); + clusterState = addNodes(clusterState, strategy, "zone2", "node6", "node7", "node8", "node9", "node10"); logger.info("--> complete relocation"); clusterState = startInitializingShardsAndReroute(strategy, clusterState); - clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()) - .add(newNode("node11", singletonMap("zone", "zone3"))) - .add(newNode("node12", singletonMap("zone", "zone3"))) - .add(newNode("node13", singletonMap("zone", "zone3"))) - .add(newNode("node14", singletonMap("zone", "zone3"))) - .add(newNode("node15", singletonMap("zone", "zone3"))) - ).build(); - ClusterState newState = strategy.reroute(clusterState, "reroute"); + ClusterState newState = addNodes(clusterState, strategy, "zone3", "node11", "node12", "node13", "node14", "node15"); + while (newState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { newState = startInitializingShardsAndReroute(strategy, newState); } @@ -1146,12 +922,7 @@ public void testThreeZoneTwoReplicaLimitsReplicaAllocationUnderFullZoneFailure() assertThat(newState.getRoutingNodes().node("node15").size(), equalTo(4)); logger.info("--> Remove complete zone3 holding primary and replicas"); - newState = removeNode(newState, "node11", strategy); - newState = removeNode(newState, "node12", strategy); - newState = removeNode(newState, "node13", strategy); - newState = removeNode(newState, "node14", strategy); - newState = removeNode(newState, "node15", strategy); - newState = strategy.reroute(newState, "reroute"); + newState = removeNodes(newState, strategy, "node11", "node12", "node13", "node14", "node15"); while (newState.getRoutingNodes().shardsWithState(INITIALIZING).isEmpty() == false) { newState = startInitializingShardsAndReroute(strategy, newState); @@ -1213,8 +984,47 @@ public void testThreeZoneTwoReplicaLimitsReplicaAllocationUnderFullZoneFailure() assertThat(newState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(0)); } - private ClusterState removeNode(ClusterState clusterState, String nodeName, AllocationService allocationService) { + private ClusterState removeNodes(ClusterState clusterState, AllocationService allocationService, String... nodeIds) { + DiscoveryNodes.Builder nodeBuilder = DiscoveryNodes.builder(clusterState.getNodes()); + org.opensearch.common.collect.List.of(nodeIds).forEach(nodeId -> nodeBuilder.remove(nodeId)); return allocationService.disassociateDeadNodes(ClusterState.builder(clusterState) - .nodes(DiscoveryNodes.builder(clusterState.getNodes()).remove(nodeName)).build(), true, "reroute"); + .nodes(nodeBuilder).build(), true, "reroute"); + } + + private ClusterState addNodes(ClusterState clusterState, AllocationService allocationService, String zone, String... nodeIds) { + DiscoveryNodes.Builder nodeBuilder = DiscoveryNodes.builder(clusterState.nodes()); + org.opensearch.common.collect.List.of(nodeIds).forEach(nodeId -> nodeBuilder.add(newNode(nodeId, singletonMap("zone", zone)))); + clusterState = ClusterState.builder(clusterState).nodes(nodeBuilder).build(); + return allocationService.reroute(clusterState, "reroute"); + } + + private AllocationService createAllocationServiceWithAdditionalSettings(Map settingsValue) { + return createAllocationService(buildSettings(settingsValue)); + } + + private AllocationService createAllocationServiceWithAdditionalSettings(Map settingsValue, + GatewayAllocator gatewayAllocator) { + return createAllocationService(buildSettings(settingsValue), gatewayAllocator); + } + + private Settings buildSettings(Map settingsValue) { + Settings.Builder settingsBuilder = Settings.builder() + .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_RECOVERIES_SETTING.getKey(), 20) + .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING.getKey(), 20) + .put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_REPLICAS_RECOVERIES_SETTING.getKey(), 20) + .put(ClusterRebalanceAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ALLOW_REBALANCE_SETTING.getKey(), "always") + .put("cluster.routing.allocation.awareness.attributes", "zone"); + settingsValue.forEach((k, v) -> { + if (v instanceof Integer) + settingsBuilder.put(k, (Integer)(v)); + else if (v instanceof Boolean) + settingsBuilder.put(k, (Boolean)(v)); + else if (v instanceof String) + settingsBuilder.put(k, (String)(v)); + else { + throw new UnsupportedOperationException("Unsupported type for key :" + k); + } + }); + return settingsBuilder.build(); } } From 38cfe083d360027301e1074af7919bb19d155b19 Mon Sep 17 00:00:00 2001 From: Bukhtawar Khan Date: Mon, 20 Sep 2021 11:04:07 +0530 Subject: [PATCH 8/8] Nit pick Signed-off-by: Bukhtawar Khan --- .../allocation/decider/NodeLoadAwareAllocationDecider.java | 2 +- .../routing/allocation/NodeLoadAwareAllocationTests.java | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/NodeLoadAwareAllocationDecider.java b/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/NodeLoadAwareAllocationDecider.java index 4262a27d99d6e..f0318d986899d 100644 --- a/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/NodeLoadAwareAllocationDecider.java +++ b/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/NodeLoadAwareAllocationDecider.java @@ -26,7 +26,7 @@ * This {@link NodeLoadAwareAllocationDecider} controls shard over-allocation * due to node failures or otherwise on the surviving nodes. The allocation limits * are decided by the user provisioned capacity, to determine if there were lost nodes. - * The provisioned capacity as defined by the below settings needs to updated one every + * The provisioned capacity as defined by the below settings needs to be updated on every * cluster scale up and scale down operations. *
  * cluster.routing.allocation.overload_awareness.provisioned_capacity: N
diff --git a/server/src/test/java/org/opensearch/cluster/routing/allocation/NodeLoadAwareAllocationTests.java b/server/src/test/java/org/opensearch/cluster/routing/allocation/NodeLoadAwareAllocationTests.java
index 4ecfe3fd1cbda..649625c9dfb08 100644
--- a/server/src/test/java/org/opensearch/cluster/routing/allocation/NodeLoadAwareAllocationTests.java
+++ b/server/src/test/java/org/opensearch/cluster/routing/allocation/NodeLoadAwareAllocationTests.java
@@ -278,7 +278,7 @@ public void testExistingPrimariesAllocationOnOverload() {
         assertThat(newState.getRoutingNodes().node("node4").size(), equalTo(12));
 
         logger.info("--> Remove node4 from zone holding primaries");
-        newState = removeNodes(newState, strategy,"node4");
+        newState = removeNodes(newState, strategy, "node4");
 
         logger.info("--> change the overload load factor to zero and verify if unassigned primaries on disk get assigned despite overload");
         strategy = createAllocationServiceWithAdditionalSettings(org.opensearch.common.collect.Map.of(
@@ -470,7 +470,7 @@ public void testSingleZoneOneReplicaLimitsShardAllocationOnOverload() {
 
         assertThat(newState.getRoutingNodes().shardsWithState(STARTED).size(), equalTo(120));
         assertThat(newState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(25));
-        assertThat(newState.getRoutingNodes().shardsWithState(UNASSIGNED).stream().filter(x -> x.primary()).count(), equalTo(5L));
+        assertThat(newState.getRoutingNodes().shardsWithState(UNASSIGNED).stream().filter(ShardRouting::primary).count(), equalTo(5L));
     }
 
     public void testThreeZoneTwoReplicaLimitsShardAllocationOnOverload() {