diff --git a/docs/changelog/98550.yaml b/docs/changelog/98550.yaml new file mode 100644 index 0000000000000..30c9891b15182 --- /dev/null +++ b/docs/changelog/98550.yaml @@ -0,0 +1,5 @@ +pr: 98550 +summary: Report a node's "roles" setting in the /_cluster/allocation/explain response +area: Allocation +type: enhancement +issues: [97859] diff --git a/docs/reference/cluster/allocation-explain.asciidoc b/docs/reference/cluster/allocation-explain.asciidoc index 3b079a5a4764c..d2b3649a7c15f 100644 --- a/docs/reference/cluster/allocation-explain.asciidoc +++ b/docs/reference/cluster/allocation-explain.asciidoc @@ -41,19 +41,19 @@ explanations for shard allocations in the cluster. For unassigned shards, the explain API provides an explanation for why the shard is unassigned. For assigned shards, the explain API provides an explanation for why the shard is remaining on its current node and has not moved or rebalanced to -another node. This API can be very useful when attempting to diagnose why a -shard is unassigned or why a shard continues to remain on its current node when +another node. This API can be very useful when attempting to diagnose why a +shard is unassigned or why a shard continues to remain on its current node when you might expect otherwise. [[cluster-allocation-explain-api-query-params]] ==== {api-query-parms-title} `include_disk_info`:: - (Optional, Boolean) If `true`, returns information about disk usage and + (Optional, Boolean) If `true`, returns information about disk usage and shard sizes. Defaults to `false`. - + `include_yes_decisions`:: - (Optional, Boolean) If `true`, returns 'YES' decisions in explanation. + (Optional, Boolean) If `true`, returns 'YES' decisions in explanation. Defaults to `false`. [[cluster-allocation-explain-api-request-body]] @@ -65,15 +65,15 @@ you might expect otherwise. parameter. `index`:: - (Optional, string) Specifies the name of the index that you would like an + (Optional, string) Specifies the name of the index that you would like an explanation for. `primary`:: - (Optional, Boolean) If `true`, returns explanation for the primary shard + (Optional, Boolean) If `true`, returns explanation for the primary shard for the given shard ID. `shard`:: - (Optional, integer) Specifies the ID of the shard that you would like an + (Optional, integer) Specifies the ID of the shard that you would like an explanation for. [[cluster-allocation-explain-api-examples]] @@ -130,6 +130,7 @@ node. "node_id" : "8qt2rY-pT6KNZB3-hGfLnw", "node_name" : "node-0", "transport_address" : "127.0.0.1:9401", + "roles" : ["data", "data_cold", "data_content", "data_frozen", "data_hot", "data_warm", "ingest", "master", "ml", "remote_cluster_client", "transform"], "node_attributes" : {}, "node_decision" : "no", <4> "weight_ranking" : 1, @@ -147,6 +148,7 @@ node. // TESTRESPONSE[s/"at" : "[^"]*"/"at" : $body.$_path/] // TESTRESPONSE[s/"node_id" : "[^"]*"/"node_id" : $body.$_path/] // TESTRESPONSE[s/"transport_address" : "[^"]*"/"transport_address" : $body.$_path/] +// TESTRESPONSE[s/"roles" : \[("[a-z_]*",)*("[a-z_]*")\]/"roles" : $body.$_path/] // TESTRESPONSE[s/"node_attributes" : \{\}/"node_attributes" : $body.$_path/] <1> The current state of the shard. @@ -207,12 +209,14 @@ unassigned due to <>. "node_id" : "pmnHu_ooQWCPEFobZGbpWw", "node_name" : "node_t2", "transport_address" : "127.0.0.1:9402", + "roles" : ["data_content", "data_hot"], "node_decision" : "yes" }, { "node_id" : "3sULLVJrRneSg0EfBB-2Ew", "node_name" : "node_t0", "transport_address" : "127.0.0.1:9400", + "roles" : ["data_content", "data_hot"], "node_decision" : "no", "store" : { <3> "matching_size" : "4.2kb", @@ -251,7 +255,8 @@ and must be reallocated. "current_node" : { "id" : "8lWJeJ7tSoui0bxrwuNhTA", "name" : "node_t1", - "transport_address" : "127.0.0.1:9401" + "transport_address" : "127.0.0.1:9401", + "roles" : ["data_content", "data_hot"] }, "can_remain_on_current_node" : "no", <1> "can_remain_decisions" : [ <2> @@ -268,6 +273,7 @@ and must be reallocated. "node_id" : "_P8olZS8Twax9u6ioN-GGA", "node_name" : "node_t0", "transport_address" : "127.0.0.1:9400", + "roles" : ["data_content", "data_hot"], "node_decision" : "no", "weight_ranking" : 1, "deciders" : [ @@ -302,6 +308,7 @@ cluster balance. "id" : "wLzJm4N4RymDkBYxwWoJsg", "name" : "node_t0", "transport_address" : "127.0.0.1:9400", + "roles" : ["data_content", "data_hot"], "weight_ranking" : 1 }, "can_remain_on_current_node" : "yes", @@ -313,6 +320,7 @@ cluster balance. "node_id" : "oE3EGFc8QN-Tdi5FFEprIA", "node_name" : "node_t1", "transport_address" : "127.0.0.1:9401", + "roles" : ["data_content", "data_hot"], "node_decision" : "worse_balance", <3> "weight_ranking" : 1 } diff --git a/docs/reference/tab-widgets/troubleshooting/data/diagnose-unassigned-shards.asciidoc b/docs/reference/tab-widgets/troubleshooting/data/diagnose-unassigned-shards.asciidoc index 1dbc9e979a1d4..983e7b9cf5826 100644 --- a/docs/reference/tab-widgets/troubleshooting/data/diagnose-unassigned-shards.asciidoc +++ b/docs/reference/tab-widgets/troubleshooting/data/diagnose-unassigned-shards.asciidoc @@ -20,12 +20,12 @@ In order to diagnose the unassigned shards, follow the next steps: . Log in to the {ess-console}[{ecloud} console]. + -. On the **Elasticsearch Service** panel, click the name of your deployment. +. On the **Elasticsearch Service** panel, click the name of your deployment. + NOTE: If the name of your deployment is disabled your {kib} instances might be unhealthy, in which case please contact https://support.elastic.co[Elastic Support]. -If your deployment doesn't include {kib}, all you need to do is +If your deployment doesn't include {kib}, all you need to do is {cloud}/ec-access-kibana.html[enable it first]. . Open your deployment's side navigation menu (placed under the Elastic logo in the upper left corner) @@ -106,6 +106,7 @@ The response will look like this: "node_id" : "8qt2rY-pT6KNZB3-hGfLnw", "node_name" : "node-0", "transport_address" : "127.0.0.1:9401", + "roles": ["data_content", "data_hot"], "node_attributes" : {}, "node_decision" : "no", <4> "weight_ranking" : 1, @@ -151,7 +152,7 @@ settings>> and <> APIs to the correct values in order to allow the index to be allocated. For more guidance on fixing the most common causes for unassinged shards please follow -<> or contact https://support.elastic.co[Elastic Support]. +<> or contact https://support.elastic.co[Elastic Support]. //end::kibana-api-ex[] // end::cloud[] @@ -231,6 +232,7 @@ The response will look like this: "node_id" : "8qt2rY-pT6KNZB3-hGfLnw", "node_name" : "node-0", "transport_address" : "127.0.0.1:9401", + "roles": ["data_content", "data_hot"] "node_attributes" : {}, "node_decision" : "no", <4> "weight_ranking" : 1, @@ -276,7 +278,7 @@ settings>> and <> APIs to the correct values in order to allow the index to be allocated. For more guidance on fixing the most common causes for unassinged shards please follow -<>. +<>. // end::self-managed[] diff --git a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/cluster.allocation_explain/10_basic.yml b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/cluster.allocation_explain/10_basic.yml index 7cab055244a6e..1f0e2b6fd727c 100644 --- a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/cluster.allocation_explain/10_basic.yml +++ b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/cluster.allocation_explain/10_basic.yml @@ -92,3 +92,21 @@ - is_true: can_rebalance_cluster - is_true: can_rebalance_to_other_node - is_true: rebalance_explanation + +--- +"Cluster allocation explanation response includes node's roles": + - skip: + version: " - 8.10.99" + reason: The roles field was introduced in 8.11.0 + + - do: + indices.create: + index: test + + - match: { acknowledged: true } + + - do: + cluster.allocation_explain: + body: { "index": "test", "shard": 0, "primary": true } + + - is_true: current_node.roles diff --git a/server/src/internalClusterTest/java/org/elasticsearch/action/admin/cluster/allocation/ClusterAllocationExplainIT.java b/server/src/internalClusterTest/java/org/elasticsearch/action/admin/cluster/allocation/ClusterAllocationExplainIT.java index 1a7c0ce13eccf..1a5b984ecf34e 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/action/admin/cluster/allocation/ClusterAllocationExplainIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/action/admin/cluster/allocation/ClusterAllocationExplainIT.java @@ -15,6 +15,7 @@ import org.elasticsearch.cluster.health.ClusterHealthStatus; import org.elasticsearch.cluster.metadata.IndexMetadata; import org.elasticsearch.cluster.node.DiscoveryNode; +import org.elasticsearch.cluster.node.DiscoveryNodeRole; import org.elasticsearch.cluster.routing.ShardRoutingState; import org.elasticsearch.cluster.routing.UnassignedInfo; import org.elasticsearch.cluster.routing.UnassignedInfo.AllocationStatus; @@ -1150,6 +1151,67 @@ public void testCannotAllocateStaleReplicaExplanation() throws Exception { } } + public void testExplainRolesOutput() throws Exception { + logger.info("--> Starting first node with \"roles\": [\"master\", \"data_hot\", \"ingest\"]"); + List firstNodeRoles = List.of("data_hot", "ingest", "master"); + Settings firstNodeSettings = Settings.builder().putList("node.roles", firstNodeRoles).build(); + internalCluster().startNode(firstNodeSettings); + + logger.info("--> Creating an index on the first node"); + prepareIndex(1, 0); + + logger.info("--> Starting a second node, which won't have the index, with \"roles\": [\"data_cold\", \"data_frozen\"]"); + List secondNodeRoles = List.of("data_cold", "data_frozen"); + Settings secondNodeSettings = Settings.builder().putList("node.roles", secondNodeRoles).build(); + internalCluster().startNode(secondNodeSettings); + + boolean includeYesDecisions = randomBoolean(); + boolean includeDiskInfo = randomBoolean(); + ClusterAllocationExplanation explanation = runExplain(true, includeYesDecisions, includeDiskInfo); + + assertEquals( + Set.of(DiscoveryNodeRole.DATA_HOT_NODE_ROLE, DiscoveryNodeRole.INGEST_ROLE, DiscoveryNodeRole.MASTER_ROLE), + explanation.getCurrentNode().getRoles() + ); + + try (XContentParser parser = getParser(explanation)) { + // Fast-forward to the "current_node" object, which contains "roles". + do { + parser.nextToken(); + assertNotEquals(Token.END_OBJECT, parser.currentToken()); + // START_OBJECT has a null currentName(), so check for that before de-referencing. + } while (parser.currentName() == null || (parser.currentName().equals("current_node")) == false); + assertEquals(Token.START_OBJECT, parser.nextToken()); + + // Fast-forward to "roles" field in the "current_node" object. + do { + parser.nextToken(); + assertNotEquals(Token.END_OBJECT, parser.currentToken()); + } while ((parser.currentName().equals("roles")) == false); + + // Check that the "roles" reported are those explicitly set via Settings for the first node, which possesses the shard. + // Note: list() implicitly consumes the parser START_ARRAY and END_ARRAY tokens. + assertEquals(firstNodeRoles, parser.list()); + + // Fast-forward to the "node_allocation_decisions" object, which contains "roles". + do { + parser.nextToken(); + // START_OBJECT has a null currentName(), so check for that before de-referencing. + } while (parser.currentName() == null || (parser.currentName().equals("node_allocation_decisions")) == false); + assertEquals(Token.START_ARRAY, parser.nextToken()); + assertEquals(Token.START_OBJECT, parser.nextToken()); + + // Fast-forward to "roles" field in the "node_allocation_decisions" object. + do { + parser.nextToken(); + assertNotEquals(Token.END_OBJECT, parser.currentToken()); + } while ((parser.currentName().equals("roles")) == false); + + // Check that the "roles" reported are those explicitly set via Settings for the second node, which does not possess the shard. + assertEquals(secondNodeRoles, parser.list()); + } + } + private void verifyClusterInfo(ClusterInfo clusterInfo, boolean includeDiskInfo, int numNodes) { if (includeDiskInfo) { assertThat(clusterInfo.getNodeMostAvailableDiskUsages().size(), greaterThanOrEqualTo(0)); @@ -1309,8 +1371,11 @@ private void verifyShardInfo(XContentParser parser, boolean primary, boolean inc parser.currentName().equals("id") || parser.currentName().equals("name") || parser.currentName().equals("transport_address") + || parser.currentName().equals("roles") || parser.currentName().equals("weight_ranking") ); + } else if (token == Token.START_ARRAY || token == Token.END_ARRAY) { + assertEquals("roles", parser.currentName()); } else { assertTrue(token.isValue()); assertNotNull(parser.text()); @@ -1436,6 +1501,10 @@ private String verifyNodeDecisionPrologue(XContentParser parser) throws IOExcept parser.nextToken(); assertNotNull(parser.text()); parser.nextToken(); + assertEquals("roles", parser.currentName()); + parser.nextToken(); + assertNotEquals(0, parser.list().size()); + parser.nextToken(); assertEquals("node_decision", parser.currentName()); parser.nextToken(); return nodeName; diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/AbstractAllocationDecision.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/AbstractAllocationDecision.java index 71a9501c4c623..bbfdf856cf787 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/AbstractAllocationDecision.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/AbstractAllocationDecision.java @@ -9,6 +9,7 @@ package org.elasticsearch.cluster.routing.allocation; import org.elasticsearch.cluster.node.DiscoveryNode; +import org.elasticsearch.cluster.node.DiscoveryNodeRole; import org.elasticsearch.cluster.routing.allocation.decider.Decision.Type; import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; @@ -110,6 +111,11 @@ public static XContentBuilder discoveryNodeToXContent(DiscoveryNode node, boolea } builder.endObject(); } + builder.startArray("roles"); + for (DiscoveryNodeRole role : node.getRoles()) { + builder.value(role.roleName()); + } + builder.endArray(); return builder; } diff --git a/server/src/test/java/org/elasticsearch/action/admin/cluster/allocation/ClusterAllocationExplainActionTests.java b/server/src/test/java/org/elasticsearch/action/admin/cluster/allocation/ClusterAllocationExplainActionTests.java index 61599b00e293a..70df808ae1f08 100644 --- a/server/src/test/java/org/elasticsearch/action/admin/cluster/allocation/ClusterAllocationExplainActionTests.java +++ b/server/src/test/java/org/elasticsearch/action/admin/cluster/allocation/ClusterAllocationExplainActionTests.java @@ -35,6 +35,7 @@ import java.time.Instant; import java.util.Collections; import java.util.Locale; +import java.util.stream.Collectors; import static org.elasticsearch.action.admin.cluster.allocation.TransportClusterAllocationExplainAction.findShardToExplain; import static org.hamcrest.Matchers.allOf; @@ -109,6 +110,7 @@ public ShardAllocationDecision decideShardAllocation(ShardRouting shard, Routing cae.getCurrentNode().getId(), cae.getCurrentNode().getName(), cae.getCurrentNode().getAddress(), + cae.getCurrentNode().getRoles().stream().map(r -> '"' + r.roleName() + '"').collect(Collectors.joining(", ", "[", "]")), explanation }; assertEquals(XContentHelper.stripWhitespace(Strings.format(""" { @@ -120,7 +122,8 @@ public ShardAllocationDecision decideShardAllocation(ShardRouting shard, Routing "current_node": { "id": "%s", "name": "%s", - "transport_address": "%s" + "transport_address": "%s", + "roles": %s }, "explanation": "%s" }""", args)), Strings.toString(builder)); diff --git a/server/src/test/java/org/elasticsearch/action/admin/cluster/allocation/ClusterAllocationExplanationTests.java b/server/src/test/java/org/elasticsearch/action/admin/cluster/allocation/ClusterAllocationExplanationTests.java index 0e1df7643dc38..4243a943c6761 100644 --- a/server/src/test/java/org/elasticsearch/action/admin/cluster/allocation/ClusterAllocationExplanationTests.java +++ b/server/src/test/java/org/elasticsearch/action/admin/cluster/allocation/ClusterAllocationExplanationTests.java @@ -93,6 +93,7 @@ public void testExplanationToXContent() throws Exception { "id": "node-0", "name": "", "transport_address": "%s", + "roles": [], "weight_ranking": 3 }, "can_remain_on_current_node": "yes", @@ -123,6 +124,7 @@ public void testRandomShardExplanationToXContent() throws Exception { "id": "node-0", "name": "", "transport_address": "%s", + "roles": [], "weight_ranking": 3 }, "can_remain_on_current_node": "yes",