Fix TransportMasterNodeAction not Retrying NodeClosedException (#51325)

Added node closed exception to the retryable remote exceptions as it's possible to run into this exception instead of a connect exception when the master node is just shutting down but still responding to requests.
elastic · Jan 24, 2020 · db48029 · db48029
1 parent 3322df3
commit db48029
Show file tree

Hide file tree

Showing 3 changed files with 22 additions and 2 deletions.
diff --git a/server/src/main/java/org/elasticsearch/action/support/master/TransportMasterNodeAction.java b/server/src/main/java/org/elasticsearch/action/support/master/TransportMasterNodeAction.java
@@ -46,6 +46,7 @@
 import org.elasticsearch.tasks.Task;
 import org.elasticsearch.threadpool.ThreadPool;
 import org.elasticsearch.transport.ConnectTransportException;
+import org.elasticsearch.transport.RemoteTransportException;
 import org.elasticsearch.transport.TransportException;
 import org.elasticsearch.transport.TransportService;
 
@@ -178,7 +179,8 @@ protected void doStart(ClusterState clusterState) {
                                 @Override
                                 public void handleException(final TransportException exp) {
                                     Throwable cause = exp.unwrapCause();
-                                    if (cause instanceof ConnectTransportException) {
+                                    if (cause instanceof ConnectTransportException ||
+                                        (exp instanceof RemoteTransportException && cause instanceof NodeClosedException)) {
                                         // we want to retry here a bit to see if a new master is elected
                                         logger.debug("connection exception while trying to forward request with action name [{}] to " +
                                                 "master node [{}], scheduling a retry. Error: [{}]",

diff --git a/...src/test/java/org/elasticsearch/action/support/master/TransportMasterNodeActionTests.java b/...src/test/java/org/elasticsearch/action/support/master/TransportMasterNodeActionTests.java
@@ -45,6 +45,7 @@
 import org.elasticsearch.common.io.stream.StreamOutput;
 import org.elasticsearch.common.unit.TimeValue;
 import org.elasticsearch.discovery.MasterNotDiscoveredException;
+import org.elasticsearch.node.NodeClosedException;
 import org.elasticsearch.rest.RestStatus;
 import org.elasticsearch.tasks.Task;
 import org.elasticsearch.test.ESTestCase;
@@ -392,7 +393,8 @@ public void testDelegateToFailingMaster() throws ExecutionException, Interrupted
         assertThat(capturedRequest.action, equalTo("internal:testAction"));
 
         if (rejoinSameMaster) {
-            transport.handleRemoteError(capturedRequest.requestId, new ConnectTransportException(masterNode, "Fake error"));
+            transport.handleRemoteError(capturedRequest.requestId,
+                randomBoolean() ? new ConnectTransportException(masterNode, "Fake error") : new NodeClosedException(masterNode));
             assertFalse(listener.isDone());
             if (randomBoolean()) {
                 // simulate master node removal

diff --git a/server/src/test/java/org/elasticsearch/cluster/ClusterHealthIT.java b/server/src/test/java/org/elasticsearch/cluster/ClusterHealthIT.java
@@ -28,7 +28,10 @@
 import org.elasticsearch.common.Priority;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.test.ESIntegTestCase;
+import org.elasticsearch.test.InternalTestCluster;
 
+import java.util.ArrayList;
+import java.util.List;
 import java.util.concurrent.atomic.AtomicBoolean;
 
 import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
@@ -294,4 +297,17 @@ public void clusterStateProcessed(String source, ClusterState oldState, ClusterS
         assertFalse(healthResponseFuture.get().isTimedOut());
     }
 
+    public void testHealthOnMasterFailover() throws Exception {
+        final String node = internalCluster().startDataOnlyNode();
+        final List<ActionFuture<ClusterHealthResponse>> responseFutures = new ArrayList<>();
+        // Run a few health requests concurrent to master fail-overs against a data-node to make sure master failover is handled
+        // without exceptions
+        for (int i = 0; i < 20; ++i) {
+            responseFutures.add(client(node).admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID).execute());
+            internalCluster().restartNode(internalCluster().getMasterName(), InternalTestCluster.EMPTY_CALLBACK);
+        }
+        for (ActionFuture<ClusterHealthResponse> responseFuture : responseFutures) {
+            assertSame(responseFuture.get().getStatus(), ClusterHealthStatus.GREEN);
+        }
+    }
 }