Skip to content

Commit

Permalink
Fix TransportMasterNodeAction not Retrying NodeClosedException (#51325)
Browse files Browse the repository at this point in the history
Added node closed exception to the retryable remote exceptions as it's possible to run into this exception instead of a connect exception when the master node is just shutting down but still responding to requests.
  • Loading branch information
original-brownbear committed Jan 24, 2020
1 parent 3322df3 commit db48029
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
import org.elasticsearch.tasks.Task;
import org.elasticsearch.threadpool.ThreadPool;
import org.elasticsearch.transport.ConnectTransportException;
import org.elasticsearch.transport.RemoteTransportException;
import org.elasticsearch.transport.TransportException;
import org.elasticsearch.transport.TransportService;

Expand Down Expand Up @@ -178,7 +179,8 @@ protected void doStart(ClusterState clusterState) {
@Override
public void handleException(final TransportException exp) {
Throwable cause = exp.unwrapCause();
if (cause instanceof ConnectTransportException) {
if (cause instanceof ConnectTransportException ||
(exp instanceof RemoteTransportException && cause instanceof NodeClosedException)) {
// we want to retry here a bit to see if a new master is elected
logger.debug("connection exception while trying to forward request with action name [{}] to " +
"master node [{}], scheduling a retry. Error: [{}]",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.discovery.MasterNotDiscoveredException;
import org.elasticsearch.node.NodeClosedException;
import org.elasticsearch.rest.RestStatus;
import org.elasticsearch.tasks.Task;
import org.elasticsearch.test.ESTestCase;
Expand Down Expand Up @@ -392,7 +393,8 @@ public void testDelegateToFailingMaster() throws ExecutionException, Interrupted
assertThat(capturedRequest.action, equalTo("internal:testAction"));

if (rejoinSameMaster) {
transport.handleRemoteError(capturedRequest.requestId, new ConnectTransportException(masterNode, "Fake error"));
transport.handleRemoteError(capturedRequest.requestId,
randomBoolean() ? new ConnectTransportException(masterNode, "Fake error") : new NodeClosedException(masterNode));
assertFalse(listener.isDone());
if (randomBoolean()) {
// simulate master node removal
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,10 @@
import org.elasticsearch.common.Priority;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.test.ESIntegTestCase;
import org.elasticsearch.test.InternalTestCluster;

import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.atomic.AtomicBoolean;

import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
Expand Down Expand Up @@ -294,4 +297,17 @@ public void clusterStateProcessed(String source, ClusterState oldState, ClusterS
assertFalse(healthResponseFuture.get().isTimedOut());
}

public void testHealthOnMasterFailover() throws Exception {
final String node = internalCluster().startDataOnlyNode();
final List<ActionFuture<ClusterHealthResponse>> responseFutures = new ArrayList<>();
// Run a few health requests concurrent to master fail-overs against a data-node to make sure master failover is handled
// without exceptions
for (int i = 0; i < 20; ++i) {
responseFutures.add(client(node).admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID).execute());
internalCluster().restartNode(internalCluster().getMasterName(), InternalTestCluster.EMPTY_CALLBACK);
}
for (ActionFuture<ClusterHealthResponse> responseFuture : responseFutures) {
assertSame(responseFuture.get().getStatus(), ClusterHealthStatus.GREEN);
}
}
}

0 comments on commit db48029

Please sign in to comment.