From 325d2ed8c0e5aa9e972447cde05a9a655c645646 Mon Sep 17 00:00:00 2001 From: Lee Hinman Date: Fri, 15 Nov 2019 10:31:14 -0700 Subject: [PATCH] Handle failure to retrieve ILM policy step better This commit wraps the calls to retrieve the current step in a try/catch so that the exception does not bubble up. Instead, step info is added containing the exception to the existing step. Semi-related to #49128 --- .../xpack/ilm/IndexLifecycleRunner.java | 39 +++++++-- .../xpack/ilm/IndexLifecycleRunnerTests.java | 79 +++++++++++++++++++ 2 files changed, 110 insertions(+), 8 deletions(-) diff --git a/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/IndexLifecycleRunner.java b/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/IndexLifecycleRunner.java index e29f6d6c07965..6ee59368ae7a5 100644 --- a/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/IndexLifecycleRunner.java +++ b/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/IndexLifecycleRunner.java @@ -104,7 +104,14 @@ boolean isReadyToTransitionToThisPhase(final String policy, final IndexMetaData public void runPeriodicStep(String policy, IndexMetaData indexMetaData) { String index = indexMetaData.getIndex().getName(); LifecycleExecutionState lifecycleState = LifecycleExecutionState.fromIndexMetadata(indexMetaData); - Step currentStep = getCurrentStep(stepRegistry, policy, indexMetaData, lifecycleState); + final Step currentStep; + try { + currentStep = getCurrentStep(stepRegistry, policy, indexMetaData, lifecycleState); + } catch (Exception e) { + markPolicyRetrievalError(policy, indexMetaData.getIndex(), lifecycleState, e); + return; + } + if (currentStep == null) { if (stepRegistry.policyExists(policy) == false) { markPolicyDoesNotExist(policy, indexMetaData.getIndex(), lifecycleState); @@ -194,7 +201,13 @@ public void onFailure(String source, Exception e) { public void maybeRunAsyncAction(ClusterState currentState, IndexMetaData indexMetaData, String policy, StepKey expectedStepKey) { String index = indexMetaData.getIndex().getName(); LifecycleExecutionState lifecycleState = LifecycleExecutionState.fromIndexMetadata(indexMetaData); - Step currentStep = getCurrentStep(stepRegistry, policy, indexMetaData, lifecycleState); + final Step currentStep; + try { + currentStep = getCurrentStep(stepRegistry, policy, indexMetaData, lifecycleState); + } catch (Exception e) { + markPolicyRetrievalError(policy, indexMetaData.getIndex(), lifecycleState, e); + return; + } if (currentStep == null) { logger.warn("current step [{}] for index [{}] with policy [{}] is not recognized", getCurrentStepKey(lifecycleState), index, policy); @@ -237,7 +250,13 @@ public void onFailure(Exception e) { public void runPolicyAfterStateChange(String policy, IndexMetaData indexMetaData) { String index = indexMetaData.getIndex().getName(); LifecycleExecutionState lifecycleState = LifecycleExecutionState.fromIndexMetadata(indexMetaData); - Step currentStep = getCurrentStep(stepRegistry, policy, indexMetaData, lifecycleState); + final Step currentStep; + try { + currentStep = getCurrentStep(stepRegistry, policy, indexMetaData, lifecycleState); + } catch (Exception e) { + markPolicyRetrievalError(policy, indexMetaData.getIndex(), lifecycleState, e); + return; + } if (currentStep == null) { if (stepRegistry.policyExists(policy) == false) { markPolicyDoesNotExist(policy, indexMetaData.getIndex(), lifecycleState); @@ -596,10 +615,14 @@ private static IndexMetaData.Builder removePolicyForIndex(IndexMetaData indexMet } private void markPolicyDoesNotExist(String policyName, Index index, LifecycleExecutionState executionState) { - logger.debug("policy [{}] for index [{}] does not exist, recording this in step_info for this index", - policyName, index.getName()); - setStepInfo(index, policyName, getCurrentStepKey(executionState), - new SetStepInfoUpdateTask.ExceptionWrapper( - new IllegalArgumentException("policy [" + policyName + "] does not exist"))); + markPolicyRetrievalError(policyName, index, executionState, + new IllegalArgumentException("policy [" + policyName + "] does not exist")); + } + + private void markPolicyRetrievalError(String policyName, Index index, LifecycleExecutionState executionState, Exception e) { + logger.debug( + new ParameterizedMessage("unable to retrieve policy [{}] for index [{}], recording this in step_info for this index", + policyName, index.getName()), e); + setStepInfo(index, policyName, getCurrentStepKey(executionState), new SetStepInfoUpdateTask.ExceptionWrapper(e)); } } diff --git a/x-pack/plugin/ilm/src/test/java/org/elasticsearch/xpack/ilm/IndexLifecycleRunnerTests.java b/x-pack/plugin/ilm/src/test/java/org/elasticsearch/xpack/ilm/IndexLifecycleRunnerTests.java index c285fed0cc930..20d1a0aed726a 100644 --- a/x-pack/plugin/ilm/src/test/java/org/elasticsearch/xpack/ilm/IndexLifecycleRunnerTests.java +++ b/x-pack/plugin/ilm/src/test/java/org/elasticsearch/xpack/ilm/IndexLifecycleRunnerTests.java @@ -72,6 +72,7 @@ import java.util.TreeMap; import java.util.concurrent.CountDownLatch; import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicLong; import java.util.function.BiFunction; import java.util.function.Function; @@ -356,6 +357,84 @@ public void testRunStateChangePolicyWithNextStep() throws Exception { threadPool.shutdownNow(); } + public void testRunPeriodicPolicyWithFailureToReadPolicy() throws Exception { + doTestRunPolicyWithFailureToReadPolicy(false, true); + } + + public void testRunStateChangePolicyWithFailureToReadPolicy() throws Exception { + doTestRunPolicyWithFailureToReadPolicy(false, false); + } + + public void testRunAsyncActionPolicyWithFailureToReadPolicy() throws Exception { + doTestRunPolicyWithFailureToReadPolicy(true, false); + } + + public void doTestRunPolicyWithFailureToReadPolicy(boolean asyncAction, boolean periodicAction) throws Exception { + String policyName = "foo"; + StepKey stepKey = new StepKey("phase", "action", "cluster_state_action_step"); + StepKey nextStepKey = new StepKey("phase", "action", "next_cluster_state_action_step"); + MockClusterStateActionStep step = new MockClusterStateActionStep(stepKey, nextStepKey); + MockClusterStateActionStep nextStep = new MockClusterStateActionStep(nextStepKey, null); + MockPolicyStepsRegistry stepRegistry = createOneStepPolicyStepRegistry(policyName, step); + AtomicBoolean resolved = new AtomicBoolean(false); + stepRegistry.setResolver((i, k) -> { + resolved.set(true); + throw new IllegalArgumentException("fake failure retrieving step"); + }); + ThreadPool threadPool = new TestThreadPool("name"); + LifecycleExecutionState les = LifecycleExecutionState.builder() + .setPhase("phase") + .setAction("action") + .setStep("cluster_state_action_step") + .build(); + IndexMetaData indexMetaData = IndexMetaData.builder("test") + .settings(Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) + .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1) + .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 1) + .put(LifecycleSettings.LIFECYCLE_NAME, policyName)) + .putCustom(LifecycleExecutionState.ILM_CUSTOM_METADATA_KEY, les.asMap()) + .build(); + ClusterService clusterService = ClusterServiceUtils.createClusterService(threadPool); + DiscoveryNode node = clusterService.localNode(); + IndexLifecycleMetadata ilm = new IndexLifecycleMetadata(Collections.emptyMap(), OperationMode.RUNNING); + ClusterState state = ClusterState.builder(new ClusterName("cluster")) + .metaData(MetaData.builder() + .put(indexMetaData, true) + .putCustom(IndexLifecycleMetadata.TYPE, ilm)) + .nodes(DiscoveryNodes.builder() + .add(node) + .masterNodeId(node.getId()) + .localNodeId(node.getId())) + .build(); + ClusterServiceUtils.setState(clusterService, state); + long stepTime = randomLong(); + IndexLifecycleRunner runner = new IndexLifecycleRunner(stepRegistry, clusterService, threadPool, () -> stepTime); + + ClusterState before = clusterService.state(); + if (asyncAction) { + runner.maybeRunAsyncAction(before, indexMetaData, policyName, stepKey); + } else if (periodicAction) { + runner.runPeriodicStep(policyName, indexMetaData); + } else { + runner.runPolicyAfterStateChange(policyName, indexMetaData); + } + + // The cluster state can take a few extra milliseconds to update after the steps are executed + assertBusy(() -> assertNotEquals(before, clusterService.state())); + LifecycleExecutionState newExecutionState = LifecycleExecutionState + .fromIndexMetadata(clusterService.state().metaData().index(indexMetaData.getIndex())); + assertThat(newExecutionState.getPhase(), equalTo("phase")); + assertThat(newExecutionState.getAction(), equalTo("action")); + assertThat(newExecutionState.getStep(), equalTo("cluster_state_action_step")); + assertThat(step.getExecuteCount(), equalTo(0L)); + assertThat(nextStep.getExecuteCount(), equalTo(0L)); + assertThat(newExecutionState.getStepInfo(), + containsString("{\"type\":\"illegal_argument_exception\",\"reason\":\"fake failure retrieving step\"}")); + clusterService.close(); + threadPool.shutdownNow(); + } + public void testRunAsyncActionDoesNotRun() { String policyName = "foo"; StepKey stepKey = new StepKey("phase", "action", "async_action_step");