Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reset job if existing reset fails #106020

Merged
merged 5 commits into from
Mar 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/changelog/106020.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 106020
summary: Fix resetting a job if the original reset task no longer exists.
area: Machine Learning
type: bug
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ void getFinishedTaskFromIndex(Task thisTask, GetTaskRequest request, ActionListe

client.get(get, ActionListener.wrap(r -> onGetFinishedTaskFromIndex(r, listener), e -> {
if (ExceptionsHelper.unwrap(e, IndexNotFoundException.class) != null) {
// We haven't yet created the index for the task results so it can't be found.
// We haven't yet created the index for the task results, so it can't be found.
listener.onFailure(
new ResourceNotFoundException("task [{}] isn't running and hasn't stored its results", e, request.getTaskId())
);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,13 @@
package org.elasticsearch.xpack.ml.integration;

import org.elasticsearch.core.TimeValue;
import org.elasticsearch.tasks.TaskId;
import org.elasticsearch.xpack.core.ml.job.config.AnalysisConfig;
import org.elasticsearch.xpack.core.ml.job.config.Blocked;
import org.elasticsearch.xpack.core.ml.job.config.DataDescription;
import org.elasticsearch.xpack.core.ml.job.config.Detector;
import org.elasticsearch.xpack.core.ml.job.config.Job;
import org.elasticsearch.xpack.core.ml.job.config.JobUpdate;
import org.elasticsearch.xpack.core.ml.job.process.autodetect.state.DataCounts;
import org.elasticsearch.xpack.core.ml.job.results.Bucket;
import org.junit.After;
Expand All @@ -34,10 +36,18 @@ public void tearDownData() {
}

public void testReset() throws Exception {
testReset(false);
}

public void testReset_previousResetFailed() throws Exception {
testReset(true);
}

private void testReset(boolean previousResetFailed) throws Exception {
TimeValue bucketSpan = TimeValue.timeValueMinutes(30);
long startTime = 1514764800000L;
final int bucketCount = 100;
Job.Builder job = createJob("test-reset", bucketSpan);
Job.Builder job = createJob("test-reset-" + previousResetFailed, bucketSpan);

openJob(job.getId());
postData(
Expand All @@ -53,6 +63,13 @@ public void testReset() throws Exception {
DataCounts dataCounts = getJobStats(job.getId()).get(0).getDataCounts();
assertThat(dataCounts.getProcessedRecordCount(), greaterThan(0L));

if (previousResetFailed) {
JobUpdate jobUpdate = new JobUpdate.Builder(job.getId()).setBlocked(
new Blocked(Blocked.Reason.RESET, new TaskId(randomIdentifier(), randomInt()))
).build();
updateJob(job.getId(), jobUpdate);
}

resetJob(job.getId());

buckets = getBuckets(job.getId());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.elasticsearch.ResourceNotFoundException;
import org.elasticsearch.action.ActionListener;
import org.elasticsearch.action.admin.cluster.node.tasks.get.GetTaskAction;
import org.elasticsearch.action.admin.cluster.node.tasks.get.GetTaskRequest;
Expand Down Expand Up @@ -124,7 +125,14 @@ protected void masterOperation(
waitExistingResetTaskToComplete(
job.getBlocked().getTaskId(),
request,
ActionListener.wrap(r -> resetIfJobIsStillBlockedOnReset(task, request, listener), listener::onFailure)
ActionListener.wrap(r -> resetIfJobIsStillBlockedOnReset(task, request, listener), e -> {
if (ExceptionsHelper.unwrapCause(e) instanceof ResourceNotFoundException) {
// If the task is not found then the node it was running on likely died, so try again.
resetIfJobIsStillBlockedOnReset(task, request, listener);
} else {
listener.onFailure(e);
}
})
);
} else {
ParentTaskAssigningClient taskClient = new ParentTaskAssigningClient(client, taskId);
Expand Down
Loading