From 7b80f6060f0880280c92e8863ebe6a8e5a0d65f1 Mon Sep 17 00:00:00 2001 From: Cyrille Le Clerc Date: Sun, 2 Jan 2022 23:13:54 +0100 Subject: [PATCH] Use spanStatus "unset" rather than "error" when interruption cause is parallel fail fast --- README.md | 12 +++++- ...nkinsOpenTelemetryPluginConfiguration.java | 29 +++++++++---- .../job/MonitoringPipelineListener.java | 41 +++++++++++++++++-- .../job/MonitoringRunListener.java | 6 +-- .../JenkinsOtelSemanticAttributes.java | 2 + .../JenkinsOtelPluginIntegrationTest.java | 39 ++++++++++++++++++ 6 files changed, 112 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index fb3f5f835..565c9f7d0 100644 --- a/README.md +++ b/README.md @@ -112,7 +112,7 @@ In addition, if the backends were configured then there will be an environment v #### Attributes -##### Pipeline run root span +##### Pipeline, freestyle, and matrix project build spans | Attribute | Description | Type | |----------------------------------|--------------|------| @@ -138,7 +138,14 @@ In addition, if the backends were configured then there will be an environment v | ci.pipeline.parameter.name | Name of the parameters | String[] | | ci.pipeline.parameter.value | Value of the parameters. "Sensitive" values are redacted | String[] | -##### Spans +##### Pipeline step spans + +| Status Code | Status Description | Description | +|-------------|--------------------|-------------| +| OK | | for step and build success | +| UNSET | Machine readable status like `FlowInterruptedException:FailFastCause:Failed in branch failingBranch` | For interrupted steps of type fail fast parallel pipeline interruption, pipeline build superseded by a newer build, or pipeline build cancelled by user, the span status is set to `UNSET` rather than `ERROR` for readability | +| ERROR | Machine readable status like `FlowInterruptedException:ExceededTimeout:Timeout has been exceeded` | For other causes of step failure | + | Attribute | Description | Type | |----------------------------------|--------------|------| @@ -148,6 +155,7 @@ In addition, if the backends were configured then there will be an environment v | jenkins.pipeline.step.plugin.name | Jenkins plugin for that particular step | String | | jenkins.pipeline.step.plugin.version| Jenkins plugin version | String | | jenkins.pipeline.step.agent.label | Labels attached to the agent | String | +| jenkins.pipeline.step.interruption.causes | List of machine readable causes of the interruption of the step like `FailFastCause:Failed in branch failingBranch`.

Common causes of interruption: `CanceledCause: Superseded by my-pipeline#123`, `ExceededTimeout: Timeout has been exceeded`, `FailFastCause:Failed in branch the-failing-branch`, `UserInterruption: Aborted by a-user` | String[] | | git.branch | Git branch name | String | | git.repository | Git repository | String | | git.username | Git user | String | diff --git a/src/main/java/io/jenkins/plugins/opentelemetry/JenkinsOpenTelemetryPluginConfiguration.java b/src/main/java/io/jenkins/plugins/opentelemetry/JenkinsOpenTelemetryPluginConfiguration.java index 5254bd304..8df382bfc 100644 --- a/src/main/java/io/jenkins/plugins/opentelemetry/JenkinsOpenTelemetryPluginConfiguration.java +++ b/src/main/java/io/jenkins/plugins/opentelemetry/JenkinsOpenTelemetryPluginConfiguration.java @@ -18,6 +18,7 @@ import io.jenkins.plugins.opentelemetry.semconv.JenkinsOtelSemanticAttributes; import io.jenkins.plugins.opentelemetry.semconv.OTelEnvironmentVariablesConventions; import io.opentelemetry.sdk.resources.Resource; +import jenkins.model.CauseOfInterruption; import jenkins.model.GlobalConfiguration; import jenkins.model.Jenkins; import net.sf.json.JSONObject; @@ -29,6 +30,7 @@ import org.jenkinsci.plugins.workflow.cps.nodes.StepStartNode; import org.jenkinsci.plugins.workflow.graph.FlowNode; import org.jenkinsci.plugins.workflow.steps.CoreStep; +import org.jenkinsci.plugins.workflow.support.steps.StageStepExecution; import org.kohsuke.stapler.DataBoundConstructor; import org.kohsuke.stapler.DataBoundSetter; import org.kohsuke.stapler.QueryParameter; @@ -42,15 +44,7 @@ import javax.inject.Inject; import java.io.IOException; import java.io.StringReader; -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Objects; -import java.util.Optional; -import java.util.Properties; -import java.util.Set; +import java.util.*; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; import java.util.logging.Level; @@ -95,6 +89,19 @@ public class JenkinsOpenTelemetryPluginConfiguration extends GlobalConfiguration private String serviceNamespace; + /** + * Interruption causes that should mark the span as error because they are external interruptions. + * + * TODO make this list configurable and accessible through {@link io.opentelemetry.sdk.autoconfigure.spi.ConfigProperties#getList(String)} + * @see CauseOfInterruption + * @see org.jenkinsci.plugins.workflow.steps.FlowInterruptedException + */ + private List statusUnsetCausesOfInterruption = Arrays.asList( + "org.jenkinsci.plugins.workflow.cps.steps.ParallelStep$FailFastCause", + StageStepExecution.CanceledCause.class.getName(), + CauseOfInterruption.UserInterruption.class.getName() + ); + /** * The previously used configuration. Kept in memory to prevent unneeded reconfigurations. */ @@ -242,6 +249,10 @@ public void setIgnoredSteps(String ignoredSteps) { this.ignoredSteps = ignoredSteps; } + public List getStatusUnsetCausesOfInterruption() { + return statusUnsetCausesOfInterruption; + } + public String getDisabledResourceProviders() { return disabledResourceProviders; } diff --git a/src/main/java/io/jenkins/plugins/opentelemetry/job/MonitoringPipelineListener.java b/src/main/java/io/jenkins/plugins/opentelemetry/job/MonitoringPipelineListener.java index 7ddbdf646..b06a1f742 100644 --- a/src/main/java/io/jenkins/plugins/opentelemetry/job/MonitoringPipelineListener.java +++ b/src/main/java/io/jenkins/plugins/opentelemetry/job/MonitoringPipelineListener.java @@ -30,6 +30,7 @@ import io.opentelemetry.context.Context; import io.opentelemetry.context.Scope; import io.opentelemetry.semconv.resource.attributes.ResourceAttributes; +import jenkins.model.CauseOfInterruption; import org.apache.commons.compress.utils.Sets; import org.jenkinsci.plugins.structs.SymbolLookup; import org.jenkinsci.plugins.structs.describable.UninstantiatedDescribable; @@ -42,6 +43,7 @@ import org.jenkinsci.plugins.workflow.graph.FlowNode; import org.jenkinsci.plugins.workflow.job.WorkflowRun; import org.jenkinsci.plugins.workflow.steps.CoreStep; +import org.jenkinsci.plugins.workflow.steps.FlowInterruptedException; import org.jenkinsci.plugins.workflow.steps.Step; import org.jenkinsci.plugins.workflow.steps.StepContext; import org.jenkinsci.plugins.workflow.steps.StepDescriptor; @@ -54,6 +56,7 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Collections; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Objects; @@ -61,6 +64,7 @@ import java.util.function.Supplier; import java.util.logging.Level; import java.util.logging.Logger; +import java.util.stream.Collectors; import static com.google.common.base.Verify.verifyNotNull; @@ -74,9 +78,16 @@ public class MonitoringPipelineListener extends AbstractPipelineListener impleme private Set ignoredSteps; private List stepHandlers; + /** + * Interruption causes that should mark the span as error because they are external interruptions. + */ + Set statusUnsetCausesOfInterruption; + @PostConstruct public void postConstruct() { - this.ignoredSteps = Sets.newHashSet(JenkinsOpenTelemetryPluginConfiguration.get().getIgnoredSteps().split(",")); + final JenkinsOpenTelemetryPluginConfiguration jenkinsOpenTelemetryPluginConfiguration = JenkinsOpenTelemetryPluginConfiguration.get(); + this.ignoredSteps = Sets.newHashSet(jenkinsOpenTelemetryPluginConfiguration.getIgnoredSteps().split(",")); + this.statusUnsetCausesOfInterruption = new HashSet<>(jenkinsOpenTelemetryPluginConfiguration.getStatusUnsetCausesOfInterruption()); } @Override @@ -292,8 +303,32 @@ private void endCurrentSpan(FlowNode node, WorkflowRun run) { span.setStatus(StatusCode.OK); } else { Throwable throwable = errorAction.getError(); - span.recordException(throwable); - span.setStatus(StatusCode.ERROR, throwable.getMessage()); + if (throwable instanceof FlowInterruptedException) { + FlowInterruptedException interruptedException = (FlowInterruptedException) throwable; + List causesOfInterruption = interruptedException.getCauses(); + + List causeDescriptions = causesOfInterruption.stream().map(cause -> cause.getClass().getSimpleName() + ": " + cause.getShortDescription()).collect(Collectors.toList()); + span.setAttribute(JenkinsOtelSemanticAttributes.JENKINS_STEP_INTERRUPTION_CAUSES, causeDescriptions); + + String statusDescription = throwable.getClass().getSimpleName() + ": " + causeDescriptions.stream().collect(Collectors.joining(", ")); + + boolean suppressSpanStatusCodeError = false; + for (CauseOfInterruption causeOfInterruption: causesOfInterruption) { + if (statusUnsetCausesOfInterruption.contains(causeOfInterruption.getClass().getName())) { + suppressSpanStatusCodeError = true; + break; + } + } + if (suppressSpanStatusCodeError) { + span.setStatus(StatusCode.UNSET, statusDescription); + } else { + span.recordException(throwable); + span.setStatus(StatusCode.ERROR, statusDescription); + } + } else { + span.recordException(throwable); + span.setStatus(StatusCode.ERROR, throwable.getMessage()); + } } span.end(); LOGGER.log(Level.FINE, () -> run.getFullDisplayName() + " - < " + node.getDisplayFunctionName() + " - end " + OtelUtils.toDebugString(span)); diff --git a/src/main/java/io/jenkins/plugins/opentelemetry/job/MonitoringRunListener.java b/src/main/java/io/jenkins/plugins/opentelemetry/job/MonitoringRunListener.java index 1948cf313..3336e03a8 100644 --- a/src/main/java/io/jenkins/plugins/opentelemetry/job/MonitoringRunListener.java +++ b/src/main/java/io/jenkins/plugins/opentelemetry/job/MonitoringRunListener.java @@ -327,13 +327,13 @@ public void _onFinalized(@NonNull Run run) { parentSpan.setAttribute(JenkinsOtelSemanticAttributes.CI_PIPELINE_RUN_RESULT, Objects.toString(runResult, null)); if (Result.SUCCESS.equals(runResult)) { - parentSpan.setStatus(StatusCode.OK); + parentSpan.setStatus(StatusCode.OK, runResult.toString()); } else if (Result.FAILURE.equals(runResult) || Result.UNSTABLE.equals(runResult)){ parentSpan.setAttribute(SemanticAttributes.EXCEPTION_TYPE, "PIPELINE_" + runResult); parentSpan.setAttribute(SemanticAttributes.EXCEPTION_MESSAGE, "PIPELINE_" + runResult); - parentSpan.setStatus(StatusCode.ERROR); + parentSpan.setStatus(StatusCode.ERROR, runResult.toString()); } else if (Result.ABORTED.equals(runResult) || Result.NOT_BUILT.equals(runResult)) { - parentSpan.setStatus(StatusCode.UNSET); + parentSpan.setStatus(StatusCode.UNSET, runResult.toString()); } } // NODE diff --git a/src/main/java/io/jenkins/plugins/opentelemetry/semconv/JenkinsOtelSemanticAttributes.java b/src/main/java/io/jenkins/plugins/opentelemetry/semconv/JenkinsOtelSemanticAttributes.java index de8464bce..84501368f 100644 --- a/src/main/java/io/jenkins/plugins/opentelemetry/semconv/JenkinsOtelSemanticAttributes.java +++ b/src/main/java/io/jenkins/plugins/opentelemetry/semconv/JenkinsOtelSemanticAttributes.java @@ -85,6 +85,8 @@ public final class JenkinsOtelSemanticAttributes { public static final AttributeKey JENKINS_STEP_AGENT_LABEL = AttributeKey.stringKey("jenkins.pipeline.step.agent.label"); + public static final AttributeKey> JENKINS_STEP_INTERRUPTION_CAUSES = AttributeKey.stringArrayKey("jenkins.pipeline.step.interruption.causes"); + public static final String JENKINS = "jenkins"; /** diff --git a/src/test/java/io/jenkins/plugins/opentelemetry/JenkinsOtelPluginIntegrationTest.java b/src/test/java/io/jenkins/plugins/opentelemetry/JenkinsOtelPluginIntegrationTest.java index 830ab0db3..d0e2ae742 100644 --- a/src/test/java/io/jenkins/plugins/opentelemetry/JenkinsOtelPluginIntegrationTest.java +++ b/src/test/java/io/jenkins/plugins/opentelemetry/JenkinsOtelPluginIntegrationTest.java @@ -14,11 +14,14 @@ import io.jenkins.plugins.opentelemetry.semconv.JenkinsOtelSemanticAttributes; import io.jenkins.plugins.opentelemetry.semconv.JenkinsSemanticMetrics; import io.opentelemetry.api.common.Attributes; +import io.opentelemetry.api.trace.StatusCode; import io.opentelemetry.sdk.metrics.data.LongPointData; import io.opentelemetry.sdk.metrics.data.MetricData; import io.opentelemetry.sdk.metrics.data.MetricDataType; import io.opentelemetry.sdk.testing.exporter.InMemoryMetricExporterProvider; import io.opentelemetry.sdk.testing.exporter.InMemoryMetricExporterUtils; +import io.opentelemetry.sdk.trace.data.SpanData; +import io.opentelemetry.sdk.trace.data.StatusData; import org.apache.commons.lang3.SystemUtils; import org.hamcrest.CoreMatchers; import org.hamcrest.MatcherAssert; @@ -29,6 +32,7 @@ import org.junit.Test; import org.jvnet.hudson.test.recipes.WithPlugin; +import java.util.Arrays; import java.util.Collection; import java.util.List; import java.util.Map; @@ -462,4 +466,39 @@ public void testPipelineWithoutCheckoutShallowSteps() throws Exception { MatcherAssert.assertThat(attributes.get(JenkinsOtelSemanticAttributes.GIT_CLONE_SHALLOW), CoreMatchers.is(false)); MatcherAssert.assertThat(attributes.get(JenkinsOtelSemanticAttributes.GIT_CLONE_DEPTH), CoreMatchers.is(0L)); } + + @Test + public void testFailFastParallelScriptedPipelineWithException() throws Exception { + assumeFalse(SystemUtils.IS_OS_WINDOWS); + String jobName = "fail-fast-parallel-scripted-pipeline-with-failure" + jobNameSuffix.incrementAndGet(); + + String pipelineScript = "node() {\n" + + " stage('ze-parallel-stage') {\n" + + " parallel failingBranch: {\n" + + " error 'the failure that will cause the interruption of other branches'\n" + + " }, branchThatWillBeInterrupted: {\n" + + " sleep 5\n" + + " }, failFast:true\n" + + " }\n" + + "}"; + Node agent = jenkinsRule.createOnlineSlave(); + WorkflowJob pipeline = jenkinsRule.createProject(WorkflowJob.class, jobName); + pipeline.setDefinition(new CpsFlowDefinition(pipelineScript, true)); + WorkflowRun build = jenkinsRule.assertBuildStatus(Result.FAILURE, pipeline.scheduleBuild2(0)); + + Tree spans = getGeneratedSpans(); + checkChainOfSpans(spans, "sleep", "Parallel branch: branchThatWillBeInterrupted", "Stage: ze-parallel-stage", JenkinsOtelSemanticAttributes.AGENT_UI, "Phase: Run"); + + SpanData sleepSpanData = spans.breadthFirstSearchNodes(node -> "sleep".equals(node.getData().spanData.getName())).get().getData().spanData; + MatcherAssert.assertThat(sleepSpanData.getStatus().getStatusCode(), CoreMatchers.is(StatusCode.UNSET)); + + SpanData branchThatWillBeInterruptedSpanData = spans.breadthFirstSearchNodes(node -> "Parallel branch: branchThatWillBeInterrupted".equals(node.getData().spanData.getName())).get().getData().spanData; + MatcherAssert.assertThat(branchThatWillBeInterruptedSpanData.getStatus().getStatusCode(), CoreMatchers.is(StatusCode.UNSET)); + MatcherAssert.assertThat(branchThatWillBeInterruptedSpanData.getStatus().getDescription(), CoreMatchers.is("FlowInterruptedException: FailFastCause: Failed in branch failingBranch")); + MatcherAssert.assertThat(branchThatWillBeInterruptedSpanData.getAttributes().get(JenkinsOtelSemanticAttributes.JENKINS_STEP_INTERRUPTION_CAUSES), CoreMatchers.is(Arrays.asList("FailFastCause: Failed in branch failingBranch"))); + + SpanData failingBranchSpanData = spans.breadthFirstSearchNodes(node -> "Parallel branch: failingBranch".equals(node.getData().spanData.getName())).get().getData().spanData; + MatcherAssert.assertThat(failingBranchSpanData.getStatus().getStatusCode(), CoreMatchers.is(StatusCode.ERROR)); + MatcherAssert.assertThat(failingBranchSpanData.getStatus().getDescription(), CoreMatchers.is("the failure that will cause the interruption of other branches")); + } }