diff --git a/pom.xml b/pom.xml index eabd17e1c..3e1a17e42 100644 --- a/pom.xml +++ b/pom.xml @@ -161,6 +161,12 @@ 1.2 test + + org.jenkins-ci.plugins + mock-slave + 153.v9768799a_2294 + test + org.awaitility awaitility diff --git a/src/main/java/org/jenkinsci/plugins/workflow/steps/durable_task/DurableTaskStep.java b/src/main/java/org/jenkinsci/plugins/workflow/steps/durable_task/DurableTaskStep.java index 4b56cab03..1a92ea374 100644 --- a/src/main/java/org/jenkinsci/plugins/workflow/steps/durable_task/DurableTaskStep.java +++ b/src/main/java/org/jenkinsci/plugins/workflow/steps/durable_task/DurableTaskStep.java @@ -367,7 +367,7 @@ static final class Execution extends AbstractStepExecutionImpl implements Runnab } else { LOGGER.fine(() -> "rediscovering that " + node + " has been removed and timeout has expired"); listener().getLogger().println(node + " has been removed for " + Util.getTimeSpanString(ExecutorStepExecution.TIMEOUT_WAITING_FOR_NODE_MILLIS) + ", assuming it is not coming back"); - throw new FlowInterruptedException(Result.ABORTED, /* TODO false probably more appropriate */true, new ExecutorStepExecution.RemovedNodeCause()); + throw new FlowInterruptedException(Result.ABORTED, /* TODO false probably more appropriate */true, new ExecutorStepExecution.RemovedNodeTimeoutCause()); } } removedNodeDiscovered = 0; // something else; reset diff --git a/src/main/java/org/jenkinsci/plugins/workflow/support/pickles/ExecutorPickle.java b/src/main/java/org/jenkinsci/plugins/workflow/support/pickles/ExecutorPickle.java index bd9756edb..b1c984908 100644 --- a/src/main/java/org/jenkinsci/plugins/workflow/support/pickles/ExecutorPickle.java +++ b/src/main/java/org/jenkinsci/plugins/workflow/support/pickles/ExecutorPickle.java @@ -122,7 +122,7 @@ protected Executor tryResolve() throws Exception { Queue.getInstance().cancel(item); owner.getListener().getLogger().printf("Killed %s after waiting for %s because we assume unknown agent %s is never going to appear%n", item.task.getDisplayName(), Util.getTimeSpanString(ExecutorStepExecution.TIMEOUT_WAITING_FOR_NODE_MILLIS), placeholder.getAssignedLabel()); - throw new FlowInterruptedException(Result.ABORTED, new ExecutorStepExecution.RemovedNodeCause()); + throw new FlowInterruptedException(Result.ABORTED, new ExecutorStepExecution.RemovedNodeTimeoutCause()); } } } diff --git a/src/main/java/org/jenkinsci/plugins/workflow/support/steps/AgentErrorCondition.java b/src/main/java/org/jenkinsci/plugins/workflow/support/steps/AgentErrorCondition.java index 129e7b092..d858ecfe7 100644 --- a/src/main/java/org/jenkinsci/plugins/workflow/support/steps/AgentErrorCondition.java +++ b/src/main/java/org/jenkinsci/plugins/workflow/support/steps/AgentErrorCondition.java @@ -36,6 +36,7 @@ import java.io.IOException; import java.nio.channels.ClosedChannelException; import java.util.stream.Stream; +import jenkins.model.CauseOfInterruption; import org.jenkinsci.Symbol; import org.jenkinsci.plugins.workflow.flow.ErrorCondition; import org.jenkinsci.plugins.workflow.steps.FlowInterruptedException; @@ -57,8 +58,7 @@ public final class AgentErrorCondition extends ErrorCondition { if (t instanceof AgentOfflineException) { return true; } - if (t instanceof FlowInterruptedException && ((FlowInterruptedException) t).getCauses().stream().anyMatch( - c -> c instanceof ExecutorStepExecution.RemovedNodeCause || c instanceof ExecutorStepExecution.QueueTaskCancelled)) { + if (t instanceof FlowInterruptedException && ((FlowInterruptedException) t).getCauses().stream().anyMatch(Retryable.class::isInstance)) { return true; } if (isClosedChannelException(t)) { @@ -90,6 +90,11 @@ private static boolean isClosedChannelException(Throwable t) { } } + /** + * A marker interface for {@link CauseOfInterruption} instances that can be retried through {@link AgentErrorCondition}. + */ + public interface Retryable {} + @Symbol("agent") @Extension public static final class DescriptorImpl extends ErrorConditionDescriptor { diff --git a/src/main/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepDynamicContext.java b/src/main/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepDynamicContext.java index 5c0f6c35c..a2757c822 100644 --- a/src/main/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepDynamicContext.java +++ b/src/main/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepDynamicContext.java @@ -107,7 +107,7 @@ void resume(StepContext context) throws Exception { exec = item.getFuture().getStartCondition().get(ExecutorStepExecution.TIMEOUT_WAITING_FOR_NODE_MILLIS, TimeUnit.MILLISECONDS); } catch (TimeoutException x) { listener.getLogger().println(node + " has been removed for " + Util.getTimeSpanString(ExecutorStepExecution.TIMEOUT_WAITING_FOR_NODE_MILLIS) + ", assuming it is not coming back"); - throw new FlowInterruptedException(Result.ABORTED, /* TODO false probably more appropriate */true, new ExecutorStepExecution.RemovedNodeCause()); + throw new FlowInterruptedException(Result.ABORTED, /* TODO false probably more appropriate */true, new ExecutorStepExecution.RemovedNodeTimeoutCause()); } catch (CancellationException x) { LOGGER.log(Level.FINE, "ceased to wait for " + node, x); throw new FlowInterruptedException(Result.ABORTED, /* TODO false probably more appropriate */true, new ExecutorStepExecution.QueueTaskCancelled()); diff --git a/src/main/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepExecution.java b/src/main/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepExecution.java index 074188328..cd1203cbb 100644 --- a/src/main/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepExecution.java +++ b/src/main/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepExecution.java @@ -26,6 +26,7 @@ import hudson.model.ResourceList; import hudson.model.Result; import hudson.model.Run; +import hudson.model.Slave; import hudson.model.TaskListener; import hudson.model.TopLevelItem; import hudson.model.User; @@ -38,6 +39,7 @@ import hudson.security.ACLContext; import hudson.security.AccessControlled; import hudson.security.Permission; +import hudson.slaves.AbstractCloudSlave; import hudson.slaves.OfflineCause; import hudson.slaves.WorkspaceList; import java.io.IOException; @@ -74,6 +76,7 @@ import org.acegisecurity.Authentication; import org.jenkinsci.plugins.durabletask.executors.ContinuableExecutable; import org.jenkinsci.plugins.durabletask.executors.ContinuedTask; +import org.jenkinsci.plugins.durabletask.executors.OnceRetentionStrategy; import org.jenkinsci.plugins.workflow.actions.LabelAction; import org.jenkinsci.plugins.workflow.actions.QueueItemAction; import org.jenkinsci.plugins.workflow.actions.ThreadNameAction; @@ -334,7 +337,7 @@ public void stop(@NonNull Throwable cause) throws Exception { } - public static final class QueueTaskCancelled extends CauseOfInterruption { + public static final class QueueTaskCancelled extends RetryableCauseOfInterruption { @Override public String getShortDescription() { return Messages.ExecutorStepExecution_queue_task_cancelled(); } @@ -346,44 +349,57 @@ public static final class QueueTaskCancelled extends CauseOfInterruption { return; } LOGGER.fine(() -> "received node deletion event on " + node.getNodeName()); - Timer.get().schedule(() -> { - Computer c = node.toComputer(); - if (c == null || c.isOnline()) { - LOGGER.fine(() -> "computer for " + node.getNodeName() + " was missing or online, skipping"); - return; - } - LOGGER.fine(() -> "processing node deletion event on " + node.getNodeName()); - for (Executor e : c.getExecutors()) { - Queue.Executable exec = e.getCurrentExecutable(); - if (exec instanceof PlaceholderTask.PlaceholderExecutable) { - PlaceholderTask task = ((PlaceholderTask.PlaceholderExecutable) exec).getParent(); - TaskListener listener; - try { - listener = task.context.get(TaskListener.class); - } catch (Exception x) { - LOGGER.log(Level.WARNING, null, x); - continue; - } - task.withExecution(execution -> { - BodyExecution body = execution.body; - if (body == null) { - listener.getLogger().println("Agent " + node.getNodeName() + " was deleted, but do not have a node body to cancel"); - return; - } - listener.getLogger().println("Agent " + node.getNodeName() + " was deleted; cancelling node body"); - if (Util.isOverridden(BodyExecution.class, body.getClass(), "cancel", Throwable.class)) { - body.cancel(new FlowInterruptedException(Result.ABORTED, false, new RemovedNodeCause())); - } else { // TODO remove once https://github.com/jenkinsci/workflow-cps-plugin/pull/570 is widely deployed - body.cancel(new RemovedNodeCause()); - } - }); + if (isOneShotAgent(node)) { + LOGGER.fine(() -> "Cancelling owner run for one-shot agent " + node.getNodeName() + " immediately"); + cancelOwnerExecution(node, new RemovedNodeCause()); + } else { + LOGGER.fine(() -> "Will cancel owner run for agent " + node.getNodeName() + " after waiting for " + TIMEOUT_WAITING_FOR_NODE_MILLIS + "ms"); + Timer.get().schedule(() -> cancelOwnerExecution(node, new RemovedNodeCause()), TIMEOUT_WAITING_FOR_NODE_MILLIS, TimeUnit.MILLISECONDS); + } + } + + private static boolean isOneShotAgent(Node node) { + return node instanceof AbstractCloudSlave || + (node instanceof Slave && ((Slave) node).getRetentionStrategy() instanceof OnceRetentionStrategy); + } + + private static void cancelOwnerExecution(Node node, CauseOfInterruption... causes) { + Computer c = node.toComputer(); + if (c == null || c.isOnline()) { + LOGGER.fine(() -> "computer for " + node.getNodeName() + " was missing or online, skipping"); + return; + } + LOGGER.fine(() -> "processing node deletion event on " + node.getNodeName()); + for (Executor e : c.getExecutors()) { + Queue.Executable exec = e.getCurrentExecutable(); + if (exec instanceof PlaceholderTask.PlaceholderExecutable) { + PlaceholderTask task = ((PlaceholderTask.PlaceholderExecutable) exec).getParent(); + TaskListener listener; + try { + listener = task.context.get(TaskListener.class); + } catch (Exception x) { + LOGGER.log(Level.WARNING, null, x); + continue; } + task.withExecution(execution -> { + BodyExecution body = execution.body; + if (body == null) { + listener.getLogger().println("Agent " + node.getNodeName() + " was deleted, but do not have a node body to cancel"); + return; + } + listener.getLogger().println("Agent " + node.getNodeName() + " was deleted; cancelling node body"); + if (Util.isOverridden(BodyExecution.class, body.getClass(), "cancel", Throwable.class)) { + body.cancel(new FlowInterruptedException(Result.ABORTED, false, causes)); + } else { // TODO remove once https://github.com/jenkinsci/workflow-cps-plugin/pull/570 is widely deployed + body.cancel(causes); + } + }); } - }, TIMEOUT_WAITING_FOR_NODE_MILLIS, TimeUnit.MILLISECONDS); + } } } - public static final class RemovedNodeCause extends CauseOfInterruption { + public static final class RemovedNodeCause extends RetryableCauseOfInterruption { @SuppressFBWarnings(value = "MS_SHOULD_BE_FINAL", justification = "deliberately mutable") public static boolean ENABLED = Boolean.parseBoolean(System.getProperty(ExecutorStepExecution.class.getName() + ".REMOVED_NODE_DETECTION", "true")); @Override public String getShortDescription() { @@ -391,6 +407,17 @@ public static final class RemovedNodeCause extends CauseOfInterruption { } } + public static final class RemovedNodeTimeoutCause extends RetryableCauseOfInterruption { + @Override public String getShortDescription() { + return "Timeout waiting for agent to come back"; + } + } + + /** + * Base class for a cause of interruption that can be retried via {@link AgentErrorCondition}. + */ + private abstract static class RetryableCauseOfInterruption extends CauseOfInterruption implements AgentErrorCondition.Retryable {} + /** Transient handle of a running executor task. */ private static final class RunningTask { /** null until placeholder executable runs */ diff --git a/src/test/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepDynamicContextTest.java b/src/test/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepDynamicContextTest.java index 6db3d29df..0b05323b2 100644 --- a/src/test/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepDynamicContextTest.java +++ b/src/test/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepDynamicContextTest.java @@ -24,33 +24,38 @@ package org.jenkinsci.plugins.workflow.support.steps; +import static org.awaitility.Awaitility.await; +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.anyOf; +import static org.hamcrest.Matchers.arrayWithSize; +import static org.hamcrest.Matchers.contains; +import static org.hamcrest.Matchers.emptyArray; +import static org.hamcrest.Matchers.hasSize; +import static org.hamcrest.Matchers.isA; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; + import hudson.model.Label; import hudson.model.Queue; import hudson.model.Result; import hudson.slaves.DumbSlave; import hudson.slaves.RetentionStrategy; import java.io.File; +import java.time.Duration; import java.util.ArrayList; import java.util.List; import java.util.logging.Level; import jenkins.model.InterruptedBuildAction; -import static org.hamcrest.MatcherAssert.assertThat; -import static org.hamcrest.Matchers.contains; -import static org.hamcrest.Matchers.emptyArray; -import static org.hamcrest.Matchers.isA; -import static org.hamcrest.Matchers.anyOf; +import org.jenkinci.plugins.mock_slave.MockCloud; +import org.jenkinsci.plugins.durabletask.executors.OnceRetentionStrategy; import org.jenkinsci.plugins.workflow.cps.CpsFlowDefinition; import org.jenkinsci.plugins.workflow.flow.FlowExecutionList; import org.jenkinsci.plugins.workflow.job.WorkflowJob; import org.jenkinsci.plugins.workflow.job.WorkflowRun; import org.jenkinsci.plugins.workflow.test.steps.SemaphoreStep; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertNull; import org.junit.ClassRule; import org.junit.Rule; import org.junit.Test; -import org.junit.rules.TemporaryFolder; import org.jvnet.hudson.test.BuildWatcher; import org.jvnet.hudson.test.Issue; import org.jvnet.hudson.test.JenkinsSessionRule; @@ -60,9 +65,12 @@ public class ExecutorStepDynamicContextTest { @ClassRule public static BuildWatcher buildWatcher = new BuildWatcher(); @Rule public JenkinsSessionRule sessions = new JenkinsSessionRule(); - @Rule public TemporaryFolder tmp = new TemporaryFolder(); @Rule public LoggerRule logging = new LoggerRule(); + private void commonSetup() { + logging.recordPackage(ExecutorStepExecution.class, Level.FINE).record(FlowExecutionList.class, Level.FINE); + } + @Test public void canceledQueueItem() throws Throwable { sessions.then(j -> { DumbSlave s = j.createSlave(Label.get("remote")); @@ -75,11 +83,7 @@ public class ExecutorStepDynamicContextTest { sessions.then(j -> { SemaphoreStep.success("wait/1", null); WorkflowRun b = j.jenkins.getItemByFullName("p", WorkflowJob.class).getBuildByNumber(1); - while (Queue.getInstance().getItems().length == 0) { - Thread.sleep(100); - } - Queue.Item[] items = Queue.getInstance().getItems(); - assertEquals(1, items.length); + var items = await().timeout(Duration.ofMinutes(1)).until(() -> j.jenkins.getQueue().getItems(), arrayWithSize(1)); Queue.getInstance().cancel(items[0]); j.assertBuildStatus(Result.ABORTED, j.waitForCompletion(b)); InterruptedBuildAction iba = b.getAction(InterruptedBuildAction.class); @@ -98,7 +102,7 @@ public class ExecutorStepDynamicContextTest { */ @Issue("JENKINS-36013") @Test public void normalNodeDisappearance() throws Throwable { - logging.recordPackage(ExecutorStepExecution.class, Level.FINE).record(FlowExecutionList.class, Level.FINE); + commonSetup(); sessions.then(j -> { // Start up a build that needs executor and then reboot and take the node offline // Starting job first ensures we don't immediately fail if Node comes from a Cloud @@ -114,20 +118,19 @@ public class ExecutorStepDynamicContextTest { sessions.then(j -> { // Start up a build and then reboot and take the node offline assertEquals(0, j.jenkins.getLabel("ghost").getNodes().size()); // Make sure test impl is correctly deleted - assertNull(j.jenkins.getNode("ghost")); // Make sure test impl is correctly deleted WorkflowRun run = j.jenkins.getItemByFullName("p", WorkflowJob.class).getLastBuild(); j.assertBuildStatus(Result.ABORTED, j.waitForCompletion(run)); j.assertLogContains("slave0 has been removed for ", run); assertThat(j.jenkins.getQueue().getItems(), emptyArray()); InterruptedBuildAction iba = run.getAction(InterruptedBuildAction.class); assertNotNull(iba); - assertThat(iba.getCauses(), contains(isA(ExecutorStepExecution.RemovedNodeCause.class))); + assertThat(iba.getCauses(), contains(isA(ExecutorStepExecution.RemovedNodeTimeoutCause.class))); }); } @Issue("JENKINS-36013") @Test public void parallelNodeDisappearance() throws Throwable { - logging.recordPackage(ExecutorStepExecution.class, Level.FINE).record(FlowExecutionList.class, Level.FINE); + commonSetup(); sessions.then(j -> { WorkflowJob p = j.createProject(WorkflowJob.class, "p"); p.setDefinition(new CpsFlowDefinition("def bs = [:]; for (int _i = 0; _i < 5; _i++) {def i = _i; bs[/b$i/] = {node('remote') {semaphore(/s$i/)}}}; parallel bs", true)); @@ -207,4 +210,43 @@ public class ExecutorStepDynamicContextTest { }); } + @Test public void onceRetentionStrategyNodeDisappearance() throws Throwable { + commonSetup(); + sessions.then(j -> { + DumbSlave s = j.createSlave(Label.get("ghost")); + s.setRetentionStrategy(new OnceRetentionStrategy(0)); + WorkflowJob p = j.createProject(WorkflowJob.class, "p"); + p.setDefinition(new CpsFlowDefinition("node('ghost') {if (isUnix()) {sh 'sleep infinity'} else {bat 'echo + sleep infinity && ping -n 999999 localhost'}}", true)); + var run = p.scheduleBuild2(0).waitForStart(); + j.waitForMessage("+ sleep infinity", run); + j.jenkins.removeNode(s); + j.assertBuildStatus(Result.ABORTED, j.waitForCompletion(run)); + assertThat(j.jenkins.getQueue().getItems(), emptyArray()); + InterruptedBuildAction iba = run.getAction(InterruptedBuildAction.class); + assertNotNull(iba); + assertThat(iba.getCauses(), contains(isA(ExecutorStepExecution.RemovedNodeCause.class))); + }); + } + + @Test public void cloudNodeDisappearance() throws Throwable { + commonSetup(); + sessions.then(j -> { + var mockCloud = new MockCloud("mock"); + mockCloud.setLabels("mock"); + j.jenkins.clouds.add(mockCloud); + WorkflowJob p = j.createProject(WorkflowJob.class, "p"); + p.setDefinition(new CpsFlowDefinition("node('mock') {if (isUnix()) {sh 'sleep infinity'} else {bat 'echo + sleep infinity && ping -n 999999 localhost'}}", true)); + WorkflowRun run = p.scheduleBuild2(0).waitForStart(); + j.waitForMessage("+ sleep infinity", run); + var mockNodes = j.jenkins.getLabel("mock").getNodes(); + assertThat(mockNodes, hasSize(1)); + var mockNode = mockNodes.iterator().next(); + j.jenkins.removeNode(mockNode); + j.assertBuildStatus(Result.ABORTED, j.waitForCompletion(run)); + assertThat(j.jenkins.getQueue().getItems(), emptyArray()); + InterruptedBuildAction iba = run.getAction(InterruptedBuildAction.class); + assertNotNull(iba); + assertThat(iba.getCauses(), contains(isA(ExecutorStepExecution.RemovedNodeCause.class))); + }); + } }