-
-
Notifications
You must be signed in to change notification settings - Fork 107
Fail fast on cloud node removal #372
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
6014dde
a68e629
17b1e37
f23bf96
2e02220
118d49e
5753b5d
c4e7a10
04e044e
b62ed6f
5a8617e
797c108
c81e290
04e4192
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -26,6 +26,7 @@ | |
| import hudson.model.ResourceList; | ||
| import hudson.model.Result; | ||
| import hudson.model.Run; | ||
| import hudson.model.Slave; | ||
| import hudson.model.TaskListener; | ||
| import hudson.model.TopLevelItem; | ||
| import hudson.model.User; | ||
|
|
@@ -38,6 +39,7 @@ | |
| import hudson.security.ACLContext; | ||
| import hudson.security.AccessControlled; | ||
| import hudson.security.Permission; | ||
| import hudson.slaves.AbstractCloudSlave; | ||
| import hudson.slaves.OfflineCause; | ||
| import hudson.slaves.WorkspaceList; | ||
| import java.io.IOException; | ||
|
|
@@ -74,6 +76,7 @@ | |
| import org.acegisecurity.Authentication; | ||
| import org.jenkinsci.plugins.durabletask.executors.ContinuableExecutable; | ||
| import org.jenkinsci.plugins.durabletask.executors.ContinuedTask; | ||
| import org.jenkinsci.plugins.durabletask.executors.OnceRetentionStrategy; | ||
| import org.jenkinsci.plugins.workflow.actions.LabelAction; | ||
| import org.jenkinsci.plugins.workflow.actions.QueueItemAction; | ||
| import org.jenkinsci.plugins.workflow.actions.ThreadNameAction; | ||
|
|
@@ -334,7 +337,7 @@ public void stop(@NonNull Throwable cause) throws Exception { | |
|
|
||
| } | ||
|
|
||
| public static final class QueueTaskCancelled extends CauseOfInterruption { | ||
| public static final class QueueTaskCancelled extends RetryableCauseOfInterruption { | ||
| @Override public String getShortDescription() { | ||
| return Messages.ExecutorStepExecution_queue_task_cancelled(); | ||
| } | ||
|
|
@@ -346,51 +349,75 @@ public static final class QueueTaskCancelled extends CauseOfInterruption { | |
| return; | ||
| } | ||
| LOGGER.fine(() -> "received node deletion event on " + node.getNodeName()); | ||
| Timer.get().schedule(() -> { | ||
| Computer c = node.toComputer(); | ||
| if (c == null || c.isOnline()) { | ||
| LOGGER.fine(() -> "computer for " + node.getNodeName() + " was missing or online, skipping"); | ||
| return; | ||
| } | ||
| LOGGER.fine(() -> "processing node deletion event on " + node.getNodeName()); | ||
| for (Executor e : c.getExecutors()) { | ||
| Queue.Executable exec = e.getCurrentExecutable(); | ||
| if (exec instanceof PlaceholderTask.PlaceholderExecutable) { | ||
| PlaceholderTask task = ((PlaceholderTask.PlaceholderExecutable) exec).getParent(); | ||
| TaskListener listener; | ||
| try { | ||
| listener = task.context.get(TaskListener.class); | ||
| } catch (Exception x) { | ||
| LOGGER.log(Level.WARNING, null, x); | ||
| continue; | ||
| } | ||
| task.withExecution(execution -> { | ||
| BodyExecution body = execution.body; | ||
| if (body == null) { | ||
| listener.getLogger().println("Agent " + node.getNodeName() + " was deleted, but do not have a node body to cancel"); | ||
| return; | ||
| } | ||
| listener.getLogger().println("Agent " + node.getNodeName() + " was deleted; cancelling node body"); | ||
| if (Util.isOverridden(BodyExecution.class, body.getClass(), "cancel", Throwable.class)) { | ||
| body.cancel(new FlowInterruptedException(Result.ABORTED, false, new RemovedNodeCause())); | ||
| } else { // TODO remove once https://github.com/jenkinsci/workflow-cps-plugin/pull/570 is widely deployed | ||
| body.cancel(new RemovedNodeCause()); | ||
| } | ||
| }); | ||
| if (isOneShotAgent(node)) { | ||
| LOGGER.fine(() -> "Cancelling owner run for one-shot agent " + node.getNodeName() + " immediately"); | ||
| cancelOwnerExecution(node, new RemovedNodeCause()); | ||
| } else { | ||
| LOGGER.fine(() -> "Will cancel owner run for agent " + node.getNodeName() + " after waiting for " + TIMEOUT_WAITING_FOR_NODE_MILLIS + "ms"); | ||
| Timer.get().schedule(() -> cancelOwnerExecution(node, new RemovedNodeCause()), TIMEOUT_WAITING_FOR_NODE_MILLIS, TimeUnit.MILLISECONDS); | ||
| } | ||
| } | ||
|
|
||
| private static boolean isOneShotAgent(Node node) { | ||
| return node instanceof AbstractCloudSlave || | ||
| (node instanceof Slave && ((Slave) node).getRetentionStrategy() instanceof OnceRetentionStrategy); | ||
|
Comment on lines
+362
to
+363
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unfortunately this heuristic does not match
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we start introducing a marker interface for this usage?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe, though I think it would suffice for
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| } | ||
|
|
||
| private static void cancelOwnerExecution(Node node, CauseOfInterruption... causes) { | ||
| Computer c = node.toComputer(); | ||
| if (c == null || c.isOnline()) { | ||
| LOGGER.fine(() -> "computer for " + node.getNodeName() + " was missing or online, skipping"); | ||
| return; | ||
| } | ||
| LOGGER.fine(() -> "processing node deletion event on " + node.getNodeName()); | ||
| for (Executor e : c.getExecutors()) { | ||
| Queue.Executable exec = e.getCurrentExecutable(); | ||
| if (exec instanceof PlaceholderTask.PlaceholderExecutable) { | ||
| PlaceholderTask task = ((PlaceholderTask.PlaceholderExecutable) exec).getParent(); | ||
| TaskListener listener; | ||
| try { | ||
| listener = task.context.get(TaskListener.class); | ||
| } catch (Exception x) { | ||
| LOGGER.log(Level.WARNING, null, x); | ||
| continue; | ||
| } | ||
| task.withExecution(execution -> { | ||
| BodyExecution body = execution.body; | ||
| if (body == null) { | ||
| listener.getLogger().println("Agent " + node.getNodeName() + " was deleted, but do not have a node body to cancel"); | ||
| return; | ||
| } | ||
| listener.getLogger().println("Agent " + node.getNodeName() + " was deleted; cancelling node body"); | ||
| if (Util.isOverridden(BodyExecution.class, body.getClass(), "cancel", Throwable.class)) { | ||
| body.cancel(new FlowInterruptedException(Result.ABORTED, false, causes)); | ||
| } else { // TODO remove once https://github.com/jenkinsci/workflow-cps-plugin/pull/570 is widely deployed | ||
| body.cancel(causes); | ||
| } | ||
| }); | ||
| } | ||
| }, TIMEOUT_WAITING_FOR_NODE_MILLIS, TimeUnit.MILLISECONDS); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| public static final class RemovedNodeCause extends CauseOfInterruption { | ||
| public static final class RemovedNodeCause extends RetryableCauseOfInterruption { | ||
| @SuppressFBWarnings(value = "MS_SHOULD_BE_FINAL", justification = "deliberately mutable") | ||
| public static boolean ENABLED = Boolean.parseBoolean(System.getProperty(ExecutorStepExecution.class.getName() + ".REMOVED_NODE_DETECTION", "true")); | ||
| @Override public String getShortDescription() { | ||
| return "Agent was removed"; | ||
| } | ||
| } | ||
|
|
||
| public static final class RemovedNodeTimeoutCause extends RetryableCauseOfInterruption { | ||
| @Override public String getShortDescription() { | ||
| return "Timeout waiting for agent to come back"; | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Base class for a cause of interruption that can be retried via {@link AgentErrorCondition}. | ||
| */ | ||
| private abstract static class RetryableCauseOfInterruption extends CauseOfInterruption implements AgentErrorCondition.Retryable {} | ||
|
|
||
| /** Transient handle of a running executor task. */ | ||
| private static final class RunningTask { | ||
| /** null until placeholder executable runs */ | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -24,33 +24,38 @@ | |
|
|
||
| package org.jenkinsci.plugins.workflow.support.steps; | ||
|
|
||
| import static org.awaitility.Awaitility.await; | ||
| import static org.hamcrest.MatcherAssert.assertThat; | ||
| import static org.hamcrest.Matchers.anyOf; | ||
| import static org.hamcrest.Matchers.arrayWithSize; | ||
| import static org.hamcrest.Matchers.contains; | ||
| import static org.hamcrest.Matchers.emptyArray; | ||
| import static org.hamcrest.Matchers.hasSize; | ||
| import static org.hamcrest.Matchers.isA; | ||
| import static org.junit.Assert.assertEquals; | ||
| import static org.junit.Assert.assertNotNull; | ||
|
|
||
| import hudson.model.Label; | ||
| import hudson.model.Queue; | ||
| import hudson.model.Result; | ||
| import hudson.slaves.DumbSlave; | ||
| import hudson.slaves.RetentionStrategy; | ||
| import java.io.File; | ||
| import java.time.Duration; | ||
| import java.util.ArrayList; | ||
| import java.util.List; | ||
| import java.util.logging.Level; | ||
| import jenkins.model.InterruptedBuildAction; | ||
| import static org.hamcrest.MatcherAssert.assertThat; | ||
| import static org.hamcrest.Matchers.contains; | ||
| import static org.hamcrest.Matchers.emptyArray; | ||
| import static org.hamcrest.Matchers.isA; | ||
| import static org.hamcrest.Matchers.anyOf; | ||
| import org.jenkinci.plugins.mock_slave.MockCloud; | ||
| import org.jenkinsci.plugins.durabletask.executors.OnceRetentionStrategy; | ||
| import org.jenkinsci.plugins.workflow.cps.CpsFlowDefinition; | ||
| import org.jenkinsci.plugins.workflow.flow.FlowExecutionList; | ||
| import org.jenkinsci.plugins.workflow.job.WorkflowJob; | ||
| import org.jenkinsci.plugins.workflow.job.WorkflowRun; | ||
| import org.jenkinsci.plugins.workflow.test.steps.SemaphoreStep; | ||
| import static org.junit.Assert.assertEquals; | ||
| import static org.junit.Assert.assertNotNull; | ||
| import static org.junit.Assert.assertNull; | ||
| import org.junit.ClassRule; | ||
| import org.junit.Rule; | ||
| import org.junit.Test; | ||
| import org.junit.rules.TemporaryFolder; | ||
| import org.jvnet.hudson.test.BuildWatcher; | ||
| import org.jvnet.hudson.test.Issue; | ||
| import org.jvnet.hudson.test.JenkinsSessionRule; | ||
|
|
@@ -60,9 +65,12 @@ public class ExecutorStepDynamicContextTest { | |
|
|
||
| @ClassRule public static BuildWatcher buildWatcher = new BuildWatcher(); | ||
| @Rule public JenkinsSessionRule sessions = new JenkinsSessionRule(); | ||
| @Rule public TemporaryFolder tmp = new TemporaryFolder(); | ||
| @Rule public LoggerRule logging = new LoggerRule(); | ||
|
|
||
| private void commonSetup() { | ||
| logging.recordPackage(ExecutorStepExecution.class, Level.FINE).record(FlowExecutionList.class, Level.FINE); | ||
| } | ||
|
|
||
| @Test public void canceledQueueItem() throws Throwable { | ||
| sessions.then(j -> { | ||
| DumbSlave s = j.createSlave(Label.get("remote")); | ||
|
|
@@ -75,11 +83,7 @@ public class ExecutorStepDynamicContextTest { | |
| sessions.then(j -> { | ||
| SemaphoreStep.success("wait/1", null); | ||
| WorkflowRun b = j.jenkins.getItemByFullName("p", WorkflowJob.class).getBuildByNumber(1); | ||
| while (Queue.getInstance().getItems().length == 0) { | ||
| Thread.sleep(100); | ||
| } | ||
| Queue.Item[] items = Queue.getInstance().getItems(); | ||
| assertEquals(1, items.length); | ||
| var items = await().timeout(Duration.ofMinutes(1)).until(() -> j.jenkins.getQueue().getItems(), arrayWithSize(1)); | ||
| Queue.getInstance().cancel(items[0]); | ||
| j.assertBuildStatus(Result.ABORTED, j.waitForCompletion(b)); | ||
| InterruptedBuildAction iba = b.getAction(InterruptedBuildAction.class); | ||
|
|
@@ -98,7 +102,7 @@ public class ExecutorStepDynamicContextTest { | |
| */ | ||
| @Issue("JENKINS-36013") | ||
| @Test public void normalNodeDisappearance() throws Throwable { | ||
| logging.recordPackage(ExecutorStepExecution.class, Level.FINE).record(FlowExecutionList.class, Level.FINE); | ||
| commonSetup(); | ||
| sessions.then(j -> { | ||
| // Start up a build that needs executor and then reboot and take the node offline | ||
| // Starting job first ensures we don't immediately fail if Node comes from a Cloud | ||
|
|
@@ -114,20 +118,19 @@ public class ExecutorStepDynamicContextTest { | |
| sessions.then(j -> { | ||
| // Start up a build and then reboot and take the node offline | ||
| assertEquals(0, j.jenkins.getLabel("ghost").getNodes().size()); // Make sure test impl is correctly deleted | ||
| assertNull(j.jenkins.getNode("ghost")); // Make sure test impl is correctly deleted | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Useless assertion, since the node would just have the label |
||
| WorkflowRun run = j.jenkins.getItemByFullName("p", WorkflowJob.class).getLastBuild(); | ||
| j.assertBuildStatus(Result.ABORTED, j.waitForCompletion(run)); | ||
| j.assertLogContains("slave0 has been removed for ", run); | ||
| assertThat(j.jenkins.getQueue().getItems(), emptyArray()); | ||
| InterruptedBuildAction iba = run.getAction(InterruptedBuildAction.class); | ||
| assertNotNull(iba); | ||
| assertThat(iba.getCauses(), contains(isA(ExecutorStepExecution.RemovedNodeCause.class))); | ||
| assertThat(iba.getCauses(), contains(isA(ExecutorStepExecution.RemovedNodeTimeoutCause.class))); | ||
| }); | ||
| } | ||
|
|
||
| @Issue("JENKINS-36013") | ||
| @Test public void parallelNodeDisappearance() throws Throwable { | ||
| logging.recordPackage(ExecutorStepExecution.class, Level.FINE).record(FlowExecutionList.class, Level.FINE); | ||
| commonSetup(); | ||
| sessions.then(j -> { | ||
| WorkflowJob p = j.createProject(WorkflowJob.class, "p"); | ||
| p.setDefinition(new CpsFlowDefinition("def bs = [:]; for (int _i = 0; _i < 5; _i++) {def i = _i; bs[/b$i/] = {node('remote') {semaphore(/s$i/)}}}; parallel bs", true)); | ||
|
|
@@ -207,4 +210,43 @@ public class ExecutorStepDynamicContextTest { | |
| }); | ||
| } | ||
|
|
||
| @Test public void onceRetentionStrategyNodeDisappearance() throws Throwable { | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Essentially a copy of |
||
| commonSetup(); | ||
| sessions.then(j -> { | ||
| DumbSlave s = j.createSlave(Label.get("ghost")); | ||
| s.setRetentionStrategy(new OnceRetentionStrategy(0)); | ||
|
Vlatombe marked this conversation as resolved.
|
||
| WorkflowJob p = j.createProject(WorkflowJob.class, "p"); | ||
| p.setDefinition(new CpsFlowDefinition("node('ghost') {if (isUnix()) {sh 'sleep infinity'} else {bat 'echo + sleep infinity && ping -n 999999 localhost'}}", true)); | ||
| var run = p.scheduleBuild2(0).waitForStart(); | ||
| j.waitForMessage("+ sleep infinity", run); | ||
| j.jenkins.removeNode(s); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IIUC this is what |
||
| j.assertBuildStatus(Result.ABORTED, j.waitForCompletion(run)); | ||
| assertThat(j.jenkins.getQueue().getItems(), emptyArray()); | ||
| InterruptedBuildAction iba = run.getAction(InterruptedBuildAction.class); | ||
| assertNotNull(iba); | ||
| assertThat(iba.getCauses(), contains(isA(ExecutorStepExecution.RemovedNodeCause.class))); | ||
| }); | ||
| } | ||
|
|
||
| @Test public void cloudNodeDisappearance() throws Throwable { | ||
| commonSetup(); | ||
| sessions.then(j -> { | ||
| var mockCloud = new MockCloud("mock"); | ||
| mockCloud.setLabels("mock"); | ||
| j.jenkins.clouds.add(mockCloud); | ||
| WorkflowJob p = j.createProject(WorkflowJob.class, "p"); | ||
| p.setDefinition(new CpsFlowDefinition("node('mock') {if (isUnix()) {sh 'sleep infinity'} else {bat 'echo + sleep infinity && ping -n 999999 localhost'}}", true)); | ||
| WorkflowRun run = p.scheduleBuild2(0).waitForStart(); | ||
| j.waitForMessage("+ sleep infinity", run); | ||
| var mockNodes = j.jenkins.getLabel("mock").getNodes(); | ||
| assertThat(mockNodes, hasSize(1)); | ||
| var mockNode = mockNodes.iterator().next(); | ||
| j.jenkins.removeNode(mockNode); | ||
| j.assertBuildStatus(Result.ABORTED, j.waitForCompletion(run)); | ||
| assertThat(j.jenkins.getQueue().getItems(), emptyArray()); | ||
| InterruptedBuildAction iba = run.getAction(InterruptedBuildAction.class); | ||
| assertNotNull(iba); | ||
| assertThat(iba.getCauses(), contains(isA(ExecutorStepExecution.RemovedNodeCause.class))); | ||
| }); | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(The crucial part FTR.)