From 178e6f5830df03b9d6e6ae6705e03bd58e802702 Mon Sep 17 00:00:00 2001 From: stack72 Date: Tue, 5 May 2026 20:10:04 +0100 Subject: [PATCH 1/3] fix(shell): honor AbortSignal end-to-end so --timeout actually cancels MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes swamp-club#247. PR #1305 added `--timeout` to `model method run` and `workflow run` but documented it as cooperative-only because built-in models did not honor the abort signal. This wires it through in two layers so the flag works in practice and surfaces the `cancelled` envelope the original PR promised. Layer 1 — built-in honoring AbortSignal: - `command/shell` now passes `context.signal` into `executeProcess` and re-throws AbortError ahead of the catch-block swallow paths so the abort is not buried as `exitCode = -1`. - `executeProcess` uses Deno.Command's native `signal` option (Deno 2.7.14 sends SIGTERM, matching the existing timeout-path semantics) and explicitly normalizes signal-driven kills to AbortError on all three execution branches (streaming, buffered+timeoutMs, simple buffered). The simple-buffered branch switches from `command.output()` to `command.spawn() + process.output()` because the former silently ignores `CommandOptions.signal` under Deno 2.7.14. Layer 2 — driver-layer error-type preservation: - `ExecutionResult` gains a `cancelled?: boolean` flag. - `RawExecutionDriver` sets it when the failure was an AbortError, so the type information survives the wire boundary that flattens exceptions to a string. - `DefaultMethodExecutionService` re-throws as `DOMException("AbortError")` when `driverResult.cancelled` is set (both call sites) so libswamp's `run.ts` handler routes it to `cancelled()` instead of `methodExecutionFailed()`. Tests: - Unit: 3 new shell_model tests (AbortError surfaces, no data record on abort, ignoreExitCode does not swallow AbortError); 2 new process_executor tests for the buffered branches; 2 new raw driver tests for the cancelled-flag contract. - Integration: new `integration/shell_timeout_test.ts` drives the CLI binary against `sleep 30` with `--timeout 1s` and asserts `code: "cancelled"` in the JSON envelope. Help text on `model method run --timeout` and `workflow run --timeout` updated to drop the "Cooperative — only honored by methods that check AbortSignal" caveat now that built-ins honor it transparently; extension-model methods still must check `ctx.signal`. Follow-up: swamp-uat's `tests/cli/adversarial/method_run_timeout_test.ts` works around this bug today by using a custom abort-aware-sleep extension fixture instead of `command/shell`. After this lands and a swamp release is tagged, that test should be updated (alongside the existing fixture- based variant, which remains useful as an extension-model regression guard) to also exercise `command/shell` directly. Co-Authored-By: Claude Opus 4.7 (1M context) --- integration/shell_timeout_test.ts | 127 ++++++++++++++++++ src/cli/commands/model_method_run.ts | 2 +- src/cli/commands/workflow_run.ts | 2 +- src/domain/drivers/execution_driver.ts | 7 + src/domain/drivers/raw_execution_driver.ts | 3 + .../drivers/raw_execution_driver_test.ts | 58 ++++++++ .../models/command/shell/shell_model.ts | 7 + .../models/command/shell/shell_model_test.ts | 69 ++++++++++ src/domain/models/method_execution_service.ts | 23 +++- .../process/process_executor.ts | 54 ++++---- .../process/process_executor_test.ts | 68 +++++++++- 11 files changed, 385 insertions(+), 35 deletions(-) create mode 100644 integration/shell_timeout_test.ts diff --git a/integration/shell_timeout_test.ts b/integration/shell_timeout_test.ts new file mode 100644 index 00000000..5006e8a7 --- /dev/null +++ b/integration/shell_timeout_test.ts @@ -0,0 +1,127 @@ +// Swamp, an Automation Framework +// Copyright (C) 2026 System Initiative, Inc. +// +// This file is part of Swamp. +// +// Swamp is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License version 3 +// as published by the Free Software Foundation, with the Swamp +// Extension and Definition Exception (found in the "COPYING-EXCEPTION" +// file). +// +// Swamp is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with Swamp. If not, see . + +/** + * In-repo regression guard for swamp-club#247: built-in command/shell must + * honor `--timeout` end-to-end. Drives the CLI binary against a long-running + * shell command and asserts the run aborts well before the natural sleep + * duration. Catches regressions in PRs before swamp-uat picks them up on + * a separate release cadence. + */ + +import { assertEquals } from "@std/assert"; +import { initializeTestRepo, runCliCommand } from "./test_helpers.ts"; +import { YamlDefinitionRepository } from "../src/infrastructure/persistence/yaml_definition_repository.ts"; +import { Definition } from "../src/domain/definitions/definition.ts"; +import { SHELL_MODEL_TYPE } from "../src/domain/models/command/shell/shell_model.ts"; + +async function withTempDir(fn: (dir: string) => Promise): Promise { + const dir = await Deno.makeTempDir({ prefix: "swamp-shell-timeout-" }); + try { + await fn(dir); + } finally { + if (Deno.build.os === "windows") { + await Deno.remove(dir, { recursive: true }).catch(() => {}); + } else { + await Deno.remove(dir, { recursive: true }); + } + } +} + +Deno.test({ + name: + "CLI: --timeout aborts a long-running command/shell run well before completion", + // Uses POSIX `sleep`. PowerShell variant via `Start-Sleep` is plausible + // but would need a separate model definition; defer until needed. + ignore: Deno.build.os === "windows", + fn: async () => { + await withTempDir(async (repoDir) => { + await initializeTestRepo(repoDir); + + // Sleep 30s — picked so a working --timeout cannot be confused with + // natural completion under any reasonable CI jitter. + const definitionRepo = new YamlDefinitionRepository(repoDir); + const definition = Definition.create({ + name: "long-sleep", + methods: { + execute: { + arguments: { + run: "sleep 30", + workingDir: "/tmp", + }, + }, + }, + }); + await definitionRepo.save(SHELL_MODEL_TYPE, definition); + + const start = performance.now(); + const result = await runCliCommand( + [ + "model", + "method", + "run", + "long-sleep", + "execute", + "--repo-dir", + repoDir, + "--timeout", + "1s", + "--json", + ], + Deno.cwd(), + ); + const elapsed = performance.now() - start; + + // Non-zero exit confirms the run did not complete naturally. + assertEquals( + result.code !== 0, + true, + `expected non-zero exit on --timeout abort, got ${result.code}. ` + + `stdout: ${result.stdout}\nstderr: ${result.stderr}`, + ); + // Generous 10s ceiling absorbs CI jitter and CLI startup time + // (worktree may need to compile fresh deps); still well below the + // 30s natural duration that would indicate the abort never fired. + assertEquals( + elapsed < 10_000, + true, + `expected --timeout to abort within 10s, took ${elapsed.toFixed(0)}ms`, + ); + // The output should mention the abort/cancellation. The exact + // envelope shape (`code: "cancelled"` vs generic + // `method_execution_failed`) depends on libswamp's TimeoutError + // handling — match loosely so this guards the user-visible + // contract without pinning the wire format. + // The JSON envelope must carry `code: "cancelled"` so callers can + // distinguish a deadline abort from a generic execution failure. + // Without the driver-layer AbortError preservation, this surfaces + // as `method_execution_failed` and the contract regresses. + const envelope = JSON.parse(result.stdout) as { + code?: string; + error?: string; + }; + assertEquals( + envelope.code, + "cancelled", + `expected envelope.code === "cancelled", got ${envelope.code}. ` + + `full stdout: ${result.stdout}`, + ); + }); + }, +}); diff --git a/src/cli/commands/model_method_run.ts b/src/cli/commands/model_method_run.ts index b941c5ac..36c73b28 100644 --- a/src/cli/commands/model_method_run.ts +++ b/src/cli/commands/model_method_run.ts @@ -138,7 +138,7 @@ export const modelMethodRunCommand = new Command() ) .option( "--timeout ", - "Cancellation deadline — seconds (e.g. 30, 1800) or duration string (e.g. 30s, 5m, 1h). Cooperative — only honored by methods that check AbortSignal.", + "Cancels the run when reached — seconds (e.g. 30, 1800) or duration string (e.g. 30s, 5m, 1h). Extension model methods must check ctx.signal to honor cancellation.", ) .action( // @ts-expect-error - Cliffy custom type returns unknown instead of string diff --git a/src/cli/commands/workflow_run.ts b/src/cli/commands/workflow_run.ts index 07e18157..8eeee31d 100644 --- a/src/cli/commands/workflow_run.ts +++ b/src/cli/commands/workflow_run.ts @@ -129,7 +129,7 @@ export const workflowRunCommand = new Command() ) .option( "--timeout ", - "Cancellation deadline — seconds (e.g. 30, 1800) or duration string (e.g. 30s, 5m, 1h). Cooperative — only honored by methods that check AbortSignal.", + "Cancels the run when reached — seconds (e.g. 30, 1800) or duration string (e.g. 30s, 5m, 1h). Extension model methods must check ctx.signal to honor cancellation.", ) // @ts-expect-error - Cliffy custom type returns unknown instead of string .action(async function (options: AnyOptions, workflowIdOrName: string) { diff --git a/src/domain/drivers/execution_driver.ts b/src/domain/drivers/execution_driver.ts index c9ebf89f..5fd8e8d2 100644 --- a/src/domain/drivers/execution_driver.ts +++ b/src/domain/drivers/execution_driver.ts @@ -89,6 +89,13 @@ export interface ExecutionResult { status: "success" | "error"; /** Error message if status is "error". */ error?: string; + /** + * True when the error originated from an aborted AbortSignal (e.g. + * --timeout fired). Lets the caller distinguish cancellation from a + * generic execution failure even though `error` flattens the exception + * to a string. + */ + cancelled?: boolean; /** Outputs produced during execution. */ outputs: DriverOutput[]; /** Log lines captured during execution. */ diff --git a/src/domain/drivers/raw_execution_driver.ts b/src/domain/drivers/raw_execution_driver.ts index 3c438595..c38863c8 100644 --- a/src/domain/drivers/raw_execution_driver.ts +++ b/src/domain/drivers/raw_execution_driver.ts @@ -200,9 +200,12 @@ export class RawExecutionDriver implements ExecutionDriver { handle, })); + const cancelled = error instanceof DOMException && + error.name === "AbortError"; return { status: "error", error: error instanceof Error ? error.message : String(error), + cancelled, outputs, logs, durationMs, diff --git a/src/domain/drivers/raw_execution_driver_test.ts b/src/domain/drivers/raw_execution_driver_test.ts index 3e88f460..74b776dd 100644 --- a/src/domain/drivers/raw_execution_driver_test.ts +++ b/src/domain/drivers/raw_execution_driver_test.ts @@ -444,3 +444,61 @@ Deno.test("RawExecutionDriver: explicit queryData wins over dataQueryService der assertEquals(usedExplicit, true); assertEquals(usedDerived, false); }); + +// AbortError preservation across the driver boundary — guards swamp-club#247. +// The driver flattens exceptions to a string for the wire result, but it must +// also set `cancelled: true` when the failure was an AbortError so callers +// (method_execution_service) can re-throw an AbortError DOMException, which +// libswamp's run handler routes to the `cancelled` envelope. + +Deno.test( + "RawExecutionDriver: marks result.cancelled=true when method throws AbortError", + async () => { + const executor: MethodExecutor = { + execute: () => { + throw new DOMException("The operation was aborted.", "AbortError"); + }, + }; + + const context = createMockContext(); + const driver = new RawExecutionDriver( + executor, + testDefinition, + testMethod, + testModelDef, + context, + "test", + ); + + const result = await driver.execute(createMockRequest()); + + assertEquals(result.status, "error"); + assertEquals(result.cancelled, true); + }, +); + +Deno.test( + "RawExecutionDriver: leaves result.cancelled falsy for non-abort errors", + async () => { + const executor: MethodExecutor = { + execute: () => { + throw new Error("boom"); + }, + }; + + const context = createMockContext(); + const driver = new RawExecutionDriver( + executor, + testDefinition, + testMethod, + testModelDef, + context, + "test", + ); + + const result = await driver.execute(createMockRequest()); + + assertEquals(result.status, "error"); + assertEquals(result.cancelled, false); + }, +); diff --git a/src/domain/models/command/shell/shell_model.ts b/src/domain/models/command/shell/shell_model.ts index da1e20d4..287a0125 100644 --- a/src/domain/models/command/shell/shell_model.ts +++ b/src/domain/models/command/shell/shell_model.ts @@ -125,6 +125,7 @@ async function executeCommand( timeoutMs: args.timeout, logger: context.logger, redactor: context.redactor, + signal: context.signal, onOutput: context.onEvent ? (line: string, stream: "stdout" | "stderr") => context.onEvent!({ type: "output", line, stream }) @@ -136,6 +137,12 @@ async function executeCommand( exitCode = result.exitCode; durationMs = result.durationMs; } catch (error) { + // AbortError must escape ahead of the swallow paths below so that + // libswamp's run.ts handler can convert it to a `cancelled` envelope. + // Without this, --timeout aborts get buried as exit code -1. + if (error instanceof DOMException && error.name === "AbortError") { + throw error; + } // Handle execution errors (command not found, timeout, etc.) const rawError = error instanceof Error ? error.message : String(error); stderr = redact(rawError); diff --git a/src/domain/models/command/shell/shell_model_test.ts b/src/domain/models/command/shell/shell_model_test.ts index fa15fd27..afba9e26 100644 --- a/src/domain/models/command/shell/shell_model_test.ts +++ b/src/domain/models/command/shell/shell_model_test.ts @@ -533,6 +533,75 @@ posixOnlyTest( }, ); +// Abort signal propagation — guards swamp-club#247. +// shell_model must surface AbortError ahead of its silent-swallow paths so +// libswamp's run.ts can convert it to a `cancelled` envelope. + +posixOnlyTest( + "shellModel.methods.execute surfaces AbortError when ctx.signal aborts mid-run", + async () => { + const controller = new AbortController(); + const args: ShellInputAttributes = { run: "sleep 5" }; + + const { context } = createTestContext({ signal: controller.signal }); + setTimeout(() => controller.abort(), 100); + + const caught = await assertRejects(() => + shellModel.methods.execute.execute(args, context) + ); + // Must be an AbortError — NOT the generic "Command exited with code -1" + // wrapper that the catch block swallows other errors into. + assertEquals(caught instanceof DOMException, true); + assertEquals((caught as DOMException).name, "AbortError"); + }, +); + +posixOnlyTest( + "shellModel.methods.execute writes no data record on abort", + async () => { + const controller = new AbortController(); + const args: ShellInputAttributes = { run: "sleep 5" }; + + const { context, getResults } = createTestContext({ + signal: controller.signal, + }); + setTimeout(() => controller.abort(), 100); + + await assertRejects(() => + shellModel.methods.execute.execute(args, context) + ); + + // AbortError must escape before context.writeResource is called, so no + // data artifacts should have been persisted for the aborted run. + assertEquals(getResults().length, 0); + }, +); + +posixOnlyTest( + "shellModel.methods.execute does not let ignoreExitCode swallow AbortError", + async () => { + // ignoreExitCode is a data-plane signal ("don't throw on non-zero exit"). + // A timeout is a control-plane signal — the deadline elapsed. The two + // must not be conflated: even with ignoreExitCode=true, an aborted run + // must surface AbortError so the user sees a `cancelled` envelope, not + // a "successful" record with exit code 143. + const controller = new AbortController(); + const args: ShellInputAttributes = { + run: "sleep 5", + ignoreExitCode: true, + }; + + const { context } = createTestContext({ signal: controller.signal }); + setTimeout(() => controller.abort(), 100); + + const caught = await assertRejects(() => + shellModel.methods.execute.execute(args, context) + ); + assertEquals(caught instanceof DOMException, true); + assertEquals((caught as DOMException).name, "AbortError"); + }, +); + Deno.test("ShellInputAttributesSchema rejects invalid attributes", () => { const result = ShellInputAttributesSchema.safeParse({ notRun: "value" }); assertEquals(result.success, false); diff --git a/src/domain/models/method_execution_service.ts b/src/domain/models/method_execution_service.ts index 94b75d29..5b41187e 100644 --- a/src/domain/models/method_execution_service.ts +++ b/src/domain/models/method_execution_service.ts @@ -668,9 +668,16 @@ export class DefaultMethodExecutionService implements MethodExecutionService { currentHandles = await processDriverOutputs(driverResult.outputs); if (driverResult.status === "error") { - const err = new Error( - driverResult.error ?? "Method execution failed", - ); + // Preserve AbortError type so libswamp's run handler routes it to + // the `cancelled` envelope. Drivers flatten exceptions to a string + // for the wire result, so use the explicit `cancelled` flag rather + // than string-matching the message. + const err: Error = driverResult.cancelled + ? new DOMException( + driverResult.error ?? "The operation was aborted.", + "AbortError", + ) + : new Error(driverResult.error ?? "Method execution failed"); (err as unknown as Record).dataHandles = currentHandles; throw err; @@ -724,6 +731,16 @@ export class DefaultMethodExecutionService implements MethodExecutionService { })); if (driverResult.status === "error") { + // Preserve AbortError type so libswamp's run handler routes it to + // the `cancelled` envelope. Drivers flatten exceptions to a string + // for the wire result, so use the explicit `cancelled` flag rather + // than string-matching the message. + if (driverResult.cancelled) { + throw new DOMException( + driverResult.error ?? "The operation was aborted.", + "AbortError", + ); + } throw new Error(driverResult.error ?? "Driver execution failed"); } diff --git a/src/infrastructure/process/process_executor.ts b/src/infrastructure/process/process_executor.ts index 1ce658a7..0f1fcdf4 100644 --- a/src/infrastructure/process/process_executor.ts +++ b/src/infrastructure/process/process_executor.ts @@ -129,6 +129,16 @@ export async function executeProcess( commandOptions.env = options.env; } + // Wire the abort signal into Deno.Command natively. On abort, Deno sends + // SIGTERM to the child (verified under Deno 2.7.14). The native path + // resolves `process.status` / `process.output()` with `code: 143, + // signal: null` rather than rejecting — callers must check + // `options.signal?.aborted` after the await and surface AbortError so + // libswamp's run.ts handler converts it to `cancelled`. + if (options.signal) { + commandOptions.signal = options.signal; + } + const command = new Deno.Command(options.command, commandOptions); let stdout: string; @@ -152,27 +162,6 @@ export async function executeProcess( }, options.timeoutMs); } - // Kill subprocess when abort signal fires - let abortHandler: (() => void) | undefined; - if (options.signal) { - if (options.signal.aborted) { - try { - process.kill("SIGTERM"); - } catch { - // Process may have already exited - } - } else { - abortHandler = () => { - try { - process.kill("SIGTERM"); - } catch { - // Process may have already exited - } - }; - options.signal.addEventListener("abort", abortHandler, { once: true }); - } - } - try { const logger = options.logger; const redact = (line: string) => @@ -207,16 +196,16 @@ export async function executeProcess( if (timeoutId !== undefined) { clearTimeout(timeoutId); } - if (abortHandler && options.signal) { - options.signal.removeEventListener("abort", abortHandler); - } } if (timedOut) { throw new Error(`Command timed out after ${options.timeoutMs}ms`); } - // Re-throw as AbortError if signal was responsible for the kill + // Re-throw as AbortError if signal was responsible for the kill. + // Deno.Command's native signal sends SIGTERM and resolves status with + // code 143 rather than rejecting, so this normalization is load-bearing + // — without it, libswamp/models/run.ts would not recognise the abort. if (options.signal?.aborted) { throw new DOMException("The operation was aborted.", "AbortError"); } @@ -246,12 +235,23 @@ export async function executeProcess( if (timedOut) { throw new Error(`Command timed out after ${options.timeoutMs}ms`); } + + if (options.signal?.aborted) { + throw new DOMException("The operation was aborted.", "AbortError"); + } } else { - // Simple buffered execution - const output = await command.output(); + // Simple buffered execution. Use spawn() + output() rather than + // command.output() directly — the latter ignores CommandOptions.signal + // under Deno 2.7.14, while the spawned-process variant honors it. + const process = command.spawn(); + const output = await process.output(); stdout = new TextDecoder().decode(output.stdout); stderr = new TextDecoder().decode(output.stderr); exitCode = output.code; + + if (options.signal?.aborted) { + throw new DOMException("The operation was aborted.", "AbortError"); + } } const durationMs = Date.now() - startTime; diff --git a/src/infrastructure/process/process_executor_test.ts b/src/infrastructure/process/process_executor_test.ts index 9bace5f9..4af1e069 100644 --- a/src/infrastructure/process/process_executor_test.ts +++ b/src/infrastructure/process/process_executor_test.ts @@ -202,9 +202,9 @@ Deno.test("executeProcess handles timeout with logger", async () => { Deno.test({ name: "executeProcess: AbortSignal aborted mid-execution surfaces AbortError (streaming mode)", - // sleep(1) and SIGTERM via process.kill are POSIX-only contracts. The - // production code only attaches abort handling in streaming mode (when - // a logger is provided), so we exercise that path here. + // sleep(1) and SIGTERM via process.kill are POSIX-only contracts. This + // test pins the streaming-mode (logger-attached) signal path; the two + // tests below cover the buffered+timeoutMs and simple-buffered paths. ignore: Deno.build.os === "windows", fn: async () => { const mockLogger = { @@ -278,6 +278,68 @@ Deno.test({ }, }); +// AbortSignal handling in the buffered branches — guards swamp-club#247. +// The streaming-mode signal test at line 200+ already covers the +// logger-attached path. These two pin the contract for the buffered +// branches (with timeoutMs, and simple buffered) so the documented +// `signal` option works uniformly across all three execution modes. + +Deno.test({ + name: + "executeProcess: AbortSignal aborts buffered+timeoutMs run with AbortError", + ignore: Deno.build.os === "windows", + fn: async () => { + const controller = new AbortController(); + setTimeout(() => controller.abort(), 100); + + const start = performance.now(); + let caught: unknown; + try { + await executeProcess({ + command: "sleep", + args: ["5"], + timeoutMs: 10_000, + signal: controller.signal, + }); + } catch (err) { + caught = err; + } + const elapsed = performance.now() - start; + + const err = caught as { name?: string }; + assertEquals(err?.name, "AbortError"); + // Sanity: the abort short-circuited well before the 10s natural timeout. + assertEquals(elapsed < 5_000, true, `elapsed ${elapsed}ms exceeded 5s`); + }, +}); + +Deno.test({ + name: + "executeProcess: AbortSignal aborts simple buffered run with AbortError", + ignore: Deno.build.os === "windows", + fn: async () => { + const controller = new AbortController(); + setTimeout(() => controller.abort(), 100); + + const start = performance.now(); + let caught: unknown; + try { + await executeProcess({ + command: "sleep", + args: ["5"], + signal: controller.signal, + }); + } catch (err) { + caught = err; + } + const elapsed = performance.now() - start; + + const err = caught as { name?: string }; + assertEquals(err?.name, "AbortError"); + assertEquals(elapsed < 5_000, true, `elapsed ${elapsed}ms exceeded 5s`); + }, +}); + Deno.test("executeProcess redacts secrets from streamed stdout lines", async () => { const infoLines: string[] = []; const warnLines: string[] = []; From 72d7e76467fe2c08f84282d7a5b084b26533fe8e Mon Sep 17 00:00:00 2001 From: stack72 Date: Tue, 5 May 2026 20:37:00 +0100 Subject: [PATCH 2/3] fix(shell): use manual abort listener for cross-platform signal kill MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Linux CI revealed that Deno.Command's native `signal` option does not reliably propagate `AbortSignal.any([signal, AbortSignal.timeout(ms)])` — the exact pattern libswamp's `withTimeout` produces. Direct AbortControllers worked (so unit tests passed on Linux) but the integration test took the full 30s sleep instead of aborting at 1s. Replace the native option with a manual `addEventListener('abort')` + `process.kill('SIGTERM')` pattern, extracted into `attachSignalKill` and applied to all three executeProcess branches. Same kill semantics (SIGTERM), same surface contract (AbortError on signal-driven kill), but works uniformly across platforms. Add a regression test that uses the exact `AbortSignal.any + timeout` shape so a future "simplification" back to native-signal-only fails at the unit level rather than in flaky integration coverage. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../process/process_executor.ts | 81 ++++++++++++++----- .../process/process_executor_test.ts | 43 ++++++++++ 2 files changed, 103 insertions(+), 21 deletions(-) diff --git a/src/infrastructure/process/process_executor.ts b/src/infrastructure/process/process_executor.ts index 0f1fcdf4..a99aee52 100644 --- a/src/infrastructure/process/process_executor.ts +++ b/src/infrastructure/process/process_executor.ts @@ -60,6 +60,40 @@ export interface ProcessResult { durationMs: number; } +/** + * Attaches a SIGTERM-on-abort listener to the given child process. Returns a + * cleanup function that detaches the listener — call it from a `finally` block + * to avoid leaking listeners when the process exits normally. + * + * Why a manual listener rather than `Deno.CommandOptions.signal`: the native + * option works for direct AbortControllers but does not reliably propagate + * `AbortSignal.any([..., AbortSignal.timeout(...)])` on Linux (observed in + * CI for swamp-club#247). The manual `addEventListener('abort')` path is + * proven to work uniformly across platforms. + */ +function attachSignalKill( + process: Deno.ChildProcess, + signal: AbortSignal, +): () => void { + if (signal.aborted) { + try { + process.kill("SIGTERM"); + } catch { + // Process may have already exited + } + return () => {}; + } + const handler = () => { + try { + process.kill("SIGTERM"); + } catch { + // Process may have already exited + } + }; + signal.addEventListener("abort", handler, { once: true }); + return () => signal.removeEventListener("abort", handler); +} + /** * Reads lines from a ReadableStream, calling onLine for each complete line. * Returns the full accumulated output as a string. @@ -129,16 +163,6 @@ export async function executeProcess( commandOptions.env = options.env; } - // Wire the abort signal into Deno.Command natively. On abort, Deno sends - // SIGTERM to the child (verified under Deno 2.7.14). The native path - // resolves `process.status` / `process.output()` with `code: 143, - // signal: null` rather than rejecting — callers must check - // `options.signal?.aborted` after the await and surface AbortError so - // libswamp's run.ts handler converts it to `cancelled`. - if (options.signal) { - commandOptions.signal = options.signal; - } - const command = new Deno.Command(options.command, commandOptions); let stdout: string; @@ -150,6 +174,9 @@ export async function executeProcess( const process = command.spawn(); let timeoutId: number | undefined; let timedOut = false; + const detachSignal = options.signal + ? attachSignalKill(process, options.signal) + : () => {}; if (options.timeoutMs) { timeoutId = setTimeout(() => { @@ -196,16 +223,17 @@ export async function executeProcess( if (timeoutId !== undefined) { clearTimeout(timeoutId); } + detachSignal(); } if (timedOut) { throw new Error(`Command timed out after ${options.timeoutMs}ms`); } - // Re-throw as AbortError if signal was responsible for the kill. - // Deno.Command's native signal sends SIGTERM and resolves status with - // code 143 rather than rejecting, so this normalization is load-bearing - // — without it, libswamp/models/run.ts would not recognise the abort. + // Surface AbortError if the signal aborted — the manual SIGTERM in + // attachSignalKill resolves `process.status` with `code: 143` rather + // than rejecting, so this normalization is load-bearing for libswamp's + // run.ts handler to route it to the `cancelled` envelope. if (options.signal?.aborted) { throw new DOMException("The operation was aborted.", "AbortError"); } @@ -213,6 +241,9 @@ export async function executeProcess( // Buffered with timeout const process = command.spawn(); let timedOut = false; + const detachSignal = options.signal + ? attachSignalKill(process, options.signal) + : () => {}; const timeoutId = setTimeout(() => { timedOut = true; @@ -230,6 +261,7 @@ export async function executeProcess( exitCode = output.code; } finally { clearTimeout(timeoutId); + detachSignal(); } if (timedOut) { @@ -240,14 +272,21 @@ export async function executeProcess( throw new DOMException("The operation was aborted.", "AbortError"); } } else { - // Simple buffered execution. Use spawn() + output() rather than - // command.output() directly — the latter ignores CommandOptions.signal - // under Deno 2.7.14, while the spawned-process variant honors it. + // Simple buffered execution. Use spawn() + output() so the manual + // abort listener can call process.kill() — `command.output()` does + // not expose the underlying child handle. const process = command.spawn(); - const output = await process.output(); - stdout = new TextDecoder().decode(output.stdout); - stderr = new TextDecoder().decode(output.stderr); - exitCode = output.code; + const detachSignal = options.signal + ? attachSignalKill(process, options.signal) + : () => {}; + try { + const output = await process.output(); + stdout = new TextDecoder().decode(output.stdout); + stderr = new TextDecoder().decode(output.stderr); + exitCode = output.code; + } finally { + detachSignal(); + } if (options.signal?.aborted) { throw new DOMException("The operation was aborted.", "AbortError"); diff --git a/src/infrastructure/process/process_executor_test.ts b/src/infrastructure/process/process_executor_test.ts index 4af1e069..115a5fed 100644 --- a/src/infrastructure/process/process_executor_test.ts +++ b/src/infrastructure/process/process_executor_test.ts @@ -340,6 +340,49 @@ Deno.test({ }, }); +// Regression for swamp-club#247: libswamp's `withTimeout` builds a signal as +// `AbortSignal.any([userSignal, AbortSignal.timeout(ms)])`. An earlier fix +// attempt relied on Deno.Command's native `signal` option, which propagated +// for direct AbortControllers but not reliably for `AbortSignal.any` of a +// timeout (observed on Linux CI). This test pins the actual production +// pattern so a future regression to native-signal-only fails here, not in +// flaky integration coverage. +Deno.test({ + name: + "executeProcess: AbortSignal.any + AbortSignal.timeout aborts streaming run", + ignore: Deno.build.os === "windows", + fn: async () => { + const mockLogger = { + info: () => {}, + warn: () => {}, + } as unknown as import("@logtape/logtape").Logger; + + const userController = new AbortController(); + const combined = AbortSignal.any([ + userController.signal, + AbortSignal.timeout(100), + ]); + + const start = performance.now(); + let caught: unknown; + try { + await executeProcess({ + command: "sleep", + args: ["5"], + logger: mockLogger, + signal: combined, + }); + } catch (err) { + caught = err; + } + const elapsed = performance.now() - start; + + const err = caught as { name?: string }; + assertEquals(err?.name, "AbortError"); + assertEquals(elapsed < 5_000, true, `elapsed ${elapsed}ms exceeded 5s`); + }, +}); + Deno.test("executeProcess redacts secrets from streamed stdout lines", async () => { const infoLines: string[] = []; const warnLines: string[] = []; From b5c3adb2e2f7b7bf1769cf929cd40ccd509eba75 Mon Sep 17 00:00:00 2001 From: stack72 Date: Tue, 5 May 2026 21:33:49 +0100 Subject: [PATCH 3/3] fix(shell): make streamLines abort-aware so dash-fork orphans don't hang MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Linux CI failure root cause: dash forks rather than execs for single-command `sh -c` invocations, so SIGTERM to the parent shell leaves an orphaned grandchild holding the stdout/stderr pipes open. macOS bash exec's the single command (parent PID becomes the command) so the orphan never appears — which is why the bug only surfaced on Linux CI. When the orphan keeps the write end of the pipe alive, the deno-side read loops in `streamLines` never get EOF, and the wider Promise.all over [stdout, stderr, status] blocks until the orphan exits naturally (30s for `sleep 30`). Fix: thread the abort signal into `streamLines` so each `reader.read()` races against `signal.aborted`. When the signal fires, the loop releases the reader and returns regardless of pipe state. Add a regression test that runs `/bin/dash -c 'sleep 30'` with a 500ms abort and asserts elapsed < 5s. dash is reliably present on Linux CI and on macOS too, so this exact failure mode is now caught at unit-test time rather than only in CI integration. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../process/process_executor.ts | 35 +++++++++++-- .../process/process_executor_test.ts | 51 +++++++++++++++++++ 2 files changed, 81 insertions(+), 5 deletions(-) diff --git a/src/infrastructure/process/process_executor.ts b/src/infrastructure/process/process_executor.ts index a99aee52..1889bca8 100644 --- a/src/infrastructure/process/process_executor.ts +++ b/src/infrastructure/process/process_executor.ts @@ -97,10 +97,17 @@ function attachSignalKill( /** * Reads lines from a ReadableStream, calling onLine for each complete line. * Returns the full accumulated output as a string. + * + * When `signal` is provided, the read loop releases the reader and returns + * as soon as the signal aborts. This unblocks callers when the underlying + * pipe stays open due to grandchildren that inherited it (e.g. dash on Linux + * forks `sleep 30` from `sh -c`; SIGTERM kills sh but the orphan keeps the + * write end of the pipe open, so the deno-side reader never sees EOF). */ export async function streamLines( stream: ReadableStream, onLine?: (line: string) => void, + signal?: AbortSignal, ): Promise { const reader = stream.getReader(); const decoder = new TextDecoder(); @@ -109,10 +116,25 @@ export async function streamLines( try { while (true) { - const { done, value } = await reader.read(); - if (done) break; + // Race the next read against the abort signal so an aborted run + // doesn't block on an inherited-pipe orphan that never EOFs. + const readPromise = reader.read(); + const result = signal && !signal.aborted + ? await Promise.race([ + readPromise, + new Promise<{ done: true; value: undefined }>((resolve) => { + signal.addEventListener( + "abort", + () => resolve({ done: true, value: undefined }), + { once: true }, + ); + }), + ]) + : await readPromise; + + if (signal?.aborted || result.done) break; - buffer += decoder.decode(value, { stream: true }); + buffer += decoder.decode(result.value, { stream: true }); const bufferLines = buffer.split("\n"); // Process all complete lines @@ -194,6 +216,9 @@ export async function executeProcess( const redact = (line: string) => options.redactor?.hasSecrets ? options.redactor.redact(line) : line; const onOutput = options.onOutput; + // Pass the abort signal down so the read loops bail out when an + // orphaned grandchild keeps the pipe write end open after the parent + // shell dies (dash on Linux, see streamLines docs). const [stdoutResult, stderrResult, status] = await Promise.all([ streamLines(process.stdout, (line) => { const redacted = redact(line); @@ -203,7 +228,7 @@ export async function executeProcess( } else { logger.info(redacted); } - }), + }, options.signal), streamLines(process.stderr, (line) => { const redacted = redact(line); if (onOutput) { @@ -212,7 +237,7 @@ export async function executeProcess( } else { logger.warn(redacted); } - }), + }, options.signal), process.status, ]); diff --git a/src/infrastructure/process/process_executor_test.ts b/src/infrastructure/process/process_executor_test.ts index 115a5fed..e491473d 100644 --- a/src/infrastructure/process/process_executor_test.ts +++ b/src/infrastructure/process/process_executor_test.ts @@ -383,6 +383,57 @@ Deno.test({ }, }); +// Regression for the Linux-CI failure mode of swamp-club#247: dash forks +// rather than exec'ing single-command `sh -c` invocations, so SIGTERM to +// the parent shell leaves an orphaned grandchild holding the stdout/stderr +// pipes open. Without abort-aware stream reads, Promise.all blocks until +// the orphan exits naturally. /bin/dash is reliably present on Linux CI +// runners and on macOS too (this test exercises the same dash binary on +// both platforms so the local result mirrors CI). +Deno.test({ + name: + "executeProcess: aborts even when child shell forks a long-running grandchild (dash)", + ignore: Deno.build.os === "windows", + fn: async () => { + try { + await Deno.stat("/bin/dash"); + } catch { + // dash absent — skip rather than fail; the regression this test + // guards against requires a fork-then-exec shell. + return; + } + + const mockLogger = { + info: () => {}, + warn: () => {}, + } as unknown as import("@logtape/logtape").Logger; + + const combined = AbortSignal.any([AbortSignal.timeout(500)]); + + const start = performance.now(); + let caught: unknown; + try { + await executeProcess({ + command: "/bin/dash", + args: ["-c", "sleep 30"], + logger: mockLogger, + signal: combined, + }); + } catch (err) { + caught = err; + } + const elapsed = performance.now() - start; + + const err = caught as { name?: string }; + assertEquals(err?.name, "AbortError"); + assertEquals( + elapsed < 5_000, + true, + `elapsed ${elapsed}ms exceeded 5s — orphan-pipe regression?`, + ); + }, +}); + Deno.test("executeProcess redacts secrets from streamed stdout lines", async () => { const infoLines: string[] = []; const warnLines: string[] = [];