From 6218523789b9d48543a9dc0a36951ed279265223 Mon Sep 17 00:00:00 2001 From: stack72 Date: Tue, 5 May 2026 14:38:37 +0100 Subject: [PATCH] fix(persistence): close TOCTOU window in find* walkers (review followup) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Followup to swamp-club#240 / #1306. Review surfaced two related issues that landed without being addressed: 1. **TOCTOU race in find* walkers**: `Deno.stat` and `Deno.readTextFile` in `YamlWorkflowRunRepository.findRunsSinceByWorkflowId` and `YamlOutputRepository.findAllGlobalSince` were unguarded. A concurrent delete (GC, `deleteAllByWorkflowId`, output cleanup) between `readDir` and the per-file ops would throw `NotFound`, propagate to the outer catch, and silently discard runs already collected for the workflow (run repo) or abandon the rest of the model type's outputs (output repo). The pre-existing `findAllByWorkflowId` and `findAll(type)` shared the bug. Fix: wrap each per-file body in its own `try/catch` that `continue`s on `NotFound`, matching the pattern `unified_data_repository.ts:273` already uses. The outer catch keeps its role of handling "directory itself doesn't exist." All four walkers across the workflow-run and output repos now use the same shape. 2. **`YamlOutputRepository.findAllGlobalSince` and `FileSystemUnifiedDataRepository.findAllGlobalSince` lacked direct unit tests**. They were covered indirectly via service-layer mocks, so a divergence between the three implementations could ship undetected. Added tests mirroring the workflow-run repo's patterns: in-window-only filtering and the "file deleted mid-iteration" race that this PR's #1 fix prevents. Also includes two cosmetic polishes called out in review: - `--limit ` → `` to match `data_search` and `workflow_run_search` flag placeholders in `--help` output. - Added `swamp summarise --limit 10` to the command's example block so the flag shows up alongside `--since` examples. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/cli/commands/summarise.ts | 3 +- .../unified_data_repository_test.ts | 85 +++++++++++++++++ .../persistence/yaml_output_repository.ts | 82 +++++++++++------ .../yaml_output_repository_test.ts | 91 +++++++++++++++++++ .../yaml_workflow_run_repository.ts | 61 +++++++++---- .../yaml_workflow_run_repository_test.ts | 54 +++++++++++ 6 files changed, 326 insertions(+), 50 deletions(-) diff --git a/src/cli/commands/summarise.ts b/src/cli/commands/summarise.ts index bc1041b5..4ad5c9a7 100644 --- a/src/cli/commands/summarise.ts +++ b/src/cli/commands/summarise.ts @@ -48,6 +48,7 @@ export const summariseCommand = new Command() .example("Show recent activity", "swamp summarise") .example("Activity from the last day", "swamp summarise --since 1d") .example("Activity from the last hour", "swamp summarise --since 1h") + .example("Cap detail output on a large repo", "swamp summarise --limit 10") .option( "--repo-dir ", "Repository directory (env: SWAMP_REPO_DIR)", @@ -56,7 +57,7 @@ export const summariseCommand = new Command() default: "7d", }) .option( - "--limit ", + "--limit ", "Cap per-group run details (counts still reflect all matching runs)", ) .action(async function (options) { diff --git a/src/infrastructure/persistence/unified_data_repository_test.ts b/src/infrastructure/persistence/unified_data_repository_test.ts index 7cefb6b1..b62a9ee3 100644 --- a/src/infrastructure/persistence/unified_data_repository_test.ts +++ b/src/infrastructure/persistence/unified_data_repository_test.ts @@ -750,3 +750,88 @@ Deno.test("markDirty is not called on read paths", async () => { } } }); + +// ============================================================================ +// findAllGlobalSince — mirrors the workflow-run repo tests so the three +// implementations of the same two-stage filter stay in lockstep. +// ============================================================================ + +async function withDataRepo( + fn: ( + repo: FileSystemUnifiedDataRepository, + tmpDir: string, + ) => Promise, +): Promise { + const tmpDir = await Deno.makeTempDir(); + try { + const catalogStore = new CatalogStore(join(tmpDir, "_catalog.db")); + const repo = new FileSystemUnifiedDataRepository( + tmpDir, + undefined, + catalogStore, + ); + await fn(repo, tmpDir); + } finally { + if (Deno.build.os === "windows") { + await Deno.remove(tmpDir, { recursive: true }).catch(() => {}); + } else { + await Deno.remove(tmpDir, { recursive: true }); + } + } +} + +Deno.test("findAllGlobalSince: returns only in-window data items", async () => { + await withDataRepo(async (repo) => { + const old = makeData("old-data"); + await repo.save(testType, "model-1", old, new TextEncoder().encode("x")); + + const oldDate = new Date(Date.now() - 7 * 24 * 60 * 60 * 1000); + const oldMetadataPath = repo.getMetadataPath( + testType, + "model-1", + "old-data", + 1, + ); + await Deno.utime(oldMetadataPath, oldDate, oldDate); + + const fresh = makeData("fresh-data"); + await repo.save(testType, "model-1", fresh, new TextEncoder().encode("y")); + + const cutoff = new Date(Date.now() - 60 * 60 * 1000); + const found = await repo.findAllGlobalSince(cutoff); + + assertEquals(found.length, 1); + assertEquals(found[0].data.name, "fresh-data"); + }); +}); + +Deno.test( + "findAllGlobalSince: file deleted mid-iteration is skipped, not fatal", + async () => { + await withDataRepo(async (repo) => { + const keep = makeData("keep-data"); + await repo.save(testType, "model-1", keep, new TextEncoder().encode("x")); + + const doomed = makeData("doomed-data"); + await repo.save( + testType, + "model-1", + doomed, + new TextEncoder().encode("y"), + ); + + // Concurrent deletion of the doomed item's metadata file. The data + // repo already wraps stat in per-file try/catch; this test pins + // that behavior so future refactors don't lose it. + await Deno.remove( + repo.getMetadataPath(testType, "model-1", "doomed-data", 1), + ); + + const cutoff = new Date(Date.now() - 60 * 60 * 1000); + const found = await repo.findAllGlobalSince(cutoff); + + assertEquals(found.length, 1); + assertEquals(found[0].data.name, "keep-data"); + }); + }, +); diff --git a/src/infrastructure/persistence/yaml_output_repository.ts b/src/infrastructure/persistence/yaml_output_repository.ts index c21d428f..f00758eb 100644 --- a/src/infrastructure/persistence/yaml_output_repository.ts +++ b/src/infrastructure/persistence/yaml_output_repository.ts @@ -122,24 +122,34 @@ export class YamlOutputRepository implements OutputRepository { try { // Iterate over method directories for await (const methodEntry of Deno.readDir(typeDir)) { - if (methodEntry.isDirectory) { - const methodDir = join(typeDir, methodEntry.name); - // Iterate over output files in method directory - for await (const entry of Deno.readDir(methodDir)) { - if (entry.isFile && entry.name.endsWith(".yaml")) { - const path = join(methodDir, entry.name); - const content = await Deno.readTextFile(path); - const data = parseYaml(content) as ModelOutputData; - // Convert logFile back to absolute path - if (data.logFile) { - data.logFile = toAbsolutePath(this.repoDir, data.logFile); - } - outputs.push(ModelOutput.fromData(data)); + if (!methodEntry.isDirectory) continue; + const methodDir = join(typeDir, methodEntry.name); + // Iterate over output files in method directory + for await (const entry of Deno.readDir(methodDir)) { + if (!entry.isFile || !entry.name.endsWith(".yaml")) continue; + const path = join(methodDir, entry.name); + + // Per-file try/catch closes the TOCTOU window: a concurrent + // delete (e.g. GC, output cleanup) can remove the file + // between readDir and readTextFile. NotFound on a single + // file means "skip it" — never "abandon the rest of the + // current method directory." + try { + const content = await Deno.readTextFile(path); + const data = parseYaml(content) as ModelOutputData; + if (data.logFile) { + data.logFile = toAbsolutePath(this.repoDir, data.logFile); } + outputs.push(ModelOutput.fromData(data)); + } catch (error) { + if (error instanceof Deno.errors.NotFound) continue; + throw error; } } } } catch (error) { + // Outer catch handles "type/method directory itself doesn't + // exist." Per-file NotFound is handled above. if (error instanceof Deno.errors.NotFound) { return []; } @@ -197,28 +207,40 @@ export class YamlOutputRepository implements OutputRepository { if (!entry.isFile || !entry.name.endsWith(".yaml")) continue; const path = join(methodDir, entry.name); - // Stage A: mtime pre-filter - const stat = await Deno.stat(path); - const mtimeMs = stat.mtime?.getTime(); - if (mtimeMs !== undefined && mtimeMs < cutoffMs) continue; + // Per-file try/catch closes the TOCTOU window: a concurrent + // delete can remove the file between readDir and stat or + // between stat and readTextFile. NotFound on a single file + // means "skip it" — never "abandon the rest of the current + // method directory or model type." + try { + // Stage A: mtime pre-filter + const stat = await Deno.stat(path); + const mtimeMs = stat.mtime?.getTime(); + if (mtimeMs !== undefined && mtimeMs < cutoffMs) continue; - // Stage B: parse and verify - const content = await Deno.readTextFile(path); - const data = parseYaml(content) as ModelOutputData; - if (data.logFile) { - data.logFile = toAbsolutePath(this.repoDir, data.logFile); - } - const output = ModelOutput.fromData(data); - if (output.startedAt.getTime() < cutoffMs) continue; + // Stage B: parse and verify + const content = await Deno.readTextFile(path); + const data = parseYaml(content) as ModelOutputData; + if (data.logFile) { + data.logFile = toAbsolutePath(this.repoDir, data.logFile); + } + const output = ModelOutput.fromData(data); + if (output.startedAt.getTime() < cutoffMs) continue; - results.push({ - output, - type: modelType, - method: output.methodName, - }); + results.push({ + output, + type: modelType, + method: output.methodName, + }); + } catch (error) { + if (error instanceof Deno.errors.NotFound) continue; + throw error; + } } } } catch (error) { + // Outer catch handles "type/method directory itself doesn't + // exist." Per-file NotFound is handled above. if (error instanceof Deno.errors.NotFound) continue; throw error; } diff --git a/src/infrastructure/persistence/yaml_output_repository_test.ts b/src/infrastructure/persistence/yaml_output_repository_test.ts index 0fb3c2a2..4eb31b5d 100644 --- a/src/infrastructure/persistence/yaml_output_repository_test.ts +++ b/src/infrastructure/persistence/yaml_output_repository_test.ts @@ -29,6 +29,14 @@ import { createDefinitionId } from "../../domain/definitions/definition.ts"; import { ModelType } from "../../domain/models/model_type.ts"; import { YamlOutputRepository } from "./yaml_output_repository.ts"; +// Import the models barrel so `modelRegistry.types()` returns real entries — +// `findAllGlobalSince` and `findAll` both walk the registry, so tests that +// exercise them need at least one registered type. The `command/shell` model +// is the simplest entry the barrel registers. +import "../../domain/models/models.ts"; + +const registeredType = ModelType.create("command/shell"); + async function withTempDir(fn: (dir: string) => Promise): Promise { const dir = await Deno.makeTempDir({ prefix: "swamp-test-" }); try { @@ -402,3 +410,86 @@ Deno.test("YamlOutputRepository invokes markDirty with relPath on mutations", as assertEquals(calls.length, 2); }); }); + +// findAllGlobalSince tests use a type that's actually in the model registry +// because the implementation iterates `modelRegistry.types()` to know where +// to look on disk. + +async function makeOutput( + repo: YamlOutputRepository, + startedAt: Date, +): Promise { + const output = ModelOutput.create({ + definitionId: createDefinitionId(crypto.randomUUID()), + methodName: "run", + status: "running", + startedAt, + provenance: defaultProvenance, + }); + output.markSucceeded(); + await repo.save(registeredType, "run", output); + return output; +} + +Deno.test("findAllGlobalSince: returns only in-window outputs", async () => { + await withTempDir(async (dir) => { + const repo = new YamlOutputRepository(dir); + + const oldDate = new Date(Date.now() - 7 * 24 * 60 * 60 * 1000); + const old = await makeOutput(repo, oldDate); + + // Backdate the file so its mtime falls before the cutoff. + const oldPath = repo.getPath(registeredType, "run", old); + await Deno.utime(oldPath, oldDate, oldDate); + + const fresh = await makeOutput(repo, new Date()); + + const cutoff = new Date(Date.now() - 60 * 60 * 1000); + const found = await repo.findAllGlobalSince(cutoff); + + assertEquals(found.length, 1); + assertEquals(found[0].output.id, fresh.id); + }); +}); + +Deno.test( + "findAllGlobalSince: file deleted mid-iteration is skipped, not fatal", + async () => { + await withTempDir(async (dir) => { + const repo = new YamlOutputRepository(dir); + + const keep = await makeOutput(repo, new Date()); + const doomed = await makeOutput(repo, new Date()); + + // Simulate a concurrent deletion. Pre-fix behavior: NotFound from + // Deno.stat propagates to the model-type catch and continues past + // the entire current type, dropping `keep`. + await Deno.remove(repo.getPath(registeredType, "run", doomed)); + + const cutoff = new Date(Date.now() - 60 * 60 * 1000); + const found = await repo.findAllGlobalSince(cutoff); + + assertEquals(found.length, 1); + assertEquals(found[0].output.id, keep.id); + }); + }, +); + +Deno.test( + "findAll: file deleted mid-iteration is skipped, not fatal", + async () => { + await withTempDir(async (dir) => { + const repo = new YamlOutputRepository(dir); + + const keep = await makeOutput(repo, new Date()); + const doomed = await makeOutput(repo, new Date()); + + await Deno.remove(repo.getPath(registeredType, "run", doomed)); + + const found = await repo.findAll(registeredType); + + assertEquals(found.length, 1); + assertEquals(found[0].id, keep.id); + }); + }, +); diff --git a/src/infrastructure/persistence/yaml_workflow_run_repository.ts b/src/infrastructure/persistence/yaml_workflow_run_repository.ts index 4f71b472..ce2f0419 100644 --- a/src/infrastructure/persistence/yaml_workflow_run_repository.ts +++ b/src/infrastructure/persistence/yaml_workflow_run_repository.ts @@ -109,20 +109,32 @@ export class YamlWorkflowRunRepository implements WorkflowRunRepository { try { for await (const entry of Deno.readDir(dir)) { if ( - entry.isFile && entry.name.startsWith("workflow-run-") && - entry.name.endsWith(".yaml") + !entry.isFile || !entry.name.startsWith("workflow-run-") || + !entry.name.endsWith(".yaml") ) { - const path = join(dir, entry.name); + continue; + } + const path = join(dir, entry.name); + + // Per-file try/catch closes the TOCTOU window: a concurrent + // delete (e.g. deleteAllByWorkflowId, GC) can remove the file + // between readDir and readTextFile. NotFound on a single file + // means "skip it" — never "abandon the rest of the workflow." + try { const content = await Deno.readTextFile(path); const data = parseYaml(content) as WorkflowRunData; - // Convert logFile back to absolute path if (data.logFile) { data.logFile = toAbsolutePath(this.repoDir, data.logFile); } runs.push(WorkflowRun.fromData(data)); + } catch (error) { + if (error instanceof Deno.errors.NotFound) continue; + throw error; } } } catch (error) { + // Outer catch handles "directory itself doesn't exist" (no runs yet + // for this workflow). Per-file NotFound is handled above. if (error instanceof Deno.errors.NotFound) { return []; } @@ -247,24 +259,35 @@ export class YamlWorkflowRunRepository implements WorkflowRunRepository { } const path = join(dir, entry.name); - // Stage A: mtime pre-filter - const stat = await Deno.stat(path); - const mtimeMs = stat.mtime?.getTime(); - if (mtimeMs !== undefined && mtimeMs < cutoffMs) continue; - - // Stage B: parse and verify - const content = await Deno.readTextFile(path); - const data = parseYaml(content) as WorkflowRunData; - if (data.logFile) { - data.logFile = toAbsolutePath(this.repoDir, data.logFile); + // Per-file try/catch closes the TOCTOU window: a concurrent + // delete can remove the file between readDir and stat or between + // stat and readTextFile. NotFound on a single file means "skip + // it" — never "discard runs already collected for this workflow." + try { + // Stage A: mtime pre-filter + const stat = await Deno.stat(path); + const mtimeMs = stat.mtime?.getTime(); + if (mtimeMs !== undefined && mtimeMs < cutoffMs) continue; + + // Stage B: parse and verify + const content = await Deno.readTextFile(path); + const data = parseYaml(content) as WorkflowRunData; + if (data.logFile) { + data.logFile = toAbsolutePath(this.repoDir, data.logFile); + } + const run = WorkflowRun.fromData(data); + const startedAtMs = run.startedAt?.getTime(); + if (startedAtMs === undefined || startedAtMs < cutoffMs) continue; + + runs.push(run); + } catch (error) { + if (error instanceof Deno.errors.NotFound) continue; + throw error; } - const run = WorkflowRun.fromData(data); - const startedAtMs = run.startedAt?.getTime(); - if (startedAtMs === undefined || startedAtMs < cutoffMs) continue; - - runs.push(run); } } catch (error) { + // Outer catch handles "directory itself doesn't exist." Per-file + // NotFound is handled above. if (error instanceof Deno.errors.NotFound) { return []; } diff --git a/src/infrastructure/persistence/yaml_workflow_run_repository_test.ts b/src/infrastructure/persistence/yaml_workflow_run_repository_test.ts index ee8092b9..4a6d5778 100644 --- a/src/infrastructure/persistence/yaml_workflow_run_repository_test.ts +++ b/src/infrastructure/persistence/yaml_workflow_run_repository_test.ts @@ -525,3 +525,57 @@ Deno.test( }); }, ); + +Deno.test( + "findAllGlobalSince: file deleted mid-iteration is skipped, not fatal", + async () => { + await withTempDir(async (dir) => { + const repo = new YamlWorkflowRunRepository(dir); + const workflow = createTestWorkflow(); + + const keep = WorkflowRun.create(workflow); + keep.start(); + await repo.save(workflow.id, keep); + + const doomed = WorkflowRun.create(workflow); + doomed.start(); + await repo.save(workflow.id, doomed); + + // Simulate a concurrent deletion by removing the file. Pre-fix + // behavior: NotFound thrown by Deno.stat propagates to the + // workflow-level catch, returning [] and losing `keep` too. + await Deno.remove(repo.getPath(workflow.id, doomed.id)); + + const cutoff = new Date(Date.now() - 60 * 60 * 1000); + const found = await repo.findAllGlobalSince(cutoff); + + assertEquals(found.length, 1); + assertEquals(found[0].run.id, keep.id); + }); + }, +); + +Deno.test( + "findAllByWorkflowId: file deleted mid-iteration is skipped, not fatal", + async () => { + await withTempDir(async (dir) => { + const repo = new YamlWorkflowRunRepository(dir); + const workflow = createTestWorkflow(); + + const keep = WorkflowRun.create(workflow); + keep.start(); + await repo.save(workflow.id, keep); + + const doomed = WorkflowRun.create(workflow); + doomed.start(); + await repo.save(workflow.id, doomed); + + await Deno.remove(repo.getPath(workflow.id, doomed.id)); + + const found = await repo.findAllByWorkflowId(workflow.id); + + assertEquals(found.length, 1); + assertEquals(found[0].id, keep.id); + }); + }, +);