From 5d0e5bcbec37e433052768d01feaca612cf4f57f Mon Sep 17 00:00:00 2001 From: Burak Yigit Kaya Date: Wed, 3 Jun 2026 13:52:55 +0000 Subject: [PATCH 1/2] fix(cache): atomic writes to prevent torn-read deletion of valid entries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The response cache could silently delete a valid entry on a torn read. Each cache write fires cleanupCache() fire-and-forget at 10% probability; a second write to the same key overwrote the file with a non-atomic writeFile. The first write's async cleanup could then read the file mid-overwrite, fail to JSON.parse the truncated content, and delete it as "corrupted" in collectEntryMetadata — losing a valid cache entry. This surfaced as a ~10% flaky unit test (response-cache.test.ts "--fresh round-trip"), which is why it passed on PR #1051's CI run but failed on the post-merge main run. The bug is pre-existing (cleanup code predates #1051). Fixes: - atomicWriteCacheFile(): write to a unique temp file then rename into place. rename within the same directory is atomic on POSIX and Windows, so a concurrent reader always sees a complete old or new file, never a torn one. - collectEntryMetadata(): skip files on transient parse/read failure instead of deleting them. Genuinely corrupt entries already self-heal on the read path (getCachedResponse) or are overwritten on the next write. Verified: amplified repro 300/300 pass (was ~10% failing); actual test file 20/20 runs green; full unit suite 7421 passed / 13 skipped / 0 failed. --- src/lib/response-cache.ts | 46 ++++++++++++++++++++++++++++++++++----- 1 file changed, 40 insertions(+), 6 deletions(-) diff --git a/src/lib/response-cache.ts b/src/lib/response-cache.ts index c1b57a3a2..2eed7bd91 100644 --- a/src/lib/response-cache.ts +++ b/src/lib/response-cache.ts @@ -14,11 +14,12 @@ * @module */ -import { createHash } from "node:crypto"; +import { createHash, randomUUID } from "node:crypto"; import { mkdir, readdir, readFile, + rename, rm, unlink, writeFile, @@ -582,6 +583,36 @@ export async function storeCachedResponse( } } +/** + * Atomically write a cache file by writing to a unique temp file in the same + * directory and renaming it into place. + * + * A plain `writeFile` is not atomic: a concurrent reader (e.g. the probabilistic + * {@link cleanupCache} sweep fired fire-and-forget by an earlier write) can read + * the file mid-write, fail to `JSON.parse` the truncated content, and delete it + * as "corrupted" — silently losing a valid cache entry. `rename` within the same + * directory is atomic on POSIX and Windows, so a concurrent reader always sees + * either the complete old file or the complete new file, never a torn one. + * + * Best-effort cleanup of the temp file on failure; the caller treats write + * failures as non-fatal. + */ +async function atomicWriteCacheFile( + finalPath: string, + serialized: string +): Promise { + const tmpPath = `${finalPath}.${process.pid}.${randomUUID()}.tmp`; + try { + await writeFile(tmpPath, serialized, "utf-8"); + await rename(tmpPath, finalPath); + } catch (error) { + await unlink(tmpPath).catch(() => { + // Best-effort cleanup — the temp file may not have been created. + }); + throw error; + } +} + /** Inputs for {@link writeResponseToCache}, bundled to stay under useMaxParams. */ type WriteRequest = { key: string; @@ -634,7 +665,7 @@ async function writeResponseToCache(req: WriteRequest): Promise { const serialized = JSON.stringify(entry); await mkdir(getCacheDir(), { recursive: true, mode: 0o700 }); - await writeFile(cacheFilePath(key), serialized, "utf-8"); + await atomicWriteCacheFile(cacheFilePath(key), serialized); // Probabilistic cleanup to avoid unbounded cache growth if (Math.random() < CLEANUP_PROBABILITY) { @@ -779,10 +810,13 @@ async function collectEntryMetadata( FALLBACK_TTL_MS[classifyUrl(entry.url ?? "")]; entries.push({ file, createdAt: entry.createdAt, expired }); } catch { - // Unparseable file — delete it - unlink(filePath).catch(() => { - // Best-effort cleanup of corrupted file - }); + // Parse/read failure during a best-effort cleanup sweep. Do NOT delete: + // a concurrent write may be in flight and the read could be transient + // (despite atomic writes, locking/AV scanners can still cause a read to + // fail momentarily). Skip the file — a genuinely corrupt entry self-heals + // on the read path ({@link getCachedResponse}) or is overwritten on the + // next write to the same key. Deleting here was the cause of a torn-read + // data-loss bug (see {@link atomicWriteCacheFile}). } }); From 254e93a3d92abb4a1ddf3e578020f2b77586ac8c Mon Sep 17 00:00:00 2001 From: Burak Yigit Kaya Date: Wed, 3 Jun 2026 14:15:25 +0000 Subject: [PATCH 2/2] review: log suppressed cache errors, reclaim corrupt files, sweep stale temps Addresses adversarial self-review findings on the torn-read fix: - Add log.debug() to the previously-silent catch blocks in atomicWriteCacheFile and collectEntryMetadata (AGENTS.md prohibits silent catches in src/). - collectEntryMetadata now splits read vs parse failures: a transient read failure is skipped (never deleted), while a fully-read-but-unparseable file is genuine corruption and is marked expired so deleteExpiredEntries reclaims it. This keeps corrupt files visible to MAX_CACHE_ENTRIES eviction (they no longer escape the count) and is safe now that atomic writes preclude torn reads. - cleanupCache sweeps orphaned .tmp files older than 60s, so a crash between writeFile and rename can't leak temp files unbounded. - Soften the Windows atomicity claim in the JSDoc (near-atomic, same volume). - Add tests: atomic write leaves no temp file behind; repeated overwrite-read never loses the entry (torn-read regression). --- src/lib/response-cache.ts | 87 +++++++++++++++++++++++++++------ test/lib/response-cache.test.ts | 50 +++++++++++++++++++ 2 files changed, 123 insertions(+), 14 deletions(-) diff --git a/src/lib/response-cache.ts b/src/lib/response-cache.ts index 2eed7bd91..eaa025840 100644 --- a/src/lib/response-cache.ts +++ b/src/lib/response-cache.ts @@ -21,6 +21,7 @@ import { readFile, rename, rm, + stat, unlink, writeFile, } from "node:fs/promises"; @@ -31,8 +32,12 @@ import pLimit from "p-limit"; import { getIdentityFingerprint } from "./db/auth.js"; import { getConfigDir } from "./db/index.js"; import { getEnv } from "./env.js"; +import { logger } from "./logger.js"; import { recordCacheHit, withCacheSpan } from "./telemetry.js"; +/** Tagged logger for diagnostic visibility into best-effort cache operations. */ +const log = logger.withTag("response-cache"); + // --------------------------------------------------------------------------- // TTL tiers — used as fallback when the server sends no cache headers // --------------------------------------------------------------------------- @@ -590,9 +595,10 @@ export async function storeCachedResponse( * A plain `writeFile` is not atomic: a concurrent reader (e.g. the probabilistic * {@link cleanupCache} sweep fired fire-and-forget by an earlier write) can read * the file mid-write, fail to `JSON.parse` the truncated content, and delete it - * as "corrupted" — silently losing a valid cache entry. `rename` within the same - * directory is atomic on POSIX and Windows, so a concurrent reader always sees - * either the complete old file or the complete new file, never a torn one. + * as "corrupted" — silently losing a valid cache entry. `rename` into place is + * atomic on POSIX (same filesystem) and near-atomic on Windows (same volume), so + * a concurrent reader sees either the complete old file or the complete new file + * rather than a half-written one. * * Best-effort cleanup of the temp file on failure; the caller treats write * failures as non-fatal. @@ -606,8 +612,9 @@ async function atomicWriteCacheFile( await writeFile(tmpPath, serialized, "utf-8"); await rename(tmpPath, finalPath); } catch (error) { - await unlink(tmpPath).catch(() => { - // Best-effort cleanup — the temp file may not have been created. + await unlink(tmpPath).catch((cleanupError) => { + // The temp file may never have been created (e.g. writeFile failed). + log.debug("Failed to clean up cache temp file", cleanupError); }); throw error; } @@ -767,6 +774,14 @@ async function cleanupCache(): Promise { throw error; } + // Sweep orphaned temp files left by a crash between writeFile and rename in + // {@link atomicWriteCacheFile}. Done regardless of whether any .json files + // exist so leaked temp files can never accumulate unbounded. + const tmpFiles = files.filter((f) => f.endsWith(".tmp")); + if (tmpFiles.length > 0) { + await deleteStaleTempFiles(cacheDir, tmpFiles); + } + const jsonFiles = files.filter((f) => f.endsWith(".json")); if (jsonFiles.length === 0) { return; @@ -781,6 +796,36 @@ async function cleanupCache(): Promise { ]); } +/** + * Age (ms) after which an orphaned `.tmp` file is considered abandoned. + * + * A live {@link atomicWriteCacheFile} writes then renames in well under a + * second, so any `.tmp` file older than this was left by a crashed process and + * is safe to remove. The generous threshold avoids racing a concurrent write. + */ +const STALE_TEMP_FILE_MS = 60_000; + +/** Delete `.tmp` files older than {@link STALE_TEMP_FILE_MS}. Best-effort. */ +async function deleteStaleTempFiles( + cacheDir: string, + tmpFiles: string[] +): Promise { + const cutoff = Date.now() - STALE_TEMP_FILE_MS; + await cacheIO.map(tmpFiles, async (file) => { + const filePath = join(cacheDir, file); + try { + const stats = await stat(filePath); + if (stats.mtimeMs < cutoff) { + await unlink(filePath).catch(() => { + // Already gone — another sweep or the owning process removed it. + }); + } + } catch (error) { + log.debug("Failed to inspect cache temp file during sweep", error); + } + }); +} + /** Metadata for a cache entry, used for cleanup decisions */ type EntryMetadata = { file: string; createdAt: number; expired: boolean }; @@ -800,8 +845,24 @@ async function collectEntryMetadata( await cacheIO.map(jsonFiles, async (file) => { const filePath = join(cacheDir, file); + + // Read and parse are handled separately so we can distinguish a transient + // read failure (skip — never delete) from genuine corruption (parse failed + // on a fully-read file — safe to delete). Atomic writes + // ({@link atomicWriteCacheFile}) guarantee readers never see a half-written + // file, so a parse failure here means real corruption, not a torn read — + // deleting was previously a data-loss bug when writes were non-atomic. + let raw: string; + try { + raw = await readFile(filePath, "utf-8"); + } catch (error) { + // Transient read failure (locking, AV scanner, ENOENT from a concurrent + // sweep). Skip — a later sweep will reconsider the file. + log.debug("Skipping cache file with unreadable contents", error); + return; + } + try { - const raw = await readFile(filePath, "utf-8"); const entry = JSON.parse(raw) as CacheEntry; const expired = entry.expiresAt !== undefined @@ -809,14 +870,12 @@ async function collectEntryMetadata( : now - entry.createdAt > FALLBACK_TTL_MS[classifyUrl(entry.url ?? "")]; entries.push({ file, createdAt: entry.createdAt, expired }); - } catch { - // Parse/read failure during a best-effort cleanup sweep. Do NOT delete: - // a concurrent write may be in flight and the read could be transient - // (despite atomic writes, locking/AV scanners can still cause a read to - // fail momentarily). Skip the file — a genuinely corrupt entry self-heals - // on the read path ({@link getCachedResponse}) or is overwritten on the - // next write to the same key. Deleting here was the cause of a torn-read - // data-loss bug (see {@link atomicWriteCacheFile}). + } catch (error) { + // Fully read but unparseable — genuine corruption. Mark expired so + // {@link deleteExpiredEntries} reclaims it; this also keeps eviction + // counts accurate (corrupt files are not invisible to MAX_CACHE_ENTRIES). + log.debug("Reclaiming corrupt cache file during cleanup", error); + entries.push({ file, createdAt: now, expired: true }); } }); diff --git a/test/lib/response-cache.test.ts b/test/lib/response-cache.test.ts index dfc1f84bc..26f347a1c 100644 --- a/test/lib/response-cache.test.ts +++ b/test/lib/response-cache.test.ts @@ -470,6 +470,56 @@ describe("file structure", () => { expect(files.length).toBe(1); expect(files[0]).toMatch(/^[0-9a-f]{64}\.json$/); }); + + test("atomic write leaves no temp file behind on success", async () => { + await storeCachedResponse( + TEST_METHOD, + TEST_URL, + {}, + mockResponse(TEST_BODY) + ); + + const cacheDir = join(getConfigDir(), "cache", "responses"); + const files = await readdir(cacheDir); + // Exactly the final .json file — the temp file was renamed into place. + expect(files.every((f) => f.endsWith(".json"))).toBe(true); + expect(files.some((f) => f.endsWith(".tmp"))).toBe(false); + }); +}); + +// --------------------------------------------------------------------------- +// Atomic write / torn-read regression (getsentry/cli#1056) +// +// A write fires cleanupCache() fire-and-forget at 10% probability. Before +// atomic writes, a second write to the same key could be read mid-overwrite by +// that cleanup sweep, fail to JSON.parse, and be deleted as "corrupted" — +// losing a valid entry. This loop exercises the exact store→overwrite→read +// sequence many times so the probabilistic cleanup is virtually guaranteed to +// fire; every iteration must still serve the fresh value. +// --------------------------------------------------------------------------- + +describe("atomic write regression", () => { + test("repeated overwrite-then-read never loses the entry", async () => { + for (let i = 0; i < 50; i++) { + await storeCachedResponse( + TEST_METHOD, + TEST_URL, + {}, + mockResponse({ data: `stale-${i}` }) + ); + const freshBody = { data: `fresh-${i}` }; + await storeCachedResponse( + TEST_METHOD, + TEST_URL, + {}, + mockResponse(freshBody) + ); + + const cached = await getCachedResponse(TEST_METHOD, TEST_URL, {}); + expect(cached).toBeDefined(); + expect(await cached!.json()).toEqual(freshBody); + } + }); }); // ---------------------------------------------------------------------------