diff --git a/src/lib/response-cache.ts b/src/lib/response-cache.ts index c1b57a3a2..eaa025840 100644 --- a/src/lib/response-cache.ts +++ b/src/lib/response-cache.ts @@ -14,12 +14,14 @@ * @module */ -import { createHash } from "node:crypto"; +import { createHash, randomUUID } from "node:crypto"; import { mkdir, readdir, readFile, + rename, rm, + stat, unlink, writeFile, } from "node:fs/promises"; @@ -30,8 +32,12 @@ import pLimit from "p-limit"; import { getIdentityFingerprint } from "./db/auth.js"; import { getConfigDir } from "./db/index.js"; import { getEnv } from "./env.js"; +import { logger } from "./logger.js"; import { recordCacheHit, withCacheSpan } from "./telemetry.js"; +/** Tagged logger for diagnostic visibility into best-effort cache operations. */ +const log = logger.withTag("response-cache"); + // --------------------------------------------------------------------------- // TTL tiers — used as fallback when the server sends no cache headers // --------------------------------------------------------------------------- @@ -582,6 +588,38 @@ export async function storeCachedResponse( } } +/** + * Atomically write a cache file by writing to a unique temp file in the same + * directory and renaming it into place. + * + * A plain `writeFile` is not atomic: a concurrent reader (e.g. the probabilistic + * {@link cleanupCache} sweep fired fire-and-forget by an earlier write) can read + * the file mid-write, fail to `JSON.parse` the truncated content, and delete it + * as "corrupted" — silently losing a valid cache entry. `rename` into place is + * atomic on POSIX (same filesystem) and near-atomic on Windows (same volume), so + * a concurrent reader sees either the complete old file or the complete new file + * rather than a half-written one. + * + * Best-effort cleanup of the temp file on failure; the caller treats write + * failures as non-fatal. + */ +async function atomicWriteCacheFile( + finalPath: string, + serialized: string +): Promise { + const tmpPath = `${finalPath}.${process.pid}.${randomUUID()}.tmp`; + try { + await writeFile(tmpPath, serialized, "utf-8"); + await rename(tmpPath, finalPath); + } catch (error) { + await unlink(tmpPath).catch((cleanupError) => { + // The temp file may never have been created (e.g. writeFile failed). + log.debug("Failed to clean up cache temp file", cleanupError); + }); + throw error; + } +} + /** Inputs for {@link writeResponseToCache}, bundled to stay under useMaxParams. */ type WriteRequest = { key: string; @@ -634,7 +672,7 @@ async function writeResponseToCache(req: WriteRequest): Promise { const serialized = JSON.stringify(entry); await mkdir(getCacheDir(), { recursive: true, mode: 0o700 }); - await writeFile(cacheFilePath(key), serialized, "utf-8"); + await atomicWriteCacheFile(cacheFilePath(key), serialized); // Probabilistic cleanup to avoid unbounded cache growth if (Math.random() < CLEANUP_PROBABILITY) { @@ -736,6 +774,14 @@ async function cleanupCache(): Promise { throw error; } + // Sweep orphaned temp files left by a crash between writeFile and rename in + // {@link atomicWriteCacheFile}. Done regardless of whether any .json files + // exist so leaked temp files can never accumulate unbounded. + const tmpFiles = files.filter((f) => f.endsWith(".tmp")); + if (tmpFiles.length > 0) { + await deleteStaleTempFiles(cacheDir, tmpFiles); + } + const jsonFiles = files.filter((f) => f.endsWith(".json")); if (jsonFiles.length === 0) { return; @@ -750,6 +796,36 @@ async function cleanupCache(): Promise { ]); } +/** + * Age (ms) after which an orphaned `.tmp` file is considered abandoned. + * + * A live {@link atomicWriteCacheFile} writes then renames in well under a + * second, so any `.tmp` file older than this was left by a crashed process and + * is safe to remove. The generous threshold avoids racing a concurrent write. + */ +const STALE_TEMP_FILE_MS = 60_000; + +/** Delete `.tmp` files older than {@link STALE_TEMP_FILE_MS}. Best-effort. */ +async function deleteStaleTempFiles( + cacheDir: string, + tmpFiles: string[] +): Promise { + const cutoff = Date.now() - STALE_TEMP_FILE_MS; + await cacheIO.map(tmpFiles, async (file) => { + const filePath = join(cacheDir, file); + try { + const stats = await stat(filePath); + if (stats.mtimeMs < cutoff) { + await unlink(filePath).catch(() => { + // Already gone — another sweep or the owning process removed it. + }); + } + } catch (error) { + log.debug("Failed to inspect cache temp file during sweep", error); + } + }); +} + /** Metadata for a cache entry, used for cleanup decisions */ type EntryMetadata = { file: string; createdAt: number; expired: boolean }; @@ -769,8 +845,24 @@ async function collectEntryMetadata( await cacheIO.map(jsonFiles, async (file) => { const filePath = join(cacheDir, file); + + // Read and parse are handled separately so we can distinguish a transient + // read failure (skip — never delete) from genuine corruption (parse failed + // on a fully-read file — safe to delete). Atomic writes + // ({@link atomicWriteCacheFile}) guarantee readers never see a half-written + // file, so a parse failure here means real corruption, not a torn read — + // deleting was previously a data-loss bug when writes were non-atomic. + let raw: string; + try { + raw = await readFile(filePath, "utf-8"); + } catch (error) { + // Transient read failure (locking, AV scanner, ENOENT from a concurrent + // sweep). Skip — a later sweep will reconsider the file. + log.debug("Skipping cache file with unreadable contents", error); + return; + } + try { - const raw = await readFile(filePath, "utf-8"); const entry = JSON.parse(raw) as CacheEntry; const expired = entry.expiresAt !== undefined @@ -778,11 +870,12 @@ async function collectEntryMetadata( : now - entry.createdAt > FALLBACK_TTL_MS[classifyUrl(entry.url ?? "")]; entries.push({ file, createdAt: entry.createdAt, expired }); - } catch { - // Unparseable file — delete it - unlink(filePath).catch(() => { - // Best-effort cleanup of corrupted file - }); + } catch (error) { + // Fully read but unparseable — genuine corruption. Mark expired so + // {@link deleteExpiredEntries} reclaims it; this also keeps eviction + // counts accurate (corrupt files are not invisible to MAX_CACHE_ENTRIES). + log.debug("Reclaiming corrupt cache file during cleanup", error); + entries.push({ file, createdAt: now, expired: true }); } }); diff --git a/test/lib/response-cache.test.ts b/test/lib/response-cache.test.ts index dfc1f84bc..26f347a1c 100644 --- a/test/lib/response-cache.test.ts +++ b/test/lib/response-cache.test.ts @@ -470,6 +470,56 @@ describe("file structure", () => { expect(files.length).toBe(1); expect(files[0]).toMatch(/^[0-9a-f]{64}\.json$/); }); + + test("atomic write leaves no temp file behind on success", async () => { + await storeCachedResponse( + TEST_METHOD, + TEST_URL, + {}, + mockResponse(TEST_BODY) + ); + + const cacheDir = join(getConfigDir(), "cache", "responses"); + const files = await readdir(cacheDir); + // Exactly the final .json file — the temp file was renamed into place. + expect(files.every((f) => f.endsWith(".json"))).toBe(true); + expect(files.some((f) => f.endsWith(".tmp"))).toBe(false); + }); +}); + +// --------------------------------------------------------------------------- +// Atomic write / torn-read regression (getsentry/cli#1056) +// +// A write fires cleanupCache() fire-and-forget at 10% probability. Before +// atomic writes, a second write to the same key could be read mid-overwrite by +// that cleanup sweep, fail to JSON.parse, and be deleted as "corrupted" — +// losing a valid entry. This loop exercises the exact store→overwrite→read +// sequence many times so the probabilistic cleanup is virtually guaranteed to +// fire; every iteration must still serve the fresh value. +// --------------------------------------------------------------------------- + +describe("atomic write regression", () => { + test("repeated overwrite-then-read never loses the entry", async () => { + for (let i = 0; i < 50; i++) { + await storeCachedResponse( + TEST_METHOD, + TEST_URL, + {}, + mockResponse({ data: `stale-${i}` }) + ); + const freshBody = { data: `fresh-${i}` }; + await storeCachedResponse( + TEST_METHOD, + TEST_URL, + {}, + mockResponse(freshBody) + ); + + const cached = await getCachedResponse(TEST_METHOD, TEST_URL, {}); + expect(cached).toBeDefined(); + expect(await cached!.json()).toEqual(freshBody); + } + }); }); // ---------------------------------------------------------------------------