Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 101 additions & 8 deletions src/lib/response-cache.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,14 @@
* @module
*/

import { createHash } from "node:crypto";
import { createHash, randomUUID } from "node:crypto";
import {
mkdir,
readdir,
readFile,
rename,
rm,
stat,
unlink,
writeFile,
} from "node:fs/promises";
Expand All @@ -30,8 +32,12 @@ import pLimit from "p-limit";
import { getIdentityFingerprint } from "./db/auth.js";
import { getConfigDir } from "./db/index.js";
import { getEnv } from "./env.js";
import { logger } from "./logger.js";
import { recordCacheHit, withCacheSpan } from "./telemetry.js";

/** Tagged logger for diagnostic visibility into best-effort cache operations. */
const log = logger.withTag("response-cache");

// ---------------------------------------------------------------------------
// TTL tiers — used as fallback when the server sends no cache headers
// ---------------------------------------------------------------------------
Expand Down Expand Up @@ -582,6 +588,38 @@ export async function storeCachedResponse(
}
}

/**
* Atomically write a cache file by writing to a unique temp file in the same
* directory and renaming it into place.
*
* A plain `writeFile` is not atomic: a concurrent reader (e.g. the probabilistic
* {@link cleanupCache} sweep fired fire-and-forget by an earlier write) can read
* the file mid-write, fail to `JSON.parse` the truncated content, and delete it
* as "corrupted" — silently losing a valid cache entry. `rename` into place is
* atomic on POSIX (same filesystem) and near-atomic on Windows (same volume), so
* a concurrent reader sees either the complete old file or the complete new file
* rather than a half-written one.
*
* Best-effort cleanup of the temp file on failure; the caller treats write
* failures as non-fatal.
*/
async function atomicWriteCacheFile(
finalPath: string,
serialized: string
): Promise<void> {
const tmpPath = `${finalPath}.${process.pid}.${randomUUID()}.tmp`;
try {
await writeFile(tmpPath, serialized, "utf-8");
await rename(tmpPath, finalPath);
} catch (error) {
await unlink(tmpPath).catch((cleanupError) => {
// The temp file may never have been created (e.g. writeFile failed).
log.debug("Failed to clean up cache temp file", cleanupError);
});
throw error;
}
}

/** Inputs for {@link writeResponseToCache}, bundled to stay under useMaxParams. */
type WriteRequest = {
key: string;
Expand Down Expand Up @@ -634,7 +672,7 @@ async function writeResponseToCache(req: WriteRequest): Promise<number> {

const serialized = JSON.stringify(entry);
await mkdir(getCacheDir(), { recursive: true, mode: 0o700 });
await writeFile(cacheFilePath(key), serialized, "utf-8");
await atomicWriteCacheFile(cacheFilePath(key), serialized);

// Probabilistic cleanup to avoid unbounded cache growth
if (Math.random() < CLEANUP_PROBABILITY) {
Expand Down Expand Up @@ -736,6 +774,14 @@ async function cleanupCache(): Promise<void> {
throw error;
}

// Sweep orphaned temp files left by a crash between writeFile and rename in
// {@link atomicWriteCacheFile}. Done regardless of whether any .json files
// exist so leaked temp files can never accumulate unbounded.
const tmpFiles = files.filter((f) => f.endsWith(".tmp"));
if (tmpFiles.length > 0) {
await deleteStaleTempFiles(cacheDir, tmpFiles);
}

const jsonFiles = files.filter((f) => f.endsWith(".json"));
if (jsonFiles.length === 0) {
return;
Expand All @@ -750,6 +796,36 @@ async function cleanupCache(): Promise<void> {
]);
}

/**
* Age (ms) after which an orphaned `.tmp` file is considered abandoned.
*
* A live {@link atomicWriteCacheFile} writes then renames in well under a
* second, so any `.tmp` file older than this was left by a crashed process and
* is safe to remove. The generous threshold avoids racing a concurrent write.
*/
const STALE_TEMP_FILE_MS = 60_000;

/** Delete `.tmp` files older than {@link STALE_TEMP_FILE_MS}. Best-effort. */
async function deleteStaleTempFiles(
cacheDir: string,
tmpFiles: string[]
): Promise<void> {
const cutoff = Date.now() - STALE_TEMP_FILE_MS;
await cacheIO.map(tmpFiles, async (file) => {
const filePath = join(cacheDir, file);
try {
const stats = await stat(filePath);
if (stats.mtimeMs < cutoff) {
await unlink(filePath).catch(() => {
// Already gone — another sweep or the owning process removed it.
});
}
} catch (error) {
log.debug("Failed to inspect cache temp file during sweep", error);
}
});
}

/** Metadata for a cache entry, used for cleanup decisions */
type EntryMetadata = { file: string; createdAt: number; expired: boolean };

Expand All @@ -769,20 +845,37 @@ async function collectEntryMetadata(

await cacheIO.map(jsonFiles, async (file) => {
const filePath = join(cacheDir, file);

// Read and parse are handled separately so we can distinguish a transient
// read failure (skip — never delete) from genuine corruption (parse failed
// on a fully-read file — safe to delete). Atomic writes
// ({@link atomicWriteCacheFile}) guarantee readers never see a half-written
// file, so a parse failure here means real corruption, not a torn read —
// deleting was previously a data-loss bug when writes were non-atomic.
let raw: string;
try {
raw = await readFile(filePath, "utf-8");
} catch (error) {
// Transient read failure (locking, AV scanner, ENOENT from a concurrent
// sweep). Skip — a later sweep will reconsider the file.
log.debug("Skipping cache file with unreadable contents", error);
return;
}

try {
const raw = await readFile(filePath, "utf-8");
const entry = JSON.parse(raw) as CacheEntry;
const expired =
entry.expiresAt !== undefined
? now >= entry.expiresAt
: now - entry.createdAt >
FALLBACK_TTL_MS[classifyUrl(entry.url ?? "")];
entries.push({ file, createdAt: entry.createdAt, expired });
} catch {
// Unparseable file — delete it
unlink(filePath).catch(() => {
// Best-effort cleanup of corrupted file
});
} catch (error) {
// Fully read but unparseable — genuine corruption. Mark expired so
// {@link deleteExpiredEntries} reclaims it; this also keeps eviction
// counts accurate (corrupt files are not invisible to MAX_CACHE_ENTRIES).
log.debug("Reclaiming corrupt cache file during cleanup", error);
entries.push({ file, createdAt: now, expired: true });
}
});

Expand Down
50 changes: 50 additions & 0 deletions test/lib/response-cache.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -470,6 +470,56 @@ describe("file structure", () => {
expect(files.length).toBe(1);
expect(files[0]).toMatch(/^[0-9a-f]{64}\.json$/);
});

test("atomic write leaves no temp file behind on success", async () => {
await storeCachedResponse(
TEST_METHOD,
TEST_URL,
{},
mockResponse(TEST_BODY)
);

const cacheDir = join(getConfigDir(), "cache", "responses");
const files = await readdir(cacheDir);
// Exactly the final .json file — the temp file was renamed into place.
expect(files.every((f) => f.endsWith(".json"))).toBe(true);
expect(files.some((f) => f.endsWith(".tmp"))).toBe(false);
});
});

// ---------------------------------------------------------------------------
// Atomic write / torn-read regression (getsentry/cli#1056)
//
// A write fires cleanupCache() fire-and-forget at 10% probability. Before
// atomic writes, a second write to the same key could be read mid-overwrite by
// that cleanup sweep, fail to JSON.parse, and be deleted as "corrupted" —
// losing a valid entry. This loop exercises the exact store→overwrite→read
// sequence many times so the probabilistic cleanup is virtually guaranteed to
// fire; every iteration must still serve the fresh value.
// ---------------------------------------------------------------------------

describe("atomic write regression", () => {
test("repeated overwrite-then-read never loses the entry", async () => {
for (let i = 0; i < 50; i++) {
await storeCachedResponse(
TEST_METHOD,
TEST_URL,
{},
mockResponse({ data: `stale-${i}` })
);
const freshBody = { data: `fresh-${i}` };
await storeCachedResponse(
TEST_METHOD,
TEST_URL,
{},
mockResponse(freshBody)
);

const cached = await getCachedResponse(TEST_METHOD, TEST_URL, {});
expect(cached).toBeDefined();
expect(await cached!.json()).toEqual(freshBody);
}
});
});

// ---------------------------------------------------------------------------
Expand Down
Loading