diff --git a/src/lib/dif/scan.ts b/src/lib/dif/scan.ts index 06c35d961..8634b8e1c 100644 --- a/src/lib/dif/scan.ts +++ b/src/lib/dif/scan.ts @@ -18,10 +18,11 @@ * `--type macho` are equivalent here, and the feature filters narrow further. */ -import { open, readdir, readFile, realpath, stat } from "node:fs/promises"; -import { join } from "node:path"; +import { open, readFile, stat } from "node:fs/promises"; +import { resolve } from "node:path"; import { ValidationError } from "../errors.js"; import { logger } from "../logger.js"; +import { type WalkEntry, walkFiles } from "../scan/index.js"; import { type DifArchiveInfo, type DifObjectInfo, @@ -243,94 +244,121 @@ export function objectPassesFilters( } /** - * Recursively collect every regular file under a path, with cycle-safe - * symlink-following via visited realpath tracking. + * Recursively scan paths for candidate files. + * + * Each argument may be a file (kept as-is) or a directory. Directories are + * walked recursively by the shared, well-tested {@link walkFiles} walker — + * the same traversal foundation the DSN code-scanner uses. The walker is + * configured to preserve this command's contract, which differs from the + * walker's DSN-tuned defaults: debug files are large binaries that typically + * live in `.gitignore`d build output (`build/`, `target/`, Xcode + * `DerivedData`), so gitignore handling and the default build-output + * skip-dirs are disabled, the 256 KB default size cap is lifted (size gating + * happens later in {@link prepareDifs}, after the cheap format peek, so it can + * accurately drive `oversizedCount`), and the unused `isBinary` classification + * is turned off to avoid an extra head-read per file. Symlinks are followed; + * cycles are detected by the walker via a visited inode set. + * + * Duplicate file paths (e.g. overlapping directory arguments, or a file passed + * both explicitly and reached via a directory walk) are collapsed by absolute + * path so each file is read and parsed at most once. + * + * A path that does not exist (or is unreadable) throws {@link ValidationError}. + * Failures deep inside a directory tree are swallowed by the walker (a scanned + * tree legitimately contains many unreadable/odd entries). + * + * @param paths - Files and/or directories to scan. + * @returns File paths in scan order (absolute for walked entries; as-given for + * explicit file arguments), de-duplicated by absolute path. + * @throws {ValidationError} If an explicit path does not exist or cannot be + * accessed. */ -async function collectFiles( - path: string, - out: string[], - visited: Set -): Promise { - let info: Awaited>; - try { - info = await stat(path); - } catch (err) { - log.debug(`Skipping unreadable path: ${path}`, err); - return; - } - - if (info.isDirectory()) { - let real: string; - try { - real = await realpath(path); - } catch { - log.debug(`Could not resolve real path for directory: ${path}`); - return; - } - if (visited.has(real)) { - log.debug(`Skipping already-visited directory (symlink cycle?): ${path}`); - return; +export async function scanPaths(paths: string[]): Promise { + const files: string[] = []; + const seen = new Set(); + const add = (filePath: string): void => { + const key = resolve(filePath); + if (!seen.has(key)) { + seen.add(key); + files.push(filePath); } - visited.add(real); + }; - let entries: string[]; - try { - entries = await readdir(path); - } catch (err) { - log.debug(`Skipping unreadable directory: ${path}`, err); - return; - } - for (const entry of entries) { - await collectFiles(join(path, entry), out, visited); + for (const path of paths) { + const info = await statExplicitPath(path); + if (info.isFile()) { + add(path); + } else if (info.isDirectory()) { + for await (const entry of walkDebugFileDir(path)) { + add(entry.absolutePath); + } + } else { + // Sockets, FIFOs, devices, etc. are not debug files — skip them. + log.debug(`Skipping non-file, non-directory path: ${path}`); } - return; } - if (info.isFile()) { - out.push(path); - } + return files; } /** - * Recursively scan paths for candidate files. - * - * Each argument may be a file (kept as-is) or a directory (walked - * recursively). Unreadable entries are skipped with a debug log. Symlinks - * are followed; cycle-safe via visited-realpath tracking. A path that does - * not exist at all throws {@link ValidationError} (unlike a walk failure - * inside a tree, which is silently skipped). + * `stat` an explicit, user-supplied path argument, translating filesystem + * errors into a {@link ValidationError}. Unlike entries discovered deep inside + * a directory walk (which are silently skipped), a path the user named + * explicitly must exist and be accessible. * - * @param paths - Files and/or directories to scan. - * @returns Absolute-or-relative file paths in scan order. - * @throws {ValidationError} If an explicit path does not exist or cannot be - * accessed. Walk-internal failures (entries deep inside a tree) are still - * silently skipped. + * @param path - The explicit path argument. + * @returns The path's `Stats`. + * @throws {ValidationError} If the path does not exist or cannot be accessed. */ -export async function scanPaths(paths: string[]): Promise { - const files: string[] = []; - const visited = new Set(); - for (const path of paths) { - try { - await stat(path); - } catch (err) { - const code = (err as NodeJS.ErrnoException).code; - if (code === "ENOENT") { - throw new ValidationError(`Path '${path}' does not exist.`, "path"); - } - if (code === "EACCES" || code === "EPERM") { - throw new ValidationError( - `Path '${path}' is not readable: ${code}.`, - "path" - ); - } +async function statExplicitPath( + path: string +): Promise>> { + try { + return await stat(path); + } catch (err) { + const code = (err as NodeJS.ErrnoException).code; + if (code === "ENOENT") { + throw new ValidationError(`Path '${path}' does not exist.`, "path"); + } + if (code === "EACCES" || code === "EPERM") { throw new ValidationError( - `Cannot access path '${path}': ${code ?? "unknown error"}.`, + `Path '${path}' is not readable: ${code}.`, "path" ); } - await collectFiles(path, files, visited); + throw new ValidationError( + `Cannot access path '${path}': ${code ?? "unknown error"}.`, + "path" + ); } - return files; +} + +/** + * Walk a directory for every regular file, using the shared {@link walkFiles} + * walker configured for debug-file scanning. + * + * The options intentionally diverge from the walker's DSN-tuned defaults: + * debug files are large binaries that typically live in `.gitignore`d build + * output (`build/`, `target/`, Xcode `DerivedData`), so gitignore handling and + * the default build-output skip-dirs are disabled, the 256 KB default size cap + * is lifted (size gating happens later in {@link prepareDifs}, after the cheap + * format peek, so it can accurately drive `oversizedCount`), and the unused + * `isBinary` classification is turned off to avoid an extra head-read per file. + * Symlinks are followed; the walker detects cycles via a visited inode set. + * + * @param dir - Absolute or relative directory path to walk. + * @returns An async iterable of every file found beneath `dir`. + */ +function walkDebugFileDir(dir: string): AsyncIterable { + return walkFiles({ + cwd: resolve(dir), + respectGitignore: false, + alwaysSkipDirs: [], + maxFileSize: Number.POSITIVE_INFINITY, + followSymlinks: true, + classifyBinary: false, + }); } const PEEK_HEADER_BYTES = 4096; diff --git a/src/lib/scan/types.ts b/src/lib/scan/types.ts index 1125a4fba..69b03124a 100644 --- a/src/lib/scan/types.ts +++ b/src/lib/scan/types.ts @@ -179,6 +179,20 @@ export type WalkOptions = { * DSN-style cache invalidation callers pass `true`. */ recordMtimes?: boolean; + /** + * Classify each yielded file as binary, populating `WalkEntry.isBinary`. + * Default: `true`. + * + * When `true` (and no `extensions` allowlist is set), files with an + * unknown extension are sniffed by reading their first 8 KB and looking + * for a NUL byte. Consumers that ignore `isBinary` and walk large + * binary-heavy trees (e.g. the debug-information-file scanner) should + * pass `false` to skip that per-file head-read entirely — every entry is + * then reported with `isBinary: false` without touching file contents. + * Has no effect when `extensions` is set (classification is already + * skipped for allowlisted files). + */ + classifyBinary?: boolean; /** * Observer invoked once per directory the walker enters, with the * directory's absolute path and its floored `stat.mtimeMs`. Fires diff --git a/src/lib/scan/walker.ts b/src/lib/scan/walker.ts index 1da5789eb..679b9fa85 100644 --- a/src/lib/scan/walker.ts +++ b/src/lib/scan/walker.ts @@ -235,6 +235,18 @@ async function* walkFilesImpl(opts: WalkOptions): AsyncGenerator { }; const startedAt = cfg.clock(); const visitedInodes = new Set(); + // Seed the root's inode so a descendant symlink pointing back at the scan + // root is treated as a cycle. The root frame is pushed directly (below) + // rather than through `maybeDescend`, which is the only place inodes are + // otherwise recorded — so without this seed a symlink → root would re-list + // the root's entire subtree under a second path prefix. Only relevant when + // following symlinks; the default walk never re-enters a directory. + if (cfg.followSymlinks) { + const rootKey = await inodeKey(cfg.cwd); + if (rootKey) { + visitedInodes.add(rootKey); + } + } const stack: DirFrame[] = [{ absDir: cfg.cwd, depth: 0 }]; const ctx: WalkContext = { cfg, @@ -742,23 +754,29 @@ async function tryYieldFile( * Classify a file as binary via extension fast-path or 8 KB NUL sniff. * * Skip the sniff for: - * (a) callers that provided an `extensions` allowlist — the file + * (a) callers that opted out via `classifyBinary: false` — they ignore + * `isBinary`, so no read is worth doing; + * (b) callers that provided an `extensions` allowlist — the file * already passed it, so by construction the caller considers * its extension text-bearing; - * (b) known text extensions (`TEXT_EXTENSIONS`); - * (c) empty files. + * (c) known text extensions (`TEXT_EXTENSIONS`); + * (d) empty files. * - * Ordering matters: (a) runs first so we never re-compute - * `path.extname` + `Set.has` for the hot DSN-scan path where - * `processEntry` has already matched the extension. + * Ordering matters: (a)/(b) run first so we never read file contents (or + * even re-compute `path.extname` + `Set.has`) for the hot DSN-scan path + * where `processEntry` has already matched the extension. */ async function classifyFile( absPath: string, size: number, cfg: NormalizedOptions ): Promise { + if (!cfg.classifyBinary) { + // (a) caller ignores `isBinary` — skip all classification work. + return false; + } if (cfg.extensions !== undefined) { - // (a) caller filtered — skip re-classification. + // (b) caller filtered — skip re-classification. return false; } const byExt = classifyByExtension(absPath, TEXT_EXTENSIONS); @@ -911,6 +929,7 @@ type NormalizedOptions = { timeBudgetMs: number; clock: () => number; recordMtimes: boolean; + classifyBinary: boolean; onDirectoryVisit: ((absDir: string, mtimeMs: number) => void) | undefined; }; @@ -939,6 +958,7 @@ function normalizeOptions(opts: WalkOptions): NormalizedOptions { timeBudgetMs: opts.timeBudgetMs ?? Number.POSITIVE_INFINITY, clock: opts.clock ?? (() => performance.now()), recordMtimes: opts.recordMtimes ?? false, + classifyBinary: opts.classifyBinary ?? true, onDirectoryVisit: opts.onDirectoryVisit, }; } diff --git a/test/lib/dif/scan.test.ts b/test/lib/dif/scan.test.ts index 74d032ee1..2626c19a5 100644 --- a/test/lib/dif/scan.test.ts +++ b/test/lib/dif/scan.test.ts @@ -7,9 +7,9 @@ * `prepareDifs` size gate. */ -import { mkdtemp, rm, writeFile } from "node:fs/promises"; +import { mkdir, mkdtemp, rm, symlink, writeFile } from "node:fs/promises"; import { tmpdir } from "node:os"; -import { join } from "node:path"; +import { basename, join } from "node:path"; import { afterEach, beforeEach, describe, expect, test } from "vitest"; import { buildDifFilters, @@ -105,3 +105,71 @@ describe("prepareDifs size gate", () => { expect(oversizedCount).toBe(1); }); }); + +describe("scanPaths traversal", () => { + /** Map a list of file paths to their basenames for order-independent checks. */ + const names = (paths: string[]): string[] => paths.map((p) => basename(p)); + + test("walks a directory recursively and returns nested files", async () => { + await mkdir(join(tempDir, "sub"), { recursive: true }); + await writeFile(join(tempDir, "top.sym"), BREAKPAD_FIXTURE); + await writeFile(join(tempDir, "sub", "nested.sym"), BREAKPAD_FIXTURE); + + const files = await scanPaths([tempDir]); + + expect(names(files).sort()).toEqual(["nested.sym", "top.sym"]); + }); + + test("keeps an explicit file argument as-is", async () => { + const path = join(tempDir, "explicit.sym"); + await writeFile(path, BREAKPAD_FIXTURE); + + const files = await scanPaths([path]); + + expect(files).toEqual([path]); + }); + + test("discovers files under .gitignore'd and node_modules directories", async () => { + // The walker's DSN-tuned defaults (respectGitignore + build-output + // skip-dirs) would hide these; the DIF scanner must disable both because + // debug files routinely live in gitignored build output. + await writeFile(join(tempDir, ".gitignore"), "build/\nnode_modules/\n"); + await mkdir(join(tempDir, "build"), { recursive: true }); + await mkdir(join(tempDir, "node_modules"), { recursive: true }); + await writeFile(join(tempDir, "build", "ignored.sym"), BREAKPAD_FIXTURE); + await writeFile(join(tempDir, "node_modules", "dep.sym"), BREAKPAD_FIXTURE); + + const found = names(await scanPaths([tempDir])); + + expect(found).toContain("ignored.sym"); + expect(found).toContain("dep.sym"); + }); + + test("dedupes files reachable via overlapping arguments", async () => { + await mkdir(join(tempDir, "sub"), { recursive: true }); + await writeFile(join(tempDir, "sub", "dup.sym"), BREAKPAD_FIXTURE); + + // The same file is reachable both via the parent dir walk and the explicit + // nested-dir walk; it must appear exactly once. + const files = await scanPaths([tempDir, join(tempDir, "sub")]); + + expect(names(files).filter((n) => n === "dup.sym")).toHaveLength(1); + }); + + test("is cycle-safe with a directory symlink loop", async () => { + await mkdir(join(tempDir, "a"), { recursive: true }); + await writeFile(join(tempDir, "a", "real.sym"), BREAKPAD_FIXTURE); + // A self-referential symlink would loop forever without cycle detection. + await symlink(join(tempDir, "a"), join(tempDir, "a", "loop")); + + const found = names(await scanPaths([tempDir])); + + expect(found).toContain("real.sym"); + }); + + test("throws ValidationError for a non-existent path", async () => { + await expect(scanPaths([join(tempDir, "does-not-exist")])).rejects.toThrow( + /does not exist/ + ); + }); +}); diff --git a/test/lib/scan/walker.test.ts b/test/lib/scan/walker.test.ts index 09dc70db2..9b93f43ab 100644 --- a/test/lib/scan/walker.test.ts +++ b/test/lib/scan/walker.test.ts @@ -279,6 +279,37 @@ describe("walkFiles — binary detection", () => { cleanup(); } }); + + test("classifyBinary: false yields files without sniffing (isBinary=false)", async () => { + const { cwd, cleanup } = makeSandbox({}); + try { + // A NUL-containing blob that the default sniff classifies as binary. + const bin = new Uint8Array(256); + bin[10] = 0; + writeFileSync(join(cwd, "blob.bin"), bin); + + const findBlob = async ( + opts: Parameters[0] + ): Promise => { + for await (const e of walkFiles(opts)) { + if (e.relativePath === "blob.bin") { + return e; + } + } + return; + }; + + // By default the blob sniffs as binary; with classification disabled the + // same file is still yielded but reported non-binary (the 8 KB head-read + // is skipped entirely). + expect((await findBlob({ cwd }))?.isBinary).toBe(true); + expect((await findBlob({ cwd, classifyBinary: false }))?.isBinary).toBe( + false + ); + } finally { + cleanup(); + } + }); }); describe("walkFiles — mtime recording", () => { @@ -662,6 +693,24 @@ describe("walkFiles — followSymlinks", () => { cleanup(); } }); + + test("followSymlinks: true does not re-list the tree via a symlink to the scan root", async () => { + const { cwd, cleanup } = makeSandbox({ + "top.ts": "a", + "sub/nested.ts": "b", + }); + try { + // A symlink pointing back at the scan root. The root frame is pushed + // directly (not via maybeDescend), so unless its inode is seeded into + // the visited set the walker would re-list the whole tree under + // `sub/root-link/...`. Asserting the exact set guards that seeding. + symlinkSync(cwd, join(cwd, "sub", "root-link")); + const files = await collect({ cwd, followSymlinks: true }); + expect(files.sort()).toEqual(["sub/nested.ts", "top.ts"]); + } finally { + cleanup(); + } + }); }); describe("walkFiles — parallel walker (concurrency > 1)", () => {