Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
176 changes: 102 additions & 74 deletions src/lib/dif/scan.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,11 @@
* `--type macho` are equivalent here, and the feature filters narrow further.
*/

import { open, readdir, readFile, realpath, stat } from "node:fs/promises";
import { join } from "node:path";
import { open, readFile, stat } from "node:fs/promises";
import { resolve } from "node:path";
import { ValidationError } from "../errors.js";
import { logger } from "../logger.js";
import { type WalkEntry, walkFiles } from "../scan/index.js";
import {
type DifArchiveInfo,
type DifObjectInfo,
Expand Down Expand Up @@ -243,94 +244,121 @@ export function objectPassesFilters(
}

/**
* Recursively collect every regular file under a path, with cycle-safe
* symlink-following via visited realpath tracking.
* Recursively scan paths for candidate files.
*
* Each argument may be a file (kept as-is) or a directory. Directories are
* walked recursively by the shared, well-tested {@link walkFiles} walker —
* the same traversal foundation the DSN code-scanner uses. The walker is
* configured to preserve this command's contract, which differs from the
* walker's DSN-tuned defaults: debug files are large binaries that typically
* live in `.gitignore`d build output (`build/`, `target/`, Xcode
* `DerivedData`), so gitignore handling and the default build-output
* skip-dirs are disabled, the 256 KB default size cap is lifted (size gating
* happens later in {@link prepareDifs}, after the cheap format peek, so it can
* accurately drive `oversizedCount`), and the unused `isBinary` classification
* is turned off to avoid an extra head-read per file. Symlinks are followed;
* cycles are detected by the walker via a visited inode set.
*
* Duplicate file paths (e.g. overlapping directory arguments, or a file passed
* both explicitly and reached via a directory walk) are collapsed by absolute
* path so each file is read and parsed at most once.
*
* A path that does not exist (or is unreadable) throws {@link ValidationError}.
* Failures deep inside a directory tree are swallowed by the walker (a scanned
* tree legitimately contains many unreadable/odd entries).
*
* @param paths - Files and/or directories to scan.
* @returns File paths in scan order (absolute for walked entries; as-given for
* explicit file arguments), de-duplicated by absolute path.
* @throws {ValidationError} If an explicit path does not exist or cannot be
* accessed.
*/
async function collectFiles(
path: string,
out: string[],
visited: Set<string>
): Promise<void> {
let info: Awaited<ReturnType<typeof stat>>;
try {
info = await stat(path);
} catch (err) {
log.debug(`Skipping unreadable path: ${path}`, err);
return;
}

if (info.isDirectory()) {
let real: string;
try {
real = await realpath(path);
} catch {
log.debug(`Could not resolve real path for directory: ${path}`);
return;
}
if (visited.has(real)) {
log.debug(`Skipping already-visited directory (symlink cycle?): ${path}`);
return;
export async function scanPaths(paths: string[]): Promise<string[]> {
const files: string[] = [];
const seen = new Set<string>();
const add = (filePath: string): void => {
const key = resolve(filePath);
if (!seen.has(key)) {
seen.add(key);
files.push(filePath);
}
visited.add(real);
};

let entries: string[];
try {
entries = await readdir(path);
} catch (err) {
log.debug(`Skipping unreadable directory: ${path}`, err);
return;
}
for (const entry of entries) {
await collectFiles(join(path, entry), out, visited);
for (const path of paths) {
const info = await statExplicitPath(path);
if (info.isFile()) {
add(path);
} else if (info.isDirectory()) {
for await (const entry of walkDebugFileDir(path)) {
add(entry.absolutePath);
}
} else {
// Sockets, FIFOs, devices, etc. are not debug files — skip them.
log.debug(`Skipping non-file, non-directory path: ${path}`);
}
return;
}

if (info.isFile()) {
out.push(path);
}
return files;
}

/**
* Recursively scan paths for candidate files.
*
* Each argument may be a file (kept as-is) or a directory (walked
* recursively). Unreadable entries are skipped with a debug log. Symlinks
* are followed; cycle-safe via visited-realpath tracking. A path that does
* not exist at all throws {@link ValidationError} (unlike a walk failure
* inside a tree, which is silently skipped).
* `stat` an explicit, user-supplied path argument, translating filesystem
* errors into a {@link ValidationError}. Unlike entries discovered deep inside
* a directory walk (which are silently skipped), a path the user named
* explicitly must exist and be accessible.
*
* @param paths - Files and/or directories to scan.
* @returns Absolute-or-relative file paths in scan order.
* @throws {ValidationError} If an explicit path does not exist or cannot be
* accessed. Walk-internal failures (entries deep inside a tree) are still
* silently skipped.
* @param path - The explicit path argument.
* @returns The path's `Stats`.
* @throws {ValidationError} If the path does not exist or cannot be accessed.
*/
export async function scanPaths(paths: string[]): Promise<string[]> {
const files: string[] = [];
const visited = new Set<string>();
for (const path of paths) {
try {
await stat(path);
} catch (err) {
const code = (err as NodeJS.ErrnoException).code;
if (code === "ENOENT") {
throw new ValidationError(`Path '${path}' does not exist.`, "path");
}
if (code === "EACCES" || code === "EPERM") {
throw new ValidationError(
`Path '${path}' is not readable: ${code}.`,
"path"
);
}
async function statExplicitPath(
path: string
): Promise<Awaited<ReturnType<typeof stat>>> {
try {
return await stat(path);
} catch (err) {
const code = (err as NodeJS.ErrnoException).code;
if (code === "ENOENT") {
throw new ValidationError(`Path '${path}' does not exist.`, "path");
}
if (code === "EACCES" || code === "EPERM") {
throw new ValidationError(
`Cannot access path '${path}': ${code ?? "unknown error"}.`,
`Path '${path}' is not readable: ${code}.`,
"path"
);
}
await collectFiles(path, files, visited);
throw new ValidationError(
`Cannot access path '${path}': ${code ?? "unknown error"}.`,
"path"
);
}
return files;
}

/**
* Walk a directory for every regular file, using the shared {@link walkFiles}
* walker configured for debug-file scanning.
*
* The options intentionally diverge from the walker's DSN-tuned defaults:
* debug files are large binaries that typically live in `.gitignore`d build
* output (`build/`, `target/`, Xcode `DerivedData`), so gitignore handling and
* the default build-output skip-dirs are disabled, the 256 KB default size cap
* is lifted (size gating happens later in {@link prepareDifs}, after the cheap
* format peek, so it can accurately drive `oversizedCount`), and the unused
* `isBinary` classification is turned off to avoid an extra head-read per file.
* Symlinks are followed; the walker detects cycles via a visited inode set.
*
* @param dir - Absolute or relative directory path to walk.
* @returns An async iterable of every file found beneath `dir`.
*/
function walkDebugFileDir(dir: string): AsyncIterable<WalkEntry> {
return walkFiles({
cwd: resolve(dir),
respectGitignore: false,
alwaysSkipDirs: [],
maxFileSize: Number.POSITIVE_INFINITY,
followSymlinks: true,
classifyBinary: false,
});
Comment thread
cursor[bot] marked this conversation as resolved.
}

const PEEK_HEADER_BYTES = 4096;
Expand Down
14 changes: 14 additions & 0 deletions src/lib/scan/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,20 @@ export type WalkOptions = {
* DSN-style cache invalidation callers pass `true`.
*/
recordMtimes?: boolean;
/**
* Classify each yielded file as binary, populating `WalkEntry.isBinary`.
* Default: `true`.
*
* When `true` (and no `extensions` allowlist is set), files with an
* unknown extension are sniffed by reading their first 8 KB and looking
* for a NUL byte. Consumers that ignore `isBinary` and walk large
* binary-heavy trees (e.g. the debug-information-file scanner) should
* pass `false` to skip that per-file head-read entirely — every entry is
* then reported with `isBinary: false` without touching file contents.
* Has no effect when `extensions` is set (classification is already
* skipped for allowlisted files).
*/
classifyBinary?: boolean;
/**
* Observer invoked once per directory the walker enters, with the
* directory's absolute path and its floored `stat.mtimeMs`. Fires
Expand Down
34 changes: 27 additions & 7 deletions src/lib/scan/walker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,18 @@ async function* walkFilesImpl(opts: WalkOptions): AsyncGenerator<WalkEntry> {
};
const startedAt = cfg.clock();
const visitedInodes = new Set<string>();
// Seed the root's inode so a descendant symlink pointing back at the scan
// root is treated as a cycle. The root frame is pushed directly (below)
// rather than through `maybeDescend`, which is the only place inodes are
// otherwise recorded — so without this seed a symlink → root would re-list
// the root's entire subtree under a second path prefix. Only relevant when
// following symlinks; the default walk never re-enters a directory.
if (cfg.followSymlinks) {
const rootKey = await inodeKey(cfg.cwd);
if (rootKey) {
visitedInodes.add(rootKey);
}
}
const stack: DirFrame[] = [{ absDir: cfg.cwd, depth: 0 }];
const ctx: WalkContext = {
cfg,
Expand Down Expand Up @@ -742,23 +754,29 @@ async function tryYieldFile(
* Classify a file as binary via extension fast-path or 8 KB NUL sniff.
*
* Skip the sniff for:
* (a) callers that provided an `extensions` allowlist — the file
* (a) callers that opted out via `classifyBinary: false` — they ignore
* `isBinary`, so no read is worth doing;
* (b) callers that provided an `extensions` allowlist — the file
* already passed it, so by construction the caller considers
* its extension text-bearing;
* (b) known text extensions (`TEXT_EXTENSIONS`);
* (c) empty files.
* (c) known text extensions (`TEXT_EXTENSIONS`);
* (d) empty files.
*
* Ordering matters: (a) runs first so we never re-compute
* `path.extname` + `Set.has` for the hot DSN-scan path where
* `processEntry` has already matched the extension.
* Ordering matters: (a)/(b) run first so we never read file contents (or
* even re-compute `path.extname` + `Set.has`) for the hot DSN-scan path
* where `processEntry` has already matched the extension.
*/
async function classifyFile(
absPath: string,
size: number,
cfg: NormalizedOptions
): Promise<boolean> {
if (!cfg.classifyBinary) {
// (a) caller ignores `isBinary` — skip all classification work.
return false;
}
if (cfg.extensions !== undefined) {
// (a) caller filtered — skip re-classification.
// (b) caller filtered — skip re-classification.
return false;
}
const byExt = classifyByExtension(absPath, TEXT_EXTENSIONS);
Expand Down Expand Up @@ -911,6 +929,7 @@ type NormalizedOptions = {
timeBudgetMs: number;
clock: () => number;
recordMtimes: boolean;
classifyBinary: boolean;
onDirectoryVisit: ((absDir: string, mtimeMs: number) => void) | undefined;
};

Expand Down Expand Up @@ -939,6 +958,7 @@ function normalizeOptions(opts: WalkOptions): NormalizedOptions {
timeBudgetMs: opts.timeBudgetMs ?? Number.POSITIVE_INFINITY,
clock: opts.clock ?? (() => performance.now()),
recordMtimes: opts.recordMtimes ?? false,
classifyBinary: opts.classifyBinary ?? true,
onDirectoryVisit: opts.onDirectoryVisit,
};
}
Expand Down
72 changes: 70 additions & 2 deletions test/lib/dif/scan.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@
* `prepareDifs` size gate.
*/

import { mkdtemp, rm, writeFile } from "node:fs/promises";
import { mkdir, mkdtemp, rm, symlink, writeFile } from "node:fs/promises";
import { tmpdir } from "node:os";
import { join } from "node:path";
import { basename, join } from "node:path";
import { afterEach, beforeEach, describe, expect, test } from "vitest";
import {
buildDifFilters,
Expand Down Expand Up @@ -105,3 +105,71 @@ describe("prepareDifs size gate", () => {
expect(oversizedCount).toBe(1);
});
});

describe("scanPaths traversal", () => {
/** Map a list of file paths to their basenames for order-independent checks. */
const names = (paths: string[]): string[] => paths.map((p) => basename(p));

test("walks a directory recursively and returns nested files", async () => {
await mkdir(join(tempDir, "sub"), { recursive: true });
await writeFile(join(tempDir, "top.sym"), BREAKPAD_FIXTURE);
await writeFile(join(tempDir, "sub", "nested.sym"), BREAKPAD_FIXTURE);

const files = await scanPaths([tempDir]);

expect(names(files).sort()).toEqual(["nested.sym", "top.sym"]);
});

test("keeps an explicit file argument as-is", async () => {
const path = join(tempDir, "explicit.sym");
await writeFile(path, BREAKPAD_FIXTURE);

const files = await scanPaths([path]);

expect(files).toEqual([path]);
});

test("discovers files under .gitignore'd and node_modules directories", async () => {
// The walker's DSN-tuned defaults (respectGitignore + build-output
// skip-dirs) would hide these; the DIF scanner must disable both because
// debug files routinely live in gitignored build output.
await writeFile(join(tempDir, ".gitignore"), "build/\nnode_modules/\n");
await mkdir(join(tempDir, "build"), { recursive: true });
await mkdir(join(tempDir, "node_modules"), { recursive: true });
await writeFile(join(tempDir, "build", "ignored.sym"), BREAKPAD_FIXTURE);
await writeFile(join(tempDir, "node_modules", "dep.sym"), BREAKPAD_FIXTURE);

const found = names(await scanPaths([tempDir]));

expect(found).toContain("ignored.sym");
expect(found).toContain("dep.sym");
});

test("dedupes files reachable via overlapping arguments", async () => {
await mkdir(join(tempDir, "sub"), { recursive: true });
await writeFile(join(tempDir, "sub", "dup.sym"), BREAKPAD_FIXTURE);

// The same file is reachable both via the parent dir walk and the explicit
// nested-dir walk; it must appear exactly once.
const files = await scanPaths([tempDir, join(tempDir, "sub")]);

expect(names(files).filter((n) => n === "dup.sym")).toHaveLength(1);
});

test("is cycle-safe with a directory symlink loop", async () => {
await mkdir(join(tempDir, "a"), { recursive: true });
await writeFile(join(tempDir, "a", "real.sym"), BREAKPAD_FIXTURE);
// A self-referential symlink would loop forever without cycle detection.
await symlink(join(tempDir, "a"), join(tempDir, "a", "loop"));

const found = names(await scanPaths([tempDir]));

expect(found).toContain("real.sym");
});

test("throws ValidationError for a non-existent path", async () => {
await expect(scanPaths([join(tempDir, "does-not-exist")])).rejects.toThrow(
/does not exist/
);
});
});
Loading
Loading