From 44a9933d20884d7ba4dad9164601d3228681e5ab Mon Sep 17 00:00:00 2001 From: kipeum86 Date: Fri, 17 Apr 2026 00:42:19 +0900 Subject: [PATCH 1/6] fix(detection): recover original bytes in heuristic candidates --- .../_framework/recover-bytes.test.ts | 55 +++++++++++++++++++ src/detection/_framework/recover-bytes.ts | 10 ++++ src/detection/_framework/runner.ts | 15 ++++- src/detection/_framework/types.ts | 6 ++ .../heuristics/capitalization-cluster.test.ts | 28 +++++++++- .../heuristics/capitalization-cluster.ts | 15 ++++- .../heuristics/email-domain-inference.test.ts | 53 ++++++++++++++---- .../heuristics/email-domain-inference.ts | 21 ++++++- .../rules/heuristics/quoted-term.test.ts | 27 ++++++++- src/detection/rules/heuristics/quoted-term.ts | 14 ++++- .../rules/heuristics/repeatability.test.ts | 36 +++++++++++- .../rules/heuristics/repeatability.ts | 20 ++++++- 12 files changed, 279 insertions(+), 21 deletions(-) create mode 100644 src/detection/_framework/recover-bytes.test.ts create mode 100644 src/detection/_framework/recover-bytes.ts diff --git a/src/detection/_framework/recover-bytes.test.ts b/src/detection/_framework/recover-bytes.test.ts new file mode 100644 index 0000000..c61640c --- /dev/null +++ b/src/detection/_framework/recover-bytes.test.ts @@ -0,0 +1,55 @@ +import { describe, expect, it } from "vitest"; + +import { normalizeForMatching } from "../normalize.js"; + +import { recoverOriginalSlice } from "./recover-bytes.js"; + +describe("recoverOriginalSlice", () => { + it("preserves smart quotes from the original text", () => { + const original = `prefix \u201CAcme Corp\u201D suffix`; + const map = normalizeForMatching(original); + const start = map.text.indexOf(`"Acme Corp"`); + const end = start + `"Acme Corp"`.length; + expect(recoverOriginalSlice(original, map, start, end)).toBe( + `\u201CAcme Corp\u201D`, + ); + }); + + it("preserves fullwidth digits from the original text", () => { + const original = "Call \uFF10\uFF11\uFF12\uFF13 now"; + const map = normalizeForMatching(original); + const start = map.text.indexOf("0123"); + const end = start + "0123".length; + expect(recoverOriginalSlice(original, map, start, end)).toBe( + "\uFF10\uFF11\uFF12\uFF13", + ); + }); + + it("passes ASCII slices through unchanged", () => { + const original = "Acme Corp"; + const map = normalizeForMatching(original); + expect(recoverOriginalSlice(original, map, 0, map.text.length)).toBe( + "Acme Corp", + ); + }); + + it("supports startNorm = 0", () => { + const original = "\uFF21BC"; + const map = normalizeForMatching(original); + expect(recoverOriginalSlice(original, map, 0, 1)).toBe("\uFF21"); + }); + + it("supports endNorm = text.length", () => { + const original = `\u201CAcme\u201D`; + const map = normalizeForMatching(original); + expect( + recoverOriginalSlice(original, map, 0, map.text.length), + ).toBe(`\u201CAcme\u201D`); + }); + + it("returns an empty string for an empty slice", () => { + const original = "Acme"; + const map = normalizeForMatching(original); + expect(recoverOriginalSlice(original, map, 2, 2)).toBe(""); + }); +}); diff --git a/src/detection/_framework/recover-bytes.ts b/src/detection/_framework/recover-bytes.ts new file mode 100644 index 0000000..eae504e --- /dev/null +++ b/src/detection/_framework/recover-bytes.ts @@ -0,0 +1,10 @@ +import type { PositionMap } from "../normalize.js"; + +export function recoverOriginalSlice( + originalText: string, + map: PositionMap, + startNorm: number, + endNorm: number, +): string { + return originalText.slice(map.origOffsets[startNorm], map.origOffsets[endNorm]); +} diff --git a/src/detection/_framework/runner.ts b/src/detection/_framework/runner.ts index 9337ce1..1b302b3 100644 --- a/src/detection/_framework/runner.ts +++ b/src/detection/_framework/runner.ts @@ -320,7 +320,18 @@ export function runHeuristicPhase( if (text.length === 0) return []; const map = normalizeForMatching(text); if (map.text.length === 0) return []; - return runHeuristicPhaseOnMap(map, level, heuristics, context, opts); + const heuristicContext: HeuristicContext = { + ...context, + originalText: text, + map, + }; + return runHeuristicPhaseOnMap( + map, + level, + heuristics, + heuristicContext, + opts, + ); } /** @@ -462,6 +473,8 @@ export function runAllPhases(text: string, opts: RunAllOptions): RunAllResult { structuralDefinitions, priorCandidates: regexCandidates, documentLanguage, + originalText: text, + map, }; const heuristicCandidates = runHeuristicPhaseOnMap( map, diff --git a/src/detection/_framework/types.ts b/src/detection/_framework/types.ts index 06775b7..78ae59f 100644 --- a/src/detection/_framework/types.ts +++ b/src/detection/_framework/types.ts @@ -1,3 +1,5 @@ +import type { PositionMap } from "../normalize.js"; + /** * Rule framework types — Phase 0. * @@ -112,11 +114,15 @@ export interface Candidate { * - structuralDefinitions (from structural phase) to skip D9 defined labels * - priorCandidates (from regex phase) to avoid double-counting * - documentLanguage (from runner) to filter role blacklists + * - originalText + map (from runner) to recover original bytes for emitted + * candidates without re-normalizing */ export interface HeuristicContext { readonly structuralDefinitions: readonly StructuralDefinition[]; readonly priorCandidates: readonly Candidate[]; readonly documentLanguage: "ko" | "en" | "mixed"; + readonly originalText?: string; + readonly map?: PositionMap; } /** diff --git a/src/detection/rules/heuristics/capitalization-cluster.test.ts b/src/detection/rules/heuristics/capitalization-cluster.test.ts index 9ce0a0d..d2960c8 100644 --- a/src/detection/rules/heuristics/capitalization-cluster.test.ts +++ b/src/detection/rules/heuristics/capitalization-cluster.test.ts @@ -1,5 +1,6 @@ import { describe, expect, it } from "vitest"; +import { normalizeForMatching } from "../../normalize.js"; import type { HeuristicContext } from "../../_framework/types.js"; import { CAPITALIZATION_CLUSTER } from "./capitalization-cluster.js"; @@ -16,7 +17,12 @@ function makeContext( } function detect(text: string, ctx: HeuristicContext = makeContext()) { - return CAPITALIZATION_CLUSTER.detect(text, ctx); + const map = normalizeForMatching(text); + return CAPITALIZATION_CLUSTER.detect(map.text, { + ...ctx, + originalText: text, + map, + }); } function expectFast(input: string, budgetMs = 100): void { @@ -100,6 +106,26 @@ describe("heuristics.capitalization-cluster", () => { ]); }); + it("recovers original bytes from smart-quoted input", () => { + expect(detect("\u201C\uFF21\uFF43\uFF4D\uFF45\u3000\uFF23\uFF4F\uFF52\uFF50\u201D signed.")).toEqual([ + { + text: "\uFF21\uFF43\uFF4D\uFF45\u3000\uFF23\uFF4F\uFF52\uFF50", + ruleId: "heuristics.capitalization-cluster", + confidence: 0.7, + }, + ]); + }); + + it("preserves fullwidth ASCII letters in candidate.text", () => { + expect(detect("\uFF21\uFF43\uFF4D\uFF45\u3000\uFF28\uFF4F\uFF4C\uFF44\uFF49\uFF4E\uFF47\uFF53 approved.")).toEqual([ + { + text: "\uFF21\uFF43\uFF4D\uFF45\u3000\uFF28\uFF4F\uFF4C\uFF44\uFF49\uFF4E\uFF47\uFF53", + ruleId: "heuristics.capitalization-cluster", + confidence: 0.7, + }, + ]); + }); + it("is ReDoS-safe on a 10KB pathological input", () => { expectFast(`${"A".repeat(5000)} ${"B".repeat(5000)}`); }); diff --git a/src/detection/rules/heuristics/capitalization-cluster.ts b/src/detection/rules/heuristics/capitalization-cluster.ts index b50a654..6c14987 100644 --- a/src/detection/rules/heuristics/capitalization-cluster.ts +++ b/src/detection/rules/heuristics/capitalization-cluster.ts @@ -9,8 +9,7 @@ * 2. Prior candidate skip — already-found strings excluded * 3. Role blacklist — generic legal roles excluded * 4. Confidence 0.7 (moderate — caps clusters are common in English prose) - * 5. Returns normalized text as candidate.text (ASCII letters are - * normalized losslessly, so normalized = original for this heuristic) + * 5. Recovers original bytes for candidate.text via HeuristicContext.map * * See docs/phases/phase-1-rulebook.md § 14.4.1 */ @@ -20,6 +19,7 @@ import type { Heuristic, HeuristicContext, } from "../../_framework/types.js"; +import { recoverOriginalSlice } from "../../_framework/recover-bytes.js"; import { ROLE_BLACKLIST_EN } from "../role-blacklist-en.js"; export const CAPITALIZATION_CLUSTER: Heuristic = { @@ -46,8 +46,17 @@ export const CAPITALIZATION_CLUSTER: Heuristic = { if (ROLE_BLACKLIST_EN.has(candidate.toLowerCase())) continue; const words = candidate.split(/\s+/); if (words.some((w) => ROLE_BLACKLIST_EN.has(w.toLowerCase()))) continue; + const original = + ctx.originalText && ctx.map + ? recoverOriginalSlice( + ctx.originalText, + ctx.map, + m.index, + m.index + candidate.length, + ) + : candidate; out.push({ - text: candidate, + text: original, ruleId: "heuristics.capitalization-cluster", confidence: 0.7, }); diff --git a/src/detection/rules/heuristics/email-domain-inference.test.ts b/src/detection/rules/heuristics/email-domain-inference.test.ts index ca6fe5d..8430040 100644 --- a/src/detection/rules/heuristics/email-domain-inference.test.ts +++ b/src/detection/rules/heuristics/email-domain-inference.test.ts @@ -1,5 +1,6 @@ import { describe, expect, it } from "vitest"; +import { normalizeForMatching } from "../../normalize.js"; import type { Candidate, HeuristicContext } from "../../_framework/types.js"; import { EMAIL_DOMAIN_INFERENCE } from "./email-domain-inference.js"; @@ -16,13 +17,18 @@ function makeContext( }; } -function detect(ctx: HeuristicContext) { - return EMAIL_DOMAIN_INFERENCE.detect("", ctx); +function detect(text: string, ctx: HeuristicContext) { + const map = normalizeForMatching(text); + return EMAIL_DOMAIN_INFERENCE.detect(map.text, { + ...ctx, + originalText: text, + map, + }); } function expectFast(ctx: HeuristicContext, budgetMs = 100): void { const start = performance.now(); - void detect(ctx); + void detect("", ctx); const elapsed = performance.now() - start; expect(elapsed).toBeLessThan(budgetMs); } @@ -51,7 +57,7 @@ describe("heuristics.email-domain-inference", () => { [{ text: "Samsung", ruleId: "heuristics.email-domain-inference", confidence: 0.8 }], ], ])("%s", (_name, ctx, expected) => { - expect(detect(ctx)).toEqual(expected); + expect(detect("", ctx)).toEqual(expected); }); it.each([ @@ -78,7 +84,7 @@ describe("heuristics.email-domain-inference", () => { [{ text: "Northwind", ruleId: "heuristics.email-domain-inference", confidence: 0.8 }], ], ])("%s", (_name, ctx, expected) => { - expect(detect(ctx)).toEqual(expected); + expect(detect("", ctx)).toEqual(expected); }); it.each([ @@ -101,7 +107,7 @@ describe("heuristics.email-domain-inference", () => { ]), ], ])("%s", (_name, ctx) => { - expect(detect(ctx)).toEqual([]); + expect(detect("", ctx)).toEqual([]); }); it("skips inferred names that match structural-definition labels (D9)", () => { @@ -117,7 +123,7 @@ describe("heuristics.email-domain-inference", () => { ], }, ); - expect(detect(ctx)).toEqual([]); + expect(detect("", ctx)).toEqual([]); }); it("skips inferred names already present in priorCandidates", () => { @@ -125,25 +131,26 @@ describe("heuristics.email-domain-inference", () => { { text: "legal@acme-corp.com", ruleId: "identifiers.email", confidence: 1.0 }, { text: "Acme Corp", ruleId: "entities.en-corp-suffix", confidence: 1.0 }, ]); - expect(detect(ctx)).toEqual([]); + expect(detect("", ctx)).toEqual([]); }); it("skips blacklisted inferred names like Party", () => { const ctx = makeContext([ { text: "legal@party.com", ruleId: "identifiers.email", confidence: 1.0 }, ]); - expect(detect(ctx)).toEqual([]); + expect(detect("", ctx)).toEqual([]); }); it("skips blacklisted inferred names like Company", () => { const ctx = makeContext([ { text: "legal@company.com", ruleId: "identifiers.email", confidence: 1.0 }, ]); - expect(detect(ctx)).toEqual([]); + expect(detect("", ctx)).toEqual([]); }); it("emits 0.8 for corporate prefixes and 0.6 for personal prefixes", () => { const result = detect( + "", makeContext([ { text: "legal@acme-corp.com", ruleId: "identifiers.email", confidence: 1.0 }, { text: "john@beta.com", ruleId: "identifiers.email", confidence: 1.0 }, @@ -163,6 +170,32 @@ describe("heuristics.email-domain-inference", () => { ]); }); + it("recovers original bytes from smart-quoted document occurrences", () => { + const ctx = makeContext([ + { text: "legal@acme-corp.com", ruleId: "identifiers.email", confidence: 1.0 }, + ]); + expect(detect("\u201C\uFF21\uFF43\uFF4D\uFF45\u3000\uFF23\uFF4F\uFF52\uFF50\u201D is the counterparty.", ctx)).toEqual([ + { + text: "\uFF21\uFF43\uFF4D\uFF45\u3000\uFF23\uFF4F\uFF52\uFF50", + ruleId: "heuristics.email-domain-inference", + confidence: 0.8, + }, + ]); + }); + + it("preserves fullwidth digits when recovering inferred names from text", () => { + const ctx = makeContext([ + { text: "legal@acme-123.com", ruleId: "identifiers.email", confidence: 1.0 }, + ]); + expect(detect("\uFF21\uFF43\uFF4D\uFF45\u3000\uFF11\uFF12\uFF13 responded.", ctx)).toEqual([ + { + text: "\uFF21\uFF43\uFF4D\uFF45\u3000\uFF11\uFF12\uFF13", + ruleId: "heuristics.email-domain-inference", + confidence: 0.8, + }, + ]); + }); + it("is ReDoS-safe on a 10KB pathological input", () => { expectFast( makeContext([ diff --git a/src/detection/rules/heuristics/email-domain-inference.ts b/src/detection/rules/heuristics/email-domain-inference.ts index 8f5137a..f139e36 100644 --- a/src/detection/rules/heuristics/email-domain-inference.ts +++ b/src/detection/rules/heuristics/email-domain-inference.ts @@ -19,6 +19,7 @@ import type { Heuristic, HeuristicContext, } from "../../_framework/types.js"; +import { recoverOriginalSlice } from "../../_framework/recover-bytes.js"; import { ROLE_BLACKLIST_EN } from "../role-blacklist-en.js"; /** Common TLDs to strip. */ @@ -39,6 +40,22 @@ function titleCase(s: string): string { return s[0]!.toUpperCase() + s.slice(1).toLowerCase(); } +function recoverInferredText( + normalizedText: string, + inferred: string, + ctx: HeuristicContext, +): string { + if (!ctx.originalText || !ctx.map) return inferred; + const startNorm = normalizedText.indexOf(inferred); + if (startNorm < 0) return inferred; + return recoverOriginalSlice( + ctx.originalText, + ctx.map, + startNorm, + startNorm + inferred.length, + ); +} + export const EMAIL_DOMAIN_INFERENCE: Heuristic = { id: "heuristics.email-domain-inference", category: "heuristics", @@ -47,7 +64,7 @@ export const EMAIL_DOMAIN_INFERENCE: Heuristic = { levels: ["paranoid"], description: "Infer company name from email domain (legal@acme-corp.com → 'Acme Corp')", - detect(_text: string, ctx: HeuristicContext): readonly Candidate[] { + detect(text: string, ctx: HeuristicContext): readonly Candidate[] { const definedLabels = new Set( ctx.structuralDefinitions.map((d) => d.label), ); @@ -93,7 +110,7 @@ export const EMAIL_DOMAIN_INFERENCE: Heuristic = { const confidence = CORPORATE_PREFIXES.has(localPart) ? 0.8 : 0.6; out.push({ - text: inferred, + text: recoverInferredText(text, inferred, ctx), ruleId: "heuristics.email-domain-inference", confidence, }); diff --git a/src/detection/rules/heuristics/quoted-term.test.ts b/src/detection/rules/heuristics/quoted-term.test.ts index 561a091..495a411 100644 --- a/src/detection/rules/heuristics/quoted-term.test.ts +++ b/src/detection/rules/heuristics/quoted-term.test.ts @@ -17,7 +17,12 @@ function makeContext( } function detectRaw(text: string, ctx: HeuristicContext = makeContext()) { - return QUOTED_TERM.detect(normalizeForMatching(text).text, ctx); + const map = normalizeForMatching(text); + return QUOTED_TERM.detect(map.text, { + ...ctx, + originalText: text, + map, + }); } function expectFast(input: string, budgetMs = 100): void { @@ -88,6 +93,26 @@ describe("heuristics.quoted-term", () => { ]); }); + it("recovers original inner bytes from smart-quoted input", () => { + expect(detectRaw("\u201C\uFF21\uFF43\uFF4D\uFF45\u3000\uFF11\uFF12\uFF13\u201D shall survive.")).toEqual([ + { + text: "\uFF21\uFF43\uFF4D\uFF45\u3000\uFF11\uFF12\uFF13", + ruleId: "heuristics.quoted-term", + confidence: 0.6, + }, + ]); + }); + + it("preserves fullwidth digits in candidate.text", () => { + expect(detectRaw('"\uFF21\uFF43\uFF4D\uFF45\uFF11\uFF12\uFF13" shall survive.')).toEqual([ + { + text: "\uFF21\uFF43\uFF4D\uFF45\uFF11\uFF12\uFF13", + ruleId: "heuristics.quoted-term", + confidence: 0.6, + }, + ]); + }); + it("is ReDoS-safe on a 10KB pathological input", () => { expectFast(`"${"A".repeat(10000)}"`); }); diff --git a/src/detection/rules/heuristics/quoted-term.ts b/src/detection/rules/heuristics/quoted-term.ts index 118d650..f817e5b 100644 --- a/src/detection/rules/heuristics/quoted-term.ts +++ b/src/detection/rules/heuristics/quoted-term.ts @@ -18,6 +18,7 @@ import type { Heuristic, HeuristicContext, } from "../../_framework/types.js"; +import { recoverOriginalSlice } from "../../_framework/recover-bytes.js"; import { ROLE_BLACKLIST_EN } from "../role-blacklist-en.js"; import { ROLE_BLACKLIST_KO } from "../role-blacklist-ko.js"; @@ -43,8 +44,19 @@ export const QUOTED_TERM: Heuristic = { if (priorTexts.has(inner)) continue; if (ROLE_BLACKLIST_EN.has(inner.toLowerCase())) continue; if (ROLE_BLACKLIST_KO.has(inner)) continue; + const innerStartNorm = m.index + 1; + const innerEndNorm = innerStartNorm + inner.length; + const original = + ctx.originalText && ctx.map + ? recoverOriginalSlice( + ctx.originalText, + ctx.map, + innerStartNorm, + innerEndNorm, + ) + : inner; out.push({ - text: inner, + text: original, ruleId: "heuristics.quoted-term", confidence: 0.6, }); diff --git a/src/detection/rules/heuristics/repeatability.test.ts b/src/detection/rules/heuristics/repeatability.test.ts index b36ea1b..c2a611c 100644 --- a/src/detection/rules/heuristics/repeatability.test.ts +++ b/src/detection/rules/heuristics/repeatability.test.ts @@ -1,5 +1,6 @@ import { describe, expect, it } from "vitest"; +import { normalizeForMatching } from "../../normalize.js"; import type { HeuristicContext } from "../../_framework/types.js"; import { REPEATABILITY } from "./repeatability.js"; @@ -16,7 +17,12 @@ function makeContext( } function detect(text: string, ctx: HeuristicContext = makeContext()) { - return REPEATABILITY.detect(text, ctx); + const map = normalizeForMatching(text); + return REPEATABILITY.detect(map.text, { + ...ctx, + originalText: text, + map, + }); } function expectFast(input: string, budgetMs = 100): void { @@ -123,6 +129,34 @@ describe("heuristics.repeatability", () => { ]); }); + it("recovers original bytes from repeated smart-quoted input", () => { + expect( + detect( + "\u201C\uFF21\uFF43\uFF4D\uFF45\u3000\uFF23\uFF4F\uFF52\uFF50\u201D signed. \u201C\uFF21\uFF43\uFF4D\uFF45\u3000\uFF23\uFF4F\uFF52\uFF50\u201D approved. \u201C\uFF21\uFF43\uFF4D\uFF45\u3000\uFF23\uFF4F\uFF52\uFF50\u201D closed.", + ), + ).toEqual([ + { + text: "\uFF21\uFF43\uFF4D\uFF45\u3000\uFF23\uFF4F\uFF52\uFF50", + ruleId: "heuristics.repeatability", + confidence: 0.5, + }, + ]); + }); + + it("preserves fullwidth ASCII letters in repeated candidate.text", () => { + expect( + detect( + "\uFF21\uFF43\uFF4D\uFF45 signed. \uFF21\uFF43\uFF4D\uFF45 approved. \uFF21\uFF43\uFF4D\uFF45 closed.", + ), + ).toEqual([ + { + text: "\uFF21\uFF43\uFF4D\uFF45", + ruleId: "heuristics.repeatability", + confidence: 0.5, + }, + ]); + }); + it("is ReDoS-safe on a 10KB pathological input", () => { expectFast(`${"Acme ".repeat(2000)}${"삼성전자 ".repeat(1000)}`); }); diff --git a/src/detection/rules/heuristics/repeatability.ts b/src/detection/rules/heuristics/repeatability.ts index db01c45..6ff29ed 100644 --- a/src/detection/rules/heuristics/repeatability.ts +++ b/src/detection/rules/heuristics/repeatability.ts @@ -19,6 +19,7 @@ import type { Heuristic, HeuristicContext, } from "../../_framework/types.js"; +import { recoverOriginalSlice } from "../../_framework/recover-bytes.js"; import { ROLE_BLACKLIST_EN } from "../role-blacklist-en.js"; import { ROLE_BLACKLIST_KO } from "../role-blacklist-ko.js"; @@ -40,6 +41,7 @@ export const REPEATABILITY: Heuristic = { const priorTexts = new Set(ctx.priorCandidates.map((c) => c.text)); const counts = new Map(); + const firstSpans = new Map(); const enPattern = /(? Date: Fri, 17 Apr 2026 23:45:43 +0900 Subject: [PATCH 2/6] fix(security): add ZIP size guard via loadDocxZip wrapper --- src/docx/limits.ts | 2 ++ src/docx/load.test.ts | 55 +++++++++++++++++++++++++++++ src/docx/load.ts | 20 +++++++++++ src/finalize/guided-recovery.ts | 5 ++- src/finalize/preflight-expansion.ts | 5 +-- src/ui/DocumentPreview.svelte | 10 ++---- src/ui/engine.ts | 7 ++-- 7 files changed, 88 insertions(+), 16 deletions(-) create mode 100644 src/docx/limits.ts create mode 100644 src/docx/load.test.ts create mode 100644 src/docx/load.ts diff --git a/src/docx/limits.ts b/src/docx/limits.ts new file mode 100644 index 0000000..9a70e8a --- /dev/null +++ b/src/docx/limits.ts @@ -0,0 +1,2 @@ +export const MAX_INPUT_BYTES = 50 * 1024 * 1024; // 50 MB +export const MAX_ENTRY_BYTES = 20 * 1024 * 1024; // 20 MB diff --git a/src/docx/load.test.ts b/src/docx/load.test.ts new file mode 100644 index 0000000..69c7811 --- /dev/null +++ b/src/docx/load.test.ts @@ -0,0 +1,55 @@ +import { describe, expect, it, vi, afterEach } from "vitest"; +import JSZip from "jszip"; + +import { MAX_INPUT_BYTES } from "./limits.js"; +import { FileTooLargeError, loadDocxZip } from "./load.js"; + +async function makeZipBytes(): Promise { + const zip = new JSZip(); + zip.file( + "word/document.xml", + `Hello`, + ); + return zip.generateAsync({ type: "uint8array" }); +} + +afterEach(() => { + vi.restoreAllMocks(); +}); + +describe("loadDocxZip", () => { + it("rejects empty bytes", async () => { + await expect(loadDocxZip(new Uint8Array(0))).rejects.toBeInstanceOf( + FileTooLargeError, + ); + }); + + it("accepts a valid docx-sized zip", async () => { + const bytes = await makeZipBytes(); + const zip = await loadDocxZip(bytes); + expect(zip.file("word/document.xml")).not.toBeNull(); + }); + + it("accepts the exact MAX_INPUT_BYTES boundary and delegates to JSZip", async () => { + const zip = new JSZip(); + const spy = vi.spyOn(JSZip, "loadAsync").mockResolvedValue(zip); + const bytes = new Uint8Array(MAX_INPUT_BYTES); + + await expect(loadDocxZip(bytes)).resolves.toBe(zip); + expect(spy).toHaveBeenCalledTimes(1); + }); + + it("rejects bytes larger than MAX_INPUT_BYTES", async () => { + const spy = vi.spyOn(JSZip, "loadAsync"); + const bytes = new Uint8Array(MAX_INPUT_BYTES + 1); + + await expect(loadDocxZip(bytes)).rejects.toBeInstanceOf(FileTooLargeError); + expect(spy).not.toHaveBeenCalled(); + }); + + it("propagates JSZip errors for corrupt ZIP data", async () => { + await expect(loadDocxZip(new Uint8Array([1, 2, 3, 4]))).rejects.not.toBeInstanceOf( + FileTooLargeError, + ); + }); +}); diff --git a/src/docx/load.ts b/src/docx/load.ts new file mode 100644 index 0000000..9488477 --- /dev/null +++ b/src/docx/load.ts @@ -0,0 +1,20 @@ +import JSZip from "jszip"; + +import { MAX_INPUT_BYTES } from "./limits.js"; + +export class FileTooLargeError extends Error { + constructor(size: number, limit: number) { + super(`File size ${size} bytes exceeds limit of ${limit} bytes`); + this.name = "FileTooLargeError"; + } +} + +export async function loadDocxZip(bytes: Uint8Array): Promise { + if (bytes.length === 0) { + throw new FileTooLargeError(0, MAX_INPUT_BYTES); + } + if (bytes.length > MAX_INPUT_BYTES) { + throw new FileTooLargeError(bytes.length, MAX_INPUT_BYTES); + } + return JSZip.loadAsync(bytes.slice()); +} diff --git a/src/finalize/guided-recovery.ts b/src/finalize/guided-recovery.ts index b16acc9..eee31d1 100644 --- a/src/finalize/guided-recovery.ts +++ b/src/finalize/guided-recovery.ts @@ -1,5 +1,4 @@ -import JSZip from "jszip"; - +import { loadDocxZip } from "../docx/load.js"; import { finalizeRedaction, type FinalizeOptions, @@ -261,7 +260,7 @@ async function defaultRepairPass( repairPlan: RepairPlan, options: GuidedRecoveryOptions, ): Promise { - const zip = await JSZip.loadAsync(bytes.slice()); + const zip = await loadDocxZip(bytes); await applyRelsRepairsToZip( zip, repairPlan.relsRepairs, diff --git a/src/finalize/preflight-expansion.ts b/src/finalize/preflight-expansion.ts index b088610..345c464 100644 --- a/src/finalize/preflight-expansion.ts +++ b/src/finalize/preflight-expansion.ts @@ -1,5 +1,6 @@ -import JSZip from "jszip"; +import type JSZip from "jszip"; +import { loadDocxZip } from "../docx/load.js"; import { collectVerifySurfaces } from "../docx/verify-surfaces.js"; import type { ResolvedRedactionTarget } from "../selection-targets.js"; @@ -29,7 +30,7 @@ export async function buildPreflightExpansionPlan( }; } - const zip = await JSZip.loadAsync(bytes.slice()); + const zip = await loadDocxZip(bytes); const surfaces = await collectVerifySurfaces(zip); const extraLiterals = new Map>(); const relsRepairs = new Map>(); diff --git a/src/ui/DocumentPreview.svelte b/src/ui/DocumentPreview.svelte index 68bf34a..31f3311 100644 --- a/src/ui/DocumentPreview.svelte +++ b/src/ui/DocumentPreview.svelte @@ -14,12 +14,8 @@ have that can come later. -->