diff --git a/README.ko.md b/README.ko.md index 537e29f..76fe949 100644 --- a/README.ko.md +++ b/README.ko.md @@ -13,7 +13,7 @@ document-redactor.html 다운로드
- HTML 한 파일 · ~277 KB · 더블클릭으로 실행 + HTML 한 파일 · ~276 KB · 더블클릭으로 실행 @@ -53,7 +53,7 @@ CI Apache 2.0 license single HTML distribution - 277 KB artifact + 276 KB artifact zero network requests rule-based engine AI none @@ -148,7 +148,7 @@ flowchart TD 현재 확인된 크기
- 277 KB + 276 KB 무결성 sidecar
@@ -160,14 +160,14 @@ flowchart TD 자동화 테스트
- 1,739 tests + 1,774 tests -2026년 4월 14일 기준으로 확인한 현재 빌드: +2026년 4월 18일 기준으로 확인한 현재 빌드: -- `document-redactor.html` SHA-256: `323221def9ce105afbd8ea805a5ed7e0751152ec2d531d6dba84111332cd32f9` +- `document-redactor.html` SHA-256: `4c65364179c80b3993bfee2d99715e5d3c40938d5780ff092f0e021aadc1e77e` - `shasum -a 256 -c document-redactor.html.sha256` 로 로컬 검증 완료 ## 현재 릴리즈가 실제로 하는 일 diff --git a/README.md b/README.md index 0dadb8b..161abdc 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ Download document-redactor.html

- Single HTML · ~277 KB · open locally + Single HTML · ~276 KB · open locally @@ -53,7 +53,7 @@ CI Apache 2.0 license single HTML distribution - 277 KB artifact + 276 KB artifact zero network requests rule-based engine AI none @@ -149,7 +149,7 @@ flowchart TD Current checked size
- 277 KB + 276 KB Integrity sidecar
@@ -161,14 +161,14 @@ flowchart TD Automated coverage
- 1,739 tests + 1,774 tests -Current checked release artifact on April 14, 2026: +Current checked release artifact on April 18, 2026: -- `document-redactor.html` SHA-256: `323221def9ce105afbd8ea805a5ed7e0751152ec2d531d6dba84111332cd32f9` +- `document-redactor.html` SHA-256: `4c65364179c80b3993bfee2d99715e5d3c40938d5780ff092f0e021aadc1e77e` - Verified locally with `shasum -a 256 -c document-redactor.html.sha256` ## What The Current Release Does diff --git a/release-notes/2026-04-18-draft.md b/release-notes/2026-04-18-draft.md new file mode 100644 index 0000000..633427a --- /dev/null +++ b/release-notes/2026-04-18-draft.md @@ -0,0 +1,68 @@ +# document-redactor v1.1.1 + +_Drafted 2026-04-18_ + +## Summary + +This update focuses on two things: + +- making DOCX intake and XML parsing safer against oversized files and ZIP-bomb-style payloads, +- closing a few quiet leak paths that could survive redaction even when the visible document looked clean. + +It also fixes a detection correctness bug where heuristic candidates could return normalized text instead of the original DOCX bytes, which matters for smart quotes, fullwidth ASCII, and ideographic spaces. + +## What's new + +### Safer DOCX loading + +- Added a `loadDocxZip()` guard for all main UI and finalize entry points. +- Files larger than **50 MB** are now rejected before JSZip fully loads them. +- This reduces the chance that a very large or malicious input file freezes or crashes the browser tab. + +### Safer ZIP entry reads + +- Added a `readZipEntry()` guard for XML entry reads. +- Individual ZIP entries larger than **20 MB** are now rejected before downstream XML handling continues. +- This hardens scope walking, metadata scrubbing, verifier surface collection, and preflight repair against oversized decompressed entries. + +### External URL stripping in `.rels` + +- Redacted output now strips surviving external `http://` and `https://` targets from `.rels` files. +- Both `Target="..."` and `Target='...'` forms are covered. +- `mailto:` links and relative package paths are preserved. +- Verification now explicitly fails if any external `http/https` relationship target survives in the output. + +### `docProps/custom.xml` removal + +- Metadata scrubbing now removes `docProps/custom.xml` entirely instead of leaving custom properties behind. +- `[Content_Types].xml` is updated at the same time so the removed custom-properties part does not leave a stale override entry. +- This closes a metadata channel where author email, project names, tracking IDs, or other arbitrary custom fields could survive redaction. + +### Heuristic original-byte recovery + +- Heuristic candidates now recover the original source slice from the normalization offset map before emitting `Candidate.text`. +- This improves real-world redaction reliability when DOCX content uses: + - smart quotes, + - fullwidth ASCII or digits, + - ideographic spaces. +- Result: fewer cases where detection appears correct but the later literal redaction step fails to find the exact original bytes. + +## User-visible impact + +- Oversized or suspicious DOCX inputs fail earlier and more clearly. +- Redacted files are less likely to retain hidden metadata or tracking URLs. +- Documents using smart quotes / fullwidth characters should redact more reliably when the match came from a heuristic. +- No workflow changes were introduced in the UI; this is mainly a safety and correctness release. + +## Validation + +- Automated tests: **1,774 passing** +- Full suite: `bun run test` passed +- Local ReDoS gate: `SKIP_REDOS_FUZZ=0 bun run test` passed +- Production build: `bun run build` passed +- Built artifact size: **276 KB** (`dist/document-redactor.html`) + +## Notes + +- This draft summarizes the work completed on **2026-04-18**. +- Download links and release asset hashes can be filled in once the release is cut. diff --git a/src/app-version.ts b/src/app-version.ts index 9ebef24..0c6aad4 100644 --- a/src/app-version.ts +++ b/src/app-version.ts @@ -1 +1 @@ -export const APP_VERSION = "v.1.05"; +export const APP_VERSION = "v1.1.1"; diff --git a/src/detection/_framework/recover-bytes.test.ts b/src/detection/_framework/recover-bytes.test.ts new file mode 100644 index 0000000..c61640c --- /dev/null +++ b/src/detection/_framework/recover-bytes.test.ts @@ -0,0 +1,55 @@ +import { describe, expect, it } from "vitest"; + +import { normalizeForMatching } from "../normalize.js"; + +import { recoverOriginalSlice } from "./recover-bytes.js"; + +describe("recoverOriginalSlice", () => { + it("preserves smart quotes from the original text", () => { + const original = `prefix \u201CAcme Corp\u201D suffix`; + const map = normalizeForMatching(original); + const start = map.text.indexOf(`"Acme Corp"`); + const end = start + `"Acme Corp"`.length; + expect(recoverOriginalSlice(original, map, start, end)).toBe( + `\u201CAcme Corp\u201D`, + ); + }); + + it("preserves fullwidth digits from the original text", () => { + const original = "Call \uFF10\uFF11\uFF12\uFF13 now"; + const map = normalizeForMatching(original); + const start = map.text.indexOf("0123"); + const end = start + "0123".length; + expect(recoverOriginalSlice(original, map, start, end)).toBe( + "\uFF10\uFF11\uFF12\uFF13", + ); + }); + + it("passes ASCII slices through unchanged", () => { + const original = "Acme Corp"; + const map = normalizeForMatching(original); + expect(recoverOriginalSlice(original, map, 0, map.text.length)).toBe( + "Acme Corp", + ); + }); + + it("supports startNorm = 0", () => { + const original = "\uFF21BC"; + const map = normalizeForMatching(original); + expect(recoverOriginalSlice(original, map, 0, 1)).toBe("\uFF21"); + }); + + it("supports endNorm = text.length", () => { + const original = `\u201CAcme\u201D`; + const map = normalizeForMatching(original); + expect( + recoverOriginalSlice(original, map, 0, map.text.length), + ).toBe(`\u201CAcme\u201D`); + }); + + it("returns an empty string for an empty slice", () => { + const original = "Acme"; + const map = normalizeForMatching(original); + expect(recoverOriginalSlice(original, map, 2, 2)).toBe(""); + }); +}); diff --git a/src/detection/_framework/recover-bytes.ts b/src/detection/_framework/recover-bytes.ts new file mode 100644 index 0000000..eae504e --- /dev/null +++ b/src/detection/_framework/recover-bytes.ts @@ -0,0 +1,10 @@ +import type { PositionMap } from "../normalize.js"; + +export function recoverOriginalSlice( + originalText: string, + map: PositionMap, + startNorm: number, + endNorm: number, +): string { + return originalText.slice(map.origOffsets[startNorm], map.origOffsets[endNorm]); +} diff --git a/src/detection/_framework/runner.ts b/src/detection/_framework/runner.ts index 9337ce1..1b302b3 100644 --- a/src/detection/_framework/runner.ts +++ b/src/detection/_framework/runner.ts @@ -320,7 +320,18 @@ export function runHeuristicPhase( if (text.length === 0) return []; const map = normalizeForMatching(text); if (map.text.length === 0) return []; - return runHeuristicPhaseOnMap(map, level, heuristics, context, opts); + const heuristicContext: HeuristicContext = { + ...context, + originalText: text, + map, + }; + return runHeuristicPhaseOnMap( + map, + level, + heuristics, + heuristicContext, + opts, + ); } /** @@ -462,6 +473,8 @@ export function runAllPhases(text: string, opts: RunAllOptions): RunAllResult { structuralDefinitions, priorCandidates: regexCandidates, documentLanguage, + originalText: text, + map, }; const heuristicCandidates = runHeuristicPhaseOnMap( map, diff --git a/src/detection/_framework/types.ts b/src/detection/_framework/types.ts index 06775b7..78ae59f 100644 --- a/src/detection/_framework/types.ts +++ b/src/detection/_framework/types.ts @@ -1,3 +1,5 @@ +import type { PositionMap } from "../normalize.js"; + /** * Rule framework types — Phase 0. * @@ -112,11 +114,15 @@ export interface Candidate { * - structuralDefinitions (from structural phase) to skip D9 defined labels * - priorCandidates (from regex phase) to avoid double-counting * - documentLanguage (from runner) to filter role blacklists + * - originalText + map (from runner) to recover original bytes for emitted + * candidates without re-normalizing */ export interface HeuristicContext { readonly structuralDefinitions: readonly StructuralDefinition[]; readonly priorCandidates: readonly Candidate[]; readonly documentLanguage: "ko" | "en" | "mixed"; + readonly originalText?: string; + readonly map?: PositionMap; } /** diff --git a/src/detection/rules/heuristics/capitalization-cluster.test.ts b/src/detection/rules/heuristics/capitalization-cluster.test.ts index 9ce0a0d..d2960c8 100644 --- a/src/detection/rules/heuristics/capitalization-cluster.test.ts +++ b/src/detection/rules/heuristics/capitalization-cluster.test.ts @@ -1,5 +1,6 @@ import { describe, expect, it } from "vitest"; +import { normalizeForMatching } from "../../normalize.js"; import type { HeuristicContext } from "../../_framework/types.js"; import { CAPITALIZATION_CLUSTER } from "./capitalization-cluster.js"; @@ -16,7 +17,12 @@ function makeContext( } function detect(text: string, ctx: HeuristicContext = makeContext()) { - return CAPITALIZATION_CLUSTER.detect(text, ctx); + const map = normalizeForMatching(text); + return CAPITALIZATION_CLUSTER.detect(map.text, { + ...ctx, + originalText: text, + map, + }); } function expectFast(input: string, budgetMs = 100): void { @@ -100,6 +106,26 @@ describe("heuristics.capitalization-cluster", () => { ]); }); + it("recovers original bytes from smart-quoted input", () => { + expect(detect("\u201C\uFF21\uFF43\uFF4D\uFF45\u3000\uFF23\uFF4F\uFF52\uFF50\u201D signed.")).toEqual([ + { + text: "\uFF21\uFF43\uFF4D\uFF45\u3000\uFF23\uFF4F\uFF52\uFF50", + ruleId: "heuristics.capitalization-cluster", + confidence: 0.7, + }, + ]); + }); + + it("preserves fullwidth ASCII letters in candidate.text", () => { + expect(detect("\uFF21\uFF43\uFF4D\uFF45\u3000\uFF28\uFF4F\uFF4C\uFF44\uFF49\uFF4E\uFF47\uFF53 approved.")).toEqual([ + { + text: "\uFF21\uFF43\uFF4D\uFF45\u3000\uFF28\uFF4F\uFF4C\uFF44\uFF49\uFF4E\uFF47\uFF53", + ruleId: "heuristics.capitalization-cluster", + confidence: 0.7, + }, + ]); + }); + it("is ReDoS-safe on a 10KB pathological input", () => { expectFast(`${"A".repeat(5000)} ${"B".repeat(5000)}`); }); diff --git a/src/detection/rules/heuristics/capitalization-cluster.ts b/src/detection/rules/heuristics/capitalization-cluster.ts index b50a654..6c14987 100644 --- a/src/detection/rules/heuristics/capitalization-cluster.ts +++ b/src/detection/rules/heuristics/capitalization-cluster.ts @@ -9,8 +9,7 @@ * 2. Prior candidate skip — already-found strings excluded * 3. Role blacklist — generic legal roles excluded * 4. Confidence 0.7 (moderate — caps clusters are common in English prose) - * 5. Returns normalized text as candidate.text (ASCII letters are - * normalized losslessly, so normalized = original for this heuristic) + * 5. Recovers original bytes for candidate.text via HeuristicContext.map * * See docs/phases/phase-1-rulebook.md § 14.4.1 */ @@ -20,6 +19,7 @@ import type { Heuristic, HeuristicContext, } from "../../_framework/types.js"; +import { recoverOriginalSlice } from "../../_framework/recover-bytes.js"; import { ROLE_BLACKLIST_EN } from "../role-blacklist-en.js"; export const CAPITALIZATION_CLUSTER: Heuristic = { @@ -46,8 +46,17 @@ export const CAPITALIZATION_CLUSTER: Heuristic = { if (ROLE_BLACKLIST_EN.has(candidate.toLowerCase())) continue; const words = candidate.split(/\s+/); if (words.some((w) => ROLE_BLACKLIST_EN.has(w.toLowerCase()))) continue; + const original = + ctx.originalText && ctx.map + ? recoverOriginalSlice( + ctx.originalText, + ctx.map, + m.index, + m.index + candidate.length, + ) + : candidate; out.push({ - text: candidate, + text: original, ruleId: "heuristics.capitalization-cluster", confidence: 0.7, }); diff --git a/src/detection/rules/heuristics/email-domain-inference.test.ts b/src/detection/rules/heuristics/email-domain-inference.test.ts index ca6fe5d..8430040 100644 --- a/src/detection/rules/heuristics/email-domain-inference.test.ts +++ b/src/detection/rules/heuristics/email-domain-inference.test.ts @@ -1,5 +1,6 @@ import { describe, expect, it } from "vitest"; +import { normalizeForMatching } from "../../normalize.js"; import type { Candidate, HeuristicContext } from "../../_framework/types.js"; import { EMAIL_DOMAIN_INFERENCE } from "./email-domain-inference.js"; @@ -16,13 +17,18 @@ function makeContext( }; } -function detect(ctx: HeuristicContext) { - return EMAIL_DOMAIN_INFERENCE.detect("", ctx); +function detect(text: string, ctx: HeuristicContext) { + const map = normalizeForMatching(text); + return EMAIL_DOMAIN_INFERENCE.detect(map.text, { + ...ctx, + originalText: text, + map, + }); } function expectFast(ctx: HeuristicContext, budgetMs = 100): void { const start = performance.now(); - void detect(ctx); + void detect("", ctx); const elapsed = performance.now() - start; expect(elapsed).toBeLessThan(budgetMs); } @@ -51,7 +57,7 @@ describe("heuristics.email-domain-inference", () => { [{ text: "Samsung", ruleId: "heuristics.email-domain-inference", confidence: 0.8 }], ], ])("%s", (_name, ctx, expected) => { - expect(detect(ctx)).toEqual(expected); + expect(detect("", ctx)).toEqual(expected); }); it.each([ @@ -78,7 +84,7 @@ describe("heuristics.email-domain-inference", () => { [{ text: "Northwind", ruleId: "heuristics.email-domain-inference", confidence: 0.8 }], ], ])("%s", (_name, ctx, expected) => { - expect(detect(ctx)).toEqual(expected); + expect(detect("", ctx)).toEqual(expected); }); it.each([ @@ -101,7 +107,7 @@ describe("heuristics.email-domain-inference", () => { ]), ], ])("%s", (_name, ctx) => { - expect(detect(ctx)).toEqual([]); + expect(detect("", ctx)).toEqual([]); }); it("skips inferred names that match structural-definition labels (D9)", () => { @@ -117,7 +123,7 @@ describe("heuristics.email-domain-inference", () => { ], }, ); - expect(detect(ctx)).toEqual([]); + expect(detect("", ctx)).toEqual([]); }); it("skips inferred names already present in priorCandidates", () => { @@ -125,25 +131,26 @@ describe("heuristics.email-domain-inference", () => { { text: "legal@acme-corp.com", ruleId: "identifiers.email", confidence: 1.0 }, { text: "Acme Corp", ruleId: "entities.en-corp-suffix", confidence: 1.0 }, ]); - expect(detect(ctx)).toEqual([]); + expect(detect("", ctx)).toEqual([]); }); it("skips blacklisted inferred names like Party", () => { const ctx = makeContext([ { text: "legal@party.com", ruleId: "identifiers.email", confidence: 1.0 }, ]); - expect(detect(ctx)).toEqual([]); + expect(detect("", ctx)).toEqual([]); }); it("skips blacklisted inferred names like Company", () => { const ctx = makeContext([ { text: "legal@company.com", ruleId: "identifiers.email", confidence: 1.0 }, ]); - expect(detect(ctx)).toEqual([]); + expect(detect("", ctx)).toEqual([]); }); it("emits 0.8 for corporate prefixes and 0.6 for personal prefixes", () => { const result = detect( + "", makeContext([ { text: "legal@acme-corp.com", ruleId: "identifiers.email", confidence: 1.0 }, { text: "john@beta.com", ruleId: "identifiers.email", confidence: 1.0 }, @@ -163,6 +170,32 @@ describe("heuristics.email-domain-inference", () => { ]); }); + it("recovers original bytes from smart-quoted document occurrences", () => { + const ctx = makeContext([ + { text: "legal@acme-corp.com", ruleId: "identifiers.email", confidence: 1.0 }, + ]); + expect(detect("\u201C\uFF21\uFF43\uFF4D\uFF45\u3000\uFF23\uFF4F\uFF52\uFF50\u201D is the counterparty.", ctx)).toEqual([ + { + text: "\uFF21\uFF43\uFF4D\uFF45\u3000\uFF23\uFF4F\uFF52\uFF50", + ruleId: "heuristics.email-domain-inference", + confidence: 0.8, + }, + ]); + }); + + it("preserves fullwidth digits when recovering inferred names from text", () => { + const ctx = makeContext([ + { text: "legal@acme-123.com", ruleId: "identifiers.email", confidence: 1.0 }, + ]); + expect(detect("\uFF21\uFF43\uFF4D\uFF45\u3000\uFF11\uFF12\uFF13 responded.", ctx)).toEqual([ + { + text: "\uFF21\uFF43\uFF4D\uFF45\u3000\uFF11\uFF12\uFF13", + ruleId: "heuristics.email-domain-inference", + confidence: 0.8, + }, + ]); + }); + it("is ReDoS-safe on a 10KB pathological input", () => { expectFast( makeContext([ diff --git a/src/detection/rules/heuristics/email-domain-inference.ts b/src/detection/rules/heuristics/email-domain-inference.ts index 8f5137a..f139e36 100644 --- a/src/detection/rules/heuristics/email-domain-inference.ts +++ b/src/detection/rules/heuristics/email-domain-inference.ts @@ -19,6 +19,7 @@ import type { Heuristic, HeuristicContext, } from "../../_framework/types.js"; +import { recoverOriginalSlice } from "../../_framework/recover-bytes.js"; import { ROLE_BLACKLIST_EN } from "../role-blacklist-en.js"; /** Common TLDs to strip. */ @@ -39,6 +40,22 @@ function titleCase(s: string): string { return s[0]!.toUpperCase() + s.slice(1).toLowerCase(); } +function recoverInferredText( + normalizedText: string, + inferred: string, + ctx: HeuristicContext, +): string { + if (!ctx.originalText || !ctx.map) return inferred; + const startNorm = normalizedText.indexOf(inferred); + if (startNorm < 0) return inferred; + return recoverOriginalSlice( + ctx.originalText, + ctx.map, + startNorm, + startNorm + inferred.length, + ); +} + export const EMAIL_DOMAIN_INFERENCE: Heuristic = { id: "heuristics.email-domain-inference", category: "heuristics", @@ -47,7 +64,7 @@ export const EMAIL_DOMAIN_INFERENCE: Heuristic = { levels: ["paranoid"], description: "Infer company name from email domain (legal@acme-corp.com → 'Acme Corp')", - detect(_text: string, ctx: HeuristicContext): readonly Candidate[] { + detect(text: string, ctx: HeuristicContext): readonly Candidate[] { const definedLabels = new Set( ctx.structuralDefinitions.map((d) => d.label), ); @@ -93,7 +110,7 @@ export const EMAIL_DOMAIN_INFERENCE: Heuristic = { const confidence = CORPORATE_PREFIXES.has(localPart) ? 0.8 : 0.6; out.push({ - text: inferred, + text: recoverInferredText(text, inferred, ctx), ruleId: "heuristics.email-domain-inference", confidence, }); diff --git a/src/detection/rules/heuristics/quoted-term.test.ts b/src/detection/rules/heuristics/quoted-term.test.ts index 561a091..495a411 100644 --- a/src/detection/rules/heuristics/quoted-term.test.ts +++ b/src/detection/rules/heuristics/quoted-term.test.ts @@ -17,7 +17,12 @@ function makeContext( } function detectRaw(text: string, ctx: HeuristicContext = makeContext()) { - return QUOTED_TERM.detect(normalizeForMatching(text).text, ctx); + const map = normalizeForMatching(text); + return QUOTED_TERM.detect(map.text, { + ...ctx, + originalText: text, + map, + }); } function expectFast(input: string, budgetMs = 100): void { @@ -88,6 +93,26 @@ describe("heuristics.quoted-term", () => { ]); }); + it("recovers original inner bytes from smart-quoted input", () => { + expect(detectRaw("\u201C\uFF21\uFF43\uFF4D\uFF45\u3000\uFF11\uFF12\uFF13\u201D shall survive.")).toEqual([ + { + text: "\uFF21\uFF43\uFF4D\uFF45\u3000\uFF11\uFF12\uFF13", + ruleId: "heuristics.quoted-term", + confidence: 0.6, + }, + ]); + }); + + it("preserves fullwidth digits in candidate.text", () => { + expect(detectRaw('"\uFF21\uFF43\uFF4D\uFF45\uFF11\uFF12\uFF13" shall survive.')).toEqual([ + { + text: "\uFF21\uFF43\uFF4D\uFF45\uFF11\uFF12\uFF13", + ruleId: "heuristics.quoted-term", + confidence: 0.6, + }, + ]); + }); + it("is ReDoS-safe on a 10KB pathological input", () => { expectFast(`"${"A".repeat(10000)}"`); }); diff --git a/src/detection/rules/heuristics/quoted-term.ts b/src/detection/rules/heuristics/quoted-term.ts index 118d650..f817e5b 100644 --- a/src/detection/rules/heuristics/quoted-term.ts +++ b/src/detection/rules/heuristics/quoted-term.ts @@ -18,6 +18,7 @@ import type { Heuristic, HeuristicContext, } from "../../_framework/types.js"; +import { recoverOriginalSlice } from "../../_framework/recover-bytes.js"; import { ROLE_BLACKLIST_EN } from "../role-blacklist-en.js"; import { ROLE_BLACKLIST_KO } from "../role-blacklist-ko.js"; @@ -43,8 +44,19 @@ export const QUOTED_TERM: Heuristic = { if (priorTexts.has(inner)) continue; if (ROLE_BLACKLIST_EN.has(inner.toLowerCase())) continue; if (ROLE_BLACKLIST_KO.has(inner)) continue; + const innerStartNorm = m.index + 1; + const innerEndNorm = innerStartNorm + inner.length; + const original = + ctx.originalText && ctx.map + ? recoverOriginalSlice( + ctx.originalText, + ctx.map, + innerStartNorm, + innerEndNorm, + ) + : inner; out.push({ - text: inner, + text: original, ruleId: "heuristics.quoted-term", confidence: 0.6, }); diff --git a/src/detection/rules/heuristics/repeatability.test.ts b/src/detection/rules/heuristics/repeatability.test.ts index b36ea1b..c2a611c 100644 --- a/src/detection/rules/heuristics/repeatability.test.ts +++ b/src/detection/rules/heuristics/repeatability.test.ts @@ -1,5 +1,6 @@ import { describe, expect, it } from "vitest"; +import { normalizeForMatching } from "../../normalize.js"; import type { HeuristicContext } from "../../_framework/types.js"; import { REPEATABILITY } from "./repeatability.js"; @@ -16,7 +17,12 @@ function makeContext( } function detect(text: string, ctx: HeuristicContext = makeContext()) { - return REPEATABILITY.detect(text, ctx); + const map = normalizeForMatching(text); + return REPEATABILITY.detect(map.text, { + ...ctx, + originalText: text, + map, + }); } function expectFast(input: string, budgetMs = 100): void { @@ -123,6 +129,34 @@ describe("heuristics.repeatability", () => { ]); }); + it("recovers original bytes from repeated smart-quoted input", () => { + expect( + detect( + "\u201C\uFF21\uFF43\uFF4D\uFF45\u3000\uFF23\uFF4F\uFF52\uFF50\u201D signed. \u201C\uFF21\uFF43\uFF4D\uFF45\u3000\uFF23\uFF4F\uFF52\uFF50\u201D approved. \u201C\uFF21\uFF43\uFF4D\uFF45\u3000\uFF23\uFF4F\uFF52\uFF50\u201D closed.", + ), + ).toEqual([ + { + text: "\uFF21\uFF43\uFF4D\uFF45\u3000\uFF23\uFF4F\uFF52\uFF50", + ruleId: "heuristics.repeatability", + confidence: 0.5, + }, + ]); + }); + + it("preserves fullwidth ASCII letters in repeated candidate.text", () => { + expect( + detect( + "\uFF21\uFF43\uFF4D\uFF45 signed. \uFF21\uFF43\uFF4D\uFF45 approved. \uFF21\uFF43\uFF4D\uFF45 closed.", + ), + ).toEqual([ + { + text: "\uFF21\uFF43\uFF4D\uFF45", + ruleId: "heuristics.repeatability", + confidence: 0.5, + }, + ]); + }); + it("is ReDoS-safe on a 10KB pathological input", () => { expectFast(`${"Acme ".repeat(2000)}${"삼성전자 ".repeat(1000)}`); }); diff --git a/src/detection/rules/heuristics/repeatability.ts b/src/detection/rules/heuristics/repeatability.ts index db01c45..6ff29ed 100644 --- a/src/detection/rules/heuristics/repeatability.ts +++ b/src/detection/rules/heuristics/repeatability.ts @@ -19,6 +19,7 @@ import type { Heuristic, HeuristicContext, } from "../../_framework/types.js"; +import { recoverOriginalSlice } from "../../_framework/recover-bytes.js"; import { ROLE_BLACKLIST_EN } from "../role-blacklist-en.js"; import { ROLE_BLACKLIST_KO } from "../role-blacklist-ko.js"; @@ -40,6 +41,7 @@ export const REPEATABILITY: Heuristic = { const priorTexts = new Set(ctx.priorCandidates.map((c) => c.text)); const counts = new Map(); + const firstSpans = new Map(); const enPattern = /(? { + const zip = new JSZip(); + zip.file( + "word/document.xml", + `Hello`, + ); + return zip.generateAsync({ type: "uint8array" }); +} + +afterEach(() => { + vi.restoreAllMocks(); +}); + +describe("loadDocxZip", () => { + it("rejects empty bytes", async () => { + await expect(loadDocxZip(new Uint8Array(0))).rejects.toBeInstanceOf( + FileTooLargeError, + ); + }); + + it("accepts a valid docx-sized zip", async () => { + const bytes = await makeZipBytes(); + const zip = await loadDocxZip(bytes); + expect(zip.file("word/document.xml")).not.toBeNull(); + }); + + it("accepts the exact MAX_INPUT_BYTES boundary and delegates to JSZip", async () => { + const zip = new JSZip(); + const spy = vi.spyOn(JSZip, "loadAsync").mockResolvedValue(zip); + const bytes = new Uint8Array(MAX_INPUT_BYTES); + + await expect(loadDocxZip(bytes)).resolves.toBe(zip); + expect(spy).toHaveBeenCalledTimes(1); + }); + + it("rejects bytes larger than MAX_INPUT_BYTES", async () => { + const spy = vi.spyOn(JSZip, "loadAsync"); + const bytes = new Uint8Array(MAX_INPUT_BYTES + 1); + + await expect(loadDocxZip(bytes)).rejects.toBeInstanceOf(FileTooLargeError); + expect(spy).not.toHaveBeenCalled(); + }); + + it("propagates JSZip errors for corrupt ZIP data", async () => { + await expect(loadDocxZip(new Uint8Array([1, 2, 3, 4]))).rejects.not.toBeInstanceOf( + FileTooLargeError, + ); + }); +}); + +describe("readZipEntry", () => { + it("returns a string for a normal entry", async () => { + const zip = new JSZip(); + zip.file("word/document.xml", ""); + + await expect(readZipEntry(zip, "word/document.xml")).resolves.toBe( + "", + ); + }); + + it("throws when the entry does not exist", async () => { + const zip = new JSZip(); + await expect(readZipEntry(zip, "missing.xml")).rejects.toThrow( + "ZIP entry not found: missing.xml", + ); + }); + + it("throws EntryTooLargeError for oversized decompressed content", async () => { + const zip = { + file(path: string) { + if (path !== "word/document.xml") return null; + return { + async: vi.fn().mockResolvedValue("x".repeat(MAX_ENTRY_BYTES + 1)), + }; + }, + } as unknown as JSZip; + + await expect(readZipEntry(zip, "word/document.xml")).rejects.toBeInstanceOf( + EntryTooLargeError, + ); + }); + + it("accepts content at the exact MAX_ENTRY_BYTES boundary", async () => { + const zip = { + file(path: string) { + if (path !== "word/document.xml") return null; + return { + async: vi.fn().mockResolvedValue("x".repeat(MAX_ENTRY_BYTES)), + }; + }, + } as unknown as JSZip; + + await expect(readZipEntry(zip, "word/document.xml")).resolves.toBe( + "x".repeat(MAX_ENTRY_BYTES), + ); + }); +}); diff --git a/src/docx/load.ts b/src/docx/load.ts new file mode 100644 index 0000000..b6afade --- /dev/null +++ b/src/docx/load.ts @@ -0,0 +1,42 @@ +import JSZip from "jszip"; + +import { MAX_ENTRY_BYTES, MAX_INPUT_BYTES } from "./limits.js"; + +export class FileTooLargeError extends Error { + constructor(size: number, limit: number) { + super(`File size ${size} bytes exceeds limit of ${limit} bytes`); + this.name = "FileTooLargeError"; + } +} + +export class EntryTooLargeError extends Error { + constructor(path: string, size: number, limit: number) { + super(`ZIP entry "${path}" decompressed to ${size} bytes, exceeds limit of ${limit} bytes`); + this.name = "EntryTooLargeError"; + } +} + +export async function loadDocxZip(bytes: Uint8Array): Promise { + if (bytes.length === 0) { + throw new FileTooLargeError(0, MAX_INPUT_BYTES); + } + if (bytes.length > MAX_INPUT_BYTES) { + throw new FileTooLargeError(bytes.length, MAX_INPUT_BYTES); + } + return JSZip.loadAsync(bytes.slice()); +} + +export async function readZipEntry( + zip: JSZip, + path: string, +): Promise { + const file = zip.file(path); + if (file === null) { + throw new Error(`ZIP entry not found: ${path}`); + } + const content = await file.async("string"); + if (content.length > MAX_ENTRY_BYTES) { + throw new EntryTooLargeError(path, content.length, MAX_ENTRY_BYTES); + } + return content; +} diff --git a/src/docx/scopes.ts b/src/docx/scopes.ts index 8f1f6c3..b94e014 100644 --- a/src/docx/scopes.ts +++ b/src/docx/scopes.ts @@ -12,6 +12,7 @@ import type JSZip from "jszip"; +import { readZipEntry } from "./load.js"; import { SCOPE_PATTERNS, type Scope, type ScopeKind } from "./types.js"; /** @@ -73,11 +74,7 @@ export async function readScopeXml( zip: JSZip, scope: Scope, ): Promise { - const file = zip.file(scope.path); - if (file === null) { - throw new Error(`scope ${scope.path} not found in zip`); - } - return file.async("string"); + return readZipEntry(zip, scope.path); } /** diff --git a/src/docx/scrub-metadata.test.ts b/src/docx/scrub-metadata.test.ts index fd090d1..c281b36 100644 --- a/src/docx/scrub-metadata.test.ts +++ b/src/docx/scrub-metadata.test.ts @@ -24,6 +24,18 @@ const APP_XML = ` Microsoft Word `; +const CONTENT_TYPES_XML = ` + + + + +`; + +const CUSTOM_XML = ` + + kim@example.com +`; + describe("scrubMetadataXml", () => { it("zeroes out dc:creator", () => { const out = scrubMetadataXml(CORE_XML, ["creator"]); @@ -117,4 +129,41 @@ describe("scrubDocxMetadata", () => { const newCore = await zip.file("docProps/core.xml")!.async("string"); expect(newCore).not.toContain("Kim Chul-Soo"); }); + + it("removes docProps/custom.xml entirely when present", async () => { + const zip = new JSZip(); + zip.file("docProps/custom.xml", CUSTOM_XML); + zip.file("[Content_Types].xml", CONTENT_TYPES_XML); + + await scrubDocxMetadata(zip); + + expect(zip.file("docProps/custom.xml")).toBeNull(); + }); + + it("removes the custom.xml override from [Content_Types].xml", async () => { + const zip = new JSZip(); + zip.file("docProps/custom.xml", CUSTOM_XML); + zip.file("[Content_Types].xml", CONTENT_TYPES_XML); + + await scrubDocxMetadata(zip); + + const contentTypes = await zip.file("[Content_Types].xml")!.async("string"); + expect(contentTypes).not.toContain(`/docProps/custom.xml`); + expect(contentTypes).toContain(`/word/document.xml`); + }); + + it("leaves [Content_Types].xml alone when no custom override exists", async () => { + const zip = new JSZip(); + const contentTypes = CONTENT_TYPES_XML.replace( + /\s*]*PartName="\/docProps\/custom\.xml"[^>]*\/>/, + "", + ); + zip.file("[Content_Types].xml", contentTypes); + + await scrubDocxMetadata(zip); + + expect(await zip.file("[Content_Types].xml")!.async("string")).toBe( + contentTypes, + ); + }); }); diff --git a/src/docx/scrub-metadata.ts b/src/docx/scrub-metadata.ts index 4768bcd..2d563c3 100644 --- a/src/docx/scrub-metadata.ts +++ b/src/docx/scrub-metadata.ts @@ -14,6 +14,7 @@ import type JSZip from "jszip"; +import { readZipEntry } from "./load.js"; import { METADATA_SENSITIVE_FIELDS } from "./types.js"; /** @@ -42,19 +43,36 @@ export function scrubMetadataXml(xml: string, fields: ReadonlyArray): st /** * Apply the standard scrub policy to a DOCX zip in place. Reads * `docProps/core.xml` and `docProps/app.xml`, scrubs each, and writes them - * back. Idempotent. + * back. Removes `docProps/custom.xml` entirely because its schema is + * free-form and can hide arbitrary metadata payloads. Idempotent. */ export async function scrubDocxMetadata(zip: JSZip): Promise { const targets = ["docProps/core.xml", "docProps/app.xml"]; for (const path of targets) { const file = zip.file(path); if (file === null) continue; - const xml = await file.async("string"); + const xml = await readZipEntry(zip, path); const cleaned = scrubMetadataXml(xml, METADATA_SENSITIVE_FIELDS); zip.file(path, cleaned); } + + if (zip.file("docProps/custom.xml") !== null) { + zip.remove("docProps/custom.xml"); + } + + if (zip.file("[Content_Types].xml") !== null) { + const xml = await readZipEntry(zip, "[Content_Types].xml"); + zip.file("[Content_Types].xml", removeCustomPropsOverride(xml)); + } } function escapeRegex(s: string): string { return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); } + +function removeCustomPropsOverride(xml: string): string { + return xml.replace( + /\s*]*PartName=["']\/docProps\/custom\.xml["'][^>]*\/>/g, + "", + ); +} diff --git a/src/docx/verify-surfaces.test.ts b/src/docx/verify-surfaces.test.ts index 1c10655..a20bc56 100644 --- a/src/docx/verify-surfaces.test.ts +++ b/src/docx/verify-surfaces.test.ts @@ -32,4 +32,13 @@ describe("verify-surfaces", () => { "https://example.com/second", ]); }); + + it("extracts relationship Target values from single-quoted attributes too", () => { + const rels = ``; + + expect(extractRelationshipTargets(rels)).toEqual([ + "https://example.com/first", + "mailto:second@example.com", + ]); + }); }); diff --git a/src/docx/verify-surfaces.ts b/src/docx/verify-surfaces.ts index 94aac53..32b7d38 100644 --- a/src/docx/verify-surfaces.ts +++ b/src/docx/verify-surfaces.ts @@ -1,6 +1,7 @@ import type JSZip from "jszip"; import { extractScopeText } from "../detection/extract-text.js"; +import { readZipEntry } from "./load.js"; import { listScopes, readScopeXml } from "./scopes.js"; import type { Scope } from "./types.js"; @@ -50,7 +51,7 @@ export async function collectVerifySurfaces(zip: JSZip): Promise const relsTargetSurfaces: RelsTargetSurface[] = []; for (const path of listRelsPaths(zip)) { - const xml = await zip.file(path)!.async("string"); + const xml = await readZipEntry(zip, path); for (const text of extractRelationshipTargets(xml)) { relsTargetSurfaces.push({ kind: "rels-target", path, text }); } @@ -86,7 +87,7 @@ export function extractFldSimpleInstrValues(xml: string): readonly string[] { export function extractRelationshipTargets(relsXml: string): readonly string[] { const out: string[] = []; - const re = /]*\bTarget="([^"]*)"/g; + const re = /]*\bTarget=["']([^"']*)["']/g; let match: RegExpExecArray | null; while ((match = re.exec(relsXml)) !== null) { out.push(decodeXml(match[1] ?? "")); diff --git a/src/docx/verify.test.ts b/src/docx/verify.test.ts index c9f4cc3..e31cd48 100644 --- a/src/docx/verify.test.ts +++ b/src/docx/verify.test.ts @@ -179,6 +179,36 @@ describe("verifyRedaction", () => { expect(result.survived).toEqual([]); }); + it("fails verification when an external http URL survives in rels", async () => { + const zip = await syntheticDocx({ + "word/document.xml": bodyWith("[REDACTED]"), + "word/_rels/document.xml.rels": ``, + }); + const result = await verifyRedaction(zip, resolved("unrelated@example.com")); + expect(result.isClean).toBe(false); + expect(result.survived).toEqual([ + expect.objectContaining({ + text: "http://evil.example/track", + surface: "rels", + }), + ]); + }); + + it("fails verification when a single-quoted external https URL survives in rels", async () => { + const zip = await syntheticDocx({ + "word/document.xml": bodyWith("[REDACTED]"), + "word/_rels/document.xml.rels": ``, + }); + const result = await verifyRedaction(zip, resolved("unrelated@example.com")); + expect(result.isClean).toBe(false); + expect(result.survived).toEqual([ + expect.objectContaining({ + text: "https://evil.example/track", + surface: "rels", + }), + ]); + }); + it("enumerates multiple rels files in sorted path order", async () => { const zip = await syntheticDocx({ "word/document.xml": bodyWith("[REDACTED]"), diff --git a/src/docx/verify.ts b/src/docx/verify.ts index 3c973ac..bcfbe48 100644 --- a/src/docx/verify.ts +++ b/src/docx/verify.ts @@ -36,6 +36,7 @@ import { collectVerifySurfaces } from "./verify-surfaces.js"; import type { Scope } from "./types.js"; export type VerifySurfaceKind = "text" | "field" | "rels"; +const EXTERNAL_URL_TARGET_ID = "security:external-url"; /** One sensitive string that survived in one scope. */ export interface SurvivedString { @@ -121,6 +122,22 @@ export async function verifyRedaction( for (const surface of surfaces.relsTargetSurfaces) { const scope = { kind: "rels", path: surface.path } as unknown as Scope; + if (isExternalHttpUrl(surface.text)) { + mergeSurvival( + survivedByKey, + { + id: `${EXTERNAL_URL_TARGET_ID}:${surface.text}`, + displayText: surface.text, + redactionLiterals: [surface.text], + verificationLiterals: [surface.text], + scopes: [scope], + }, + scope, + "rels", + 1, + surface.text, + ); + } for (const target of activeTargets) { for (const literal of target.verificationLiterals) { const count = countOccurrences(surface.text, literal); @@ -207,3 +224,7 @@ function mergeSurvival( matchedLiteral: existing.matchedLiteral ?? matchedLiteral, }); } + +function isExternalHttpUrl(text: string): boolean { + return text.startsWith("http://") || text.startsWith("https://"); +} diff --git a/src/finalize/guided-recovery.ts b/src/finalize/guided-recovery.ts index b16acc9..eee31d1 100644 --- a/src/finalize/guided-recovery.ts +++ b/src/finalize/guided-recovery.ts @@ -1,5 +1,4 @@ -import JSZip from "jszip"; - +import { loadDocxZip } from "../docx/load.js"; import { finalizeRedaction, type FinalizeOptions, @@ -261,7 +260,7 @@ async function defaultRepairPass( repairPlan: RepairPlan, options: GuidedRecoveryOptions, ): Promise { - const zip = await JSZip.loadAsync(bytes.slice()); + const zip = await loadDocxZip(bytes); await applyRelsRepairsToZip( zip, repairPlan.relsRepairs, diff --git a/src/finalize/preflight-expansion.test.ts b/src/finalize/preflight-expansion.test.ts index 4b778a9..7a63ccd 100644 --- a/src/finalize/preflight-expansion.test.ts +++ b/src/finalize/preflight-expansion.test.ts @@ -2,7 +2,10 @@ import { describe, expect, it } from "vitest"; import JSZip from "jszip"; import { buildResolvedTargetsFromStrings } from "../selection-targets.js"; -import { buildPreflightExpansionPlan } from "./preflight-expansion.js"; +import { + applyRelsRepairsToZip, + buildPreflightExpansionPlan, +} from "./preflight-expansion.js"; import type { ResolvedRedactionTarget } from "../selection-targets.js"; const W_NS = `xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"`; @@ -15,6 +18,14 @@ async function syntheticDocx(parts: Record): Promise return zip.generateAsync({ type: "uint8array" }); } +function syntheticZip(parts: Record): JSZip { + const zip = new JSZip(); + for (const [path, content] of Object.entries(parts)) { + zip.file(path, content); + } + return zip; +} + function bodyWith(text: string): string { return `${text}`; } @@ -144,4 +155,75 @@ describe("preflight-expansion", () => { expandedLiteralCount: 0, }); }); + + it("strips double-quoted http URLs from rels", async () => { + const zip = syntheticZip({ + "word/_rels/document.xml.rels": ``, + }); + + await applyRelsRepairsToZip(zip, new Map()); + + const rels = await zip.file("word/_rels/document.xml.rels")!.async("string"); + expect(rels).toContain(`Target=""`); + expect(rels).not.toContain("http://evil.example/pixel"); + }); + + it("strips double-quoted https URLs from rels", async () => { + const zip = syntheticZip({ + "word/_rels/document.xml.rels": ``, + }); + + await applyRelsRepairsToZip(zip, new Map()); + + const rels = await zip.file("word/_rels/document.xml.rels")!.async("string"); + expect(rels).toContain(`Target=""`); + expect(rels).not.toContain("https://track.example/pixel"); + }); + + it("strips single-quoted https URLs from rels", async () => { + const zip = syntheticZip({ + "word/_rels/document.xml.rels": ``, + }); + + await applyRelsRepairsToZip(zip, new Map()); + + const rels = await zip.file("word/_rels/document.xml.rels")!.async("string"); + expect(rels).toContain(`Target=''`); + expect(rels).not.toContain("https://track.example/pixel"); + }); + + it("preserves mailto targets in rels", async () => { + const zip = syntheticZip({ + "word/_rels/document.xml.rels": ``, + }); + + await applyRelsRepairsToZip(zip, new Map()); + + const rels = await zip.file("word/_rels/document.xml.rels")!.async("string"); + expect(rels).toContain(`mailto:legal@example.com`); + }); + + it("preserves relative targets in rels", async () => { + const zip = syntheticZip({ + "word/_rels/document.xml.rels": ``, + }); + + await applyRelsRepairsToZip(zip, new Map()); + + const rels = await zip.file("word/_rels/document.xml.rels")!.async("string"); + expect(rels).toContain(`Target="media/image1.png"`); + }); + + it("strips only external URLs in mixed rels content", async () => { + const zip = syntheticZip({ + "word/_rels/document.xml.rels": ``, + }); + + await applyRelsRepairsToZip(zip, new Map()); + + const rels = await zip.file("word/_rels/document.xml.rels")!.async("string"); + expect(rels).toContain(`Target=""`); + expect(rels).toContain(`Target="media/image1.png"`); + expect(rels).toContain(`Target="mailto:legal@example.com"`); + }); }); diff --git a/src/finalize/preflight-expansion.ts b/src/finalize/preflight-expansion.ts index b088610..2d7e1eb 100644 --- a/src/finalize/preflight-expansion.ts +++ b/src/finalize/preflight-expansion.ts @@ -1,5 +1,6 @@ -import JSZip from "jszip"; +import type JSZip from "jszip"; +import { loadDocxZip, readZipEntry } from "../docx/load.js"; import { collectVerifySurfaces } from "../docx/verify-surfaces.js"; import type { ResolvedRedactionTarget } from "../selection-targets.js"; @@ -29,7 +30,7 @@ export async function buildPreflightExpansionPlan( }; } - const zip = await JSZip.loadAsync(bytes.slice()); + const zip = await loadDocxZip(bytes); const surfaces = await collectVerifySurfaces(zip); const extraLiterals = new Map>(); const relsRepairs = new Map>(); @@ -120,11 +121,12 @@ export async function applyRelsRepairsToZip( relsRepairs: ReadonlyMap, placeholder = "[REDACTED]", ): Promise { - for (const [path, literals] of relsRepairs) { - const file = zip.file(path); - if (file === null) continue; - const xml = await file.async("string"); - const repaired = repairRelationshipTargets(xml, literals, placeholder); + for (const path of listRelsPaths(zip)) { + const literals = relsRepairs.get(path) ?? []; + const xml = await readZipEntry(zip, path); + const repaired = stripExternalUrls( + repairRelationshipTargets(xml, literals, placeholder), + ); zip.file(path, repaired); } } @@ -153,6 +155,18 @@ function repairRelationshipTargets( ); } +function stripExternalUrls(relsXml: string): string { + let result = relsXml.replace( + /(]*\bTarget=)"(https?:\/\/[^"]*)"/g, + '$1""', + ); + result = result.replace( + /(]*\bTarget=)'(https?:\/\/[^']*)'/g, + "$1''", + ); + return result; +} + function idleSummary(): PreflightExpansionSummary { return { touchedScopePaths: [], @@ -191,3 +205,14 @@ function decodeXml(text: string): string { String.fromCodePoint(Number.parseInt(dec, 10)), ); } + +function listRelsPaths(zip: JSZip): string[] { + const paths: string[] = []; + zip.forEach((relativePath, file) => { + if (file.dir) return; + if (relativePath.endsWith(".rels")) { + paths.push(relativePath); + } + }); + return paths.sort(); +} diff --git a/src/ui/DocumentPreview.svelte b/src/ui/DocumentPreview.svelte index 68bf34a..31f3311 100644 --- a/src/ui/DocumentPreview.svelte +++ b/src/ui/DocumentPreview.svelte @@ -14,12 +14,8 @@ have that can come later. -->