From 44a9933d20884d7ba4dad9164601d3228681e5ab Mon Sep 17 00:00:00 2001
From: kipeum86 <kipeum86@gmail.com>
Date: Fri, 17 Apr 2026 00:42:19 +0900
Subject: [PATCH 1/6] fix(detection): recover original bytes in heuristic
 candidates

---
 .../_framework/recover-bytes.test.ts          | 55 +++++++++++++++++++
 src/detection/_framework/recover-bytes.ts     | 10 ++++
 src/detection/_framework/runner.ts            | 15 ++++-
 src/detection/_framework/types.ts             |  6 ++
 .../heuristics/capitalization-cluster.test.ts | 28 +++++++++-
 .../heuristics/capitalization-cluster.ts      | 15 ++++-
 .../heuristics/email-domain-inference.test.ts | 53 ++++++++++++++----
 .../heuristics/email-domain-inference.ts      | 21 ++++++-
 .../rules/heuristics/quoted-term.test.ts      | 27 ++++++++-
 src/detection/rules/heuristics/quoted-term.ts | 14 ++++-
 .../rules/heuristics/repeatability.test.ts    | 36 +++++++++++-
 .../rules/heuristics/repeatability.ts         | 20 ++++++-
 12 files changed, 279 insertions(+), 21 deletions(-)
 create mode 100644 src/detection/_framework/recover-bytes.test.ts
 create mode 100644 src/detection/_framework/recover-bytes.ts

diff --git a/src/detection/_framework/recover-bytes.test.ts b/src/detection/_framework/recover-bytes.test.ts
new file mode 100644
index 0000000..c61640c
--- /dev/null
+++ b/src/detection/_framework/recover-bytes.test.ts
@@ -0,0 +1,55 @@
+import { describe, expect, it } from "vitest";
+
+import { normalizeForMatching } from "../normalize.js";
+
+import { recoverOriginalSlice } from "./recover-bytes.js";
+
+describe("recoverOriginalSlice", () => {
+  it("preserves smart quotes from the original text", () => {
+    const original = `prefix \u201CAcme Corp\u201D suffix`;
+    const map = normalizeForMatching(original);
+    const start = map.text.indexOf(`"Acme Corp"`);
+    const end = start + `"Acme Corp"`.length;
+    expect(recoverOriginalSlice(original, map, start, end)).toBe(
+      `\u201CAcme Corp\u201D`,
+    );
+  });
+
+  it("preserves fullwidth digits from the original text", () => {
+    const original = "Call \uFF10\uFF11\uFF12\uFF13 now";
+    const map = normalizeForMatching(original);
+    const start = map.text.indexOf("0123");
+    const end = start + "0123".length;
+    expect(recoverOriginalSlice(original, map, start, end)).toBe(
+      "\uFF10\uFF11\uFF12\uFF13",
+    );
+  });
+
+  it("passes ASCII slices through unchanged", () => {
+    const original = "Acme Corp";
+    const map = normalizeForMatching(original);
+    expect(recoverOriginalSlice(original, map, 0, map.text.length)).toBe(
+      "Acme Corp",
+    );
+  });
+
+  it("supports startNorm = 0", () => {
+    const original = "\uFF21BC";
+    const map = normalizeForMatching(original);
+    expect(recoverOriginalSlice(original, map, 0, 1)).toBe("\uFF21");
+  });
+
+  it("supports endNorm = text.length", () => {
+    const original = `\u201CAcme\u201D`;
+    const map = normalizeForMatching(original);
+    expect(
+      recoverOriginalSlice(original, map, 0, map.text.length),
+    ).toBe(`\u201CAcme\u201D`);
+  });
+
+  it("returns an empty string for an empty slice", () => {
+    const original = "Acme";
+    const map = normalizeForMatching(original);
+    expect(recoverOriginalSlice(original, map, 2, 2)).toBe("");
+  });
+});
diff --git a/src/detection/_framework/recover-bytes.ts b/src/detection/_framework/recover-bytes.ts
new file mode 100644
index 0000000..eae504e
--- /dev/null
+++ b/src/detection/_framework/recover-bytes.ts
@@ -0,0 +1,10 @@
+import type { PositionMap } from "../normalize.js";
+
+export function recoverOriginalSlice(
+  originalText: string,
+  map: PositionMap,
+  startNorm: number,
+  endNorm: number,
+): string {
+  return originalText.slice(map.origOffsets[startNorm], map.origOffsets[endNorm]);
+}
diff --git a/src/detection/_framework/runner.ts b/src/detection/_framework/runner.ts
index 9337ce1..1b302b3 100644
--- a/src/detection/_framework/runner.ts
+++ b/src/detection/_framework/runner.ts
@@ -320,7 +320,18 @@ export function runHeuristicPhase(
   if (text.length === 0) return [];
   const map = normalizeForMatching(text);
   if (map.text.length === 0) return [];
-  return runHeuristicPhaseOnMap(map, level, heuristics, context, opts);
+  const heuristicContext: HeuristicContext = {
+    ...context,
+    originalText: text,
+    map,
+  };
+  return runHeuristicPhaseOnMap(
+    map,
+    level,
+    heuristics,
+    heuristicContext,
+    opts,
+  );
 }
 
 /**
@@ -462,6 +473,8 @@ export function runAllPhases(text: string, opts: RunAllOptions): RunAllResult {
     structuralDefinitions,
     priorCandidates: regexCandidates,
     documentLanguage,
+    originalText: text,
+    map,
   };
   const heuristicCandidates = runHeuristicPhaseOnMap(
     map,
diff --git a/src/detection/_framework/types.ts b/src/detection/_framework/types.ts
index 06775b7..78ae59f 100644
--- a/src/detection/_framework/types.ts
+++ b/src/detection/_framework/types.ts
@@ -1,3 +1,5 @@
+import type { PositionMap } from "../normalize.js";
+
 /**
  * Rule framework types — Phase 0.
  *
@@ -112,11 +114,15 @@ export interface Candidate {
  *  - structuralDefinitions (from structural phase) to skip D9 defined labels
  *  - priorCandidates (from regex phase) to avoid double-counting
  *  - documentLanguage (from runner) to filter role blacklists
+ *  - originalText + map (from runner) to recover original bytes for emitted
+ *    candidates without re-normalizing
  */
 export interface HeuristicContext {
   readonly structuralDefinitions: readonly StructuralDefinition[];
   readonly priorCandidates: readonly Candidate[];
   readonly documentLanguage: "ko" | "en" | "mixed";
+  readonly originalText?: string;
+  readonly map?: PositionMap;
 }
 
 /**
diff --git a/src/detection/rules/heuristics/capitalization-cluster.test.ts b/src/detection/rules/heuristics/capitalization-cluster.test.ts
index 9ce0a0d..d2960c8 100644
--- a/src/detection/rules/heuristics/capitalization-cluster.test.ts
+++ b/src/detection/rules/heuristics/capitalization-cluster.test.ts
@@ -1,5 +1,6 @@
 import { describe, expect, it } from "vitest";
 
+import { normalizeForMatching } from "../../normalize.js";
 import type { HeuristicContext } from "../../_framework/types.js";
 
 import { CAPITALIZATION_CLUSTER } from "./capitalization-cluster.js";
@@ -16,7 +17,12 @@ function makeContext(
 }
 
 function detect(text: string, ctx: HeuristicContext = makeContext()) {
-  return CAPITALIZATION_CLUSTER.detect(text, ctx);
+  const map = normalizeForMatching(text);
+  return CAPITALIZATION_CLUSTER.detect(map.text, {
+    ...ctx,
+    originalText: text,
+    map,
+  });
 }
 
 function expectFast(input: string, budgetMs = 100): void {
@@ -100,6 +106,26 @@ describe("heuristics.capitalization-cluster", () => {
     ]);
   });
 
+  it("recovers original bytes from smart-quoted input", () => {
+    expect(detect("\u201C\uFF21\uFF43\uFF4D\uFF45\u3000\uFF23\uFF4F\uFF52\uFF50\u201D signed.")).toEqual([
+      {
+        text: "\uFF21\uFF43\uFF4D\uFF45\u3000\uFF23\uFF4F\uFF52\uFF50",
+        ruleId: "heuristics.capitalization-cluster",
+        confidence: 0.7,
+      },
+    ]);
+  });
+
+  it("preserves fullwidth ASCII letters in candidate.text", () => {
+    expect(detect("\uFF21\uFF43\uFF4D\uFF45\u3000\uFF28\uFF4F\uFF4C\uFF44\uFF49\uFF4E\uFF47\uFF53 approved.")).toEqual([
+      {
+        text: "\uFF21\uFF43\uFF4D\uFF45\u3000\uFF28\uFF4F\uFF4C\uFF44\uFF49\uFF4E\uFF47\uFF53",
+        ruleId: "heuristics.capitalization-cluster",
+        confidence: 0.7,
+      },
+    ]);
+  });
+
   it("is ReDoS-safe on a 10KB pathological input", () => {
     expectFast(`${"A".repeat(5000)} ${"B".repeat(5000)}`);
   });
diff --git a/src/detection/rules/heuristics/capitalization-cluster.ts b/src/detection/rules/heuristics/capitalization-cluster.ts
index b50a654..6c14987 100644
--- a/src/detection/rules/heuristics/capitalization-cluster.ts
+++ b/src/detection/rules/heuristics/capitalization-cluster.ts
@@ -9,8 +9,7 @@
  *   2. Prior candidate skip — already-found strings excluded
  *   3. Role blacklist — generic legal roles excluded
  *   4. Confidence 0.7 (moderate — caps clusters are common in English prose)
- *   5. Returns normalized text as candidate.text (ASCII letters are
- *      normalized losslessly, so normalized = original for this heuristic)
+ *   5. Recovers original bytes for candidate.text via HeuristicContext.map
  *
  * See docs/phases/phase-1-rulebook.md § 14.4.1
  */
@@ -20,6 +19,7 @@ import type {
   Heuristic,
   HeuristicContext,
 } from "../../_framework/types.js";
+import { recoverOriginalSlice } from "../../_framework/recover-bytes.js";
 import { ROLE_BLACKLIST_EN } from "../role-blacklist-en.js";
 
 export const CAPITALIZATION_CLUSTER: Heuristic = {
@@ -46,8 +46,17 @@ export const CAPITALIZATION_CLUSTER: Heuristic = {
       if (ROLE_BLACKLIST_EN.has(candidate.toLowerCase())) continue;
       const words = candidate.split(/\s+/);
       if (words.some((w) => ROLE_BLACKLIST_EN.has(w.toLowerCase()))) continue;
+      const original =
+        ctx.originalText && ctx.map
+          ? recoverOriginalSlice(
+              ctx.originalText,
+              ctx.map,
+              m.index,
+              m.index + candidate.length,
+            )
+          : candidate;
       out.push({
-        text: candidate,
+        text: original,
         ruleId: "heuristics.capitalization-cluster",
         confidence: 0.7,
       });
diff --git a/src/detection/rules/heuristics/email-domain-inference.test.ts b/src/detection/rules/heuristics/email-domain-inference.test.ts
index ca6fe5d..8430040 100644
--- a/src/detection/rules/heuristics/email-domain-inference.test.ts
+++ b/src/detection/rules/heuristics/email-domain-inference.test.ts
@@ -1,5 +1,6 @@
 import { describe, expect, it } from "vitest";
 
+import { normalizeForMatching } from "../../normalize.js";
 import type { Candidate, HeuristicContext } from "../../_framework/types.js";
 
 import { EMAIL_DOMAIN_INFERENCE } from "./email-domain-inference.js";
@@ -16,13 +17,18 @@ function makeContext(
   };
 }
 
-function detect(ctx: HeuristicContext) {
-  return EMAIL_DOMAIN_INFERENCE.detect("", ctx);
+function detect(text: string, ctx: HeuristicContext) {
+  const map = normalizeForMatching(text);
+  return EMAIL_DOMAIN_INFERENCE.detect(map.text, {
+    ...ctx,
+    originalText: text,
+    map,
+  });
 }
 
 function expectFast(ctx: HeuristicContext, budgetMs = 100): void {
   const start = performance.now();
-  void detect(ctx);
+  void detect("", ctx);
   const elapsed = performance.now() - start;
   expect(elapsed).toBeLessThan(budgetMs);
 }
@@ -51,7 +57,7 @@ describe("heuristics.email-domain-inference", () => {
       [{ text: "Samsung", ruleId: "heuristics.email-domain-inference", confidence: 0.8 }],
     ],
   ])("%s", (_name, ctx, expected) => {
-    expect(detect(ctx)).toEqual(expected);
+    expect(detect("", ctx)).toEqual(expected);
   });
 
   it.each([
@@ -78,7 +84,7 @@ describe("heuristics.email-domain-inference", () => {
       [{ text: "Northwind", ruleId: "heuristics.email-domain-inference", confidence: 0.8 }],
     ],
   ])("%s", (_name, ctx, expected) => {
-    expect(detect(ctx)).toEqual(expected);
+    expect(detect("", ctx)).toEqual(expected);
   });
 
   it.each([
@@ -101,7 +107,7 @@ describe("heuristics.email-domain-inference", () => {
       ]),
     ],
   ])("%s", (_name, ctx) => {
-    expect(detect(ctx)).toEqual([]);
+    expect(detect("", ctx)).toEqual([]);
   });
 
   it("skips inferred names that match structural-definition labels (D9)", () => {
@@ -117,7 +123,7 @@ describe("heuristics.email-domain-inference", () => {
         ],
       },
     );
-    expect(detect(ctx)).toEqual([]);
+    expect(detect("", ctx)).toEqual([]);
   });
 
   it("skips inferred names already present in priorCandidates", () => {
@@ -125,25 +131,26 @@ describe("heuristics.email-domain-inference", () => {
       { text: "legal@acme-corp.com", ruleId: "identifiers.email", confidence: 1.0 },
       { text: "Acme Corp", ruleId: "entities.en-corp-suffix", confidence: 1.0 },
     ]);
-    expect(detect(ctx)).toEqual([]);
+    expect(detect("", ctx)).toEqual([]);
   });
 
   it("skips blacklisted inferred names like Party", () => {
     const ctx = makeContext([
       { text: "legal@party.com", ruleId: "identifiers.email", confidence: 1.0 },
     ]);
-    expect(detect(ctx)).toEqual([]);
+    expect(detect("", ctx)).toEqual([]);
   });
 
   it("skips blacklisted inferred names like Company", () => {
     const ctx = makeContext([
       { text: "legal@company.com", ruleId: "identifiers.email", confidence: 1.0 },
     ]);
-    expect(detect(ctx)).toEqual([]);
+    expect(detect("", ctx)).toEqual([]);
   });
 
   it("emits 0.8 for corporate prefixes and 0.6 for personal prefixes", () => {
     const result = detect(
+      "",
       makeContext([
         { text: "legal@acme-corp.com", ruleId: "identifiers.email", confidence: 1.0 },
         { text: "john@beta.com", ruleId: "identifiers.email", confidence: 1.0 },
@@ -163,6 +170,32 @@ describe("heuristics.email-domain-inference", () => {
     ]);
   });
 
+  it("recovers original bytes from smart-quoted document occurrences", () => {
+    const ctx = makeContext([
+      { text: "legal@acme-corp.com", ruleId: "identifiers.email", confidence: 1.0 },
+    ]);
+    expect(detect("\u201C\uFF21\uFF43\uFF4D\uFF45\u3000\uFF23\uFF4F\uFF52\uFF50\u201D is the counterparty.", ctx)).toEqual([
+      {
+        text: "\uFF21\uFF43\uFF4D\uFF45\u3000\uFF23\uFF4F\uFF52\uFF50",
+        ruleId: "heuristics.email-domain-inference",
+        confidence: 0.8,
+      },
+    ]);
+  });
+
+  it("preserves fullwidth digits when recovering inferred names from text", () => {
+    const ctx = makeContext([
+      { text: "legal@acme-123.com", ruleId: "identifiers.email", confidence: 1.0 },
+    ]);
+    expect(detect("\uFF21\uFF43\uFF4D\uFF45\u3000\uFF11\uFF12\uFF13 responded.", ctx)).toEqual([
+      {
+        text: "\uFF21\uFF43\uFF4D\uFF45\u3000\uFF11\uFF12\uFF13",
+        ruleId: "heuristics.email-domain-inference",
+        confidence: 0.8,
+      },
+    ]);
+  });
+
   it("is ReDoS-safe on a 10KB pathological input", () => {
     expectFast(
       makeContext([
diff --git a/src/detection/rules/heuristics/email-domain-inference.ts b/src/detection/rules/heuristics/email-domain-inference.ts
index 8f5137a..f139e36 100644
--- a/src/detection/rules/heuristics/email-domain-inference.ts
+++ b/src/detection/rules/heuristics/email-domain-inference.ts
@@ -19,6 +19,7 @@ import type {
   Heuristic,
   HeuristicContext,
 } from "../../_framework/types.js";
+import { recoverOriginalSlice } from "../../_framework/recover-bytes.js";
 import { ROLE_BLACKLIST_EN } from "../role-blacklist-en.js";
 
 /** Common TLDs to strip. */
@@ -39,6 +40,22 @@ function titleCase(s: string): string {
   return s[0]!.toUpperCase() + s.slice(1).toLowerCase();
 }
 
+function recoverInferredText(
+  normalizedText: string,
+  inferred: string,
+  ctx: HeuristicContext,
+): string {
+  if (!ctx.originalText || !ctx.map) return inferred;
+  const startNorm = normalizedText.indexOf(inferred);
+  if (startNorm < 0) return inferred;
+  return recoverOriginalSlice(
+    ctx.originalText,
+    ctx.map,
+    startNorm,
+    startNorm + inferred.length,
+  );
+}
+
 export const EMAIL_DOMAIN_INFERENCE: Heuristic = {
   id: "heuristics.email-domain-inference",
   category: "heuristics",
@@ -47,7 +64,7 @@ export const EMAIL_DOMAIN_INFERENCE: Heuristic = {
   levels: ["paranoid"],
   description:
     "Infer company name from email domain (legal@acme-corp.com → 'Acme Corp')",
-  detect(_text: string, ctx: HeuristicContext): readonly Candidate[] {
+  detect(text: string, ctx: HeuristicContext): readonly Candidate[] {
     const definedLabels = new Set(
       ctx.structuralDefinitions.map((d) => d.label),
     );
@@ -93,7 +110,7 @@ export const EMAIL_DOMAIN_INFERENCE: Heuristic = {
       const confidence = CORPORATE_PREFIXES.has(localPart) ? 0.8 : 0.6;
 
       out.push({
-        text: inferred,
+        text: recoverInferredText(text, inferred, ctx),
         ruleId: "heuristics.email-domain-inference",
         confidence,
       });
diff --git a/src/detection/rules/heuristics/quoted-term.test.ts b/src/detection/rules/heuristics/quoted-term.test.ts
index 561a091..495a411 100644
--- a/src/detection/rules/heuristics/quoted-term.test.ts
+++ b/src/detection/rules/heuristics/quoted-term.test.ts
@@ -17,7 +17,12 @@ function makeContext(
 }
 
 function detectRaw(text: string, ctx: HeuristicContext = makeContext()) {
-  return QUOTED_TERM.detect(normalizeForMatching(text).text, ctx);
+  const map = normalizeForMatching(text);
+  return QUOTED_TERM.detect(map.text, {
+    ...ctx,
+    originalText: text,
+    map,
+  });
 }
 
 function expectFast(input: string, budgetMs = 100): void {
@@ -88,6 +93,26 @@ describe("heuristics.quoted-term", () => {
     ]);
   });
 
+  it("recovers original inner bytes from smart-quoted input", () => {
+    expect(detectRaw("\u201C\uFF21\uFF43\uFF4D\uFF45\u3000\uFF11\uFF12\uFF13\u201D shall survive.")).toEqual([
+      {
+        text: "\uFF21\uFF43\uFF4D\uFF45\u3000\uFF11\uFF12\uFF13",
+        ruleId: "heuristics.quoted-term",
+        confidence: 0.6,
+      },
+    ]);
+  });
+
+  it("preserves fullwidth digits in candidate.text", () => {
+    expect(detectRaw('"\uFF21\uFF43\uFF4D\uFF45\uFF11\uFF12\uFF13" shall survive.')).toEqual([
+      {
+        text: "\uFF21\uFF43\uFF4D\uFF45\uFF11\uFF12\uFF13",
+        ruleId: "heuristics.quoted-term",
+        confidence: 0.6,
+      },
+    ]);
+  });
+
   it("is ReDoS-safe on a 10KB pathological input", () => {
     expectFast(`"${"A".repeat(10000)}"`);
   });
diff --git a/src/detection/rules/heuristics/quoted-term.ts b/src/detection/rules/heuristics/quoted-term.ts
index 118d650..f817e5b 100644
--- a/src/detection/rules/heuristics/quoted-term.ts
+++ b/src/detection/rules/heuristics/quoted-term.ts
@@ -18,6 +18,7 @@ import type {
   Heuristic,
   HeuristicContext,
 } from "../../_framework/types.js";
+import { recoverOriginalSlice } from "../../_framework/recover-bytes.js";
 import { ROLE_BLACKLIST_EN } from "../role-blacklist-en.js";
 import { ROLE_BLACKLIST_KO } from "../role-blacklist-ko.js";
 
@@ -43,8 +44,19 @@ export const QUOTED_TERM: Heuristic = {
       if (priorTexts.has(inner)) continue;
       if (ROLE_BLACKLIST_EN.has(inner.toLowerCase())) continue;
       if (ROLE_BLACKLIST_KO.has(inner)) continue;
+      const innerStartNorm = m.index + 1;
+      const innerEndNorm = innerStartNorm + inner.length;
+      const original =
+        ctx.originalText && ctx.map
+          ? recoverOriginalSlice(
+              ctx.originalText,
+              ctx.map,
+              innerStartNorm,
+              innerEndNorm,
+            )
+          : inner;
       out.push({
-        text: inner,
+        text: original,
         ruleId: "heuristics.quoted-term",
         confidence: 0.6,
       });
diff --git a/src/detection/rules/heuristics/repeatability.test.ts b/src/detection/rules/heuristics/repeatability.test.ts
index b36ea1b..c2a611c 100644
--- a/src/detection/rules/heuristics/repeatability.test.ts
+++ b/src/detection/rules/heuristics/repeatability.test.ts
@@ -1,5 +1,6 @@
 import { describe, expect, it } from "vitest";
 
+import { normalizeForMatching } from "../../normalize.js";
 import type { HeuristicContext } from "../../_framework/types.js";
 
 import { REPEATABILITY } from "./repeatability.js";
@@ -16,7 +17,12 @@ function makeContext(
 }
 
 function detect(text: string, ctx: HeuristicContext = makeContext()) {
-  return REPEATABILITY.detect(text, ctx);
+  const map = normalizeForMatching(text);
+  return REPEATABILITY.detect(map.text, {
+    ...ctx,
+    originalText: text,
+    map,
+  });
 }
 
 function expectFast(input: string, budgetMs = 100): void {
@@ -123,6 +129,34 @@ describe("heuristics.repeatability", () => {
     ]);
   });
 
+  it("recovers original bytes from repeated smart-quoted input", () => {
+    expect(
+      detect(
+        "\u201C\uFF21\uFF43\uFF4D\uFF45\u3000\uFF23\uFF4F\uFF52\uFF50\u201D signed. \u201C\uFF21\uFF43\uFF4D\uFF45\u3000\uFF23\uFF4F\uFF52\uFF50\u201D approved. \u201C\uFF21\uFF43\uFF4D\uFF45\u3000\uFF23\uFF4F\uFF52\uFF50\u201D closed.",
+      ),
+    ).toEqual([
+      {
+        text: "\uFF21\uFF43\uFF4D\uFF45\u3000\uFF23\uFF4F\uFF52\uFF50",
+        ruleId: "heuristics.repeatability",
+        confidence: 0.5,
+      },
+    ]);
+  });
+
+  it("preserves fullwidth ASCII letters in repeated candidate.text", () => {
+    expect(
+      detect(
+        "\uFF21\uFF43\uFF4D\uFF45 signed. \uFF21\uFF43\uFF4D\uFF45 approved. \uFF21\uFF43\uFF4D\uFF45 closed.",
+      ),
+    ).toEqual([
+      {
+        text: "\uFF21\uFF43\uFF4D\uFF45",
+        ruleId: "heuristics.repeatability",
+        confidence: 0.5,
+      },
+    ]);
+  });
+
   it("is ReDoS-safe on a 10KB pathological input", () => {
     expectFast(`${"Acme ".repeat(2000)}${"삼성전자 ".repeat(1000)}`);
   });
diff --git a/src/detection/rules/heuristics/repeatability.ts b/src/detection/rules/heuristics/repeatability.ts
index db01c45..6ff29ed 100644
--- a/src/detection/rules/heuristics/repeatability.ts
+++ b/src/detection/rules/heuristics/repeatability.ts
@@ -19,6 +19,7 @@ import type {
   Heuristic,
   HeuristicContext,
 } from "../../_framework/types.js";
+import { recoverOriginalSlice } from "../../_framework/recover-bytes.js";
 import { ROLE_BLACKLIST_EN } from "../role-blacklist-en.js";
 import { ROLE_BLACKLIST_KO } from "../role-blacklist-ko.js";
 
@@ -40,6 +41,7 @@ export const REPEATABILITY: Heuristic = {
     const priorTexts = new Set(ctx.priorCandidates.map((c) => c.text));
 
     const counts = new Map<string, number>();
+    const firstSpans = new Map<string, readonly [number, number]>();
 
     const enPattern =
       /(?<![A-Za-z])[A-Z][a-z]+(?:\s+[A-Z][a-z]+){0,3}(?![A-Za-z])/g;
@@ -47,12 +49,18 @@ export const REPEATABILITY: Heuristic = {
     while ((m = enPattern.exec(text)) !== null) {
       const token = m[0]!;
       counts.set(token, (counts.get(token) ?? 0) + 1);
+      if (!firstSpans.has(token)) {
+        firstSpans.set(token, [m.index, m.index + token.length]);
+      }
     }
 
     const koPattern = /(?<![가-힣])[가-힣]{2,6}(?![가-힣])/g;
     while ((m = koPattern.exec(text)) !== null) {
       const token = m[0]!;
       counts.set(token, (counts.get(token) ?? 0) + 1);
+      if (!firstSpans.has(token)) {
+        firstSpans.set(token, [m.index, m.index + token.length]);
+      }
     }
 
     const out: Candidate[] = [];
@@ -62,8 +70,18 @@ export const REPEATABILITY: Heuristic = {
       if (priorTexts.has(token)) continue;
       if (ROLE_BLACKLIST_EN.has(token.toLowerCase())) continue;
       if (ROLE_BLACKLIST_KO.has(token)) continue;
+      const span = firstSpans.get(token);
+      const original =
+        span && ctx.originalText && ctx.map
+          ? recoverOriginalSlice(
+              ctx.originalText,
+              ctx.map,
+              span[0],
+              span[1],
+            )
+          : token;
       out.push({
-        text: token,
+        text: original,
         ruleId: "heuristics.repeatability",
         confidence: 0.5,
       });

From 06f6244c5d9c285cff110d0a733d10f82504a864 Mon Sep 17 00:00:00 2001
From: kipeum86 <kipeum86@gmail.com>
Date: Fri, 17 Apr 2026 23:45:43 +0900
Subject: [PATCH 2/6] fix(security): add ZIP size guard via loadDocxZip wrapper

---
 src/docx/limits.ts                  |  2 ++
 src/docx/load.test.ts               | 55 +++++++++++++++++++++++++++++
 src/docx/load.ts                    | 20 +++++++++++
 src/finalize/guided-recovery.ts     |  5 ++-
 src/finalize/preflight-expansion.ts |  5 +--
 src/ui/DocumentPreview.svelte       | 10 ++----
 src/ui/engine.ts                    |  7 ++--
 7 files changed, 88 insertions(+), 16 deletions(-)
 create mode 100644 src/docx/limits.ts
 create mode 100644 src/docx/load.test.ts
 create mode 100644 src/docx/load.ts

diff --git a/src/docx/limits.ts b/src/docx/limits.ts
new file mode 100644
index 0000000..9a70e8a
--- /dev/null
+++ b/src/docx/limits.ts
@@ -0,0 +1,2 @@
+export const MAX_INPUT_BYTES = 50 * 1024 * 1024; // 50 MB
+export const MAX_ENTRY_BYTES = 20 * 1024 * 1024; // 20 MB
diff --git a/src/docx/load.test.ts b/src/docx/load.test.ts
new file mode 100644
index 0000000..69c7811
--- /dev/null
+++ b/src/docx/load.test.ts
@@ -0,0 +1,55 @@
+import { describe, expect, it, vi, afterEach } from "vitest";
+import JSZip from "jszip";
+
+import { MAX_INPUT_BYTES } from "./limits.js";
+import { FileTooLargeError, loadDocxZip } from "./load.js";
+
+async function makeZipBytes(): Promise<Uint8Array> {
+  const zip = new JSZip();
+  zip.file(
+    "word/document.xml",
+    `<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"><w:body><w:p><w:r><w:t>Hello</w:t></w:r></w:p></w:body></w:document>`,
+  );
+  return zip.generateAsync({ type: "uint8array" });
+}
+
+afterEach(() => {
+  vi.restoreAllMocks();
+});
+
+describe("loadDocxZip", () => {
+  it("rejects empty bytes", async () => {
+    await expect(loadDocxZip(new Uint8Array(0))).rejects.toBeInstanceOf(
+      FileTooLargeError,
+    );
+  });
+
+  it("accepts a valid docx-sized zip", async () => {
+    const bytes = await makeZipBytes();
+    const zip = await loadDocxZip(bytes);
+    expect(zip.file("word/document.xml")).not.toBeNull();
+  });
+
+  it("accepts the exact MAX_INPUT_BYTES boundary and delegates to JSZip", async () => {
+    const zip = new JSZip();
+    const spy = vi.spyOn(JSZip, "loadAsync").mockResolvedValue(zip);
+    const bytes = new Uint8Array(MAX_INPUT_BYTES);
+
+    await expect(loadDocxZip(bytes)).resolves.toBe(zip);
+    expect(spy).toHaveBeenCalledTimes(1);
+  });
+
+  it("rejects bytes larger than MAX_INPUT_BYTES", async () => {
+    const spy = vi.spyOn(JSZip, "loadAsync");
+    const bytes = new Uint8Array(MAX_INPUT_BYTES + 1);
+
+    await expect(loadDocxZip(bytes)).rejects.toBeInstanceOf(FileTooLargeError);
+    expect(spy).not.toHaveBeenCalled();
+  });
+
+  it("propagates JSZip errors for corrupt ZIP data", async () => {
+    await expect(loadDocxZip(new Uint8Array([1, 2, 3, 4]))).rejects.not.toBeInstanceOf(
+      FileTooLargeError,
+    );
+  });
+});
diff --git a/src/docx/load.ts b/src/docx/load.ts
new file mode 100644
index 0000000..9488477
--- /dev/null
+++ b/src/docx/load.ts
@@ -0,0 +1,20 @@
+import JSZip from "jszip";
+
+import { MAX_INPUT_BYTES } from "./limits.js";
+
+export class FileTooLargeError extends Error {
+  constructor(size: number, limit: number) {
+    super(`File size ${size} bytes exceeds limit of ${limit} bytes`);
+    this.name = "FileTooLargeError";
+  }
+}
+
+export async function loadDocxZip(bytes: Uint8Array): Promise<JSZip> {
+  if (bytes.length === 0) {
+    throw new FileTooLargeError(0, MAX_INPUT_BYTES);
+  }
+  if (bytes.length > MAX_INPUT_BYTES) {
+    throw new FileTooLargeError(bytes.length, MAX_INPUT_BYTES);
+  }
+  return JSZip.loadAsync(bytes.slice());
+}
diff --git a/src/finalize/guided-recovery.ts b/src/finalize/guided-recovery.ts
index b16acc9..eee31d1 100644
--- a/src/finalize/guided-recovery.ts
+++ b/src/finalize/guided-recovery.ts
@@ -1,5 +1,4 @@
-import JSZip from "jszip";
-
+import { loadDocxZip } from "../docx/load.js";
 import {
   finalizeRedaction,
   type FinalizeOptions,
@@ -261,7 +260,7 @@ async function defaultRepairPass(
   repairPlan: RepairPlan,
   options: GuidedRecoveryOptions,
 ): Promise<FinalizedReport> {
-  const zip = await JSZip.loadAsync(bytes.slice());
+  const zip = await loadDocxZip(bytes);
   await applyRelsRepairsToZip(
     zip,
     repairPlan.relsRepairs,
diff --git a/src/finalize/preflight-expansion.ts b/src/finalize/preflight-expansion.ts
index b088610..345c464 100644
--- a/src/finalize/preflight-expansion.ts
+++ b/src/finalize/preflight-expansion.ts
@@ -1,5 +1,6 @@
-import JSZip from "jszip";
+import type JSZip from "jszip";
 
+import { loadDocxZip } from "../docx/load.js";
 import { collectVerifySurfaces } from "../docx/verify-surfaces.js";
 import type { ResolvedRedactionTarget } from "../selection-targets.js";
 
@@ -29,7 +30,7 @@ export async function buildPreflightExpansionPlan(
     };
   }
 
-  const zip = await JSZip.loadAsync(bytes.slice());
+  const zip = await loadDocxZip(bytes);
   const surfaces = await collectVerifySurfaces(zip);
   const extraLiterals = new Map<string, Set<string>>();
   const relsRepairs = new Map<string, Set<string>>();
diff --git a/src/ui/DocumentPreview.svelte b/src/ui/DocumentPreview.svelte
index 68bf34a..31f3311 100644
--- a/src/ui/DocumentPreview.svelte
+++ b/src/ui/DocumentPreview.svelte
@@ -14,12 +14,8 @@
     have that can come later.
 -->
 <script lang="ts">
-  import JSZip from "jszip";
-
-  import {
-    renderDocumentBody,
-    type RenderedDocument,
-  } from "../docx/render-body.js";
+  import { loadDocxZip } from "../docx/load.js";
+  import { renderDocumentBody, type RenderedDocument } from "../docx/render-body.js";
   import RenderedBody from "./RenderedBody.svelte";
   import { appState, type AppPhase } from "./state.svelte.ts";
 
@@ -102,7 +98,7 @@
     if (cached !== undefined) return cached;
 
     const promise = (async () => {
-      const zip = await JSZip.loadAsync(bytes.slice());
+      const zip = await loadDocxZip(bytes);
       return await renderDocumentBody(zip);
     })();
     renderedDocCache.set(bytes, promise);
diff --git a/src/ui/engine.ts b/src/ui/engine.ts
index 088ddab..48b420d 100644
--- a/src/ui/engine.ts
+++ b/src/ui/engine.ts
@@ -29,8 +29,6 @@
  * which is what makes both halves independently testable.
  */
 
-import JSZip from "jszip";
-
 import {
   detectAllInZip,
   type ScopedCandidate,
@@ -40,6 +38,7 @@ import { extractTextFromZip } from "../detection/extract-text.js";
 import { normalizeForMatching } from "../detection/normalize.js";
 import { ROLE_BLACKLIST_EN } from "../detection/rules/role-blacklist-en.js";
 import { ROLE_BLACKLIST_KO } from "../detection/rules/role-blacklist-ko.js";
+import { loadDocxZip } from "../docx/load.js";
 import { listScopes } from "../docx/scopes.js";
 import type { Scope } from "../docx/types.js";
 import {
@@ -177,7 +176,7 @@ export async function analyzeZip(
   // whatever the caller handed us. Avoids subtle bugs where a second
   // call to analyzeZip sees mutations JSZip made to the underlying
   // buffer during its own processing.
-  const zip = await JSZip.loadAsync(bytes.slice());
+  const zip = await loadDocxZip(bytes);
 
   // File stats — size of the input + number of text-bearing scopes.
   // sizeBytes comes from the caller's view (the bytes they actually
@@ -301,7 +300,7 @@ export async function applyRedaction(
     resolveSelectedTargets(analysis.selectionTargets, selections);
   const preflightPlan = await buildPreflightExpansionPlan(bytes, selectedTargets);
   // Fresh reload every time — see docstring.
-  const zip = await JSZip.loadAsync(bytes.slice());
+  const zip = await loadDocxZip(bytes);
   await applyRelsRepairsToZip(
     zip,
     preflightPlan.relsRepairs,

From ce538749e28509c99c48ef857cd0507011af40d1 Mon Sep 17 00:00:00 2001
From: kipeum86 <kipeum86@gmail.com>
Date: Fri, 17 Apr 2026 23:47:12 +0900
Subject: [PATCH 3/6] fix(security): add per-entry size guard via readZipEntry

---
 src/docx/load.test.ts               | 57 ++++++++++++++++++++++++++++-
 src/docx/load.ts                    | 24 +++++++++++-
 src/docx/scopes.ts                  |  7 +---
 src/docx/scrub-metadata.ts          |  3 +-
 src/docx/verify-surfaces.ts         |  3 +-
 src/finalize/preflight-expansion.ts |  4 +-
 6 files changed, 86 insertions(+), 12 deletions(-)

diff --git a/src/docx/load.test.ts b/src/docx/load.test.ts
index 69c7811..a9143be 100644
--- a/src/docx/load.test.ts
+++ b/src/docx/load.test.ts
@@ -1,8 +1,13 @@
 import { describe, expect, it, vi, afterEach } from "vitest";
 import JSZip from "jszip";
 
-import { MAX_INPUT_BYTES } from "./limits.js";
-import { FileTooLargeError, loadDocxZip } from "./load.js";
+import { MAX_ENTRY_BYTES, MAX_INPUT_BYTES } from "./limits.js";
+import {
+  EntryTooLargeError,
+  FileTooLargeError,
+  loadDocxZip,
+  readZipEntry,
+} from "./load.js";
 
 async function makeZipBytes(): Promise<Uint8Array> {
   const zip = new JSZip();
@@ -53,3 +58,51 @@ describe("loadDocxZip", () => {
     );
   });
 });
+
+describe("readZipEntry", () => {
+  it("returns a string for a normal entry", async () => {
+    const zip = new JSZip();
+    zip.file("word/document.xml", "<w:document/>");
+
+    await expect(readZipEntry(zip, "word/document.xml")).resolves.toBe(
+      "<w:document/>",
+    );
+  });
+
+  it("throws when the entry does not exist", async () => {
+    const zip = new JSZip();
+    await expect(readZipEntry(zip, "missing.xml")).rejects.toThrow(
+      "ZIP entry not found: missing.xml",
+    );
+  });
+
+  it("throws EntryTooLargeError for oversized decompressed content", async () => {
+    const zip = {
+      file(path: string) {
+        if (path !== "word/document.xml") return null;
+        return {
+          async: vi.fn().mockResolvedValue("x".repeat(MAX_ENTRY_BYTES + 1)),
+        };
+      },
+    } as unknown as JSZip;
+
+    await expect(readZipEntry(zip, "word/document.xml")).rejects.toBeInstanceOf(
+      EntryTooLargeError,
+    );
+  });
+
+  it("accepts content at the exact MAX_ENTRY_BYTES boundary", async () => {
+    const zip = {
+      file(path: string) {
+        if (path !== "word/document.xml") return null;
+        return {
+          async: vi.fn().mockResolvedValue("x".repeat(MAX_ENTRY_BYTES)),
+        };
+      },
+    } as unknown as JSZip;
+
+    await expect(readZipEntry(zip, "word/document.xml")).resolves.toBe(
+      "x".repeat(MAX_ENTRY_BYTES),
+    );
+  });
+});
diff --git a/src/docx/load.ts b/src/docx/load.ts
index 9488477..b6afade 100644
--- a/src/docx/load.ts
+++ b/src/docx/load.ts
@@ -1,6 +1,6 @@
 import JSZip from "jszip";
 
-import { MAX_INPUT_BYTES } from "./limits.js";
+import { MAX_ENTRY_BYTES, MAX_INPUT_BYTES } from "./limits.js";
 
 export class FileTooLargeError extends Error {
   constructor(size: number, limit: number) {
@@ -9,6 +9,13 @@ export class FileTooLargeError extends Error {
   }
 }
 
+export class EntryTooLargeError extends Error {
+  constructor(path: string, size: number, limit: number) {
+    super(`ZIP entry "${path}" decompressed to ${size} bytes, exceeds limit of ${limit} bytes`);
+    this.name = "EntryTooLargeError";
+  }
+}
+
 export async function loadDocxZip(bytes: Uint8Array): Promise<JSZip> {
   if (bytes.length === 0) {
     throw new FileTooLargeError(0, MAX_INPUT_BYTES);
@@ -18,3 +25,18 @@ export async function loadDocxZip(bytes: Uint8Array): Promise<JSZip> {
   }
   return JSZip.loadAsync(bytes.slice());
 }
+
+export async function readZipEntry(
+  zip: JSZip,
+  path: string,
+): Promise<string> {
+  const file = zip.file(path);
+  if (file === null) {
+    throw new Error(`ZIP entry not found: ${path}`);
+  }
+  const content = await file.async("string");
+  if (content.length > MAX_ENTRY_BYTES) {
+    throw new EntryTooLargeError(path, content.length, MAX_ENTRY_BYTES);
+  }
+  return content;
+}
diff --git a/src/docx/scopes.ts b/src/docx/scopes.ts
index 8f1f6c3..b94e014 100644
--- a/src/docx/scopes.ts
+++ b/src/docx/scopes.ts
@@ -12,6 +12,7 @@
 
 import type JSZip from "jszip";
 
+import { readZipEntry } from "./load.js";
 import { SCOPE_PATTERNS, type Scope, type ScopeKind } from "./types.js";
 
 /**
@@ -73,11 +74,7 @@ export async function readScopeXml(
   zip: JSZip,
   scope: Scope,
 ): Promise<string> {
-  const file = zip.file(scope.path);
-  if (file === null) {
-    throw new Error(`scope ${scope.path} not found in zip`);
-  }
-  return file.async("string");
+  return readZipEntry(zip, scope.path);
 }
 
 /**
diff --git a/src/docx/scrub-metadata.ts b/src/docx/scrub-metadata.ts
index 4768bcd..44ec306 100644
--- a/src/docx/scrub-metadata.ts
+++ b/src/docx/scrub-metadata.ts
@@ -14,6 +14,7 @@
 
 import type JSZip from "jszip";
 
+import { readZipEntry } from "./load.js";
 import { METADATA_SENSITIVE_FIELDS } from "./types.js";
 
 /**
@@ -49,7 +50,7 @@ export async function scrubDocxMetadata(zip: JSZip): Promise<void> {
   for (const path of targets) {
     const file = zip.file(path);
     if (file === null) continue;
-    const xml = await file.async("string");
+    const xml = await readZipEntry(zip, path);
     const cleaned = scrubMetadataXml(xml, METADATA_SENSITIVE_FIELDS);
     zip.file(path, cleaned);
   }
diff --git a/src/docx/verify-surfaces.ts b/src/docx/verify-surfaces.ts
index 94aac53..82d751c 100644
--- a/src/docx/verify-surfaces.ts
+++ b/src/docx/verify-surfaces.ts
@@ -1,6 +1,7 @@
 import type JSZip from "jszip";
 
 import { extractScopeText } from "../detection/extract-text.js";
+import { readZipEntry } from "./load.js";
 import { listScopes, readScopeXml } from "./scopes.js";
 import type { Scope } from "./types.js";
 
@@ -50,7 +51,7 @@ export async function collectVerifySurfaces(zip: JSZip): Promise<VerifySurfaces>
 
   const relsTargetSurfaces: RelsTargetSurface[] = [];
   for (const path of listRelsPaths(zip)) {
-    const xml = await zip.file(path)!.async("string");
+    const xml = await readZipEntry(zip, path);
     for (const text of extractRelationshipTargets(xml)) {
       relsTargetSurfaces.push({ kind: "rels-target", path, text });
     }
diff --git a/src/finalize/preflight-expansion.ts b/src/finalize/preflight-expansion.ts
index 345c464..0f94c29 100644
--- a/src/finalize/preflight-expansion.ts
+++ b/src/finalize/preflight-expansion.ts
@@ -1,6 +1,6 @@
 import type JSZip from "jszip";
 
-import { loadDocxZip } from "../docx/load.js";
+import { loadDocxZip, readZipEntry } from "../docx/load.js";
 import { collectVerifySurfaces } from "../docx/verify-surfaces.js";
 import type { ResolvedRedactionTarget } from "../selection-targets.js";
 
@@ -124,7 +124,7 @@ export async function applyRelsRepairsToZip(
   for (const [path, literals] of relsRepairs) {
     const file = zip.file(path);
     if (file === null) continue;
-    const xml = await file.async("string");
+    const xml = await readZipEntry(zip, path);
     const repaired = repairRelationshipTargets(xml, literals, placeholder);
     zip.file(path, repaired);
   }

From 3a4f5f1b08417991730193a189f4821967150fff Mon Sep 17 00:00:00 2001
From: kipeum86 <kipeum86@gmail.com>
Date: Fri, 17 Apr 2026 23:49:57 +0900
Subject: [PATCH 4/6] fix(security): strip external URLs from .rels in redacted
 output

---
 src/docx/verify-surfaces.test.ts         |  9 +++
 src/docx/verify-surfaces.ts              |  2 +-
 src/docx/verify.test.ts                  | 30 +++++++++
 src/docx/verify.ts                       | 21 ++++++
 src/finalize/preflight-expansion.test.ts | 84 +++++++++++++++++++++++-
 src/finalize/preflight-expansion.ts      | 32 +++++++--
 6 files changed, 172 insertions(+), 6 deletions(-)

diff --git a/src/docx/verify-surfaces.test.ts b/src/docx/verify-surfaces.test.ts
index 1c10655..a20bc56 100644
--- a/src/docx/verify-surfaces.test.ts
+++ b/src/docx/verify-surfaces.test.ts
@@ -32,4 +32,13 @@ describe("verify-surfaces", () => {
       "https://example.com/second",
     ]);
   });
+
+  it("extracts relationship Target values from single-quoted attributes too", () => {
+    const rels = `<?xml version="1.0"?><Relationships xmlns="x"><Relationship Id="rId1" Target='https://example.com/first'/><Relationship Id="rId2" Target='mailto:second@example.com'/></Relationships>`;
+
+    expect(extractRelationshipTargets(rels)).toEqual([
+      "https://example.com/first",
+      "mailto:second@example.com",
+    ]);
+  });
 });
diff --git a/src/docx/verify-surfaces.ts b/src/docx/verify-surfaces.ts
index 82d751c..32b7d38 100644
--- a/src/docx/verify-surfaces.ts
+++ b/src/docx/verify-surfaces.ts
@@ -87,7 +87,7 @@ export function extractFldSimpleInstrValues(xml: string): readonly string[] {
 
 export function extractRelationshipTargets(relsXml: string): readonly string[] {
   const out: string[] = [];
-  const re = /<Relationship\b[^>]*\bTarget="([^"]*)"/g;
+  const re = /<Relationship\b[^>]*\bTarget=["']([^"']*)["']/g;
   let match: RegExpExecArray | null;
   while ((match = re.exec(relsXml)) !== null) {
     out.push(decodeXml(match[1] ?? ""));
diff --git a/src/docx/verify.test.ts b/src/docx/verify.test.ts
index c9f4cc3..e31cd48 100644
--- a/src/docx/verify.test.ts
+++ b/src/docx/verify.test.ts
@@ -179,6 +179,36 @@ describe("verifyRedaction", () => {
     expect(result.survived).toEqual([]);
   });
 
+  it("fails verification when an external http URL survives in rels", async () => {
+    const zip = await syntheticDocx({
+      "word/document.xml": bodyWith("[REDACTED]"),
+      "word/_rels/document.xml.rels": `<?xml version="1.0"?><Relationships xmlns="x"><Relationship Target="http://evil.example/track"/></Relationships>`,
+    });
+    const result = await verifyRedaction(zip, resolved("unrelated@example.com"));
+    expect(result.isClean).toBe(false);
+    expect(result.survived).toEqual([
+      expect.objectContaining({
+        text: "http://evil.example/track",
+        surface: "rels",
+      }),
+    ]);
+  });
+
+  it("fails verification when a single-quoted external https URL survives in rels", async () => {
+    const zip = await syntheticDocx({
+      "word/document.xml": bodyWith("[REDACTED]"),
+      "word/_rels/document.xml.rels": `<?xml version="1.0"?><Relationships xmlns="x"><Relationship Target='https://evil.example/track'/></Relationships>`,
+    });
+    const result = await verifyRedaction(zip, resolved("unrelated@example.com"));
+    expect(result.isClean).toBe(false);
+    expect(result.survived).toEqual([
+      expect.objectContaining({
+        text: "https://evil.example/track",
+        surface: "rels",
+      }),
+    ]);
+  });
+
   it("enumerates multiple rels files in sorted path order", async () => {
     const zip = await syntheticDocx({
       "word/document.xml": bodyWith("[REDACTED]"),
diff --git a/src/docx/verify.ts b/src/docx/verify.ts
index 3c973ac..bcfbe48 100644
--- a/src/docx/verify.ts
+++ b/src/docx/verify.ts
@@ -36,6 +36,7 @@ import { collectVerifySurfaces } from "./verify-surfaces.js";
 import type { Scope } from "./types.js";
 
 export type VerifySurfaceKind = "text" | "field" | "rels";
+const EXTERNAL_URL_TARGET_ID = "security:external-url";
 
 /** One sensitive string that survived in one scope. */
 export interface SurvivedString {
@@ -121,6 +122,22 @@ export async function verifyRedaction(
 
   for (const surface of surfaces.relsTargetSurfaces) {
     const scope = { kind: "rels", path: surface.path } as unknown as Scope;
+    if (isExternalHttpUrl(surface.text)) {
+      mergeSurvival(
+        survivedByKey,
+        {
+          id: `${EXTERNAL_URL_TARGET_ID}:${surface.text}`,
+          displayText: surface.text,
+          redactionLiterals: [surface.text],
+          verificationLiterals: [surface.text],
+          scopes: [scope],
+        },
+        scope,
+        "rels",
+        1,
+        surface.text,
+      );
+    }
     for (const target of activeTargets) {
       for (const literal of target.verificationLiterals) {
         const count = countOccurrences(surface.text, literal);
@@ -207,3 +224,7 @@ function mergeSurvival(
     matchedLiteral: existing.matchedLiteral ?? matchedLiteral,
   });
 }
+
+function isExternalHttpUrl(text: string): boolean {
+  return text.startsWith("http://") || text.startsWith("https://");
+}
diff --git a/src/finalize/preflight-expansion.test.ts b/src/finalize/preflight-expansion.test.ts
index 4b778a9..7a63ccd 100644
--- a/src/finalize/preflight-expansion.test.ts
+++ b/src/finalize/preflight-expansion.test.ts
@@ -2,7 +2,10 @@ import { describe, expect, it } from "vitest";
 import JSZip from "jszip";
 
 import { buildResolvedTargetsFromStrings } from "../selection-targets.js";
-import { buildPreflightExpansionPlan } from "./preflight-expansion.js";
+import {
+  applyRelsRepairsToZip,
+  buildPreflightExpansionPlan,
+} from "./preflight-expansion.js";
 import type { ResolvedRedactionTarget } from "../selection-targets.js";
 
 const W_NS = `xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"`;
@@ -15,6 +18,14 @@ async function syntheticDocx(parts: Record<string, string>): Promise<Uint8Array>
   return zip.generateAsync({ type: "uint8array" });
 }
 
+function syntheticZip(parts: Record<string, string>): JSZip {
+  const zip = new JSZip();
+  for (const [path, content] of Object.entries(parts)) {
+    zip.file(path, content);
+  }
+  return zip;
+}
+
 function bodyWith(text: string): string {
   return `<w:document ${W_NS}><w:body><w:p><w:r><w:t>${text}</w:t></w:r></w:p></w:body></w:document>`;
 }
@@ -144,4 +155,75 @@ describe("preflight-expansion", () => {
       expandedLiteralCount: 0,
     });
   });
+
+  it("strips double-quoted http URLs from rels", async () => {
+    const zip = syntheticZip({
+      "word/_rels/document.xml.rels": `<?xml version="1.0"?><Relationships><Relationship Target="http://evil.example/pixel"/></Relationships>`,
+    });
+
+    await applyRelsRepairsToZip(zip, new Map());
+
+    const rels = await zip.file("word/_rels/document.xml.rels")!.async("string");
+    expect(rels).toContain(`Target=""`);
+    expect(rels).not.toContain("http://evil.example/pixel");
+  });
+
+  it("strips double-quoted https URLs from rels", async () => {
+    const zip = syntheticZip({
+      "word/_rels/document.xml.rels": `<?xml version="1.0"?><Relationships><Relationship Target="https://track.example/pixel"/></Relationships>`,
+    });
+
+    await applyRelsRepairsToZip(zip, new Map());
+
+    const rels = await zip.file("word/_rels/document.xml.rels")!.async("string");
+    expect(rels).toContain(`Target=""`);
+    expect(rels).not.toContain("https://track.example/pixel");
+  });
+
+  it("strips single-quoted https URLs from rels", async () => {
+    const zip = syntheticZip({
+      "word/_rels/document.xml.rels": `<?xml version="1.0"?><Relationships><Relationship Target='https://track.example/pixel'/></Relationships>`,
+    });
+
+    await applyRelsRepairsToZip(zip, new Map());
+
+    const rels = await zip.file("word/_rels/document.xml.rels")!.async("string");
+    expect(rels).toContain(`Target=''`);
+    expect(rels).not.toContain("https://track.example/pixel");
+  });
+
+  it("preserves mailto targets in rels", async () => {
+    const zip = syntheticZip({
+      "word/_rels/document.xml.rels": `<?xml version="1.0"?><Relationships><Relationship Target="mailto:legal@example.com"/></Relationships>`,
+    });
+
+    await applyRelsRepairsToZip(zip, new Map());
+
+    const rels = await zip.file("word/_rels/document.xml.rels")!.async("string");
+    expect(rels).toContain(`mailto:legal@example.com`);
+  });
+
+  it("preserves relative targets in rels", async () => {
+    const zip = syntheticZip({
+      "word/_rels/document.xml.rels": `<?xml version="1.0"?><Relationships><Relationship Target="media/image1.png"/></Relationships>`,
+    });
+
+    await applyRelsRepairsToZip(zip, new Map());
+
+    const rels = await zip.file("word/_rels/document.xml.rels")!.async("string");
+    expect(rels).toContain(`Target="media/image1.png"`);
+  });
+
+  it("strips only external URLs in mixed rels content", async () => {
+    const zip = syntheticZip({
+      "word/_rels/document.xml.rels": `<?xml version="1.0"?><Relationships><Relationship Target="https://track.example/pixel"/><Relationship Target="media/image1.png"/><Relationship Target="mailto:legal@example.com"/></Relationships>`,
+    });
+
+    await applyRelsRepairsToZip(zip, new Map());
+
+    const rels = await zip.file("word/_rels/document.xml.rels")!.async("string");
+    expect(rels).toContain(`Target=""`);
+    expect(rels).toContain(`Target="media/image1.png"`);
+    expect(rels).toContain(`Target="mailto:legal@example.com"`);
+  });
 });
diff --git a/src/finalize/preflight-expansion.ts b/src/finalize/preflight-expansion.ts
index 0f94c29..2d7e1eb 100644
--- a/src/finalize/preflight-expansion.ts
+++ b/src/finalize/preflight-expansion.ts
@@ -121,11 +121,12 @@ export async function applyRelsRepairsToZip(
   relsRepairs: ReadonlyMap<string, readonly string[]>,
   placeholder = "[REDACTED]",
 ): Promise<void> {
-  for (const [path, literals] of relsRepairs) {
-    const file = zip.file(path);
-    if (file === null) continue;
+  for (const path of listRelsPaths(zip)) {
+    const literals = relsRepairs.get(path) ?? [];
     const xml = await readZipEntry(zip, path);
-    const repaired = repairRelationshipTargets(xml, literals, placeholder);
+    const repaired = stripExternalUrls(
+      repairRelationshipTargets(xml, literals, placeholder),
+    );
     zip.file(path, repaired);
   }
 }
@@ -154,6 +155,18 @@ function repairRelationshipTargets(
   );
 }
 
+function stripExternalUrls(relsXml: string): string {
+  let result = relsXml.replace(
+    /(<Relationship\b[^>]*\bTarget=)"(https?:\/\/[^"]*)"/g,
+    '$1""',
+  );
+  result = result.replace(
+    /(<Relationship\b[^>]*\bTarget=)'(https?:\/\/[^']*)'/g,
+    "$1''",
+  );
+  return result;
+}
+
 function idleSummary(): PreflightExpansionSummary {
   return {
     touchedScopePaths: [],
@@ -192,3 +205,14 @@ function decodeXml(text: string): string {
       String.fromCodePoint(Number.parseInt(dec, 10)),
     );
 }
+
+function listRelsPaths(zip: JSZip): string[] {
+  const paths: string[] = [];
+  zip.forEach((relativePath, file) => {
+    if (file.dir) return;
+    if (relativePath.endsWith(".rels")) {
+      paths.push(relativePath);
+    }
+  });
+  return paths.sort();
+}

From 247376f0d5561035ee84e6a792fc2f79ca3370ee Mon Sep 17 00:00:00 2001
From: kipeum86 <kipeum86@gmail.com>
Date: Fri, 17 Apr 2026 23:52:11 +0900
Subject: [PATCH 5/6] fix(security): scrub docProps/custom.xml from redacted
 output

---
 src/docx/scrub-metadata.test.ts | 49 +++++++++++++++++++++++++++++++++
 src/docx/scrub-metadata.ts      | 19 ++++++++++++-
 2 files changed, 67 insertions(+), 1 deletion(-)

diff --git a/src/docx/scrub-metadata.test.ts b/src/docx/scrub-metadata.test.ts
index fd090d1..c281b36 100644
--- a/src/docx/scrub-metadata.test.ts
+++ b/src/docx/scrub-metadata.test.ts
@@ -24,6 +24,18 @@ const APP_XML = `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
   <Application>Microsoft Word</Application>
 </Properties>`;
 
+const CONTENT_TYPES_XML = `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
+  <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
+  <Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
+  <Override PartName="/docProps/custom.xml" ContentType="application/vnd.openxmlformats-officedocument.custom-properties+xml"/>
+</Types>`;
+
+const CUSTOM_XML = `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<Properties xmlns="http://schemas.openxmlformats.org/officeDocument/2006/custom-properties">
+  <property name="AuthorEmail"><vt:lpwstr xmlns:vt="http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes">kim@example.com</vt:lpwstr></property>
+</Properties>`;
+
 describe("scrubMetadataXml", () => {
   it("zeroes out dc:creator", () => {
     const out = scrubMetadataXml(CORE_XML, ["creator"]);
@@ -117,4 +129,41 @@ describe("scrubDocxMetadata", () => {
     const newCore = await zip.file("docProps/core.xml")!.async("string");
     expect(newCore).not.toContain("Kim Chul-Soo");
   });
+
+  it("removes docProps/custom.xml entirely when present", async () => {
+    const zip = new JSZip();
+    zip.file("docProps/custom.xml", CUSTOM_XML);
+    zip.file("[Content_Types].xml", CONTENT_TYPES_XML);
+
+    await scrubDocxMetadata(zip);
+
+    expect(zip.file("docProps/custom.xml")).toBeNull();
+  });
+
+  it("removes the custom.xml override from [Content_Types].xml", async () => {
+    const zip = new JSZip();
+    zip.file("docProps/custom.xml", CUSTOM_XML);
+    zip.file("[Content_Types].xml", CONTENT_TYPES_XML);
+
+    await scrubDocxMetadata(zip);
+
+    const contentTypes = await zip.file("[Content_Types].xml")!.async("string");
+    expect(contentTypes).not.toContain(`/docProps/custom.xml`);
+    expect(contentTypes).toContain(`/word/document.xml`);
+  });
+
+  it("leaves [Content_Types].xml alone when no custom override exists", async () => {
+    const zip = new JSZip();
+    const contentTypes = CONTENT_TYPES_XML.replace(
+      /\s*<Override\b[^>]*PartName="\/docProps\/custom\.xml"[^>]*\/>/,
+      "",
+    );
+    zip.file("[Content_Types].xml", contentTypes);
+
+    await scrubDocxMetadata(zip);
+
+    expect(await zip.file("[Content_Types].xml")!.async("string")).toBe(
+      contentTypes,
+    );
+  });
 });
diff --git a/src/docx/scrub-metadata.ts b/src/docx/scrub-metadata.ts
index 44ec306..2d563c3 100644
--- a/src/docx/scrub-metadata.ts
+++ b/src/docx/scrub-metadata.ts
@@ -43,7 +43,8 @@ export function scrubMetadataXml(xml: string, fields: ReadonlyArray<string>): st
 /**
  * Apply the standard scrub policy to a DOCX zip in place. Reads
  * `docProps/core.xml` and `docProps/app.xml`, scrubs each, and writes them
- * back. Idempotent.
+ * back. Removes `docProps/custom.xml` entirely because its schema is
+ * free-form and can hide arbitrary metadata payloads. Idempotent.
  */
 export async function scrubDocxMetadata(zip: JSZip): Promise<void> {
   const targets = ["docProps/core.xml", "docProps/app.xml"];
@@ -54,8 +55,24 @@ export async function scrubDocxMetadata(zip: JSZip): Promise<void> {
     const cleaned = scrubMetadataXml(xml, METADATA_SENSITIVE_FIELDS);
     zip.file(path, cleaned);
   }
+
+  if (zip.file("docProps/custom.xml") !== null) {
+    zip.remove("docProps/custom.xml");
+  }
+
+  if (zip.file("[Content_Types].xml") !== null) {
+    const xml = await readZipEntry(zip, "[Content_Types].xml");
+    zip.file("[Content_Types].xml", removeCustomPropsOverride(xml));
+  }
 }
 
 function escapeRegex(s: string): string {
   return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
 }
+
+function removeCustomPropsOverride(xml: string): string {
+  return xml.replace(
+    /\s*<Override\b[^>]*PartName=["']\/docProps\/custom\.xml["'][^>]*\/>/g,
+    "",
+  );
+}

From 52c5d0441ca25740454018a2849870a012234ab3 Mon Sep 17 00:00:00 2001
From: kipeum86 <kipeum86@gmail.com>
Date: Sat, 18 Apr 2026 00:43:22 +0900
Subject: [PATCH 6/6] docs(release): update v1.1.1 release metadata

---
 README.ko.md                      | 12 +++---
 README.md                         | 12 +++---
 release-notes/2026-04-18-draft.md | 68 +++++++++++++++++++++++++++++++
 src/app-version.ts                |  2 +-
 4 files changed, 81 insertions(+), 13 deletions(-)
 create mode 100644 release-notes/2026-04-18-draft.md

diff --git a/README.ko.md b/README.ko.md
index 537e29f..76fe949 100644
--- a/README.ko.md
+++ b/README.ko.md
@@ -13,7 +13,7 @@
         <img alt="document-redactor.html 다운로드" src="https://img.shields.io/badge/document--redactor.html-%EC%B5%9C%EC%8B%A0%20%EB%B0%9B%EA%B8%B0-0f766e?style=for-the-badge&logo=html5&logoColor=white" />
       </a>
       <br />
-      <sub>HTML 한 파일 · ~277 KB · 더블클릭으로 실행</sub>
+      <sub>HTML 한 파일 · ~276 KB · 더블클릭으로 실행</sub>
     </td>
     <td align="center" valign="middle">
       <a href="https://github.com/kipeum86/document-redactor/releases/latest/download/document-redactor.html.sha256">
@@ -53,7 +53,7 @@
   <img alt="CI" src="https://img.shields.io/github/actions/workflow/status/kipeum86/document-redactor/ci.yml?branch=main&label=CI&style=flat-square" />
   <img alt="Apache 2.0 license" src="https://img.shields.io/badge/license-Apache%202.0-0f172a?style=flat-square" />
   <img alt="single HTML distribution" src="https://img.shields.io/badge/distribution-single%20HTML-0f172a?style=flat-square" />
-  <img alt="277 KB artifact" src="https://img.shields.io/badge/current%20build-277%20KB-166534?style=flat-square" />
+  <img alt="276 KB artifact" src="https://img.shields.io/badge/current%20build-276%20KB-166534?style=flat-square" />
   <img alt="zero network requests" src="https://img.shields.io/badge/network-0%20requests-166534?style=flat-square" />
   <img alt="rule-based engine" src="https://img.shields.io/badge/detection-rule--based-1d4ed8?style=flat-square" />
   <img alt="AI none" src="https://img.shields.io/badge/AI-none-7f1d1d?style=flat-square" />
@@ -148,7 +148,7 @@ flowchart TD
     </td>
     <td width="20%" valign="top">
       <strong>현재 확인된 크기</strong><br />
-      277 KB
+      276 KB
     </td>
     <td width="20%" valign="top">
       <strong>무결성 sidecar</strong><br />
@@ -160,14 +160,14 @@ flowchart TD
     </td>
     <td width="20%" valign="top">
       <strong>자동화 테스트</strong><br />
-      1,739 tests
+      1,774 tests
     </td>
   </tr>
 </table>
 
-2026년 4월 14일 기준으로 확인한 현재 빌드:
+2026년 4월 18일 기준으로 확인한 현재 빌드:
 
-- `document-redactor.html` SHA-256: `323221def9ce105afbd8ea805a5ed7e0751152ec2d531d6dba84111332cd32f9`
+- `document-redactor.html` SHA-256: `4c65364179c80b3993bfee2d99715e5d3c40938d5780ff092f0e021aadc1e77e`
 - `shasum -a 256 -c document-redactor.html.sha256` 로 로컬 검증 완료
 
 ## 현재 릴리즈가 실제로 하는 일
diff --git a/README.md b/README.md
index 0dadb8b..161abdc 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@
         <img alt="Download document-redactor.html" src="https://img.shields.io/badge/document--redactor.html-Download%20(latest)-0f766e?style=for-the-badge&logo=html5&logoColor=white" />
       </a>
       <br />
-      <sub>Single HTML · ~277 KB · open locally</sub>
+      <sub>Single HTML · ~276 KB · open locally</sub>
     </td>
     <td align="center" valign="middle">
       <a href="https://github.com/kipeum86/document-redactor/releases/latest/download/document-redactor.html.sha256">
@@ -53,7 +53,7 @@
   <img alt="CI" src="https://img.shields.io/github/actions/workflow/status/kipeum86/document-redactor/ci.yml?branch=main&label=CI&style=flat-square" />
   <img alt="Apache 2.0 license" src="https://img.shields.io/badge/license-Apache%202.0-0f172a?style=flat-square" />
   <img alt="single HTML distribution" src="https://img.shields.io/badge/distribution-single%20HTML-0f172a?style=flat-square" />
-  <img alt="277 KB artifact" src="https://img.shields.io/badge/current%20build-277%20KB-166534?style=flat-square" />
+  <img alt="276 KB artifact" src="https://img.shields.io/badge/current%20build-276%20KB-166534?style=flat-square" />
   <img alt="zero network requests" src="https://img.shields.io/badge/network-0%20requests-166534?style=flat-square" />
   <img alt="rule-based engine" src="https://img.shields.io/badge/detection-rule--based-1d4ed8?style=flat-square" />
   <img alt="AI none" src="https://img.shields.io/badge/AI-none-7f1d1d?style=flat-square" />
@@ -149,7 +149,7 @@ flowchart TD
     </td>
     <td width="20%" valign="top">
       <strong>Current checked size</strong><br />
-      277 KB
+      276 KB
     </td>
     <td width="20%" valign="top">
       <strong>Integrity sidecar</strong><br />
@@ -161,14 +161,14 @@ flowchart TD
     </td>
     <td width="20%" valign="top">
       <strong>Automated coverage</strong><br />
-      1,739 tests
+      1,774 tests
     </td>
   </tr>
 </table>
 
-Current checked release artifact on April 14, 2026:
+Current checked release artifact on April 18, 2026:
 
-- `document-redactor.html` SHA-256: `323221def9ce105afbd8ea805a5ed7e0751152ec2d531d6dba84111332cd32f9`
+- `document-redactor.html` SHA-256: `4c65364179c80b3993bfee2d99715e5d3c40938d5780ff092f0e021aadc1e77e`
 - Verified locally with `shasum -a 256 -c document-redactor.html.sha256`
 
 ## What The Current Release Does
diff --git a/release-notes/2026-04-18-draft.md b/release-notes/2026-04-18-draft.md
new file mode 100644
index 0000000..633427a
--- /dev/null
+++ b/release-notes/2026-04-18-draft.md
@@ -0,0 +1,68 @@
+# document-redactor v1.1.1
+
+_Drafted 2026-04-18_
+
+## Summary
+
+This update focuses on two things:
+
+- making DOCX intake and XML parsing safer against oversized files and ZIP-bomb-style payloads,
+- closing a few quiet leak paths that could survive redaction even when the visible document looked clean.
+
+It also fixes a detection correctness bug where heuristic candidates could return normalized text instead of the original DOCX bytes, which matters for smart quotes, fullwidth ASCII, and ideographic spaces.
+
+## What's new
+
+### Safer DOCX loading
+
+- Added a `loadDocxZip()` guard for all main UI and finalize entry points.
+- Files larger than **50 MB** are now rejected before JSZip fully loads them.
+- This reduces the chance that a very large or malicious input file freezes or crashes the browser tab.
+
+### Safer ZIP entry reads
+
+- Added a `readZipEntry()` guard for XML entry reads.
+- Individual ZIP entries larger than **20 MB** are now rejected before downstream XML handling continues.
+- This hardens scope walking, metadata scrubbing, verifier surface collection, and preflight repair against oversized decompressed entries.
+
+### External URL stripping in `.rels`
+
+- Redacted output now strips surviving external `http://` and `https://` targets from `.rels` files.
+- Both `Target="..."` and `Target='...'` forms are covered.
+- `mailto:` links and relative package paths are preserved.
+- Verification now explicitly fails if any external `http/https` relationship target survives in the output.
+
+### `docProps/custom.xml` removal
+
+- Metadata scrubbing now removes `docProps/custom.xml` entirely instead of leaving custom properties behind.
+- `[Content_Types].xml` is updated at the same time so the removed custom-properties part does not leave a stale override entry.
+- This closes a metadata channel where author email, project names, tracking IDs, or other arbitrary custom fields could survive redaction.
+
+### Heuristic original-byte recovery
+
+- Heuristic candidates now recover the original source slice from the normalization offset map before emitting `Candidate.text`.
+- This improves real-world redaction reliability when DOCX content uses:
+  - smart quotes,
+  - fullwidth ASCII or digits,
+  - ideographic spaces.
+- Result: fewer cases where detection appears correct but the later literal redaction step fails to find the exact original bytes.
+
+## User-visible impact
+
+- Oversized or suspicious DOCX inputs fail earlier and more clearly.
+- Redacted files are less likely to retain hidden metadata or tracking URLs.
+- Documents using smart quotes / fullwidth characters should redact more reliably when the match came from a heuristic.
+- No workflow changes were introduced in the UI; this is mainly a safety and correctness release.
+
+## Validation
+
+- Automated tests: **1,774 passing**
+- Full suite: `bun run test` passed
+- Local ReDoS gate: `SKIP_REDOS_FUZZ=0 bun run test` passed
+- Production build: `bun run build` passed
+- Built artifact size: **276 KB** (`dist/document-redactor.html`)
+
+## Notes
+
+- This draft summarizes the work completed on **2026-04-18**.
+- Download links and release asset hashes can be filled in once the release is cut.
diff --git a/src/app-version.ts b/src/app-version.ts
index 9ebef24..0c6aad4 100644
--- a/src/app-version.ts
+++ b/src/app-version.ts
@@ -1 +1 @@
-export const APP_VERSION = "v.1.05";
+export const APP_VERSION = "v1.1.1";