kipeum86 · kipeum86 · Apr 17, 2026 · Apr 16, 2026 · Apr 17, 2026 · Apr 17, 2026
diff --git a/README.ko.md b/README.ko.md
@@ -13,7 +13,7 @@
         <img alt="document-redactor.html 다운로드" src="https://img.shields.io/badge/document--redactor.html-%EC%B5%9C%EC%8B%A0%20%EB%B0%9B%EA%B8%B0-0f766e?style=for-the-badge&logo=html5&logoColor=white" />
       </a>
       <br />
-      <sub>HTML 한 파일 · ~277 KB · 더블클릭으로 실행</sub>
+      <sub>HTML 한 파일 · ~276 KB · 더블클릭으로 실행</sub>
     </td>
     <td align="center" valign="middle">
       <a href="https://github.com/kipeum86/document-redactor/releases/latest/download/document-redactor.html.sha256">
@@ -53,7 +53,7 @@
   <img alt="CI" src="https://img.shields.io/github/actions/workflow/status/kipeum86/document-redactor/ci.yml?branch=main&label=CI&style=flat-square" />
   <img alt="Apache 2.0 license" src="https://img.shields.io/badge/license-Apache%202.0-0f172a?style=flat-square" />
   <img alt="single HTML distribution" src="https://img.shields.io/badge/distribution-single%20HTML-0f172a?style=flat-square" />
-  <img alt="277 KB artifact" src="https://img.shields.io/badge/current%20build-277%20KB-166534?style=flat-square" />
+  <img alt="276 KB artifact" src="https://img.shields.io/badge/current%20build-276%20KB-166534?style=flat-square" />
   <img alt="zero network requests" src="https://img.shields.io/badge/network-0%20requests-166534?style=flat-square" />
   <img alt="rule-based engine" src="https://img.shields.io/badge/detection-rule--based-1d4ed8?style=flat-square" />
   <img alt="AI none" src="https://img.shields.io/badge/AI-none-7f1d1d?style=flat-square" />
@@ -148,7 +148,7 @@ flowchart TD
     </td>
     <td width="20%" valign="top">
       <strong>현재 확인된 크기</strong><br />
-      277 KB
+      276 KB
     </td>
     <td width="20%" valign="top">
       <strong>무결성 sidecar</strong><br />
@@ -160,14 +160,14 @@ flowchart TD
     </td>
     <td width="20%" valign="top">
       <strong>자동화 테스트</strong><br />
-      1,739 tests
+      1,774 tests
     </td>
   </tr>
 </table>
 
-2026년 4월 14일 기준으로 확인한 현재 빌드:
+2026년 4월 18일 기준으로 확인한 현재 빌드:
 
-- `document-redactor.html` SHA-256: `323221def9ce105afbd8ea805a5ed7e0751152ec2d531d6dba84111332cd32f9`
+- `document-redactor.html` SHA-256: `4c65364179c80b3993bfee2d99715e5d3c40938d5780ff092f0e021aadc1e77e`
 - `shasum -a 256 -c document-redactor.html.sha256` 로 로컬 검증 완료
 
 ## 현재 릴리즈가 실제로 하는 일

diff --git a/README.md b/README.md
@@ -13,7 +13,7 @@
         <img alt="Download document-redactor.html" src="https://img.shields.io/badge/document--redactor.html-Download%20(latest)-0f766e?style=for-the-badge&logo=html5&logoColor=white" />
       </a>
       <br />
-      <sub>Single HTML · ~277 KB · open locally</sub>
+      <sub>Single HTML · ~276 KB · open locally</sub>
     </td>
     <td align="center" valign="middle">
       <a href="https://github.com/kipeum86/document-redactor/releases/latest/download/document-redactor.html.sha256">
@@ -53,7 +53,7 @@
   <img alt="CI" src="https://img.shields.io/github/actions/workflow/status/kipeum86/document-redactor/ci.yml?branch=main&label=CI&style=flat-square" />
   <img alt="Apache 2.0 license" src="https://img.shields.io/badge/license-Apache%202.0-0f172a?style=flat-square" />
   <img alt="single HTML distribution" src="https://img.shields.io/badge/distribution-single%20HTML-0f172a?style=flat-square" />
-  <img alt="277 KB artifact" src="https://img.shields.io/badge/current%20build-277%20KB-166534?style=flat-square" />
+  <img alt="276 KB artifact" src="https://img.shields.io/badge/current%20build-276%20KB-166534?style=flat-square" />
   <img alt="zero network requests" src="https://img.shields.io/badge/network-0%20requests-166534?style=flat-square" />
   <img alt="rule-based engine" src="https://img.shields.io/badge/detection-rule--based-1d4ed8?style=flat-square" />
   <img alt="AI none" src="https://img.shields.io/badge/AI-none-7f1d1d?style=flat-square" />
@@ -149,7 +149,7 @@ flowchart TD
     </td>
     <td width="20%" valign="top">
       <strong>Current checked size</strong><br />
-      277 KB
+      276 KB
     </td>
     <td width="20%" valign="top">
       <strong>Integrity sidecar</strong><br />
@@ -161,14 +161,14 @@ flowchart TD
     </td>
     <td width="20%" valign="top">
       <strong>Automated coverage</strong><br />
-      1,739 tests
+      1,774 tests
     </td>
   </tr>
 </table>
 
-Current checked release artifact on April 14, 2026:
+Current checked release artifact on April 18, 2026:
 
-- `document-redactor.html` SHA-256: `323221def9ce105afbd8ea805a5ed7e0751152ec2d531d6dba84111332cd32f9`
+- `document-redactor.html` SHA-256: `4c65364179c80b3993bfee2d99715e5d3c40938d5780ff092f0e021aadc1e77e`
 - Verified locally with `shasum -a 256 -c document-redactor.html.sha256`
 
 ## What The Current Release Does

diff --git a/release-notes/2026-04-18-draft.md b/release-notes/2026-04-18-draft.md
@@ -0,0 +1,68 @@
+# document-redactor v1.1.1
+
+_Drafted 2026-04-18_
+
+## Summary
+
+This update focuses on two things:
+
+- making DOCX intake and XML parsing safer against oversized files and ZIP-bomb-style payloads,
+- closing a few quiet leak paths that could survive redaction even when the visible document looked clean.
+
+It also fixes a detection correctness bug where heuristic candidates could return normalized text instead of the original DOCX bytes, which matters for smart quotes, fullwidth ASCII, and ideographic spaces.
+
+## What's new
+
+### Safer DOCX loading
+
+- Added a `loadDocxZip()` guard for all main UI and finalize entry points.
+- Files larger than **50 MB** are now rejected before JSZip fully loads them.
+- This reduces the chance that a very large or malicious input file freezes or crashes the browser tab.
+
+### Safer ZIP entry reads
+
+- Added a `readZipEntry()` guard for XML entry reads.
+- Individual ZIP entries larger than **20 MB** are now rejected before downstream XML handling continues.
+- This hardens scope walking, metadata scrubbing, verifier surface collection, and preflight repair against oversized decompressed entries.
+
+### External URL stripping in `.rels`
+
+- Redacted output now strips surviving external `http://` and `https://` targets from `.rels` files.
+- Both `Target="..."` and `Target='...'` forms are covered.
+- `mailto:` links and relative package paths are preserved.
+- Verification now explicitly fails if any external `http/https` relationship target survives in the output.
+
+### `docProps/custom.xml` removal
+
+- Metadata scrubbing now removes `docProps/custom.xml` entirely instead of leaving custom properties behind.
+- `[Content_Types].xml` is updated at the same time so the removed custom-properties part does not leave a stale override entry.
+- This closes a metadata channel where author email, project names, tracking IDs, or other arbitrary custom fields could survive redaction.
+
+### Heuristic original-byte recovery
+
+- Heuristic candidates now recover the original source slice from the normalization offset map before emitting `Candidate.text`.
+- This improves real-world redaction reliability when DOCX content uses:
+  - smart quotes,
+  - fullwidth ASCII or digits,
+  - ideographic spaces.
+- Result: fewer cases where detection appears correct but the later literal redaction step fails to find the exact original bytes.
+
+## User-visible impact
+
+- Oversized or suspicious DOCX inputs fail earlier and more clearly.
+- Redacted files are less likely to retain hidden metadata or tracking URLs.
+- Documents using smart quotes / fullwidth characters should redact more reliably when the match came from a heuristic.
+- No workflow changes were introduced in the UI; this is mainly a safety and correctness release.
+
+## Validation
+
+- Automated tests: **1,774 passing**
+- Full suite: `bun run test` passed
+- Local ReDoS gate: `SKIP_REDOS_FUZZ=0 bun run test` passed
+- Production build: `bun run build` passed
+- Built artifact size: **276 KB** (`dist/document-redactor.html`)
+
+## Notes
+
+- This draft summarizes the work completed on **2026-04-18**.
+- Download links and release asset hashes can be filled in once the release is cut.
diff --git a/src/app-version.ts b/src/app-version.ts
@@ -1 +1 @@
-export const APP_VERSION = "v.1.05";
+export const APP_VERSION = "v1.1.1";
diff --git a/src/detection/_framework/recover-bytes.test.ts b/src/detection/_framework/recover-bytes.test.ts
@@ -0,0 +1,55 @@
+import { describe, expect, it } from "vitest";
+
+import { normalizeForMatching } from "../normalize.js";
+
+import { recoverOriginalSlice } from "./recover-bytes.js";
+
+describe("recoverOriginalSlice", () => {
+  it("preserves smart quotes from the original text", () => {
+    const original = `prefix \u201CAcme Corp\u201D suffix`;
+    const map = normalizeForMatching(original);
+    const start = map.text.indexOf(`"Acme Corp"`);
+    const end = start + `"Acme Corp"`.length;
+    expect(recoverOriginalSlice(original, map, start, end)).toBe(
+      `\u201CAcme Corp\u201D`,
+    );
+  });
+
+  it("preserves fullwidth digits from the original text", () => {
+    const original = "Call \uFF10\uFF11\uFF12\uFF13 now";
+    const map = normalizeForMatching(original);
+    const start = map.text.indexOf("0123");
+    const end = start + "0123".length;
+    expect(recoverOriginalSlice(original, map, start, end)).toBe(
+      "\uFF10\uFF11\uFF12\uFF13",
+    );
+  });
+
+  it("passes ASCII slices through unchanged", () => {
+    const original = "Acme Corp";
+    const map = normalizeForMatching(original);
+    expect(recoverOriginalSlice(original, map, 0, map.text.length)).toBe(
+      "Acme Corp",
+    );
+  });
+
+  it("supports startNorm = 0", () => {
+    const original = "\uFF21BC";
+    const map = normalizeForMatching(original);
+    expect(recoverOriginalSlice(original, map, 0, 1)).toBe("\uFF21");
+  });
+
+  it("supports endNorm = text.length", () => {
+    const original = `\u201CAcme\u201D`;
+    const map = normalizeForMatching(original);
+    expect(
+      recoverOriginalSlice(original, map, 0, map.text.length),
+    ).toBe(`\u201CAcme\u201D`);
+  });
+
+  it("returns an empty string for an empty slice", () => {
+    const original = "Acme";
+    const map = normalizeForMatching(original);
+    expect(recoverOriginalSlice(original, map, 2, 2)).toBe("");
+  });
+});
diff --git a/src/detection/_framework/recover-bytes.ts b/src/detection/_framework/recover-bytes.ts
@@ -0,0 +1,10 @@
+import type { PositionMap } from "../normalize.js";
+
+export function recoverOriginalSlice(
+  originalText: string,
+  map: PositionMap,
+  startNorm: number,
+  endNorm: number,
+): string {
+  return originalText.slice(map.origOffsets[startNorm], map.origOffsets[endNorm]);
+}
diff --git a/src/detection/_framework/runner.ts b/src/detection/_framework/runner.ts
@@ -320,7 +320,18 @@ export function runHeuristicPhase(
   if (text.length === 0) return [];
   const map = normalizeForMatching(text);
   if (map.text.length === 0) return [];
-  return runHeuristicPhaseOnMap(map, level, heuristics, context, opts);
+  const heuristicContext: HeuristicContext = {
+    ...context,
+    originalText: text,
+    map,
+  };
+  return runHeuristicPhaseOnMap(
+    map,
+    level,
+    heuristics,
+    heuristicContext,
+    opts,
+  );
 }
 
 /**
@@ -462,6 +473,8 @@ export function runAllPhases(text: string, opts: RunAllOptions): RunAllResult {
     structuralDefinitions,
     priorCandidates: regexCandidates,
     documentLanguage,
+    originalText: text,
+    map,
   };
   const heuristicCandidates = runHeuristicPhaseOnMap(
     map,

diff --git a/src/detection/_framework/types.ts b/src/detection/_framework/types.ts
@@ -1,3 +1,5 @@
+import type { PositionMap } from "../normalize.js";
+
 /**
  * Rule framework types — Phase 0.
  *
@@ -112,11 +114,15 @@ export interface Candidate {
  *  - structuralDefinitions (from structural phase) to skip D9 defined labels
  *  - priorCandidates (from regex phase) to avoid double-counting
  *  - documentLanguage (from runner) to filter role blacklists
+ *  - originalText + map (from runner) to recover original bytes for emitted
+ *    candidates without re-normalizing
  */
 export interface HeuristicContext {
   readonly structuralDefinitions: readonly StructuralDefinition[];
   readonly priorCandidates: readonly Candidate[];
   readonly documentLanguage: "ko" | "en" | "mixed";
+  readonly originalText?: string;
+  readonly map?: PositionMap;
 }
 
 /**

diff --git a/src/detection/rules/heuristics/capitalization-cluster.test.ts b/src/detection/rules/heuristics/capitalization-cluster.test.ts
@@ -1,5 +1,6 @@
 import { describe, expect, it } from "vitest";
 
+import { normalizeForMatching } from "../../normalize.js";
 import type { HeuristicContext } from "../../_framework/types.js";
 
 import { CAPITALIZATION_CLUSTER } from "./capitalization-cluster.js";
@@ -16,7 +17,12 @@ function makeContext(
 }
 
 function detect(text: string, ctx: HeuristicContext = makeContext()) {
-  return CAPITALIZATION_CLUSTER.detect(text, ctx);
+  const map = normalizeForMatching(text);
+  return CAPITALIZATION_CLUSTER.detect(map.text, {
+    ...ctx,
+    originalText: text,
+    map,
+  });
 }
 
 function expectFast(input: string, budgetMs = 100): void {
@@ -100,6 +106,26 @@ describe("heuristics.capitalization-cluster", () => {
     ]);
   });
 
+  it("recovers original bytes from smart-quoted input", () => {
+    expect(detect("\u201C\uFF21\uFF43\uFF4D\uFF45\u3000\uFF23\uFF4F\uFF52\uFF50\u201D signed.")).toEqual([
+      {
+        text: "\uFF21\uFF43\uFF4D\uFF45\u3000\uFF23\uFF4F\uFF52\uFF50",
+        ruleId: "heuristics.capitalization-cluster",
+        confidence: 0.7,
+      },
+    ]);
+  });
+
+  it("preserves fullwidth ASCII letters in candidate.text", () => {
+    expect(detect("\uFF21\uFF43\uFF4D\uFF45\u3000\uFF28\uFF4F\uFF4C\uFF44\uFF49\uFF4E\uFF47\uFF53 approved.")).toEqual([
+      {
+        text: "\uFF21\uFF43\uFF4D\uFF45\u3000\uFF28\uFF4F\uFF4C\uFF44\uFF49\uFF4E\uFF47\uFF53",
+        ruleId: "heuristics.capitalization-cluster",
+        confidence: 0.7,
+      },
+    ]);
+  });
+
   it("is ReDoS-safe on a 10KB pathological input", () => {
     expectFast(`${"A".repeat(5000)} ${"B".repeat(5000)}`);
   });

diff --git a/src/detection/rules/heuristics/capitalization-cluster.ts b/src/detection/rules/heuristics/capitalization-cluster.ts
@@ -9,8 +9,7 @@
  *   2. Prior candidate skip — already-found strings excluded
  *   3. Role blacklist — generic legal roles excluded
  *   4. Confidence 0.7 (moderate — caps clusters are common in English prose)
- *   5. Returns normalized text as candidate.text (ASCII letters are
- *      normalized losslessly, so normalized = original for this heuristic)
+ *   5. Recovers original bytes for candidate.text via HeuristicContext.map
  *
  * See docs/phases/phase-1-rulebook.md § 14.4.1
  */
@@ -20,6 +19,7 @@ import type {
   Heuristic,
   HeuristicContext,
 } from "../../_framework/types.js";
+import { recoverOriginalSlice } from "../../_framework/recover-bytes.js";
 import { ROLE_BLACKLIST_EN } from "../role-blacklist-en.js";
 
 export const CAPITALIZATION_CLUSTER: Heuristic = {
@@ -46,8 +46,17 @@ export const CAPITALIZATION_CLUSTER: Heuristic = {
       if (ROLE_BLACKLIST_EN.has(candidate.toLowerCase())) continue;
       const words = candidate.split(/\s+/);
       if (words.some((w) => ROLE_BLACKLIST_EN.has(w.toLowerCase()))) continue;
+      const original =
+        ctx.originalText && ctx.map
+          ? recoverOriginalSlice(
+              ctx.originalText,
+              ctx.map,
+              m.index,
+              m.index + candidate.length,
+            )
+          : candidate;
       out.push({
-        text: candidate,
+        text: original,
         ruleId: "heuristics.capitalization-cluster",
         confidence: 0.7,
       });
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		export const APP_VERSION = "v.1.05";
		export const APP_VERSION = "v1.1.1";