diff --git a/README.ko.md b/README.ko.md
index 537e29f..76fe949 100644
--- a/README.ko.md
+++ b/README.ko.md
@@ -13,7 +13,7 @@
- HTML 한 파일 · ~277 KB · 더블클릭으로 실행
+ HTML 한 파일 · ~276 KB · 더블클릭으로 실행
@@ -53,7 +53,7 @@
-
+
@@ -148,7 +148,7 @@ flowchart TD
|
현재 확인된 크기
- 277 KB
+ 276 KB
|
무결성 sidecar
@@ -160,14 +160,14 @@ flowchart TD
|
자동화 테스트
- 1,739 tests
+ 1,774 tests
|
-2026년 4월 14일 기준으로 확인한 현재 빌드:
+2026년 4월 18일 기준으로 확인한 현재 빌드:
-- `document-redactor.html` SHA-256: `323221def9ce105afbd8ea805a5ed7e0751152ec2d531d6dba84111332cd32f9`
+- `document-redactor.html` SHA-256: `4c65364179c80b3993bfee2d99715e5d3c40938d5780ff092f0e021aadc1e77e`
- `shasum -a 256 -c document-redactor.html.sha256` 로 로컬 검증 완료
## 현재 릴리즈가 실제로 하는 일
diff --git a/README.md b/README.md
index 0dadb8b..161abdc 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@
- Single HTML · ~277 KB · open locally
+ Single HTML · ~276 KB · open locally
@@ -53,7 +53,7 @@
-
+
@@ -149,7 +149,7 @@ flowchart TD
|
Current checked size
- 277 KB
+ 276 KB
|
Integrity sidecar
@@ -161,14 +161,14 @@ flowchart TD
|
Automated coverage
- 1,739 tests
+ 1,774 tests
|
-Current checked release artifact on April 14, 2026:
+Current checked release artifact on April 18, 2026:
-- `document-redactor.html` SHA-256: `323221def9ce105afbd8ea805a5ed7e0751152ec2d531d6dba84111332cd32f9`
+- `document-redactor.html` SHA-256: `4c65364179c80b3993bfee2d99715e5d3c40938d5780ff092f0e021aadc1e77e`
- Verified locally with `shasum -a 256 -c document-redactor.html.sha256`
## What The Current Release Does
diff --git a/release-notes/2026-04-18-draft.md b/release-notes/2026-04-18-draft.md
new file mode 100644
index 0000000..633427a
--- /dev/null
+++ b/release-notes/2026-04-18-draft.md
@@ -0,0 +1,68 @@
+# document-redactor v1.1.1
+
+_Drafted 2026-04-18_
+
+## Summary
+
+This update focuses on two things:
+
+- making DOCX intake and XML parsing safer against oversized files and ZIP-bomb-style payloads,
+- closing a few quiet leak paths that could survive redaction even when the visible document looked clean.
+
+It also fixes a detection correctness bug where heuristic candidates could return normalized text instead of the original DOCX bytes, which matters for smart quotes, fullwidth ASCII, and ideographic spaces.
+
+## What's new
+
+### Safer DOCX loading
+
+- Added a `loadDocxZip()` guard for all main UI and finalize entry points.
+- Files larger than **50 MB** are now rejected before JSZip fully loads them.
+- This reduces the chance that a very large or malicious input file freezes or crashes the browser tab.
+
+### Safer ZIP entry reads
+
+- Added a `readZipEntry()` guard for XML entry reads.
+- Individual ZIP entries larger than **20 MB** are now rejected before downstream XML handling continues.
+- This hardens scope walking, metadata scrubbing, verifier surface collection, and preflight repair against oversized decompressed entries.
+
+### External URL stripping in `.rels`
+
+- Redacted output now strips surviving external `http://` and `https://` targets from `.rels` files.
+- Both `Target="..."` and `Target='...'` forms are covered.
+- `mailto:` links and relative package paths are preserved.
+- Verification now explicitly fails if any external `http/https` relationship target survives in the output.
+
+### `docProps/custom.xml` removal
+
+- Metadata scrubbing now removes `docProps/custom.xml` entirely instead of leaving custom properties behind.
+- `[Content_Types].xml` is updated at the same time so the removed custom-properties part does not leave a stale override entry.
+- This closes a metadata channel where author email, project names, tracking IDs, or other arbitrary custom fields could survive redaction.
+
+### Heuristic original-byte recovery
+
+- Heuristic candidates now recover the original source slice from the normalization offset map before emitting `Candidate.text`.
+- This improves real-world redaction reliability when DOCX content uses:
+ - smart quotes,
+ - fullwidth ASCII or digits,
+ - ideographic spaces.
+- Result: fewer cases where detection appears correct but the later literal redaction step fails to find the exact original bytes.
+
+## User-visible impact
+
+- Oversized or suspicious DOCX inputs fail earlier and more clearly.
+- Redacted files are less likely to retain hidden metadata or tracking URLs.
+- Documents using smart quotes / fullwidth characters should redact more reliably when the match came from a heuristic.
+- No workflow changes were introduced in the UI; this is mainly a safety and correctness release.
+
+## Validation
+
+- Automated tests: **1,774 passing**
+- Full suite: `bun run test` passed
+- Local ReDoS gate: `SKIP_REDOS_FUZZ=0 bun run test` passed
+- Production build: `bun run build` passed
+- Built artifact size: **276 KB** (`dist/document-redactor.html`)
+
+## Notes
+
+- This draft summarizes the work completed on **2026-04-18**.
+- Download links and release asset hashes can be filled in once the release is cut.
diff --git a/src/app-version.ts b/src/app-version.ts
index 9ebef24..0c6aad4 100644
--- a/src/app-version.ts
+++ b/src/app-version.ts
@@ -1 +1 @@
-export const APP_VERSION = "v.1.05";
+export const APP_VERSION = "v1.1.1";
diff --git a/src/detection/_framework/recover-bytes.test.ts b/src/detection/_framework/recover-bytes.test.ts
new file mode 100644
index 0000000..c61640c
--- /dev/null
+++ b/src/detection/_framework/recover-bytes.test.ts
@@ -0,0 +1,55 @@
+import { describe, expect, it } from "vitest";
+
+import { normalizeForMatching } from "../normalize.js";
+
+import { recoverOriginalSlice } from "./recover-bytes.js";
+
+describe("recoverOriginalSlice", () => {
+ it("preserves smart quotes from the original text", () => {
+ const original = `prefix \u201CAcme Corp\u201D suffix`;
+ const map = normalizeForMatching(original);
+ const start = map.text.indexOf(`"Acme Corp"`);
+ const end = start + `"Acme Corp"`.length;
+ expect(recoverOriginalSlice(original, map, start, end)).toBe(
+ `\u201CAcme Corp\u201D`,
+ );
+ });
+
+ it("preserves fullwidth digits from the original text", () => {
+ const original = "Call \uFF10\uFF11\uFF12\uFF13 now";
+ const map = normalizeForMatching(original);
+ const start = map.text.indexOf("0123");
+ const end = start + "0123".length;
+ expect(recoverOriginalSlice(original, map, start, end)).toBe(
+ "\uFF10\uFF11\uFF12\uFF13",
+ );
+ });
+
+ it("passes ASCII slices through unchanged", () => {
+ const original = "Acme Corp";
+ const map = normalizeForMatching(original);
+ expect(recoverOriginalSlice(original, map, 0, map.text.length)).toBe(
+ "Acme Corp",
+ );
+ });
+
+ it("supports startNorm = 0", () => {
+ const original = "\uFF21BC";
+ const map = normalizeForMatching(original);
+ expect(recoverOriginalSlice(original, map, 0, 1)).toBe("\uFF21");
+ });
+
+ it("supports endNorm = text.length", () => {
+ const original = `\u201CAcme\u201D`;
+ const map = normalizeForMatching(original);
+ expect(
+ recoverOriginalSlice(original, map, 0, map.text.length),
+ ).toBe(`\u201CAcme\u201D`);
+ });
+
+ it("returns an empty string for an empty slice", () => {
+ const original = "Acme";
+ const map = normalizeForMatching(original);
+ expect(recoverOriginalSlice(original, map, 2, 2)).toBe("");
+ });
+});
diff --git a/src/detection/_framework/recover-bytes.ts b/src/detection/_framework/recover-bytes.ts
new file mode 100644
index 0000000..eae504e
--- /dev/null
+++ b/src/detection/_framework/recover-bytes.ts
@@ -0,0 +1,10 @@
+import type { PositionMap } from "../normalize.js";
+
+export function recoverOriginalSlice(
+ originalText: string,
+ map: PositionMap,
+ startNorm: number,
+ endNorm: number,
+): string {
+ return originalText.slice(map.origOffsets[startNorm], map.origOffsets[endNorm]);
+}
diff --git a/src/detection/_framework/runner.ts b/src/detection/_framework/runner.ts
index 9337ce1..1b302b3 100644
--- a/src/detection/_framework/runner.ts
+++ b/src/detection/_framework/runner.ts
@@ -320,7 +320,18 @@ export function runHeuristicPhase(
if (text.length === 0) return [];
const map = normalizeForMatching(text);
if (map.text.length === 0) return [];
- return runHeuristicPhaseOnMap(map, level, heuristics, context, opts);
+ const heuristicContext: HeuristicContext = {
+ ...context,
+ originalText: text,
+ map,
+ };
+ return runHeuristicPhaseOnMap(
+ map,
+ level,
+ heuristics,
+ heuristicContext,
+ opts,
+ );
}
/**
@@ -462,6 +473,8 @@ export function runAllPhases(text: string, opts: RunAllOptions): RunAllResult {
structuralDefinitions,
priorCandidates: regexCandidates,
documentLanguage,
+ originalText: text,
+ map,
};
const heuristicCandidates = runHeuristicPhaseOnMap(
map,
diff --git a/src/detection/_framework/types.ts b/src/detection/_framework/types.ts
index 06775b7..78ae59f 100644
--- a/src/detection/_framework/types.ts
+++ b/src/detection/_framework/types.ts
@@ -1,3 +1,5 @@
+import type { PositionMap } from "../normalize.js";
+
/**
* Rule framework types — Phase 0.
*
@@ -112,11 +114,15 @@ export interface Candidate {
* - structuralDefinitions (from structural phase) to skip D9 defined labels
* - priorCandidates (from regex phase) to avoid double-counting
* - documentLanguage (from runner) to filter role blacklists
+ * - originalText + map (from runner) to recover original bytes for emitted
+ * candidates without re-normalizing
*/
export interface HeuristicContext {
readonly structuralDefinitions: readonly StructuralDefinition[];
readonly priorCandidates: readonly Candidate[];
readonly documentLanguage: "ko" | "en" | "mixed";
+ readonly originalText?: string;
+ readonly map?: PositionMap;
}
/**
diff --git a/src/detection/rules/heuristics/capitalization-cluster.test.ts b/src/detection/rules/heuristics/capitalization-cluster.test.ts
index 9ce0a0d..d2960c8 100644
--- a/src/detection/rules/heuristics/capitalization-cluster.test.ts
+++ b/src/detection/rules/heuristics/capitalization-cluster.test.ts
@@ -1,5 +1,6 @@
import { describe, expect, it } from "vitest";
+import { normalizeForMatching } from "../../normalize.js";
import type { HeuristicContext } from "../../_framework/types.js";
import { CAPITALIZATION_CLUSTER } from "./capitalization-cluster.js";
@@ -16,7 +17,12 @@ function makeContext(
}
function detect(text: string, ctx: HeuristicContext = makeContext()) {
- return CAPITALIZATION_CLUSTER.detect(text, ctx);
+ const map = normalizeForMatching(text);
+ return CAPITALIZATION_CLUSTER.detect(map.text, {
+ ...ctx,
+ originalText: text,
+ map,
+ });
}
function expectFast(input: string, budgetMs = 100): void {
@@ -100,6 +106,26 @@ describe("heuristics.capitalization-cluster", () => {
]);
});
+ it("recovers original bytes from smart-quoted input", () => {
+ expect(detect("\u201C\uFF21\uFF43\uFF4D\uFF45\u3000\uFF23\uFF4F\uFF52\uFF50\u201D signed.")).toEqual([
+ {
+ text: "\uFF21\uFF43\uFF4D\uFF45\u3000\uFF23\uFF4F\uFF52\uFF50",
+ ruleId: "heuristics.capitalization-cluster",
+ confidence: 0.7,
+ },
+ ]);
+ });
+
+ it("preserves fullwidth ASCII letters in candidate.text", () => {
+ expect(detect("\uFF21\uFF43\uFF4D\uFF45\u3000\uFF28\uFF4F\uFF4C\uFF44\uFF49\uFF4E\uFF47\uFF53 approved.")).toEqual([
+ {
+ text: "\uFF21\uFF43\uFF4D\uFF45\u3000\uFF28\uFF4F\uFF4C\uFF44\uFF49\uFF4E\uFF47\uFF53",
+ ruleId: "heuristics.capitalization-cluster",
+ confidence: 0.7,
+ },
+ ]);
+ });
+
it("is ReDoS-safe on a 10KB pathological input", () => {
expectFast(`${"A".repeat(5000)} ${"B".repeat(5000)}`);
});
diff --git a/src/detection/rules/heuristics/capitalization-cluster.ts b/src/detection/rules/heuristics/capitalization-cluster.ts
index b50a654..6c14987 100644
--- a/src/detection/rules/heuristics/capitalization-cluster.ts
+++ b/src/detection/rules/heuristics/capitalization-cluster.ts
@@ -9,8 +9,7 @@
* 2. Prior candidate skip — already-found strings excluded
* 3. Role blacklist — generic legal roles excluded
* 4. Confidence 0.7 (moderate — caps clusters are common in English prose)
- * 5. Returns normalized text as candidate.text (ASCII letters are
- * normalized losslessly, so normalized = original for this heuristic)
+ * 5. Recovers original bytes for candidate.text via HeuristicContext.map
*
* See docs/phases/phase-1-rulebook.md § 14.4.1
*/
@@ -20,6 +19,7 @@ import type {
Heuristic,
HeuristicContext,
} from "../../_framework/types.js";
+import { recoverOriginalSlice } from "../../_framework/recover-bytes.js";
import { ROLE_BLACKLIST_EN } from "../role-blacklist-en.js";
export const CAPITALIZATION_CLUSTER: Heuristic = {
@@ -46,8 +46,17 @@ export const CAPITALIZATION_CLUSTER: Heuristic = {
if (ROLE_BLACKLIST_EN.has(candidate.toLowerCase())) continue;
const words = candidate.split(/\s+/);
if (words.some((w) => ROLE_BLACKLIST_EN.has(w.toLowerCase()))) continue;
+ const original =
+ ctx.originalText && ctx.map
+ ? recoverOriginalSlice(
+ ctx.originalText,
+ ctx.map,
+ m.index,
+ m.index + candidate.length,
+ )
+ : candidate;
out.push({
- text: candidate,
+ text: original,
ruleId: "heuristics.capitalization-cluster",
confidence: 0.7,
});
diff --git a/src/detection/rules/heuristics/email-domain-inference.test.ts b/src/detection/rules/heuristics/email-domain-inference.test.ts
index ca6fe5d..8430040 100644
--- a/src/detection/rules/heuristics/email-domain-inference.test.ts
+++ b/src/detection/rules/heuristics/email-domain-inference.test.ts
@@ -1,5 +1,6 @@
import { describe, expect, it } from "vitest";
+import { normalizeForMatching } from "../../normalize.js";
import type { Candidate, HeuristicContext } from "../../_framework/types.js";
import { EMAIL_DOMAIN_INFERENCE } from "./email-domain-inference.js";
@@ -16,13 +17,18 @@ function makeContext(
};
}
-function detect(ctx: HeuristicContext) {
- return EMAIL_DOMAIN_INFERENCE.detect("", ctx);
+function detect(text: string, ctx: HeuristicContext) {
+ const map = normalizeForMatching(text);
+ return EMAIL_DOMAIN_INFERENCE.detect(map.text, {
+ ...ctx,
+ originalText: text,
+ map,
+ });
}
function expectFast(ctx: HeuristicContext, budgetMs = 100): void {
const start = performance.now();
- void detect(ctx);
+ void detect("", ctx);
const elapsed = performance.now() - start;
expect(elapsed).toBeLessThan(budgetMs);
}
@@ -51,7 +57,7 @@ describe("heuristics.email-domain-inference", () => {
[{ text: "Samsung", ruleId: "heuristics.email-domain-inference", confidence: 0.8 }],
],
])("%s", (_name, ctx, expected) => {
- expect(detect(ctx)).toEqual(expected);
+ expect(detect("", ctx)).toEqual(expected);
});
it.each([
@@ -78,7 +84,7 @@ describe("heuristics.email-domain-inference", () => {
[{ text: "Northwind", ruleId: "heuristics.email-domain-inference", confidence: 0.8 }],
],
])("%s", (_name, ctx, expected) => {
- expect(detect(ctx)).toEqual(expected);
+ expect(detect("", ctx)).toEqual(expected);
});
it.each([
@@ -101,7 +107,7 @@ describe("heuristics.email-domain-inference", () => {
]),
],
])("%s", (_name, ctx) => {
- expect(detect(ctx)).toEqual([]);
+ expect(detect("", ctx)).toEqual([]);
});
it("skips inferred names that match structural-definition labels (D9)", () => {
@@ -117,7 +123,7 @@ describe("heuristics.email-domain-inference", () => {
],
},
);
- expect(detect(ctx)).toEqual([]);
+ expect(detect("", ctx)).toEqual([]);
});
it("skips inferred names already present in priorCandidates", () => {
@@ -125,25 +131,26 @@ describe("heuristics.email-domain-inference", () => {
{ text: "legal@acme-corp.com", ruleId: "identifiers.email", confidence: 1.0 },
{ text: "Acme Corp", ruleId: "entities.en-corp-suffix", confidence: 1.0 },
]);
- expect(detect(ctx)).toEqual([]);
+ expect(detect("", ctx)).toEqual([]);
});
it("skips blacklisted inferred names like Party", () => {
const ctx = makeContext([
{ text: "legal@party.com", ruleId: "identifiers.email", confidence: 1.0 },
]);
- expect(detect(ctx)).toEqual([]);
+ expect(detect("", ctx)).toEqual([]);
});
it("skips blacklisted inferred names like Company", () => {
const ctx = makeContext([
{ text: "legal@company.com", ruleId: "identifiers.email", confidence: 1.0 },
]);
- expect(detect(ctx)).toEqual([]);
+ expect(detect("", ctx)).toEqual([]);
});
it("emits 0.8 for corporate prefixes and 0.6 for personal prefixes", () => {
const result = detect(
+ "",
makeContext([
{ text: "legal@acme-corp.com", ruleId: "identifiers.email", confidence: 1.0 },
{ text: "john@beta.com", ruleId: "identifiers.email", confidence: 1.0 },
@@ -163,6 +170,32 @@ describe("heuristics.email-domain-inference", () => {
]);
});
+ it("recovers original bytes from smart-quoted document occurrences", () => {
+ const ctx = makeContext([
+ { text: "legal@acme-corp.com", ruleId: "identifiers.email", confidence: 1.0 },
+ ]);
+ expect(detect("\u201C\uFF21\uFF43\uFF4D\uFF45\u3000\uFF23\uFF4F\uFF52\uFF50\u201D is the counterparty.", ctx)).toEqual([
+ {
+ text: "\uFF21\uFF43\uFF4D\uFF45\u3000\uFF23\uFF4F\uFF52\uFF50",
+ ruleId: "heuristics.email-domain-inference",
+ confidence: 0.8,
+ },
+ ]);
+ });
+
+ it("preserves fullwidth digits when recovering inferred names from text", () => {
+ const ctx = makeContext([
+ { text: "legal@acme-123.com", ruleId: "identifiers.email", confidence: 1.0 },
+ ]);
+ expect(detect("\uFF21\uFF43\uFF4D\uFF45\u3000\uFF11\uFF12\uFF13 responded.", ctx)).toEqual([
+ {
+ text: "\uFF21\uFF43\uFF4D\uFF45\u3000\uFF11\uFF12\uFF13",
+ ruleId: "heuristics.email-domain-inference",
+ confidence: 0.8,
+ },
+ ]);
+ });
+
it("is ReDoS-safe on a 10KB pathological input", () => {
expectFast(
makeContext([
diff --git a/src/detection/rules/heuristics/email-domain-inference.ts b/src/detection/rules/heuristics/email-domain-inference.ts
index 8f5137a..f139e36 100644
--- a/src/detection/rules/heuristics/email-domain-inference.ts
+++ b/src/detection/rules/heuristics/email-domain-inference.ts
@@ -19,6 +19,7 @@ import type {
Heuristic,
HeuristicContext,
} from "../../_framework/types.js";
+import { recoverOriginalSlice } from "../../_framework/recover-bytes.js";
import { ROLE_BLACKLIST_EN } from "../role-blacklist-en.js";
/** Common TLDs to strip. */
@@ -39,6 +40,22 @@ function titleCase(s: string): string {
return s[0]!.toUpperCase() + s.slice(1).toLowerCase();
}
+function recoverInferredText(
+ normalizedText: string,
+ inferred: string,
+ ctx: HeuristicContext,
+): string {
+ if (!ctx.originalText || !ctx.map) return inferred;
+ const startNorm = normalizedText.indexOf(inferred);
+ if (startNorm < 0) return inferred;
+ return recoverOriginalSlice(
+ ctx.originalText,
+ ctx.map,
+ startNorm,
+ startNorm + inferred.length,
+ );
+}
+
export const EMAIL_DOMAIN_INFERENCE: Heuristic = {
id: "heuristics.email-domain-inference",
category: "heuristics",
@@ -47,7 +64,7 @@ export const EMAIL_DOMAIN_INFERENCE: Heuristic = {
levels: ["paranoid"],
description:
"Infer company name from email domain (legal@acme-corp.com → 'Acme Corp')",
- detect(_text: string, ctx: HeuristicContext): readonly Candidate[] {
+ detect(text: string, ctx: HeuristicContext): readonly Candidate[] {
const definedLabels = new Set(
ctx.structuralDefinitions.map((d) => d.label),
);
@@ -93,7 +110,7 @@ export const EMAIL_DOMAIN_INFERENCE: Heuristic = {
const confidence = CORPORATE_PREFIXES.has(localPart) ? 0.8 : 0.6;
out.push({
- text: inferred,
+ text: recoverInferredText(text, inferred, ctx),
ruleId: "heuristics.email-domain-inference",
confidence,
});
diff --git a/src/detection/rules/heuristics/quoted-term.test.ts b/src/detection/rules/heuristics/quoted-term.test.ts
index 561a091..495a411 100644
--- a/src/detection/rules/heuristics/quoted-term.test.ts
+++ b/src/detection/rules/heuristics/quoted-term.test.ts
@@ -17,7 +17,12 @@ function makeContext(
}
function detectRaw(text: string, ctx: HeuristicContext = makeContext()) {
- return QUOTED_TERM.detect(normalizeForMatching(text).text, ctx);
+ const map = normalizeForMatching(text);
+ return QUOTED_TERM.detect(map.text, {
+ ...ctx,
+ originalText: text,
+ map,
+ });
}
function expectFast(input: string, budgetMs = 100): void {
@@ -88,6 +93,26 @@ describe("heuristics.quoted-term", () => {
]);
});
+ it("recovers original inner bytes from smart-quoted input", () => {
+ expect(detectRaw("\u201C\uFF21\uFF43\uFF4D\uFF45\u3000\uFF11\uFF12\uFF13\u201D shall survive.")).toEqual([
+ {
+ text: "\uFF21\uFF43\uFF4D\uFF45\u3000\uFF11\uFF12\uFF13",
+ ruleId: "heuristics.quoted-term",
+ confidence: 0.6,
+ },
+ ]);
+ });
+
+ it("preserves fullwidth digits in candidate.text", () => {
+ expect(detectRaw('"\uFF21\uFF43\uFF4D\uFF45\uFF11\uFF12\uFF13" shall survive.')).toEqual([
+ {
+ text: "\uFF21\uFF43\uFF4D\uFF45\uFF11\uFF12\uFF13",
+ ruleId: "heuristics.quoted-term",
+ confidence: 0.6,
+ },
+ ]);
+ });
+
it("is ReDoS-safe on a 10KB pathological input", () => {
expectFast(`"${"A".repeat(10000)}"`);
});
diff --git a/src/detection/rules/heuristics/quoted-term.ts b/src/detection/rules/heuristics/quoted-term.ts
index 118d650..f817e5b 100644
--- a/src/detection/rules/heuristics/quoted-term.ts
+++ b/src/detection/rules/heuristics/quoted-term.ts
@@ -18,6 +18,7 @@ import type {
Heuristic,
HeuristicContext,
} from "../../_framework/types.js";
+import { recoverOriginalSlice } from "../../_framework/recover-bytes.js";
import { ROLE_BLACKLIST_EN } from "../role-blacklist-en.js";
import { ROLE_BLACKLIST_KO } from "../role-blacklist-ko.js";
@@ -43,8 +44,19 @@ export const QUOTED_TERM: Heuristic = {
if (priorTexts.has(inner)) continue;
if (ROLE_BLACKLIST_EN.has(inner.toLowerCase())) continue;
if (ROLE_BLACKLIST_KO.has(inner)) continue;
+ const innerStartNorm = m.index + 1;
+ const innerEndNorm = innerStartNorm + inner.length;
+ const original =
+ ctx.originalText && ctx.map
+ ? recoverOriginalSlice(
+ ctx.originalText,
+ ctx.map,
+ innerStartNorm,
+ innerEndNorm,
+ )
+ : inner;
out.push({
- text: inner,
+ text: original,
ruleId: "heuristics.quoted-term",
confidence: 0.6,
});
diff --git a/src/detection/rules/heuristics/repeatability.test.ts b/src/detection/rules/heuristics/repeatability.test.ts
index b36ea1b..c2a611c 100644
--- a/src/detection/rules/heuristics/repeatability.test.ts
+++ b/src/detection/rules/heuristics/repeatability.test.ts
@@ -1,5 +1,6 @@
import { describe, expect, it } from "vitest";
+import { normalizeForMatching } from "../../normalize.js";
import type { HeuristicContext } from "../../_framework/types.js";
import { REPEATABILITY } from "./repeatability.js";
@@ -16,7 +17,12 @@ function makeContext(
}
function detect(text: string, ctx: HeuristicContext = makeContext()) {
- return REPEATABILITY.detect(text, ctx);
+ const map = normalizeForMatching(text);
+ return REPEATABILITY.detect(map.text, {
+ ...ctx,
+ originalText: text,
+ map,
+ });
}
function expectFast(input: string, budgetMs = 100): void {
@@ -123,6 +129,34 @@ describe("heuristics.repeatability", () => {
]);
});
+ it("recovers original bytes from repeated smart-quoted input", () => {
+ expect(
+ detect(
+ "\u201C\uFF21\uFF43\uFF4D\uFF45\u3000\uFF23\uFF4F\uFF52\uFF50\u201D signed. \u201C\uFF21\uFF43\uFF4D\uFF45\u3000\uFF23\uFF4F\uFF52\uFF50\u201D approved. \u201C\uFF21\uFF43\uFF4D\uFF45\u3000\uFF23\uFF4F\uFF52\uFF50\u201D closed.",
+ ),
+ ).toEqual([
+ {
+ text: "\uFF21\uFF43\uFF4D\uFF45\u3000\uFF23\uFF4F\uFF52\uFF50",
+ ruleId: "heuristics.repeatability",
+ confidence: 0.5,
+ },
+ ]);
+ });
+
+ it("preserves fullwidth ASCII letters in repeated candidate.text", () => {
+ expect(
+ detect(
+ "\uFF21\uFF43\uFF4D\uFF45 signed. \uFF21\uFF43\uFF4D\uFF45 approved. \uFF21\uFF43\uFF4D\uFF45 closed.",
+ ),
+ ).toEqual([
+ {
+ text: "\uFF21\uFF43\uFF4D\uFF45",
+ ruleId: "heuristics.repeatability",
+ confidence: 0.5,
+ },
+ ]);
+ });
+
it("is ReDoS-safe on a 10KB pathological input", () => {
expectFast(`${"Acme ".repeat(2000)}${"삼성전자 ".repeat(1000)}`);
});
diff --git a/src/detection/rules/heuristics/repeatability.ts b/src/detection/rules/heuristics/repeatability.ts
index db01c45..6ff29ed 100644
--- a/src/detection/rules/heuristics/repeatability.ts
+++ b/src/detection/rules/heuristics/repeatability.ts
@@ -19,6 +19,7 @@ import type {
Heuristic,
HeuristicContext,
} from "../../_framework/types.js";
+import { recoverOriginalSlice } from "../../_framework/recover-bytes.js";
import { ROLE_BLACKLIST_EN } from "../role-blacklist-en.js";
import { ROLE_BLACKLIST_KO } from "../role-blacklist-ko.js";
@@ -40,6 +41,7 @@ export const REPEATABILITY: Heuristic = {
const priorTexts = new Set(ctx.priorCandidates.map((c) => c.text));
const counts = new Map();
+ const firstSpans = new Map();
const enPattern =
/(? {
+ const zip = new JSZip();
+ zip.file(
+ "word/document.xml",
+ `Hello`,
+ );
+ return zip.generateAsync({ type: "uint8array" });
+}
+
+afterEach(() => {
+ vi.restoreAllMocks();
+});
+
+describe("loadDocxZip", () => {
+ it("rejects empty bytes", async () => {
+ await expect(loadDocxZip(new Uint8Array(0))).rejects.toBeInstanceOf(
+ FileTooLargeError,
+ );
+ });
+
+ it("accepts a valid docx-sized zip", async () => {
+ const bytes = await makeZipBytes();
+ const zip = await loadDocxZip(bytes);
+ expect(zip.file("word/document.xml")).not.toBeNull();
+ });
+
+ it("accepts the exact MAX_INPUT_BYTES boundary and delegates to JSZip", async () => {
+ const zip = new JSZip();
+ const spy = vi.spyOn(JSZip, "loadAsync").mockResolvedValue(zip);
+ const bytes = new Uint8Array(MAX_INPUT_BYTES);
+
+ await expect(loadDocxZip(bytes)).resolves.toBe(zip);
+ expect(spy).toHaveBeenCalledTimes(1);
+ });
+
+ it("rejects bytes larger than MAX_INPUT_BYTES", async () => {
+ const spy = vi.spyOn(JSZip, "loadAsync");
+ const bytes = new Uint8Array(MAX_INPUT_BYTES + 1);
+
+ await expect(loadDocxZip(bytes)).rejects.toBeInstanceOf(FileTooLargeError);
+ expect(spy).not.toHaveBeenCalled();
+ });
+
+ it("propagates JSZip errors for corrupt ZIP data", async () => {
+ await expect(loadDocxZip(new Uint8Array([1, 2, 3, 4]))).rejects.not.toBeInstanceOf(
+ FileTooLargeError,
+ );
+ });
+});
+
+describe("readZipEntry", () => {
+ it("returns a string for a normal entry", async () => {
+ const zip = new JSZip();
+ zip.file("word/document.xml", "");
+
+ await expect(readZipEntry(zip, "word/document.xml")).resolves.toBe(
+ "",
+ );
+ });
+
+ it("throws when the entry does not exist", async () => {
+ const zip = new JSZip();
+ await expect(readZipEntry(zip, "missing.xml")).rejects.toThrow(
+ "ZIP entry not found: missing.xml",
+ );
+ });
+
+ it("throws EntryTooLargeError for oversized decompressed content", async () => {
+ const zip = {
+ file(path: string) {
+ if (path !== "word/document.xml") return null;
+ return {
+ async: vi.fn().mockResolvedValue("x".repeat(MAX_ENTRY_BYTES + 1)),
+ };
+ },
+ } as unknown as JSZip;
+
+ await expect(readZipEntry(zip, "word/document.xml")).rejects.toBeInstanceOf(
+ EntryTooLargeError,
+ );
+ });
+
+ it("accepts content at the exact MAX_ENTRY_BYTES boundary", async () => {
+ const zip = {
+ file(path: string) {
+ if (path !== "word/document.xml") return null;
+ return {
+ async: vi.fn().mockResolvedValue("x".repeat(MAX_ENTRY_BYTES)),
+ };
+ },
+ } as unknown as JSZip;
+
+ await expect(readZipEntry(zip, "word/document.xml")).resolves.toBe(
+ "x".repeat(MAX_ENTRY_BYTES),
+ );
+ });
+});
diff --git a/src/docx/load.ts b/src/docx/load.ts
new file mode 100644
index 0000000..b6afade
--- /dev/null
+++ b/src/docx/load.ts
@@ -0,0 +1,42 @@
+import JSZip from "jszip";
+
+import { MAX_ENTRY_BYTES, MAX_INPUT_BYTES } from "./limits.js";
+
+export class FileTooLargeError extends Error {
+ constructor(size: number, limit: number) {
+ super(`File size ${size} bytes exceeds limit of ${limit} bytes`);
+ this.name = "FileTooLargeError";
+ }
+}
+
+export class EntryTooLargeError extends Error {
+ constructor(path: string, size: number, limit: number) {
+ super(`ZIP entry "${path}" decompressed to ${size} bytes, exceeds limit of ${limit} bytes`);
+ this.name = "EntryTooLargeError";
+ }
+}
+
+export async function loadDocxZip(bytes: Uint8Array): Promise {
+ if (bytes.length === 0) {
+ throw new FileTooLargeError(0, MAX_INPUT_BYTES);
+ }
+ if (bytes.length > MAX_INPUT_BYTES) {
+ throw new FileTooLargeError(bytes.length, MAX_INPUT_BYTES);
+ }
+ return JSZip.loadAsync(bytes.slice());
+}
+
+export async function readZipEntry(
+ zip: JSZip,
+ path: string,
+): Promise {
+ const file = zip.file(path);
+ if (file === null) {
+ throw new Error(`ZIP entry not found: ${path}`);
+ }
+ const content = await file.async("string");
+ if (content.length > MAX_ENTRY_BYTES) {
+ throw new EntryTooLargeError(path, content.length, MAX_ENTRY_BYTES);
+ }
+ return content;
+}
diff --git a/src/docx/scopes.ts b/src/docx/scopes.ts
index 8f1f6c3..b94e014 100644
--- a/src/docx/scopes.ts
+++ b/src/docx/scopes.ts
@@ -12,6 +12,7 @@
import type JSZip from "jszip";
+import { readZipEntry } from "./load.js";
import { SCOPE_PATTERNS, type Scope, type ScopeKind } from "./types.js";
/**
@@ -73,11 +74,7 @@ export async function readScopeXml(
zip: JSZip,
scope: Scope,
): Promise {
- const file = zip.file(scope.path);
- if (file === null) {
- throw new Error(`scope ${scope.path} not found in zip`);
- }
- return file.async("string");
+ return readZipEntry(zip, scope.path);
}
/**
diff --git a/src/docx/scrub-metadata.test.ts b/src/docx/scrub-metadata.test.ts
index fd090d1..c281b36 100644
--- a/src/docx/scrub-metadata.test.ts
+++ b/src/docx/scrub-metadata.test.ts
@@ -24,6 +24,18 @@ const APP_XML = `
Microsoft Word
`;
+const CONTENT_TYPES_XML = `
+
+
+
+
+`;
+
+const CUSTOM_XML = `
+
+ kim@example.com
+`;
+
describe("scrubMetadataXml", () => {
it("zeroes out dc:creator", () => {
const out = scrubMetadataXml(CORE_XML, ["creator"]);
@@ -117,4 +129,41 @@ describe("scrubDocxMetadata", () => {
const newCore = await zip.file("docProps/core.xml")!.async("string");
expect(newCore).not.toContain("Kim Chul-Soo");
});
+
+ it("removes docProps/custom.xml entirely when present", async () => {
+ const zip = new JSZip();
+ zip.file("docProps/custom.xml", CUSTOM_XML);
+ zip.file("[Content_Types].xml", CONTENT_TYPES_XML);
+
+ await scrubDocxMetadata(zip);
+
+ expect(zip.file("docProps/custom.xml")).toBeNull();
+ });
+
+ it("removes the custom.xml override from [Content_Types].xml", async () => {
+ const zip = new JSZip();
+ zip.file("docProps/custom.xml", CUSTOM_XML);
+ zip.file("[Content_Types].xml", CONTENT_TYPES_XML);
+
+ await scrubDocxMetadata(zip);
+
+ const contentTypes = await zip.file("[Content_Types].xml")!.async("string");
+ expect(contentTypes).not.toContain(`/docProps/custom.xml`);
+ expect(contentTypes).toContain(`/word/document.xml`);
+ });
+
+ it("leaves [Content_Types].xml alone when no custom override exists", async () => {
+ const zip = new JSZip();
+ const contentTypes = CONTENT_TYPES_XML.replace(
+ /\s*]*PartName="\/docProps\/custom\.xml"[^>]*\/>/,
+ "",
+ );
+ zip.file("[Content_Types].xml", contentTypes);
+
+ await scrubDocxMetadata(zip);
+
+ expect(await zip.file("[Content_Types].xml")!.async("string")).toBe(
+ contentTypes,
+ );
+ });
});
diff --git a/src/docx/scrub-metadata.ts b/src/docx/scrub-metadata.ts
index 4768bcd..2d563c3 100644
--- a/src/docx/scrub-metadata.ts
+++ b/src/docx/scrub-metadata.ts
@@ -14,6 +14,7 @@
import type JSZip from "jszip";
+import { readZipEntry } from "./load.js";
import { METADATA_SENSITIVE_FIELDS } from "./types.js";
/**
@@ -42,19 +43,36 @@ export function scrubMetadataXml(xml: string, fields: ReadonlyArray): st
/**
* Apply the standard scrub policy to a DOCX zip in place. Reads
* `docProps/core.xml` and `docProps/app.xml`, scrubs each, and writes them
- * back. Idempotent.
+ * back. Removes `docProps/custom.xml` entirely because its schema is
+ * free-form and can hide arbitrary metadata payloads. Idempotent.
*/
export async function scrubDocxMetadata(zip: JSZip): Promise {
const targets = ["docProps/core.xml", "docProps/app.xml"];
for (const path of targets) {
const file = zip.file(path);
if (file === null) continue;
- const xml = await file.async("string");
+ const xml = await readZipEntry(zip, path);
const cleaned = scrubMetadataXml(xml, METADATA_SENSITIVE_FIELDS);
zip.file(path, cleaned);
}
+
+ if (zip.file("docProps/custom.xml") !== null) {
+ zip.remove("docProps/custom.xml");
+ }
+
+ if (zip.file("[Content_Types].xml") !== null) {
+ const xml = await readZipEntry(zip, "[Content_Types].xml");
+ zip.file("[Content_Types].xml", removeCustomPropsOverride(xml));
+ }
}
function escapeRegex(s: string): string {
return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
}
+
+function removeCustomPropsOverride(xml: string): string {
+ return xml.replace(
+ /\s*]*PartName=["']\/docProps\/custom\.xml["'][^>]*\/>/g,
+ "",
+ );
+}
diff --git a/src/docx/verify-surfaces.test.ts b/src/docx/verify-surfaces.test.ts
index 1c10655..a20bc56 100644
--- a/src/docx/verify-surfaces.test.ts
+++ b/src/docx/verify-surfaces.test.ts
@@ -32,4 +32,13 @@ describe("verify-surfaces", () => {
"https://example.com/second",
]);
});
+
+ it("extracts relationship Target values from single-quoted attributes too", () => {
+ const rels = ``;
+
+ expect(extractRelationshipTargets(rels)).toEqual([
+ "https://example.com/first",
+ "mailto:second@example.com",
+ ]);
+ });
});
diff --git a/src/docx/verify-surfaces.ts b/src/docx/verify-surfaces.ts
index 94aac53..32b7d38 100644
--- a/src/docx/verify-surfaces.ts
+++ b/src/docx/verify-surfaces.ts
@@ -1,6 +1,7 @@
import type JSZip from "jszip";
import { extractScopeText } from "../detection/extract-text.js";
+import { readZipEntry } from "./load.js";
import { listScopes, readScopeXml } from "./scopes.js";
import type { Scope } from "./types.js";
@@ -50,7 +51,7 @@ export async function collectVerifySurfaces(zip: JSZip): Promise
const relsTargetSurfaces: RelsTargetSurface[] = [];
for (const path of listRelsPaths(zip)) {
- const xml = await zip.file(path)!.async("string");
+ const xml = await readZipEntry(zip, path);
for (const text of extractRelationshipTargets(xml)) {
relsTargetSurfaces.push({ kind: "rels-target", path, text });
}
@@ -86,7 +87,7 @@ export function extractFldSimpleInstrValues(xml: string): readonly string[] {
export function extractRelationshipTargets(relsXml: string): readonly string[] {
const out: string[] = [];
- const re = /]*\bTarget="([^"]*)"/g;
+ const re = /]*\bTarget=["']([^"']*)["']/g;
let match: RegExpExecArray | null;
while ((match = re.exec(relsXml)) !== null) {
out.push(decodeXml(match[1] ?? ""));
diff --git a/src/docx/verify.test.ts b/src/docx/verify.test.ts
index c9f4cc3..e31cd48 100644
--- a/src/docx/verify.test.ts
+++ b/src/docx/verify.test.ts
@@ -179,6 +179,36 @@ describe("verifyRedaction", () => {
expect(result.survived).toEqual([]);
});
+ it("fails verification when an external http URL survives in rels", async () => {
+ const zip = await syntheticDocx({
+ "word/document.xml": bodyWith("[REDACTED]"),
+ "word/_rels/document.xml.rels": ``,
+ });
+ const result = await verifyRedaction(zip, resolved("unrelated@example.com"));
+ expect(result.isClean).toBe(false);
+ expect(result.survived).toEqual([
+ expect.objectContaining({
+ text: "http://evil.example/track",
+ surface: "rels",
+ }),
+ ]);
+ });
+
+ it("fails verification when a single-quoted external https URL survives in rels", async () => {
+ const zip = await syntheticDocx({
+ "word/document.xml": bodyWith("[REDACTED]"),
+ "word/_rels/document.xml.rels": ``,
+ });
+ const result = await verifyRedaction(zip, resolved("unrelated@example.com"));
+ expect(result.isClean).toBe(false);
+ expect(result.survived).toEqual([
+ expect.objectContaining({
+ text: "https://evil.example/track",
+ surface: "rels",
+ }),
+ ]);
+ });
+
it("enumerates multiple rels files in sorted path order", async () => {
const zip = await syntheticDocx({
"word/document.xml": bodyWith("[REDACTED]"),
diff --git a/src/docx/verify.ts b/src/docx/verify.ts
index 3c973ac..bcfbe48 100644
--- a/src/docx/verify.ts
+++ b/src/docx/verify.ts
@@ -36,6 +36,7 @@ import { collectVerifySurfaces } from "./verify-surfaces.js";
import type { Scope } from "./types.js";
export type VerifySurfaceKind = "text" | "field" | "rels";
+const EXTERNAL_URL_TARGET_ID = "security:external-url";
/** One sensitive string that survived in one scope. */
export interface SurvivedString {
@@ -121,6 +122,22 @@ export async function verifyRedaction(
for (const surface of surfaces.relsTargetSurfaces) {
const scope = { kind: "rels", path: surface.path } as unknown as Scope;
+ if (isExternalHttpUrl(surface.text)) {
+ mergeSurvival(
+ survivedByKey,
+ {
+ id: `${EXTERNAL_URL_TARGET_ID}:${surface.text}`,
+ displayText: surface.text,
+ redactionLiterals: [surface.text],
+ verificationLiterals: [surface.text],
+ scopes: [scope],
+ },
+ scope,
+ "rels",
+ 1,
+ surface.text,
+ );
+ }
for (const target of activeTargets) {
for (const literal of target.verificationLiterals) {
const count = countOccurrences(surface.text, literal);
@@ -207,3 +224,7 @@ function mergeSurvival(
matchedLiteral: existing.matchedLiteral ?? matchedLiteral,
});
}
+
+function isExternalHttpUrl(text: string): boolean {
+ return text.startsWith("http://") || text.startsWith("https://");
+}
diff --git a/src/finalize/guided-recovery.ts b/src/finalize/guided-recovery.ts
index b16acc9..eee31d1 100644
--- a/src/finalize/guided-recovery.ts
+++ b/src/finalize/guided-recovery.ts
@@ -1,5 +1,4 @@
-import JSZip from "jszip";
-
+import { loadDocxZip } from "../docx/load.js";
import {
finalizeRedaction,
type FinalizeOptions,
@@ -261,7 +260,7 @@ async function defaultRepairPass(
repairPlan: RepairPlan,
options: GuidedRecoveryOptions,
): Promise {
- const zip = await JSZip.loadAsync(bytes.slice());
+ const zip = await loadDocxZip(bytes);
await applyRelsRepairsToZip(
zip,
repairPlan.relsRepairs,
diff --git a/src/finalize/preflight-expansion.test.ts b/src/finalize/preflight-expansion.test.ts
index 4b778a9..7a63ccd 100644
--- a/src/finalize/preflight-expansion.test.ts
+++ b/src/finalize/preflight-expansion.test.ts
@@ -2,7 +2,10 @@ import { describe, expect, it } from "vitest";
import JSZip from "jszip";
import { buildResolvedTargetsFromStrings } from "../selection-targets.js";
-import { buildPreflightExpansionPlan } from "./preflight-expansion.js";
+import {
+ applyRelsRepairsToZip,
+ buildPreflightExpansionPlan,
+} from "./preflight-expansion.js";
import type { ResolvedRedactionTarget } from "../selection-targets.js";
const W_NS = `xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"`;
@@ -15,6 +18,14 @@ async function syntheticDocx(parts: Record): Promise
return zip.generateAsync({ type: "uint8array" });
}
+function syntheticZip(parts: Record): JSZip {
+ const zip = new JSZip();
+ for (const [path, content] of Object.entries(parts)) {
+ zip.file(path, content);
+ }
+ return zip;
+}
+
function bodyWith(text: string): string {
return `${text}`;
}
@@ -144,4 +155,75 @@ describe("preflight-expansion", () => {
expandedLiteralCount: 0,
});
});
+
+ it("strips double-quoted http URLs from rels", async () => {
+ const zip = syntheticZip({
+ "word/_rels/document.xml.rels": ``,
+ });
+
+ await applyRelsRepairsToZip(zip, new Map());
+
+ const rels = await zip.file("word/_rels/document.xml.rels")!.async("string");
+ expect(rels).toContain(`Target=""`);
+ expect(rels).not.toContain("http://evil.example/pixel");
+ });
+
+ it("strips double-quoted https URLs from rels", async () => {
+ const zip = syntheticZip({
+ "word/_rels/document.xml.rels": ``,
+ });
+
+ await applyRelsRepairsToZip(zip, new Map());
+
+ const rels = await zip.file("word/_rels/document.xml.rels")!.async("string");
+ expect(rels).toContain(`Target=""`);
+ expect(rels).not.toContain("https://track.example/pixel");
+ });
+
+ it("strips single-quoted https URLs from rels", async () => {
+ const zip = syntheticZip({
+ "word/_rels/document.xml.rels": ``,
+ });
+
+ await applyRelsRepairsToZip(zip, new Map());
+
+ const rels = await zip.file("word/_rels/document.xml.rels")!.async("string");
+ expect(rels).toContain(`Target=''`);
+ expect(rels).not.toContain("https://track.example/pixel");
+ });
+
+ it("preserves mailto targets in rels", async () => {
+ const zip = syntheticZip({
+ "word/_rels/document.xml.rels": ``,
+ });
+
+ await applyRelsRepairsToZip(zip, new Map());
+
+ const rels = await zip.file("word/_rels/document.xml.rels")!.async("string");
+ expect(rels).toContain(`mailto:legal@example.com`);
+ });
+
+ it("preserves relative targets in rels", async () => {
+ const zip = syntheticZip({
+ "word/_rels/document.xml.rels": ``,
+ });
+
+ await applyRelsRepairsToZip(zip, new Map());
+
+ const rels = await zip.file("word/_rels/document.xml.rels")!.async("string");
+ expect(rels).toContain(`Target="media/image1.png"`);
+ });
+
+ it("strips only external URLs in mixed rels content", async () => {
+ const zip = syntheticZip({
+ "word/_rels/document.xml.rels": ``,
+ });
+
+ await applyRelsRepairsToZip(zip, new Map());
+
+ const rels = await zip.file("word/_rels/document.xml.rels")!.async("string");
+ expect(rels).toContain(`Target=""`);
+ expect(rels).toContain(`Target="media/image1.png"`);
+ expect(rels).toContain(`Target="mailto:legal@example.com"`);
+ });
});
diff --git a/src/finalize/preflight-expansion.ts b/src/finalize/preflight-expansion.ts
index b088610..2d7e1eb 100644
--- a/src/finalize/preflight-expansion.ts
+++ b/src/finalize/preflight-expansion.ts
@@ -1,5 +1,6 @@
-import JSZip from "jszip";
+import type JSZip from "jszip";
+import { loadDocxZip, readZipEntry } from "../docx/load.js";
import { collectVerifySurfaces } from "../docx/verify-surfaces.js";
import type { ResolvedRedactionTarget } from "../selection-targets.js";
@@ -29,7 +30,7 @@ export async function buildPreflightExpansionPlan(
};
}
- const zip = await JSZip.loadAsync(bytes.slice());
+ const zip = await loadDocxZip(bytes);
const surfaces = await collectVerifySurfaces(zip);
const extraLiterals = new Map>();
const relsRepairs = new Map>();
@@ -120,11 +121,12 @@ export async function applyRelsRepairsToZip(
relsRepairs: ReadonlyMap,
placeholder = "[REDACTED]",
): Promise {
- for (const [path, literals] of relsRepairs) {
- const file = zip.file(path);
- if (file === null) continue;
- const xml = await file.async("string");
- const repaired = repairRelationshipTargets(xml, literals, placeholder);
+ for (const path of listRelsPaths(zip)) {
+ const literals = relsRepairs.get(path) ?? [];
+ const xml = await readZipEntry(zip, path);
+ const repaired = stripExternalUrls(
+ repairRelationshipTargets(xml, literals, placeholder),
+ );
zip.file(path, repaired);
}
}
@@ -153,6 +155,18 @@ function repairRelationshipTargets(
);
}
+function stripExternalUrls(relsXml: string): string {
+ let result = relsXml.replace(
+ /(]*\bTarget=)"(https?:\/\/[^"]*)"/g,
+ '$1""',
+ );
+ result = result.replace(
+ /(]*\bTarget=)'(https?:\/\/[^']*)'/g,
+ "$1''",
+ );
+ return result;
+}
+
function idleSummary(): PreflightExpansionSummary {
return {
touchedScopePaths: [],
@@ -191,3 +205,14 @@ function decodeXml(text: string): string {
String.fromCodePoint(Number.parseInt(dec, 10)),
);
}
+
+function listRelsPaths(zip: JSZip): string[] {
+ const paths: string[] = [];
+ zip.forEach((relativePath, file) => {
+ if (file.dir) return;
+ if (relativePath.endsWith(".rels")) {
+ paths.push(relativePath);
+ }
+ });
+ return paths.sort();
+}
diff --git a/src/ui/DocumentPreview.svelte b/src/ui/DocumentPreview.svelte
index 68bf34a..31f3311 100644
--- a/src/ui/DocumentPreview.svelte
+++ b/src/ui/DocumentPreview.svelte
@@ -14,12 +14,8 @@
have that can come later.
-->