Skip to content

Commit dcf480e

Browse files
committed
feat: implement redaction functionality for sensitive data in ingest pipeline and add tests
1 parent 1411ee2 commit dcf480e

6 files changed

Lines changed: 206 additions & 42 deletions

File tree

AGENT.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -982,3 +982,13 @@ Do not rewrite history; append a new snapshot each run.
982982
- unstaged: 0
983983
- untracked: 0
984984
- files: src/index.ts
985+
986+
## Status Snapshot — 2026-02-26T10:27:01.176Z
987+
- source: pre-commit
988+
- repo: /Users/anilp/Code/codaph
989+
- branch: agents/claude
990+
- head: 1411ee2
991+
- staged: 5
992+
- unstaged: 0
993+
- untracked: 0
994+
- files: src/lib/ingest-pipeline.ts, src/lib/redactor.ts, src/lib/security.ts, test/lib-ingest-pipeline.test.ts, test/lib-redactor.test.ts

src/lib/ingest-pipeline.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ import {
66
type MirrorAppender,
77
type ReasoningAvailability,
88
} from "./core-types";
9-
import { redactUnknown } from "./security";
9+
import { redactRawLine, redactUnknown } from "./redactor";
1010

1111
export interface IngestContext {
1212
source: AgentSource;
@@ -251,7 +251,7 @@ export class IngestPipeline {
251251
}
252252

253253
async ingestRawLine(sessionId: string, line: string): Promise<void> {
254-
await this.mirror.appendRawLine(sessionId, line);
254+
await this.mirror.appendRawLine(sessionId, redactRawLine(line));
255255
}
256256

257257
async flush(): Promise<void> {

src/lib/redactor.ts

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
import { isObject } from "./core-types";
2+
3+
const REDACTED = "[REDACTED]";
4+
const REDACTED_MUBIT_KEY = "[REDACTED_MUBIT_KEY]";
5+
const REDACTED_API_KEY = "[REDACTED_API_KEY]";
6+
const REDACTED_BEARER = "[REDACTED_BEARER_TOKEN]";
7+
const REDACTED_JWT = "[REDACTED_JWT]";
8+
const REDACTED_URL_CREDENTIAL = "[REDACTED_URL_CREDENTIAL]";
9+
10+
const MUBIT_KEY = /\bmbt_[A-Za-z0-9._-]{20,}\b/g;
11+
const GENERIC_SK_KEY = /\bsk-[A-Za-z0-9._-]{20,}\b/g;
12+
const ANTHROPIC_KEY = /\bsk-ant-[A-Za-z0-9._-]{16,}\b/g;
13+
const GITHUB_PAT = /\bgithub_pat_[A-Za-z0-9_]{20,}\b/g;
14+
const GITHUB_TOKEN = /\bgh(?:p|o|u|s|r)_[A-Za-z0-9]{20,}\b/g;
15+
const GOOGLE_API_KEY = /\bAIza[0-9A-Za-z\-_]{20,}\b/g;
16+
const SLACK_TOKEN = /\bxox[baprs]-[A-Za-z0-9-]{10,}\b/gi;
17+
const AWS_ACCESS_KEY_ID = /\b(?:AKIA|ASIA|AGPA|AIDA|AROA|ANPA)[A-Z0-9]{16}\b/g;
18+
const JWT_TOKEN = /\beyJ[A-Za-z0-9_-]{8,}\.[A-Za-z0-9_-]{8,}\.[A-Za-z0-9_-]{8,}\b/g;
19+
const PEM_PRIVATE_KEY_BLOCK = /-----BEGIN [A-Z0-9 ]*PRIVATE KEY-----[\s\S]*?-----END [A-Z0-9 ]*PRIVATE KEY-----/g;
20+
const PEM_CERT_BLOCK = /-----BEGIN (?:RSA |EC |DSA |OPENSSH )?PRIVATE KEY-----[\s\S]*?-----END (?:RSA |EC |DSA |OPENSSH )?PRIVATE KEY-----/g;
21+
const AUTHORIZATION_BEARER =
22+
/(authorization\s*[:=]\s*(?:bearer|token)\s+)([^\s"',;]{8,})/gi;
23+
const BASIC_AUTH_HEADER =
24+
/(authorization\s*[:=]\s*basic\s+)([A-Za-z0-9+/=]{8,})/gi;
25+
const SECRET_QUERY_PARAM =
26+
/([?&](?:api[_-]?key|token|access[_-]?token|refresh[_-]?token|secret|password)=)([^&#\s]+)/gi;
27+
const URL_USERINFO = /(\bhttps?:\/\/)([^\/\s:@]+):([^@\s\/]+)@/gi;
28+
const SECRET_ASSIGNMENT =
29+
/(^|[\s,{([])((?:api[_-]?key|access[_-]?token|refresh[_-]?token|session[_-]?token|client[_-]?secret|private[_-]?key|password|passwd|pwd|token|secret))(\s*[:=]\s*["']?)([^\s"',}\])]{6,})/gim;
30+
const ENV_SECRET_ASSIGNMENT =
31+
/\b([A-Z0-9_]*(?:API_KEY|APIKEY|TOKEN|SECRET|PASSWORD|PASSWD|PRIVATE_KEY|ACCESS_KEY|REFRESH_TOKEN|SESSION_TOKEN)[A-Z0-9_]*)\s*=\s*("[^"]{4,}"|'[^']{4,}'|[^\s"']{4,})/g;
32+
const AWS_SECRET_ACCESS_KEY_ASSIGNMENT =
33+
/(\baws(?:_|-)?secret(?:_|-)?access(?:_|-)?key\b\s*[:=]\s*["']?)([^\s"']{16,})/gi;
34+
const PRIVATE_KEY_JSON_FIELD =
35+
/("private_key"\s*:\s*")([\s\S]*?)(")/gi;
36+
37+
const SENSITIVE_KEY_NAME =
38+
/(?:^|[_-])(api(?:[_-]?key)?|apikey|token|secret|password|passwd|pwd|authorization|auth|private(?:[_-]?key)?|client(?:[_-]?secret)?|access(?:[_-]?token)?|refresh(?:[_-]?token)?|session(?:[_-]?token)?)(?:$|[_-])/i;
39+
40+
function redactStringPatterns(input: string): string {
41+
return input
42+
.replace(PEM_PRIVATE_KEY_BLOCK, "[REDACTED_PRIVATE_KEY_BLOCK]")
43+
.replace(PEM_CERT_BLOCK, "[REDACTED_PRIVATE_KEY_BLOCK]")
44+
.replace(PRIVATE_KEY_JSON_FIELD, `$1[REDACTED_PRIVATE_KEY]$3`)
45+
.replace(MUBIT_KEY, REDACTED_MUBIT_KEY)
46+
.replace(ANTHROPIC_KEY, REDACTED_API_KEY)
47+
.replace(GENERIC_SK_KEY, REDACTED_API_KEY)
48+
.replace(GITHUB_PAT, REDACTED_API_KEY)
49+
.replace(GITHUB_TOKEN, REDACTED_API_KEY)
50+
.replace(GOOGLE_API_KEY, REDACTED_API_KEY)
51+
.replace(SLACK_TOKEN, REDACTED_API_KEY)
52+
.replace(AWS_ACCESS_KEY_ID, REDACTED_API_KEY)
53+
.replace(JWT_TOKEN, REDACTED_JWT)
54+
.replace(AUTHORIZATION_BEARER, (_m, prefix) => `${prefix}${REDACTED_BEARER}`)
55+
.replace(BASIC_AUTH_HEADER, (_m, prefix) => `${prefix}${REDACTED}`)
56+
.replace(SECRET_QUERY_PARAM, (_m, prefix) => `${prefix}${REDACTED}`)
57+
.replace(URL_USERINFO, (_m, scheme) => `${scheme}${REDACTED_URL_CREDENTIAL}:${REDACTED_URL_CREDENTIAL}@`)
58+
.replace(AWS_SECRET_ACCESS_KEY_ASSIGNMENT, (_m, prefix) => `${prefix}${REDACTED}`)
59+
.replace(ENV_SECRET_ASSIGNMENT, (_m, key) => `${key}=${REDACTED}`)
60+
.replace(SECRET_ASSIGNMENT, (_m, leading, key, sep) => `${leading}${key}${sep}${REDACTED}`);
61+
}
62+
63+
function isSensitiveKeyName(keyName: string | undefined): boolean {
64+
if (!keyName) {
65+
return false;
66+
}
67+
return SENSITIVE_KEY_NAME.test(keyName);
68+
}
69+
70+
function redactUnknownInternal<T>(value: T, keyName?: string): T {
71+
if (typeof value === "string") {
72+
if (isSensitiveKeyName(keyName)) {
73+
return REDACTED as T;
74+
}
75+
return redactStringPatterns(value) as T;
76+
}
77+
78+
if (Array.isArray(value)) {
79+
return value.map((item) => redactUnknownInternal(item, keyName)) as T;
80+
}
81+
82+
if (isObject(value)) {
83+
const out: Record<string, unknown> = {};
84+
for (const [k, v] of Object.entries(value)) {
85+
out[k] = redactUnknownInternal(v, k);
86+
}
87+
return out as T;
88+
}
89+
90+
return value;
91+
}
92+
93+
export function redactSensitiveString(input: string): string {
94+
return redactStringPatterns(input);
95+
}
96+
97+
export function redactUnknown<T>(value: T): T {
98+
return redactUnknownInternal(value);
99+
}
100+
101+
export function redactRawLine(input: string): string {
102+
const trimmed = input.trim();
103+
if ((trimmed.startsWith("{") || trimmed.startsWith("[")) && trimmed.length > 1) {
104+
try {
105+
const parsed = JSON.parse(input) as unknown;
106+
return JSON.stringify(redactUnknown(parsed));
107+
} catch {
108+
// fall back to string-level redaction
109+
}
110+
}
111+
return redactSensitiveString(input);
112+
}

src/lib/security.ts

Lines changed: 1 addition & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,40 +1 @@
1-
import { isObject } from "./core-types";
2-
3-
const MUBIT_KEY = /mbt_[A-Za-z0-9_-]{20,}/g;
4-
const GENERIC_API_KEY = /sk-[A-Za-z0-9_-]{20,}/g;
5-
const KV_SECRET = /(api[_-]?key|token|secret)(\s*[:=]\s*["']?)([^\s"']{8,})/gi;
6-
const SECRET_KEY_NAME = /(api[_-]?key|token|secret)/i;
7-
8-
export function redactSensitiveString(input: string): string {
9-
return input
10-
.replace(MUBIT_KEY, "[REDACTED_MUBIT_KEY]")
11-
.replace(GENERIC_API_KEY, "[REDACTED_API_KEY]")
12-
.replace(KV_SECRET, (_m, k, sep) => `${k}${sep}[REDACTED]`);
13-
}
14-
15-
function redactUnknownInternal<T>(value: T, keyName?: string): T {
16-
if (typeof value === "string") {
17-
if (keyName && SECRET_KEY_NAME.test(keyName)) {
18-
return "[REDACTED]" as T;
19-
}
20-
return redactSensitiveString(value) as T;
21-
}
22-
23-
if (Array.isArray(value)) {
24-
return value.map((item) => redactUnknownInternal(item, keyName)) as T;
25-
}
26-
27-
if (isObject(value)) {
28-
const out: Record<string, unknown> = {};
29-
for (const [k, v] of Object.entries(value)) {
30-
out[k] = redactUnknownInternal(v, k);
31-
}
32-
return out as T;
33-
}
34-
35-
return value;
36-
}
37-
38-
export function redactUnknown<T>(value: T): T {
39-
return redactUnknownInternal(value);
40-
}
1+
export { redactSensitiveString, redactUnknown, redactRawLine } from "./redactor";

test/lib-ingest-pipeline.test.ts

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,4 +151,22 @@ describe("ingest pipeline", () => {
151151
const batchArg = (firstCallArgs[0] ?? []) as Array<{ eventId: string }>;
152152
expect(batchArg.length).toBe(2);
153153
});
154+
155+
it("redacts raw transcript lines before writing to the local mirror", async () => {
156+
const appendEvent = vi.fn(async () => ({ segment: "x", offset: 1, checksum: "abc" }));
157+
const appendRawLine = vi.fn(async () => {});
158+
const pipeline = new IngestPipeline({ appendEvent, appendRawLine });
159+
160+
await pipeline.ingestRawLine(
161+
"s1",
162+
'{"type":"user","apiKey":"sk-123456789012345678901234567890","tokenEstimate":"24k"}',
163+
);
164+
165+
expect(appendRawLine).toHaveBeenCalledTimes(1);
166+
const firstCall = (appendRawLine.mock.calls[0] ?? []) as unknown[];
167+
const line = String(firstCall[1] ?? "");
168+
expect(line).not.toContain("sk-1234567890");
169+
expect(line).toContain("[REDACTED]");
170+
expect(line).toContain('"tokenEstimate":"24k"');
171+
});
154172
});

test/lib-redactor.test.ts

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
import { describe, expect, it } from "vitest";
2+
import { redactRawLine, redactSensitiveString, redactUnknown } from "../src/lib/redactor";
3+
4+
describe("redactor", () => {
5+
it("redacts common provider/api tokens in strings", () => {
6+
const text = [
7+
"mbt_mubit-dev-1_ovr38uk1bb4johkn_qHLXEni8HpwL4JgVDunPZ7ZfPFWeuLK3AjfZO4oIafUx3ZL0fyegjVefSwsKPBVi",
8+
"sk-123456789012345678901234567890",
9+
"sk-ant-api03-123456789012345678901234567890",
10+
"github_pat_11ABCDEF123456789012345678901234567890",
11+
"ghp_123456789012345678901234567890123456",
12+
"AIzaSyD12345678901234567890123456789012345",
13+
].join(" ");
14+
15+
const redacted = redactSensitiveString(text);
16+
expect(redacted).not.toMatch(/mbt_mubit-dev/i);
17+
expect(redacted).not.toMatch(/\bsk-/);
18+
expect(redacted).not.toMatch(/github_pat_/);
19+
expect(redacted).not.toMatch(/\bghp_/);
20+
expect(redacted).not.toMatch(/\bAIza/);
21+
expect(redacted).toContain("[REDACTED");
22+
});
23+
24+
it("redacts auth headers, query params, and URL credentials", () => {
25+
const text =
26+
'Authorization: Bearer super_secret_token_123456 https://user:pass@example.com?a=1&api_key=xyz_secret_12345';
27+
const out = redactSensitiveString(text);
28+
expect(out).toContain("[REDACTED_BEARER_TOKEN]");
29+
expect(out).not.toContain("super_secret_token_123456");
30+
expect(out).toContain("[REDACTED_URL_CREDENTIAL]");
31+
expect(out).not.toContain("user:pass@");
32+
expect(out).not.toContain("xyz_secret_12345");
33+
});
34+
35+
it("redacts nested secret fields but keeps non-secret keys like tokenEstimate", () => {
36+
const payload = {
37+
tokenEstimate: "24k",
38+
apiKey: "should_hide",
39+
nested: {
40+
authorization: "Bearer abcdefghijklmnopqrstuvwxyz",
41+
private_key: "-----BEGIN PRIVATE KEY-----\nabc\n-----END PRIVATE KEY-----",
42+
},
43+
};
44+
const out = redactUnknown(payload);
45+
expect(out.tokenEstimate).toBe("24k");
46+
expect(out.apiKey).toBe("[REDACTED]");
47+
expect(String(out.nested.authorization)).toContain("[REDACTED]");
48+
expect(out.nested.private_key).toBe("[REDACTED]");
49+
});
50+
51+
it("redacts raw json lines by parsing and sanitizing values", () => {
52+
const line = JSON.stringify({
53+
type: "assistant",
54+
tokenEstimate: "24k",
55+
apiKey: "sk-123456789012345678901234567890",
56+
nested: { token: "abc123456789secret" },
57+
});
58+
const out = redactRawLine(line);
59+
expect(out).not.toContain("sk-1234567890");
60+
expect(out).not.toContain("abc123456789secret");
61+
expect(out).toContain("\"tokenEstimate\":\"24k\"");
62+
});
63+
});

0 commit comments

Comments
 (0)