diff --git a/dataclaw/parsers/claude.py b/dataclaw/parsers/claude.py index 507a7f5..0274eb0 100644 --- a/dataclaw/parsers/claude.py +++ b/dataclaw/parsers/claude.py @@ -8,7 +8,6 @@ from ..export_tasks import ExportSessionTask from ..secrets import should_skip_large_binary_string from .common import ( - anonymize_value, collect_project_sessions, count_existing_paths_and_sizes, iter_jsonl, @@ -151,7 +150,7 @@ def parse_export_session_task( return None -def build_tool_result_map(entries: Iterable[dict[str, Any]], anonymizer: Anonymizer) -> dict[str, dict]: +def build_tool_result_map(entries: Iterable[dict[str, Any]]) -> dict[str, dict]: """Pre-pass: build a map of tool_use_id -> {output, status} from tool_result blocks.""" result: dict[str, dict] = {} for entry in entries: @@ -166,7 +165,7 @@ def build_tool_result_map(entries: Iterable[dict[str, Any]], anonymizer: Anonymi tid = block.get("tool_use_id") if not tid: continue - output = build_tool_result_output(block, entry, anonymizer) + output = build_tool_result_output(block, entry) result[tid] = { "output": output, "status": "error" if block.get("is_error") else "success", @@ -177,13 +176,12 @@ def build_tool_result_map(entries: Iterable[dict[str, Any]], anonymizer: Anonymi def build_tool_result_output( block: dict[str, Any], entry: dict[str, Any], - anonymizer: Anonymizer, ) -> dict[str, Any]: - text, raw_content = parse_tool_result_content(block.get("content"), anonymizer) + text, raw_content = parse_tool_result_content(block.get("content")) if text is None: - text = extract_tool_result_text(entry.get("toolUseResult"), anonymizer) + text = extract_tool_result_text(entry.get("toolUseResult")) - raw_result = sanitize_tool_use_result(entry.get("toolUseResult"), text, anonymizer) + raw_result = sanitize_tool_use_result(entry.get("toolUseResult"), text) source_tool_uuid = entry.get("sourceToolAssistantUUID") if isinstance(source_tool_uuid, str) and source_tool_uuid: if raw_result is None: @@ -201,9 +199,9 @@ def build_tool_result_output( return output -def parse_tool_result_content(content: Any, anonymizer: Anonymizer) -> tuple[str | None, Any]: +def parse_tool_result_content(content: Any) -> tuple[str | None, Any]: if isinstance(content, str): - text = normalize_tool_result_text(content, anonymizer) + text = normalize_tool_result_text(content) if text is not None: return text, None if should_skip_large_binary_string(content): @@ -215,36 +213,34 @@ def parse_tool_result_content(content: Any, anonymizer: Anonymizer) -> tuple[str raw_parts: list[Any] = [] for part in content: if isinstance(part, dict): - anonymized_part = anonymize_value("content", part, anonymizer) if part.get("type") == "text": - part_text = extract_tool_result_text(anonymized_part, anonymizer=None) + part_text = extract_tool_result_text(part) if part_text: text_parts.append(part_text) - raw_part = prune_empty_values(drop_duplicate_text_fields(anonymized_part, part_text)) + raw_part = prune_empty_values(drop_duplicate_text_fields(part, part_text)) if raw_part is not None and raw_part != {"type": "text"}: raw_parts.append(raw_part) continue - raw_parts.append(anonymized_part) + raw_parts.append(part) continue - raw_parts.append(anonymize_value("content", part, anonymizer)) + raw_parts.append(part) text = "\n\n".join(text_parts).strip() if text_parts else None return text or None, prune_empty_values(raw_parts) if isinstance(content, dict): - anonymized_content = anonymize_value("content", content, anonymizer) - text = extract_tool_result_text(anonymized_content, anonymizer=None) - raw = prune_empty_values(drop_duplicate_text_fields(anonymized_content, text)) + text = extract_tool_result_text(content) + raw = prune_empty_values(drop_duplicate_text_fields(content, text)) if raw == {"type": "text"}: raw = None return text, raw - return None, prune_empty_values(anonymize_value("content", content, anonymizer)) + return None, prune_empty_values(content) -def extract_tool_result_text(value: Any, anonymizer: Anonymizer | None) -> str | None: +def extract_tool_result_text(value: Any) -> str | None: if isinstance(value, str): - return normalize_tool_result_text(value, anonymizer) + return normalize_tool_result_text(value) if isinstance(value, list): text_parts = [] @@ -253,7 +249,7 @@ def extract_tool_result_text(value: Any, anonymizer: Anonymizer | None) -> str | continue if part.get("type") != "text": continue - text = normalize_tool_result_text(part.get("text"), anonymizer) + text = normalize_tool_result_text(part.get("text")) if text: text_parts.append(text) if text_parts: @@ -264,32 +260,29 @@ def extract_tool_result_text(value: Any, anonymizer: Anonymizer | None) -> str | return None for candidate in (value.get("stdout"), value.get("content"), value.get("text")): - text = normalize_tool_result_text(candidate, anonymizer) + text = normalize_tool_result_text(candidate) if text: return text file_info = value.get("file") if isinstance(file_info, dict): - return normalize_tool_result_text(file_info.get("content"), anonymizer) + return normalize_tool_result_text(file_info.get("content")) return None -def normalize_tool_result_text(value: Any, anonymizer: Anonymizer | None) -> str | None: +def normalize_tool_result_text(value: Any) -> str | None: if not isinstance(value, str): return None text = value.strip() if not text or should_skip_large_binary_string(text): return None - if anonymizer is None: - return text - return anonymizer.text(text) + return text def sanitize_tool_use_result( tool_use_result: Any, text: str | None, - anonymizer: Anonymizer, ) -> dict[str, Any] | None: if tool_use_result is None: return None @@ -297,12 +290,12 @@ def sanitize_tool_use_result( if isinstance(tool_use_result, str): if should_skip_large_binary_string(tool_use_result): return {"text": tool_use_result} - sanitized_text = normalize_tool_result_text(tool_use_result, anonymizer) + sanitized_text = normalize_tool_result_text(tool_use_result) if not sanitized_text or text_matches_tool_result(sanitized_text, text): return None return {"text": sanitized_text} - sanitized = anonymize_value("toolUseResult", tool_use_result, anonymizer) + sanitized = tool_use_result sanitized = drop_redundant_result_fields(sanitized) sanitized = drop_duplicate_text_fields(sanitized, text) pruned = prune_empty_values(sanitized) @@ -405,7 +398,6 @@ def parse_session_file( messages: list[dict[str, Any]] = [] metadata = { "session_id": filepath.stem, - "cwd": None, "git_branch": None, "claude_version": None, "model": None, @@ -423,7 +415,6 @@ def parse_session_file( messages, metadata, stats, - anonymizer, include_thinking, pending_tool_results=pending_tool_results, pending_tool_uses=pending_tool_uses, @@ -431,7 +422,7 @@ def parse_session_file( except OSError: return None - return make_session_result(metadata, messages, stats) + return make_session_result(metadata, messages, stats, anonymizer=anonymizer) def find_subagent_sessions(project_dir: Path) -> list[Path]: @@ -469,7 +460,6 @@ def parse_subagent_session( messages: list[dict[str, Any]] = [] metadata = { "session_id": session_dir.name, - "cwd": None, "git_branch": None, "claude_version": None, "model": None, @@ -489,7 +479,6 @@ def parse_subagent_session( messages, metadata, stats, - anonymizer, include_thinking, pending_tool_results=pending_tool_results, pending_tool_uses=pending_tool_uses, @@ -501,7 +490,7 @@ def parse_subagent_session( return None metadata["session_id"] = resolve_subagent_session_id(session_dir, metadata["session_id"]) - return make_session_result(metadata, messages, stats) + return make_session_result(metadata, messages, stats, anonymizer=anonymizer) def resolve_subagent_session_id(session_dir: Path, session_id: str) -> str: @@ -539,7 +528,6 @@ def process_entry( messages: list[dict[str, Any]], metadata: dict[str, Any], stats: dict[str, int], - anonymizer: Anonymizer, include_thinking: bool, tool_result_map: dict[str, dict] | None = None, pending_tool_results: dict[str, dict[str, Any]] | None = None, @@ -547,8 +535,7 @@ def process_entry( ) -> None: entry_type = entry.get("type") - if metadata["cwd"] is None and entry.get("cwd"): - metadata["cwd"] = anonymizer.path(entry["cwd"]) + if entry.get("cwd"): metadata["git_branch"] = entry.get("gitBranch") metadata["claude_version"] = entry.get("version") metadata["session_id"] = entry.get("sessionId", metadata["session_id"]) @@ -556,8 +543,8 @@ def process_entry( timestamp = normalize_timestamp(entry.get("timestamp")) if entry_type == "user": - _attach_claude_tool_results(entry, anonymizer, pending_tool_results, pending_tool_uses) - content = extract_user_content(entry, anonymizer) + _attach_claude_tool_results(entry, pending_tool_results, pending_tool_uses) + content = extract_user_content(entry) if content is not None: messages.append({"role": "user", "content": content, "timestamp": timestamp}) stats["user_messages"] += 1 @@ -566,7 +553,6 @@ def process_entry( elif entry_type == "assistant": msg = extract_assistant_content( entry, - anonymizer, include_thinking, tool_result_map, pending_tool_results, @@ -591,7 +577,7 @@ def process_entry( update_time_bounds(metadata, timestamp) -def extract_user_content(entry: dict[str, Any], anonymizer: Anonymizer) -> str | None: +def extract_user_content(entry: dict[str, Any]) -> str | None: msg_data = entry.get("message", {}) content = msg_data.get("content", "") if isinstance(content, list): @@ -599,12 +585,11 @@ def extract_user_content(entry: dict[str, Any], anonymizer: Anonymizer) -> str | content = "\n".join(text_parts) if not content or not content.strip(): return None - return anonymizer.text(content) + return content def extract_assistant_content( entry: dict[str, Any], - anonymizer: Anonymizer, include_thinking: bool, tool_result_map: dict[str, dict] | None = None, pending_tool_results: dict[str, dict[str, Any]] | None = None, @@ -626,15 +611,15 @@ def extract_assistant_content( if block_type == "text": text = block.get("text", "").strip() if text: - text_parts.append(anonymizer.text(text)) + text_parts.append(text) elif block_type == "thinking" and include_thinking: thinking = block.get("thinking", "").strip() if thinking: - thinking_parts.append(anonymizer.text(thinking)) + thinking_parts.append(thinking) elif block_type == "tool_use": tu: dict[str, Any] = { "tool": block.get("name"), - "input": parse_tool_input(block.get("name"), block.get("input", {}), anonymizer), + "input": parse_tool_input(block.get("input", {})), } tool_use_id = block.get("id") if tool_result_map is not None: @@ -671,7 +656,6 @@ def _apply_claude_tool_result(tool_use: dict[str, Any], result: dict[str, Any]) def _attach_claude_tool_results( entry: dict[str, Any], - anonymizer: Anonymizer, pending_tool_results: dict[str, dict[str, Any]] | None, pending_tool_uses: dict[str, list[dict[str, Any]]] | None, ) -> None: @@ -690,7 +674,7 @@ def _attach_claude_tool_results( continue result = { - "output": build_tool_result_output(block, entry, anonymizer), + "output": build_tool_result_output(block, entry), "status": "error" if block.get("is_error") else "success", } matched_tool_uses = [] if pending_tool_uses is None else pending_tool_uses.pop(tool_use_id, []) diff --git a/dataclaw/parsers/codex.py b/dataclaw/parsers/codex.py index 64a708f..55dbfa8 100644 --- a/dataclaw/parsers/codex.py +++ b/dataclaw/parsers/codex.py @@ -152,7 +152,7 @@ class CodexParseState: pending_user_timestamp: str | None = None -def build_tool_result_map(entries: Iterable[dict[str, Any]], anonymizer: Anonymizer) -> dict[str, dict]: +def build_tool_result_map(entries: Iterable[dict[str, Any]]) -> dict[str, dict]: """Pre-pass: build call_id -> {output, status} from tool outputs.""" result: dict[str, dict] = {} for entry in entries: @@ -162,14 +162,14 @@ def build_tool_result_map(entries: Iterable[dict[str, Any]], anonymizer: Anonymi call_id = payload.get("call_id") if not call_id: continue - built = _build_codex_tool_result(payload, anonymizer) + built = _build_codex_tool_result(payload) if built is not None: result[call_id] = built return result -def _build_codex_tool_result(payload: dict[str, Any], anonymizer: Anonymizer) -> dict[str, Any] | None: +def _build_codex_tool_result(payload: dict[str, Any]) -> dict[str, Any] | None: payload_type = payload.get("type") if payload_type == "function_call_output": @@ -191,7 +191,7 @@ def _build_codex_tool_result(payload: dict[str, Any], anonymizer: Anonymizer) -> elif in_output: output_lines.append(line) if output_lines: - out["output"] = anonymizer.text("\n".join(output_lines).strip()) + out["output"] = "\n".join(output_lines).strip() return {"output": out, "status": "success"} if payload_type == "custom_tool_call_output": @@ -201,7 +201,7 @@ def _build_codex_tool_result(payload: dict[str, Any], anonymizer: Anonymizer) -> parsed = json.loads(raw) text = parsed.get("output", "") if text: - out["output"] = anonymizer.text(str(text)) + out["output"] = str(text) meta = parsed.get("metadata", {}) if "exit_code" in meta: out["exit_code"] = meta["exit_code"] @@ -209,7 +209,7 @@ def _build_codex_tool_result(payload: dict[str, Any], anonymizer: Anonymizer) -> out["duration_seconds"] = meta["duration_seconds"] except (json.JSONDecodeError, AttributeError): if raw: - out["output"] = anonymizer.text(raw) + out["output"] = raw return {"output": out, "status": "success"} return None @@ -224,7 +224,6 @@ def parse_session_file( state = CodexParseState( metadata={ "session_id": filepath.stem, - "cwd": None, "git_branch": None, "model": None, "start_time": None, @@ -243,11 +242,11 @@ def parse_session_file( entry_type = entry.get("type") if entry_type == "session_meta": - handle_session_meta(state, entry, filepath, anonymizer) + handle_session_meta(state, entry, filepath) elif entry_type == "turn_context": - handle_turn_context(state, entry, anonymizer) + handle_turn_context(state, entry) elif entry_type == "response_item": - handle_response_item(state, entry, anonymizer, include_thinking) + handle_response_item(state, entry, include_thinking) elif entry_type == "event_msg": payload = entry.get("payload", {}) event_type = payload.get("type") @@ -256,14 +255,14 @@ def parse_session_file( elif event_type == "agent_reasoning" and include_thinking: thinking = payload.get("text") if isinstance(thinking, str) and thinking.strip(): - cleaned = anonymizer.text(thinking.strip()) + cleaned = thinking.strip() if cleaned not in state._pending_thinking_seen: state._pending_thinking_seen.add(cleaned) state.pending_thinking.append(cleaned) elif event_type == "user_message": - handle_user_message(state, payload, timestamp, anonymizer) + handle_user_message(state, payload, timestamp) elif event_type == "agent_message": - handle_agent_message(state, payload, timestamp, anonymizer, include_thinking) + handle_agent_message(state, payload, timestamp, include_thinking) except OSError as e: logger.warning("Failed to read Codex session file %s: %s", filepath, e) return None @@ -284,21 +283,18 @@ def parse_session_file( else: state.metadata["model"] = "codex-unknown" - return make_session_result(state.metadata, state.messages, state.stats) + return make_session_result(state.metadata, state.messages, state.stats, anonymizer=anonymizer) def handle_session_meta( state: CodexParseState, entry: dict[str, Any], filepath: Path, - anonymizer: Anonymizer, ) -> None: payload = entry.get("payload", {}) session_cwd = payload.get("cwd") if isinstance(session_cwd, str) and session_cwd.strip(): state.raw_cwd = session_cwd - if state.metadata["cwd"] is None: - state.metadata["cwd"] = anonymizer.path(session_cwd) if state.metadata["session_id"] == filepath.stem: state.metadata["session_id"] = payload.get("id", state.metadata["session_id"]) if state.metadata["model_provider"] is None: @@ -311,21 +307,18 @@ def handle_session_meta( def handle_turn_context( state: CodexParseState, entry: dict[str, Any], - anonymizer: Anonymizer, ) -> None: payload = entry.get("payload", {}) session_cwd = payload.get("cwd") if isinstance(session_cwd, str) and session_cwd.strip(): state.raw_cwd = session_cwd - if state.metadata["cwd"] is None: - state.metadata["cwd"] = anonymizer.path(session_cwd) if state.metadata["model"] is None: model_name = payload.get("model") if isinstance(model_name, str) and model_name.strip(): state.metadata["model"] = model_name -def _build_codex_image_part(image_url: str, anonymizer: Anonymizer) -> dict[str, Any] | None: +def _build_codex_image_part(image_url: str) -> dict[str, Any] | None: if not image_url: return None @@ -346,7 +339,7 @@ def _build_codex_image_part(image_url: str, anonymizer: Anonymizer) -> dict[str, "type": "image", "source": { "type": "url", - "url": f"file://{anonymizer.path(image_url[7:])}", + "url": image_url, }, } @@ -354,12 +347,12 @@ def _build_codex_image_part(image_url: str, anonymizer: Anonymizer) -> dict[str, "type": "image", "source": { "type": "url", - "url": anonymizer.text(image_url), + "url": image_url, }, } -def _build_codex_local_image_part(image_path: str, state: CodexParseState, anonymizer: Anonymizer) -> dict[str, Any]: +def _build_codex_local_image_part(image_path: str, state: CodexParseState) -> dict[str, Any]: path = Path(image_path) if not path.is_absolute() and state.raw_cwd != UNKNOWN_CODEX_CWD: path = Path(state.raw_cwd) / path @@ -367,12 +360,12 @@ def _build_codex_local_image_part(image_path: str, state: CodexParseState, anony "type": "image", "source": { "type": "url", - "url": f"file://{anonymizer.path(str(path))}", + "url": f"file://{path}", }, } -def _extract_response_user_content_parts(payload: dict[str, Any], anonymizer: Anonymizer) -> list[dict[str, Any]]: +def _extract_response_user_content_parts(payload: dict[str, Any]) -> list[dict[str, Any]]: content_parts: list[dict[str, Any]] = [] for part in payload.get("content", []): if not isinstance(part, dict): @@ -381,7 +374,7 @@ def _extract_response_user_content_parts(payload: dict[str, Any], anonymizer: An continue image_url = part.get("image_url") if isinstance(image_url, str) and image_url: - image_part = _build_codex_image_part(image_url, anonymizer) + image_part = _build_codex_image_part(image_url) if image_part is not None: content_parts.append(image_part) return content_parts @@ -390,17 +383,16 @@ def _extract_response_user_content_parts(payload: dict[str, Any], anonymizer: An def _extract_event_user_content_parts( payload: dict[str, Any], state: CodexParseState, - anonymizer: Anonymizer, ) -> list[dict[str, Any]]: content_parts: list[dict[str, Any]] = [] for image_url in payload.get("images", []): if isinstance(image_url, str) and image_url: - image_part = _build_codex_image_part(image_url, anonymizer) + image_part = _build_codex_image_part(image_url) if image_part is not None: content_parts.append(image_part) for image_path in payload.get("local_images", []): if isinstance(image_path, str) and image_path: - content_parts.append(_build_codex_local_image_part(image_path, state, anonymizer)) + content_parts.append(_build_codex_local_image_part(image_path, state)) return content_parts @@ -464,13 +456,12 @@ def _is_user_content_entry(entry: dict[str, Any]) -> bool: def handle_response_item( state: CodexParseState, entry: dict[str, Any], - anonymizer: Anonymizer, include_thinking: bool, ) -> None: payload = entry.get("payload", {}) item_type = payload.get("type") if item_type == "message" and payload.get("role") == "user": - content_parts = _extract_response_user_content_parts(payload, anonymizer) + content_parts = _extract_response_user_content_parts(payload) if content_parts: state.pending_user_content_parts.extend(content_parts) if state.pending_user_timestamp is None: @@ -481,7 +472,7 @@ def handle_response_item( args_data = parse_tool_arguments(payload.get("arguments")) tool_use = { "tool": tool_name, - "input": parse_tool_input(tool_name, args_data, anonymizer), + "input": parse_tool_input(args_data), "_call_id": payload.get("call_id"), } _register_codex_tool_use(state, tool_use, payload.get("call_id")) @@ -490,9 +481,9 @@ def handle_response_item( tool_name = payload.get("name") raw_input = payload.get("input", "") if isinstance(raw_input, str): - inp = {"patch": anonymizer.text(raw_input)} + inp = {"patch": raw_input} else: - inp = parse_tool_input(tool_name, raw_input, anonymizer) + inp = parse_tool_input(raw_input) tool_use = { "tool": tool_name, "input": inp, @@ -503,7 +494,7 @@ def handle_response_item( elif item_type in {"function_call_output", "custom_tool_call_output"}: call_id = payload.get("call_id") if isinstance(call_id, str) and call_id: - result = _build_codex_tool_result(payload, anonymizer) + result = _build_codex_tool_result(payload) if result is not None: _attach_codex_tool_result(state, call_id, result) elif item_type == "reasoning" and include_thinking: @@ -512,7 +503,7 @@ def handle_response_item( continue text = summary.get("text") if isinstance(text, str) and text.strip(): - cleaned = anonymizer.text(text.strip()) + cleaned = text.strip() if cleaned not in state._pending_thinking_seen: state._pending_thinking_seen.add(cleaned) state.pending_thinking.append(cleaned) @@ -533,17 +524,16 @@ def handle_user_message( state: CodexParseState, payload: dict[str, Any], timestamp: str | None, - anonymizer: Anonymizer, ) -> None: flush_pending(state, timestamp) pending_parts = list(state.pending_user_content_parts) content = payload.get("message") if not pending_parts: - pending_parts.extend(_extract_event_user_content_parts(payload, state, anonymizer)) + pending_parts.extend(_extract_event_user_content_parts(payload, state)) msg: dict[str, Any] = {"role": "user", "timestamp": timestamp} if isinstance(content, str) and content.strip(): - msg["content"] = anonymizer.text(content.strip()) + msg["content"] = content.strip() if pending_parts: msg["content_parts"] = pending_parts @@ -568,13 +558,12 @@ def handle_agent_message( state: CodexParseState, payload: dict[str, Any], timestamp: str | None, - anonymizer: Anonymizer, include_thinking: bool, ) -> None: content = payload.get("message") msg: dict[str, Any] = {"role": "assistant"} if isinstance(content, str) and content.strip(): - msg["content"] = anonymizer.text(content.strip()) + msg["content"] = content.strip() if state.pending_thinking and include_thinking: msg["thinking"] = "\n\n".join(state.pending_thinking) if state.pending_tool_uses: diff --git a/dataclaw/parsers/common.py b/dataclaw/parsers/common.py index 411c0d2..46bc8f0 100644 --- a/dataclaw/parsers/common.py +++ b/dataclaw/parsers/common.py @@ -7,27 +7,32 @@ from .. import _json as json from ..anonymizer import Anonymizer -from ..secrets import redact_text, should_skip_large_binary_string +from ..secrets import should_skip_large_binary_string, should_skip_structured_string_transform logger = logging.getLogger(__name__) -_PATH_KEYS = frozenset( +_NON_ANON_STRING_KEYS = frozenset( { - "file_path", - "filePath", - "path", - "dir", - "dir_path", - "cwd", - "outputFile", - "workdir", - "targetFile", - "targetDirectory", - "relativeWorkspacePath", - "rootDir", + "session_id", + "model", + "git_branch", + "start_time", + "end_time", + "role", + "timestamp", + "tool", + "status", + "type", + "media_type", + "mime_type", + "id", + "tool_use_id", + "sourceToolAssistantUUID", + "source", + "project", + "wall_time", } ) -_CMD_KEYS = frozenset({"command", "cmd"}) def iter_jsonl(filepath: Path): @@ -58,10 +63,11 @@ def make_session_result( metadata: dict[str, Any], messages: list[dict[str, Any]], stats: dict[str, int], + anonymizer: Anonymizer | None = None, ) -> dict[str, Any] | None: if not messages: return None - return { + session = { "session_id": metadata["session_id"], "model": metadata["model"], "git_branch": metadata["git_branch"], @@ -70,6 +76,9 @@ def make_session_result( "messages": messages, "stats": stats, } + if anonymizer is None: + return session + return anonymize_session(session, anonymizer) def update_time_bounds(metadata: dict[str, Any], timestamp: str | None) -> None: @@ -139,29 +148,68 @@ def normalize_timestamp(value: Any) -> str | None: return None -def anonymize_value(key: str, value: Any, anonymizer: Anonymizer) -> Any: +def _should_skip_anonymizing_string(key: str | None, value: str, parent_dict: dict[str, Any] | None) -> bool: + if key in _NON_ANON_STRING_KEYS: + return True + return should_skip_structured_string_transform(key, value, parent_dict) + + +def _anonymize_session_value( + key: str | None, + value: Any, + anonymizer: Anonymizer, + parent_dict: dict[str, Any] | None = None, +) -> tuple[Any, bool]: if isinstance(value, str): + if _should_skip_anonymizing_string(key, value, parent_dict): + return value, False if should_skip_large_binary_string(value): - return value - if key in _PATH_KEYS: - return anonymizer.path(value) - if key in _CMD_KEYS: - redacted, _ = redact_text(value) - return anonymizer.text(redacted) - return anonymizer.text(value) + return value, False + anonymized = anonymizer.text(value) + return anonymized, anonymized != value + if isinstance(value, dict): - return {k: anonymize_value(k, v, anonymizer) for k, v in value.items()} + out: dict[str, Any] | None = None + for child_key, child_value in value.items(): + anonymized_child, changed = _anonymize_session_value(child_key, child_value, anonymizer, value) + if not changed: + continue + if out is None: + out = dict(value) + out[child_key] = anonymized_child + if out is None: + return value, False + return out, True + if isinstance(value, list): - return [anonymize_value(key, item, anonymizer) for item in value] - return value + out_list: list[Any] | None = None + for idx, item in enumerate(value): + anonymized_item, changed = _anonymize_session_value(key, item, anonymizer, parent_dict) + if not changed: + continue + if out_list is None: + out_list = list(value) + out_list[idx] = anonymized_item + if out_list is None: + return value, False + return out_list, True + + return value, False + + +def anonymize_session(session: dict[str, Any], anonymizer: Anonymizer) -> dict[str, Any]: + anonymized, _changed = _anonymize_session_value(None, session, anonymizer) + if isinstance(anonymized, dict): + return anonymized + return session -def parse_tool_input(tool_name: str | None, input_data: Any, anonymizer: Anonymizer) -> dict: - """Return a structured dict for a tool's input args, with paths/content anonymized.""" +def parse_tool_input(input_data: Any) -> dict: + """Return a structured dict for a tool's input args without anonymizing it yet.""" if not isinstance(input_data, dict): - return {"raw": anonymizer.text(str(input_data))} + return {"raw": str(input_data)} - return {k: anonymize_value(k, v, anonymizer) for k, v in input_data.items()} + return input_data def get_cached_index( diff --git a/dataclaw/parsers/cursor.py b/dataclaw/parsers/cursor.py index f70276f..189d390 100644 --- a/dataclaw/parsers/cursor.py +++ b/dataclaw/parsers/cursor.py @@ -7,7 +7,6 @@ from .. import _json as json from ..anonymizer import Anonymizer from ..export_tasks import ExportSessionTask -from ..secrets import redact_text from .common import ( build_prefixed_project_name, build_projects_from_index, @@ -285,7 +284,6 @@ def parse_session( metadata: dict[str, Any] = { "session_id": composer_id, - "cwd": None, "git_branch": None, "model": None, "start_time": None, @@ -303,14 +301,6 @@ def parse_session( if isinstance(timestamp, (int, float)): timestamp = normalize_timestamp(timestamp) - if metadata["cwd"] is None: - wuris = bubble.get("workspaceUris", []) - if wuris and isinstance(wuris, list) and wuris[0]: - uri = wuris[0] - if uri.startswith("file://"): - uri = uri[7:] - metadata["cwd"] = anonymizer.path(uri) - model_info = bubble.get("modelInfo") if isinstance(model_info, dict) and metadata["model"] is None: model_name = model_info.get("modelName") @@ -323,11 +313,10 @@ def parse_session( text = (bubble.get("text") or "").strip() if not text: continue - redacted, _ = redact_text(text) messages.append( { "role": "user", - "content": anonymizer.text(redacted), + "content": text, "timestamp": timestamp, } ) @@ -348,23 +337,16 @@ def parse_session( if isinstance(inner, dict): params_raw = inner - tool_input = parse_tool_input( - tool_name, - params_raw if isinstance(params_raw, dict) else {}, - anonymizer, - ) + tool_input = parse_tool_input(params_raw if isinstance(params_raw, dict) else {}) result_raw = _try_parse_json(tfd.get("result")) tool_output: dict[str, Any] = {} if isinstance(result_raw, str) and result_raw.strip(): - redacted_out, _ = redact_text(result_raw) - tool_output = {"text": anonymizer.text(redacted_out)} + tool_output = {"text": result_raw} elif isinstance(result_raw, dict): - tool_output = { - k: anonymizer.text(str(v)) if isinstance(v, str) else v for k, v in result_raw.items() - } + tool_output = {k: str(v) if isinstance(v, str) else v for k, v in result_raw.items()} elif result_raw is not None: - tool_output = {"text": anonymizer.text(str(result_raw))} + tool_output = {"text": str(result_raw)} status_val = tfd.get("status", "unknown") if isinstance(status_val, dict): @@ -389,12 +371,11 @@ def parse_session( if include_thinking and isinstance(thinking, dict): think_text = (thinking.get("text") or "").strip() if think_text: - msg["thinking"] = anonymizer.text(think_text) + msg["thinking"] = think_text text = (bubble.get("text") or "").strip() if text: - redacted, _ = redact_text(text) - msg["content"] = anonymizer.text(redacted) + msg["content"] = text messages.append(msg) stats["assistant_messages"] += 1 @@ -412,10 +393,9 @@ def parse_session( msg = {"role": "assistant", "timestamp": timestamp} if text: - redacted, _ = redact_text(text) - msg["content"] = anonymizer.text(redacted) + msg["content"] = text if think_text: - msg["thinking"] = anonymizer.text(think_text) + msg["thinking"] = think_text messages.append(msg) stats["assistant_messages"] += 1 @@ -429,4 +409,4 @@ def parse_session( if metadata["model"] is None: metadata["model"] = "cursor-unknown" - return make_session_result(metadata, messages, stats) + return make_session_result(metadata, messages, stats, anonymizer=anonymizer) diff --git a/dataclaw/parsers/custom.py b/dataclaw/parsers/custom.py index 280811b..f9a4ee4 100644 --- a/dataclaw/parsers/custom.py +++ b/dataclaw/parsers/custom.py @@ -5,7 +5,7 @@ from .. import _json as json from ..anonymizer import Anonymizer from ..export_tasks import ExportSessionTask -from ..secrets import redact_text +from .common import anonymize_session logger = logging.getLogger(__name__) @@ -154,8 +154,4 @@ def parse_session_bytes(project_dir_name: str, raw_line: bytes | str, anonymizer session["project"] = f"custom:{project_dir_name}" session["source"] = SOURCE - for msg in session.get("messages", []): - if "content" in msg and isinstance(msg["content"], str): - redacted, _ = redact_text(msg["content"]) - msg["content"] = anonymizer.text(redacted) - return session + return anonymize_session(session, anonymizer) diff --git a/dataclaw/parsers/gemini.py b/dataclaw/parsers/gemini.py index f0684d8..725961c 100644 --- a/dataclaw/parsers/gemini.py +++ b/dataclaw/parsers/gemini.py @@ -11,7 +11,6 @@ from ..export_tasks import ExportSessionTask from ..secrets import should_skip_large_binary_string from .common import ( - anonymize_value, collect_project_sessions, count_existing_paths_and_sizes, make_session_result, @@ -204,7 +203,7 @@ def parse_export_session_task( return parse_session_file(Path(task.file_path), anonymizer, include_thinking) -def parse_tool_call(tool_call: dict, anonymizer: Anonymizer) -> dict: +def parse_tool_call(tool_call: dict) -> dict: """Parse a Gemini tool call into a structured dict with input/output/status.""" name = tool_call.get("name") args = tool_call.get("args", {}) @@ -223,40 +222,40 @@ def parse_tool_call(tool_call: dict, anonymizer: Anonymizer) -> dict: extra_texts.append(item["text"]) if name == "read_file": - inp = {"file_path": anonymizer.path(args.get("file_path", ""))} + inp = {"file_path": args.get("file_path", "")} elif name == "write_file": inp = { - "file_path": anonymizer.path(args.get("file_path", "")), - "content": anonymizer.text(args.get("content", "")), + "file_path": args.get("file_path", ""), + "content": args.get("content", ""), } elif name == "replace": inp = { - "file_path": anonymizer.path(args.get("file_path", "")), - "old_string": anonymizer.text(args.get("old_string", "")), - "new_string": anonymizer.text(args.get("new_string", "")), + "file_path": args.get("file_path", ""), + "old_string": args.get("old_string", ""), + "new_string": args.get("new_string", ""), "expected_replacements": args.get("expected_replacements"), - "instruction": (anonymizer.text(args.get("instruction", "")) if args.get("instruction") else None), + "instruction": (args.get("instruction", "") if args.get("instruction") else None), } inp = {k: v for k, v in inp.items() if v is not None} elif name == "run_shell_command": - inp = {"command": anonymizer.text(args.get("command", ""))} + inp = {"command": args.get("command", "")} elif name == "read_many_files": - inp = {"paths": [anonymizer.path(path) for path in args.get("paths", [])]} + inp = {"paths": list(args.get("paths", []))} elif name in ("search_file_content", "grep_search"): - inp = {k: anonymizer.text(str(v)) for k, v in args.items()} + inp = {k: str(v) for k, v in args.items()} elif name == "list_directory": - inp = {"dir_path": anonymizer.path(args.get("dir_path", ""))} + inp = {"dir_path": args.get("dir_path", "")} if args.get("ignore"): if isinstance(args["ignore"], list): - inp["ignore"] = [anonymizer.text(str(path)) for path in args["ignore"]] + inp["ignore"] = [str(path) for path in args["ignore"]] else: - inp["ignore"] = anonymizer.text(str(args["ignore"])) + inp["ignore"] = str(args["ignore"]) elif name == "glob": inp = {"pattern": args.get("pattern", "")} elif name in ("google_web_search", "web_fetch", "codebase_investigator"): - inp = {k: anonymizer.text(str(v)) for k, v in args.items()} + inp = {k: str(v) for k, v in args.items()} else: - inp = {k: anonymizer.text(str(v)) if isinstance(v, str) else v for k, v in args.items()} + inp = {k: str(v) if isinstance(v, str) else v for k, v in args.items()} if name == "read_many_files": files: list[dict] = [] @@ -269,8 +268,8 @@ def parse_tool_call(tool_call: dict, anonymizer: Anonymizer) -> dict: if current_path is not None: files.append( { - "path": anonymizer.path(current_path), - "content": anonymizer.text("\n".join(content_lines).strip()), + "path": current_path, + "content": "\n".join(content_lines).strip(), } ) current_path = line[4:-4].strip() @@ -280,8 +279,8 @@ def parse_tool_call(tool_call: dict, anonymizer: Anonymizer) -> dict: if current_path is not None: files.append( { - "path": anonymizer.path(current_path), - "content": anonymizer.text("\n".join(content_lines).strip()), + "path": current_path, + "content": "\n".join(content_lines).strip(), } ) out: dict[str, Any] = {"files": files} @@ -311,29 +310,17 @@ def parse_tool_call(tool_call: dict, anonymizer: Anonymizer) -> dict: try: parsed["exit_code"] = int(parsed["exit_code"]) except ValueError: - parsed["exit_code"] = anonymizer.text(parsed["exit_code"]) - if "command" in parsed: - parsed["command"] = anonymizer.text(parsed["command"]) - if "directory" in parsed: - parsed["directory"] = anonymizer.path(parsed["directory"]) - if "output" in parsed: - parsed["output"] = anonymizer.text(parsed["output"]) + pass out = parsed elif output_text is not None: - out = {"text": anonymizer.text(output_text)} + out = {"text": output_text} else: out = {} return {"tool": name, "input": inp, "output": out, "status": status} -def anonymize_text_preserving_blobs( - text: Any, - anonymizer: Anonymizer, - *, - strip: bool = False, - drop_empty: bool = True, -) -> str | None: +def normalize_text_preserving_blobs(text: Any, *, strip: bool = False, drop_empty: bool = True) -> str | None: if not isinstance(text, str): return None if should_skip_large_binary_string(text): @@ -341,7 +328,7 @@ def anonymize_text_preserving_blobs( normalized = text.strip() if strip else text if drop_empty and not normalized.strip(): return None - return anonymizer.text(normalized) + return normalized def build_gemini_call_id(name: str, args: Any, counters: dict[str, int]) -> str: @@ -349,22 +336,19 @@ def build_gemini_call_id(name: str, args: Any, counters: dict[str, int]) -> str: return f"fc_{name}_{counters[name]}" -def anonymize_file_uri(file_uri: Any, anonymizer: Anonymizer) -> str | None: +def normalize_file_uri(file_uri: Any) -> str | None: if not isinstance(file_uri, str): return None - if file_uri.startswith("file://"): - return f"file://{anonymizer.path(file_uri[7:])}" - return anonymizer.text(file_uri) + return file_uri def parse_gemini_user_part( part: Any, - anonymizer: Anonymizer, pending_call_ids: dict[str, deque[str]], call_counters: dict[str, int], ) -> tuple[str | None, dict[str, Any] | None]: if isinstance(part, str): - text = anonymize_text_preserving_blobs(part, anonymizer, drop_empty=False) + text = normalize_text_preserving_blobs(part, drop_empty=False) if text is None: return None, None if should_skip_large_binary_string(part): @@ -375,7 +359,7 @@ def parse_gemini_user_part( return None, None if "text" in part: - text = anonymize_text_preserving_blobs(part.get("text"), anonymizer, drop_empty=False) + text = normalize_text_preserving_blobs(part.get("text"), drop_empty=False) if text is None: return None, None if should_skip_large_binary_string(part.get("text", "")): @@ -397,7 +381,7 @@ def parse_gemini_user_part( file_data = part.get("fileData") if isinstance(file_data, dict): source: dict[str, Any] = {"type": "url"} - url = anonymize_file_uri(file_data.get("fileUri"), anonymizer) + url = normalize_file_uri(file_data.get("fileUri")) if url: source["url"] = url mime_type = file_data.get("mimeType") @@ -415,7 +399,7 @@ def parse_gemini_user_part( "type": "tool_use", "id": call_id, "name": name, - "input": parse_tool_input(name, args, anonymizer), + "input": parse_tool_input(args), } function_response = part.get("functionResponse") @@ -427,9 +411,9 @@ def parse_gemini_user_part( response = function_response.get("response") content: Any = None if isinstance(response, dict) and "output" in response: - content = anonymize_text_preserving_blobs(response.get("output"), anonymizer) + content = normalize_text_preserving_blobs(response.get("output")) elif response is not None: - content = anonymize_value("response", response, anonymizer) + content = response part_result: dict[str, Any] = {"type": "tool_result", "tool_use_id": tool_use_id} if content not in (None, "", [], {}): part_result["content"] = content @@ -438,9 +422,9 @@ def parse_gemini_user_part( return None, None -def parse_gemini_user_content(content: Any, anonymizer: Anonymizer) -> tuple[str | None, list[dict[str, Any]]]: +def parse_gemini_user_content(content: Any) -> tuple[str | None, list[dict[str, Any]]]: if isinstance(content, str): - text = anonymize_text_preserving_blobs(content, anonymizer, drop_empty=False) + text = normalize_text_preserving_blobs(content, drop_empty=False) if text is None: return None, [] if should_skip_large_binary_string(content): @@ -456,7 +440,7 @@ def parse_gemini_user_content(content: Any, anonymizer: Anonymizer) -> tuple[str call_counters: dict[str, int] = defaultdict(int) for part in content: - text, content_part = parse_gemini_user_part(part, anonymizer, pending_call_ids, call_counters) + text, content_part = parse_gemini_user_part(part, pending_call_ids, call_counters) if text is not None: text_parts.append(text) if content_part: @@ -484,7 +468,6 @@ def parse_session_file( messages = [] metadata = { "session_id": data.get("sessionId", filepath.stem), - "cwd": None, "git_branch": None, "model": None, "start_time": data.get("startTime"), @@ -497,7 +480,7 @@ def parse_session_file( timestamp = msg_data.get("timestamp") if msg_type == "user": - text, content_parts = parse_gemini_user_content(msg_data.get("content"), anonymizer) + text, content_parts = parse_gemini_user_content(msg_data.get("content")) if text is None and not content_parts: continue message: dict[str, Any] = {"role": "user", "timestamp": timestamp} @@ -524,7 +507,7 @@ def parse_session_file( content = msg_data.get("content") if isinstance(content, str) and content.strip(): - msg["content"] = anonymizer.text(content.strip()) + msg["content"] = content.strip() if include_thinking: thoughts = msg_data.get("thoughts", []) @@ -534,11 +517,11 @@ def parse_session_file( if "description" in thought and isinstance(thought["description"], str): thought_texts.append(thought["description"].strip()) if thought_texts: - msg["thinking"] = anonymizer.text("\n\n".join(thought_texts)) + msg["thinking"] = "\n\n".join(thought_texts) tool_uses = [] for tool_call in msg_data.get("toolCalls", []): - tool_uses.append(parse_tool_call(tool_call, anonymizer)) + tool_uses.append(parse_tool_call(tool_call)) if tool_uses: msg["tool_uses"] = tool_uses @@ -549,4 +532,4 @@ def parse_session_file( stats["assistant_messages"] += 1 update_time_bounds(metadata, timestamp) - return make_session_result(metadata, messages, stats) + return make_session_result(metadata, messages, stats, anonymizer=anonymizer) diff --git a/dataclaw/parsers/kimi.py b/dataclaw/parsers/kimi.py index 165d154..b426637 100644 --- a/dataclaw/parsers/kimi.py +++ b/dataclaw/parsers/kimi.py @@ -185,7 +185,6 @@ def parse_session_file( messages: list[dict[str, Any]] = [] metadata: dict[str, Any] = { "session_id": filepath.parent.name, - "cwd": None, "git_branch": None, "model": None, "start_time": None, @@ -203,7 +202,7 @@ def parse_session_file( messages.append( { "role": "user", - "content": anonymizer.text(content.strip()), + "content": content.strip(), "timestamp": None, } ) @@ -224,11 +223,11 @@ def parse_session_file( if block_type == "text": text = block.get("text", "").strip() if text: - text_parts.append(anonymizer.text(text)) + text_parts.append(text) elif block_type == "think" and include_thinking: think = block.get("think", "").strip() if think: - thinking_parts.append(anonymizer.text(think)) + thinking_parts.append(think) if text_parts: msg["content"] = "\n\n".join(text_parts) @@ -257,7 +256,7 @@ def parse_session_file( tool_uses.append( { "tool": tool_name, - "input": parse_tool_input(tool_name, args, anonymizer), + "input": parse_tool_input(args), } ) @@ -278,4 +277,4 @@ def parse_session_file( logger.warning("Failed to read Kimi session file %s: %s", filepath, e) return None - return make_session_result(metadata, messages, stats) + return make_session_result(metadata, messages, stats, anonymizer=anonymizer) diff --git a/dataclaw/parsers/openclaw.py b/dataclaw/parsers/openclaw.py index ddc7b0d..6be2301 100644 --- a/dataclaw/parsers/openclaw.py +++ b/dataclaw/parsers/openclaw.py @@ -158,15 +158,11 @@ def parse_session_file( metadata: dict[str, Any] = { "session_id": header.get("id", filepath.stem), - "cwd": None, "git_branch": None, "model": None, "start_time": header.get("timestamp"), "end_time": None, } - cwd = header.get("cwd") - if isinstance(cwd, str) and cwd.strip(): - metadata["cwd"] = anonymizer.path(cwd) messages: list[dict[str, Any]] = [] stats = make_stats() @@ -212,7 +208,7 @@ def parse_session_file( messages.append( { "role": "user", - "content": anonymizer.text(text.strip()), + "content": text.strip(), "timestamp": effective_ts, } ) @@ -246,19 +242,19 @@ def parse_session_file( if block_type == "text": text = block.get("text", "") if isinstance(text, str) and text.strip(): - text_parts.append(anonymizer.text(text.strip())) + text_parts.append(text.strip()) elif block_type == "thinking" and include_thinking: thinking = block.get("thinking", "") if isinstance(thinking, str) and thinking.strip(): - thinking_parts.append(anonymizer.text(thinking.strip())) + thinking_parts.append(thinking.strip()) elif block_type == "toolCall": tool_name = block.get("name") args = block.get("arguments", {}) tool_entry: dict[str, Any] = { "tool": tool_name, - "input": parse_tool_input(tool_name, args, anonymizer), + "input": parse_tool_input(args), } tool_call_id = block.get("id") if isinstance(tool_call_id, str) and tool_call_id: @@ -291,7 +287,7 @@ def parse_session_file( tool_call_id = msg_data.get("toolCallId") if not isinstance(tool_call_id, str) or not tool_call_id: continue - result = _build_openclaw_tool_result(msg_data, anonymizer) + result = _build_openclaw_tool_result(msg_data) matched_tool_uses = pending_tool_uses.pop(tool_call_id, []) if matched_tool_uses: for tool_use in matched_tool_uses: @@ -306,11 +302,11 @@ def parse_session_file( is_error = exit_code is not None and exit_code != 0 tool_entry: dict[str, Any] = { "tool": "bash", - "input": {"command": anonymizer.text(command)} if command else {}, + "input": {"command": command} if command else {}, } out_dict: dict[str, Any] = {} if output: - out_dict["text"] = anonymizer.text(output.strip()) + out_dict["text"] = output.strip() if exit_code is not None: out_dict["exit_code"] = exit_code if out_dict: @@ -333,10 +329,10 @@ def parse_session_file( if metadata["model"] is None: metadata["model"] = "openclaw-unknown" - return make_session_result(metadata, messages, stats) + return make_session_result(metadata, messages, stats, anonymizer=anonymizer) -def _build_openclaw_tool_result(msg_data: dict[str, Any], anonymizer: Anonymizer) -> dict[str, Any]: +def _build_openclaw_tool_result(msg_data: dict[str, Any]) -> dict[str, Any]: is_error = bool(msg_data.get("isError")) content = msg_data.get("content", []) if isinstance(content, list): @@ -349,7 +345,7 @@ def _build_openclaw_tool_result(msg_data: dict[str, Any], anonymizer: Anonymizer else: output_text = "" return { - "output": {"text": anonymizer.text(output_text)} if output_text else {}, + "output": {"text": output_text} if output_text else {}, "status": "error" if is_error else "success", } diff --git a/dataclaw/parsers/opencode.py b/dataclaw/parsers/opencode.py index a6ba5ce..0569a51 100644 --- a/dataclaw/parsers/opencode.py +++ b/dataclaw/parsers/opencode.py @@ -209,7 +209,6 @@ def _parse_session_with_connection( messages: list[dict[str, Any]] = [] metadata: dict[str, Any] = { "session_id": session_id, - "cwd": None, "git_branch": None, "model": None, "start_time": None, @@ -229,7 +228,6 @@ def _parse_session_with_connection( if isinstance(raw_cwd, str) and raw_cwd.strip(): if raw_cwd != target_cwd: return None - metadata["cwd"] = anonymizer.path(raw_cwd) elif target_cwd != UNKNOWN_OPENCODE_CWD: return None @@ -253,14 +251,14 @@ def _parse_session_with_connection( parts = iter_message_parts(conn, message_row["id"]) if role == "user": - msg = extract_user_message(parts, anonymizer) + msg = extract_user_message(parts) if msg is not None: msg["timestamp"] = timestamp messages.append(msg) stats["user_messages"] += 1 update_time_bounds(metadata, timestamp) elif role == "assistant": - msg = extract_assistant_content(parts, anonymizer, include_thinking) + msg = extract_assistant_content(parts, include_thinking) if msg: msg["timestamp"] = timestamp messages.append(msg) @@ -282,7 +280,7 @@ def _parse_session_with_connection( if metadata["model"] is None: metadata["model"] = "opencode-unknown" - return make_session_result(metadata, messages, stats) + return make_session_result(metadata, messages, stats, anonymizer=anonymizer) def extract_model(message_data: dict[str, Any]) -> str | None: @@ -307,7 +305,7 @@ def iter_message_parts(conn: sqlite3.Connection, message_id: str) -> Iterator[di yield load_json_field(part_row["data"]) -def build_opencode_file_source(url: Any, mime: Any, anonymizer: Anonymizer) -> dict[str, Any] | None: +def build_opencode_file_source(url: Any, mime: Any) -> dict[str, Any] | None: if not isinstance(url, str) or not url: return None @@ -323,18 +321,18 @@ def build_opencode_file_source(url: Any, mime: Any, anonymizer: Anonymizer) -> d if url.startswith("file://"): source: dict[str, Any] = { "type": "url", - "url": f"file://{anonymizer.path(url[7:])}", + "url": url, } else: - source = {"type": "url", "url": anonymizer.text(url)} + source = {"type": "url", "url": url} if isinstance(mime, str) and mime: source["media_type"] = mime return source -def extract_opencode_file_part(part: dict[str, Any], anonymizer: Anonymizer) -> dict[str, Any] | None: - source = build_opencode_file_source(part.get("url"), part.get("mime"), anonymizer) +def extract_opencode_file_part(part: dict[str, Any]) -> dict[str, Any] | None: + source = build_opencode_file_source(part.get("url"), part.get("mime")) if source is None: return None @@ -344,7 +342,7 @@ def extract_opencode_file_part(part: dict[str, Any], anonymizer: Anonymizer) -> return {"type": "document", "source": source} -def extract_user_message(parts: Iterable[dict[str, Any]], anonymizer: Anonymizer) -> dict[str, Any] | None: +def extract_user_message(parts: Iterable[dict[str, Any]]) -> dict[str, Any] | None: text_parts: list[str] = [] content_parts: list[dict[str, Any]] = [] for part in parts: @@ -354,9 +352,9 @@ def extract_user_message(parts: Iterable[dict[str, Any]], anonymizer: Anonymizer if part_type == "text": text = part.get("text") if isinstance(text, str) and text.strip(): - text_parts.append(anonymizer.text(text.strip())) + text_parts.append(text.strip()) elif part_type == "file": - content_part = extract_opencode_file_part(part, anonymizer) + content_part = extract_opencode_file_part(part) if content_part is not None: content_parts.append(content_part) @@ -373,7 +371,6 @@ def extract_user_message(parts: Iterable[dict[str, Any]], anonymizer: Anonymizer def extract_assistant_content( parts: Iterable[dict[str, Any]], - anonymizer: Anonymizer, include_thinking: bool, ) -> dict[str, Any] | None: text_parts: list[str] = [] @@ -388,18 +385,18 @@ def extract_assistant_content( if part_type == "text": text = part.get("text") if isinstance(text, str) and text.strip(): - text_parts.append(anonymizer.text(text.strip())) + text_parts.append(text.strip()) elif part_type == "reasoning" and include_thinking: text = part.get("text") if isinstance(text, str) and text.strip(): - thinking_parts.append(anonymizer.text(text.strip())) + thinking_parts.append(text.strip()) elif part_type == "tool": tool_name = part.get("tool") state = part.get("state", {}) tool_input = state.get("input", {}) if isinstance(state, dict) else {} tool_use: dict[str, Any] = { "tool": tool_name, - "input": parse_tool_input(tool_name, tool_input, anonymizer), + "input": parse_tool_input(tool_input), } if isinstance(state, dict): status = state.get("status") @@ -407,7 +404,7 @@ def extract_assistant_content( tool_use["status"] = "success" if status == "completed" else status output = state.get("output") if isinstance(output, str) and output: - tool_use["output"] = {"text": anonymizer.text(output)} + tool_use["output"] = {"text": output} elif output is not None: tool_use["output"] = {} tool_uses.append(tool_use) diff --git a/dataclaw/secrets.py b/dataclaw/secrets.py index 337783e..fb407f8 100644 --- a/dataclaw/secrets.py +++ b/dataclaw/secrets.py @@ -4,6 +4,8 @@ import re from typing import Any +import ahocorasick + REDACTED = "[REDACTED]" _GENERIC_SECRET_SUFFIXES = ( @@ -18,6 +20,33 @@ _GENERIC_SECRET_MARKERS = tuple( f"{suffix}{delimiter}" for suffix in _GENERIC_SECRET_SUFFIXES for delimiter in ("=", ":", '"', "'", " ") ) +_FAST_PATH_CASE_MARKERS = ( + "eyJ", + "sk-ant-", + "sk-", + "AIzaSy", + "gsk_", + "fm1_", + "fm2_", + "0x", + "hf_", + "ghp_", + "gho_", + "ghs_", + "ghr_", + "github_pat_", + "pypi-", + "npm_", + "AKIA", + "xox", + "discord", + "PRIVATE KEY", + "Bearer", + "密码", +) +_FAST_PATH_LOWER_MARKERS = tuple( + dict.fromkeys(("postgres", "secret_key", "aws_secret_access_key", "password", "passwd") + _GENERIC_SECRET_MARKERS) +) # Ordered from most specific to least specific SECRET_PATTERNS = [ @@ -143,6 +172,9 @@ _ANSI_ESCAPE_RE = re.compile(r"\x1b\[[0-9;?]*[ -/]*[@-~]") _BINARY_CONTROL_CHAR_RE = re.compile(r"[\x00-\x08\x0E-\x1F]") _WHITESPACE_RE = re.compile(r"\s+") +_DIGIT_RE = re.compile(r"\d") +_TELEGRAM_PREFIX_RE = re.compile(r"\b\d{8,10}:") +_IPV4_CANDIDATE_RE = re.compile(r"\b\d{1,3}(?:\.\d{1,3}){3}\b") def should_skip_large_binary_string(text: str) -> bool: @@ -170,6 +202,19 @@ def should_skip_large_binary_string(text: str) -> bool: return _BASE64_BLOB_RE.fullmatch(compact) is not None +def should_skip_structured_string_transform( + key: str | None, + value: str, + parent_dict: dict[str, Any] | None, +) -> bool: + """Return True for schema-marked binary/document payload strings we should not rewrite.""" + if key == "data" and isinstance(parent_dict, dict) and parent_dict.get("type") == "base64": + return True + if key == "url" and value.startswith("data:"): + return True + return False + + def contains_large_binary_value(value: Any) -> bool: if isinstance(value, str): return should_skip_large_binary_string(value) @@ -209,61 +254,102 @@ def _has_mixed_char_types(s: str) -> bool: return has_upper and has_lower and has_digit -def _contains_any(text: str, needles: tuple[str, ...]) -> bool: - return any(needle in text for needle in needles) +def _build_marker_automaton(markers: tuple[str, ...]): + automaton = ahocorasick.Automaton() + for marker in markers: + automaton.add_word(marker, marker) + automaton.make_automaton() + return automaton + + +_FAST_PATH_CASE_AUTOMATON = _build_marker_automaton(_FAST_PATH_CASE_MARKERS) +_FAST_PATH_LOWER_AUTOMATON = _build_marker_automaton(_FAST_PATH_LOWER_MARKERS) -def _contains_general_secret_marker(text: str, lower_text: str | None, markers: tuple[str, ...]) -> tuple[bool, str]: - if lower_text is None: - lower_text = text.lower() - return _contains_any(lower_text, markers), lower_text +class _FastPathState: + __slots__ = ("text", "_case_markers", "_lower_markers", "_has_digit", "_has_telegram_prefix", "_has_ipv4_candidate") + def __init__(self, text: str): + self.text = text + self._case_markers: set[str] | None = None + self._lower_markers: set[str] | None = None + self._has_digit: bool | None = None + self._has_telegram_prefix: bool | None = None + self._has_ipv4_candidate: bool | None = None -def _pattern_may_match(name: str, text: str, lower_text: str | None) -> tuple[bool, str | None]: + def case_markers(self) -> set[str]: + if self._case_markers is None: + self._case_markers = {marker for _, marker in _FAST_PATH_CASE_AUTOMATON.iter(self.text)} + return self._case_markers + + def lower_markers(self) -> set[str]: + if self._lower_markers is None: + self._lower_markers = {marker for _, marker in _FAST_PATH_LOWER_AUTOMATON.iter(self.text.lower())} + return self._lower_markers + + def has_digit(self) -> bool: + if self._has_digit is None: + self._has_digit = _DIGIT_RE.search(self.text) is not None + return self._has_digit + + def has_telegram_prefix(self) -> bool: + if self._has_telegram_prefix is None: + self._has_telegram_prefix = _TELEGRAM_PREFIX_RE.search(self.text) is not None + return self._has_telegram_prefix + + def has_ipv4_candidate(self) -> bool: + if self._has_ipv4_candidate is None: + self._has_ipv4_candidate = _IPV4_CANDIDATE_RE.search(self.text) is not None + return self._has_ipv4_candidate + + +def _has_any_marker(found_markers: set[str], needles: tuple[str, ...]) -> bool: + return any(needle in found_markers for needle in needles) + + +def _pattern_may_match(name: str, state: _FastPathState) -> bool: + text = state.text if name in ("jwt", "jwt_partial"): - return "eyJ" in text, lower_text + return "eyJ" in state.case_markers() if name == "db_url": - if lower_text is None: - lower_text = text.lower() - return "postgres" in lower_text, lower_text + return "postgres" in state.lower_markers() if name == "anthropic_key": - return "sk-ant-" in text, lower_text + return "sk-ant-" in state.case_markers() if name == "openai_key": - return "sk-" in text, lower_text + return "sk-" in state.case_markers() if name == "google_api_key": - return "AIzaSy" in text, lower_text + return "AIzaSy" in state.case_markers() if name == "groq_key": - return "gsk_" in text, lower_text + return "gsk_" in state.case_markers() if name == "telegram_token": - return ":" in text, lower_text + return len(text) >= 44 and ":" in text and state.has_telegram_prefix() if name == "flyio_token": - return "fm1_" in text or "fm2_" in text, lower_text + case_markers = state.case_markers() + return "fm1_" in case_markers or "fm2_" in case_markers if name == "eth_private_key": - return "0x" in text, lower_text + return "0x" in state.case_markers() if name == "hf_token": - return "hf_" in text, lower_text + return "hf_" in state.case_markers() if name == "github_token": - return _contains_any(text, ("ghp_", "gho_", "ghs_", "ghr_")), lower_text + return _has_any_marker(state.case_markers(), ("ghp_", "gho_", "ghs_", "ghr_")) if name == "github_pat_token": - return "github_pat_" in text, lower_text + return "github_pat_" in state.case_markers() if name == "pypi_token": - return "pypi-" in text, lower_text + return "pypi-" in state.case_markers() if name == "npm_token": - return "npm_" in text, lower_text + return "npm_" in state.case_markers() if name == "aws_key": - return "AKIA" in text, lower_text + return "AKIA" in state.case_markers() if name == "aws_secret": if "=" not in text and ":" not in text: - return False, lower_text - if lower_text is None: - lower_text = text.lower() - return _contains_any(lower_text, ("secret_key", "aws_secret_access_key")), lower_text + return False + return _has_any_marker(state.lower_markers(), ("secret_key", "aws_secret_access_key")) if name == "slack_token": - return "xox" in text, lower_text + return "xox" in state.case_markers() if name == "discord_webhook": - return "discord" in text, lower_text + return "discord" in state.case_markers() and "/api/webhooks/" in text if name == "private_key": - return "PRIVATE KEY" in text, lower_text + return "PRIVATE KEY" in state.case_markers() and "-----BEGIN" in text and "-----END" in text if name == "generic_secret": if ( "-" not in text @@ -273,21 +359,21 @@ def _pattern_may_match(name: str, text: str, lower_text: str | None) -> tuple[bo and "&" not in text and " " not in text ): - return False, lower_text - return _contains_general_secret_marker(text, lower_text, _GENERIC_SECRET_MARKERS) + return False + return _has_any_marker(state.lower_markers(), _GENERIC_SECRET_MARKERS) if name == "bearer": - return "Bearer" in text, lower_text + return "Bearer" in state.case_markers() and len(text) >= 27 if name == "ip_address": - return "." in text, lower_text + return "." in text and state.has_ipv4_candidate() if name == "password_value": - if lower_text is None: - lower_text = text.lower() - return "password" in lower_text or "passwd" in lower_text or "密码" in text, lower_text + case_markers = state.case_markers() + lower_markers = state.lower_markers() + return "password" in lower_markers or "passwd" in lower_markers or "密码" in case_markers if name == "email": - return "@" in text, lower_text + return "@" in text and "." in text if name == "high_entropy": - return '"' in text or "'" in text, lower_text - return True, lower_text + return ('"' in text or "'" in text) and state.has_digit() + return True def scan_text(text: str) -> list[dict]: @@ -295,10 +381,9 @@ def scan_text(text: str) -> list[dict]: return [] findings = [] - lower_text: str | None = None + fast_path_state = _FastPathState(text) for name, pattern in SECRET_PATTERNS: - may_match, lower_text = _pattern_may_match(name, text, lower_text) - if not may_match: + if not _pattern_may_match(name, fast_path_state): continue for match in pattern.finditer(text): matched_text = match.group(0) @@ -374,9 +459,16 @@ def redact_custom_strings(text: str, strings: list[str]) -> tuple[str, int]: return text, count -def _redact_value(value: Any, custom_strings: list[str] | None = None) -> tuple[Any, int]: +def _redact_value( + value: Any, + custom_strings: list[str] | None = None, + key: str | None = None, + parent_dict: dict[str, Any] | None = None, +) -> tuple[Any, int]: """Recursively redact secrets from a string, list, or dict value.""" if isinstance(value, str): + if should_skip_structured_string_transform(key, value, parent_dict): + return value, 0 if should_skip_large_binary_string(value): return value, 0 result, count = redact_text(value) @@ -388,7 +480,7 @@ def _redact_value(value: Any, custom_strings: list[str] | None = None) -> tuple[ total = 0 out: dict[Any, Any] | None = None for k, v in value.items(): - redacted, n = _redact_value(v, custom_strings) + redacted, n = _redact_value(v, custom_strings, k, value) total += n if out is None: if n == 0 and redacted is v: @@ -402,7 +494,7 @@ def _redact_value(value: Any, custom_strings: list[str] | None = None) -> tuple[ total = 0 out_list: list[Any] | None = None for idx, item in enumerate(value): - redacted, n = _redact_value(item, custom_strings) + redacted, n = _redact_value(item, custom_strings, key, parent_dict) total += n if out_list is None: if n == 0 and redacted is item: diff --git a/pyproject.toml b/pyproject.toml index d47d5a9..968e9a9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,6 +23,7 @@ classifiers = [ dependencies = [ "huggingface_hub>=0.35.0", "orjson>=3.0.0", + "pyahocorasick>=2.3.0", "PyYAML>=6.0.0", ] diff --git a/tests/test_parser_claude.py b/tests/test_parser_claude.py index 83361db..4a07bd3 100644 --- a/tests/test_parser_claude.py +++ b/tests/test_parser_claude.py @@ -69,7 +69,7 @@ def test_hyphens_preserved_in_project_name(self): class TestExtractUserContent: def test_string_content(self, mock_anonymizer): entry = {"message": {"content": "Fix the bug"}} - result = extract_user_content(entry, mock_anonymizer) + result = extract_user_content(entry) assert result == "Fix the bug" def test_list_content(self, mock_anonymizer): @@ -81,21 +81,21 @@ def test_list_content(self, mock_anonymizer): ] } } - result = extract_user_content(entry, mock_anonymizer) + result = extract_user_content(entry) assert "Hello" in result assert "World" in result def test_empty_content(self, mock_anonymizer): entry = {"message": {"content": ""}} - assert extract_user_content(entry, mock_anonymizer) is None + assert extract_user_content(entry) is None def test_whitespace_content(self, mock_anonymizer): entry = {"message": {"content": " \n "}} - assert extract_user_content(entry, mock_anonymizer) is None + assert extract_user_content(entry) is None def test_missing_message(self, mock_anonymizer): entry = {} - assert extract_user_content(entry, mock_anonymizer) is None + assert extract_user_content(entry) is None class TestExtractAssistantContent: @@ -108,7 +108,7 @@ def test_text_blocks(self, mock_anonymizer): ] } } - result = extract_assistant_content(entry, mock_anonymizer, True) + result = extract_assistant_content(entry, True) assert result is not None assert result["role"] == "assistant" assert "Part 1" in result["content"] @@ -123,7 +123,7 @@ def test_thinking_included(self, mock_anonymizer): ] } } - result = extract_assistant_content(entry, mock_anonymizer, True) + result = extract_assistant_content(entry, True) assert result is not None assert "thinking" in result assert "Need to inspect files." in result["thinking"] @@ -137,7 +137,7 @@ def test_thinking_excluded(self, mock_anonymizer): ] } } - result = extract_assistant_content(entry, mock_anonymizer, False) + result = extract_assistant_content(entry, False) assert result is not None assert "thinking" not in result assert result["content"] == "Visible." @@ -155,7 +155,7 @@ def test_tool_use_parsed(self, mock_anonymizer): ] } } - result = extract_assistant_content(entry, mock_anonymizer, True) + result = extract_assistant_content(entry, True) assert result is not None assert len(result["tool_uses"]) == 1 assert result["tool_uses"][0]["tool"] == "Read" @@ -174,10 +174,7 @@ def test_tool_use_with_result_map(self, mock_anonymizer): } } result = extract_assistant_content( - entry, - mock_anonymizer, - True, - {"tool-1": {"output": {"text": "file.txt"}, "status": "success"}}, + entry, True, {"tool-1": {"output": {"text": "file.txt"}, "status": "success"}} ) assert result is not None tool_use = result["tool_uses"][0] @@ -186,11 +183,11 @@ def test_tool_use_with_result_map(self, mock_anonymizer): def test_empty_blocks_returns_none(self, mock_anonymizer): entry = {"message": {"content": []}} - assert extract_assistant_content(entry, mock_anonymizer, True) is None + assert extract_assistant_content(entry, True) is None def test_non_list_content_returns_none(self, mock_anonymizer): entry = {"message": {"content": "not-a-list"}} - assert extract_assistant_content(entry, mock_anonymizer, True) is None + assert extract_assistant_content(entry, True) is None def test_ignores_non_dict_blocks(self, mock_anonymizer): entry = { @@ -201,7 +198,7 @@ def test_ignores_non_dict_blocks(self, mock_anonymizer): ] } } - result = extract_assistant_content(entry, mock_anonymizer, True) + result = extract_assistant_content(entry, True) assert result is not None assert result["content"] == "Valid." @@ -211,7 +208,6 @@ def _run(self, entry, anonymizer, include_thinking=True): messages = [] metadata = { "session_id": "test", - "cwd": None, "git_branch": None, "claude_version": None, "model": None, @@ -225,7 +221,7 @@ def _run(self, entry, anonymizer, include_thinking=True): "input_tokens": 0, "output_tokens": 0, } - process_entry(entry, messages, metadata, stats, anonymizer, include_thinking) + process_entry(entry, messages, metadata, stats, include_thinking) return messages, metadata, stats def test_user_entry(self, mock_anonymizer, sample_user_entry): @@ -250,7 +246,6 @@ def test_unknown_type(self, mock_anonymizer): def test_metadata_extraction(self, mock_anonymizer, sample_user_entry): _, metadata, _ = self._run(sample_user_entry, mock_anonymizer) - assert metadata["cwd"] is not None assert metadata["claude_version"] == "1.0.0" assert metadata["start_time"] is not None @@ -655,7 +650,7 @@ def test_basic_string_output(self, mock_anonymizer): }, } ] - result = build_tool_result_map(entries, mock_anonymizer) + result = build_tool_result_map(entries) assert "tu-1" in result assert result["tu-1"]["status"] == "success" assert result["tu-1"]["output"]["text"] == "file contents here" @@ -676,7 +671,7 @@ def test_error_result(self, mock_anonymizer): }, } ] - result = build_tool_result_map(entries, mock_anonymizer) + result = build_tool_result_map(entries) assert result["tu-2"]["status"] == "error" def test_list_content(self, mock_anonymizer): @@ -697,7 +692,7 @@ def test_list_content(self, mock_anonymizer): }, } ] - result = build_tool_result_map(entries, mock_anonymizer) + result = build_tool_result_map(entries) assert "Part one" in result["tu-3"]["output"]["text"] assert "Part two" in result["tu-3"]["output"]["text"] @@ -708,7 +703,7 @@ def test_empty_content_gives_empty_output(self, mock_anonymizer): "message": {"content": [{"type": "tool_result", "tool_use_id": "tu-4", "content": ""}]}, } ] - result = build_tool_result_map(entries, mock_anonymizer) + result = build_tool_result_map(entries) assert result["tu-4"]["output"] == {} def test_structured_tool_result_keeps_extra_fields_without_dup_text(self, mock_anonymizer): @@ -734,7 +729,7 @@ def test_structured_tool_result_keeps_extra_fields_without_dup_text(self, mock_a }, } ] - result = build_tool_result_map(entries, mock_anonymizer) + result = build_tool_result_map(entries) output = result["tu-structured"]["output"] assert output["text"] == "command output" assert "stdout" not in output["raw"] @@ -767,12 +762,12 @@ def test_file_tool_result_omits_duplicate_file_content(self, mock_anonymizer): }, } ] - result = build_tool_result_map(entries, mock_anonymizer) + result = build_tool_result_map(entries) raw = result["tu-file"]["output"]["raw"] assert raw["type"] == "text" assert raw["file"]["numLines"] == 2 assert "content" not in raw["file"] - assert "testuser" not in raw["file"]["filePath"] + assert raw["file"]["filePath"] == "/Users/testuser/Documents/myproject/out.txt" def test_file_tool_result_omits_duplicate_content_with_line_prefixes(self, mock_anonymizer): entries = [ @@ -797,7 +792,7 @@ def test_file_tool_result_omits_duplicate_content_with_line_prefixes(self, mock_ }, } ] - result = build_tool_result_map(entries, mock_anonymizer) + result = build_tool_result_map(entries) raw = result["tu-file-numbered"]["output"]["raw"] assert raw["type"] == "text" assert "content" not in raw["file"] @@ -827,7 +822,7 @@ def test_file_tool_result_omits_duplicate_content_when_output_wraps_it(self, moc }, } ] - result = build_tool_result_map(entries, mock_anonymizer) + result = build_tool_result_map(entries) raw = result["tu-file-wrapped"]["output"]["raw"] assert raw["type"] == "text" assert "content" not in raw["file"] @@ -853,7 +848,7 @@ def test_non_text_tool_result_blocks_preserved(self, mock_anonymizer): }, } ] - result = build_tool_result_map(entries, mock_anonymizer) + result = build_tool_result_map(entries) output = result["tu-image"]["output"] assert "text" not in output assert output["raw"]["content"][0]["type"] == "image" @@ -875,7 +870,7 @@ def test_large_string_blob_content_preserved_verbatim_in_raw(self, mock_anonymiz }, } ] - result = build_tool_result_map(entries, mock_anonymizer) + result = build_tool_result_map(entries) output = result["tu-blob"]["output"] assert "text" not in output assert output["raw"]["content"] == blob @@ -898,7 +893,7 @@ def test_large_string_tool_use_result_preserved_verbatim_in_raw(self, mock_anony "sourceToolAssistantUUID": "assistant-blob", } ] - result = build_tool_result_map(entries, mock_anonymizer) + result = build_tool_result_map(entries) output = result["tu-blob-result"]["output"] assert "text" not in output assert output["raw"]["content"] == blob @@ -925,7 +920,7 @@ def test_long_ansi_terminal_output_is_preserved_as_text(self, mock_anonymizer): }, } ] - result = build_tool_result_map(entries, mock_anonymizer) + result = build_tool_result_map(entries) output = result["tu-ansi"]["output"] assert output["text"].startswith("Exit code 1") assert "Successfully preprocessed" in output["text"] @@ -952,9 +947,9 @@ def test_edit_tool_result_preserves_raw_payload(self, mock_anonymizer): "sourceToolAssistantUUID": "assistant-123", } ] - result = build_tool_result_map(entries, mock_anonymizer) + result = build_tool_result_map(entries) raw = result["tu-edit"]["output"]["raw"] - assert raw["filePath"] != "/Users/testuser/Documents/myproject/app.py" + assert raw["filePath"] == "/Users/testuser/Documents/myproject/app.py" assert "oldString" not in raw assert "newString" not in raw assert "structuredPatch" not in raw @@ -981,11 +976,11 @@ def test_create_tool_result_drops_duplicate_created_file_content(self, mock_anon "sourceToolAssistantUUID": "assistant-create", } ] - result = build_tool_result_map(entries, mock_anonymizer) + result = build_tool_result_map(entries) output = result["tu-create"]["output"] assert output["text"].startswith("File created successfully at:") assert output["raw"]["type"] == "create" - assert output["raw"]["filePath"] != "/Users/testuser/Documents/myproject/out.txt" + assert output["raw"]["filePath"] == "/Users/testuser/Documents/myproject/out.txt" assert "content" not in output["raw"] assert output["raw"]["sourceToolAssistantUUID"] == "assistant-create" @@ -996,7 +991,7 @@ def test_non_user_entries_ignored(self, mock_anonymizer): "message": {"content": [{"type": "tool_result", "tool_use_id": "tu-5", "content": "ignored"}]}, } ] - result = build_tool_result_map(entries, mock_anonymizer) + result = build_tool_result_map(entries) assert "tu-5" not in result def test_tool_output_attached_in_session(self, tmp_path, mock_anonymizer): diff --git a/tests/test_parser_codex.py b/tests/test_parser_codex.py index 4e6c5b5..7a91eac 100644 --- a/tests/test_parser_codex.py +++ b/tests/test_parser_codex.py @@ -416,7 +416,7 @@ def test_function_call_output(self, mock_anonymizer): }, } ] - result = build_tool_result_map(entries, mock_anonymizer) + result = build_tool_result_map(entries) assert "call-1" in result assert result["call-1"]["status"] == "success" assert result["call-1"]["output"]["exit_code"] == 0 @@ -439,7 +439,7 @@ def test_custom_tool_call_output(self, mock_anonymizer): }, } ] - result = build_tool_result_map(entries, mock_anonymizer) + result = build_tool_result_map(entries) assert "call-2" in result assert result["call-2"]["output"]["exit_code"] == 0 assert "Successfully applied patch" in result["call-2"]["output"]["output"] @@ -456,7 +456,7 @@ def test_non_response_item_ignored(self, mock_anonymizer): }, } ] - result = build_tool_result_map(entries, mock_anonymizer) + result = build_tool_result_map(entries) assert "call-3" not in result def test_output_attached_end_to_end(self, tmp_path, monkeypatch, mock_anonymizer): diff --git a/tests/test_parser_common.py b/tests/test_parser_common.py index ba43d66..0adb2ec 100644 --- a/tests/test_parser_common.py +++ b/tests/test_parser_common.py @@ -1,7 +1,8 @@ """Tests for shared parser helpers.""" from dataclaw import _json as json -from dataclaw.parsers.common import load_json_field, normalize_timestamp, parse_tool_input +from dataclaw.parsers.common import load_json_field, make_session_result, normalize_timestamp, parse_tool_input +from dataclaw.secrets import REDACTED class TestNormalizeTimestamp: @@ -30,116 +31,86 @@ def test_other_type_returns_none(self): class TestParseToolInput: def test_read_tool(self, mock_anonymizer): - result = parse_tool_input("Read", {"file_path": "/tmp/test.py"}, mock_anonymizer) + result = parse_tool_input({"file_path": "/tmp/test.py"}) assert isinstance(result, dict) assert "file_path" in result assert "test.py" in result["file_path"] def test_write_tool(self, mock_anonymizer): - result = parse_tool_input( - "Write", - {"file_path": "/tmp/test.py", "content": "abc"}, - mock_anonymizer, - ) + result = parse_tool_input({"file_path": "/tmp/test.py", "content": "abc"}) assert isinstance(result, dict) assert "file_path" in result assert "content" in result def test_bash_tool(self, mock_anonymizer): - result = parse_tool_input("Bash", {"command": "ls -la"}, mock_anonymizer) + result = parse_tool_input({"command": "ls -la"}) assert isinstance(result, dict) assert result["command"] == "ls -la" def test_grep_tool(self, mock_anonymizer): - result = parse_tool_input( - "Grep", - {"pattern": "TODO", "path": "/tmp"}, - mock_anonymizer, - ) + result = parse_tool_input({"pattern": "TODO", "path": "/tmp"}) assert isinstance(result, dict) assert "pattern" in result assert "path" in result def test_glob_tool(self, mock_anonymizer): - result = parse_tool_input( - "Glob", - {"pattern": "*.py", "path": "/tmp"}, - mock_anonymizer, - ) + result = parse_tool_input({"pattern": "*.py", "path": "/tmp"}) assert isinstance(result, dict) assert result["pattern"] == "*.py" def test_task_tool(self, mock_anonymizer): - result = parse_tool_input( - "Task", - {"prompt": "Search for bugs"}, - mock_anonymizer, - ) + result = parse_tool_input({"prompt": "Search for bugs"}) assert isinstance(result, dict) assert "Search for bugs" in result["prompt"] def test_websearch_tool(self, mock_anonymizer): - result = parse_tool_input( - "WebSearch", - {"query": "python async"}, - mock_anonymizer, - ) + result = parse_tool_input({"query": "python async"}) assert isinstance(result, dict) assert result["query"] == "python async" def test_webfetch_tool(self, mock_anonymizer): - result = parse_tool_input( - "WebFetch", - {"url": "https://example.com"}, - mock_anonymizer, - ) + result = parse_tool_input({"url": "https://example.com"}) assert isinstance(result, dict) assert result["url"] == "https://example.com" def test_edit_tool(self, mock_anonymizer): - result = parse_tool_input( - "Edit", - {"file_path": "/tmp/test.py"}, - mock_anonymizer, - ) + result = parse_tool_input({"file_path": "/tmp/test.py"}) assert isinstance(result, dict) assert "file_path" in result def test_exec_command_tool(self, mock_anonymizer): - result = parse_tool_input("exec_command", {"cmd": "ls -la"}, mock_anonymizer) + result = parse_tool_input({"cmd": "ls -la"}) assert isinstance(result, dict) assert result["cmd"] == "ls -la" def test_shell_command_tool(self, mock_anonymizer): - result = parse_tool_input( - "shell_command", - {"command": "ls", "workdir": "/tmp"}, - mock_anonymizer, - ) + result = parse_tool_input({"command": "ls", "workdir": "/tmp"}) assert isinstance(result, dict) assert result["command"] == "ls" assert "workdir" in result + def test_command_field_is_not_pre_redacted(self, mock_anonymizer): + secret = "sk-ant-abcdefghijklmnopqrstuvwxyz123456" + result = parse_tool_input({"command": f"export ANTHROPIC_API_KEY={secret}"}) + assert secret in result["command"] + assert REDACTED not in result["command"] + def test_update_plan_tool(self, mock_anonymizer): - result = parse_tool_input( - "update_plan", - {"explanation": "New plan", "plan": [{"step": "do it", "status": "pending"}]}, - mock_anonymizer, - ) + result = parse_tool_input({"explanation": "New plan", "plan": [{"step": "do it", "status": "pending"}]}) assert isinstance(result, dict) assert "explanation" in result assert "plan" in result def test_unknown_tool(self, mock_anonymizer): - result = parse_tool_input("CustomTool", {"foo": "bar"}, mock_anonymizer) + result = parse_tool_input({"foo": "bar"}) assert isinstance(result, dict) def test_none_tool_name(self, mock_anonymizer): - result = parse_tool_input(None, {"data": "value"}, mock_anonymizer) + result = parse_tool_input({"data": "value"}) assert isinstance(result, dict) def test_non_dict_input(self, mock_anonymizer): - result = parse_tool_input("Read", "just a string", mock_anonymizer) + result = parse_tool_input("just a string") assert isinstance(result, dict) assert "raw" in result @@ -155,3 +126,38 @@ def test_surrogate_escapes_are_sanitized_for_export(self): "nested": [r"\xe1"], } assert json.dumps(result) + + +class TestMakeSessionResult: + def test_centralized_anonymization_skips_base64_data(self, mock_anonymizer): + session = make_session_result( + { + "session_id": "s1", + "model": "m", + "git_branch": None, + "start_time": None, + "end_time": None, + }, + [ + { + "role": "user", + "content": "hello testuser at /Users/testuser/project", + "content_parts": [ + { + "type": "document", + "source": { + "type": "base64", + "media_type": "text/plain", + "data": "testuserbase64payload", + }, + } + ], + } + ], + {"user_messages": 1, "assistant_messages": 0, "tool_uses": 0, "input_tokens": 0, "output_tokens": 0}, + anonymizer=mock_anonymizer, + ) + + assert session is not None + assert "testuser" not in session["messages"][0]["content"] + assert session["messages"][0]["content_parts"][0]["source"]["data"] == "testuserbase64payload" diff --git a/tests/test_parser_cursor.py b/tests/test_parser_cursor.py index fc5c2d2..b463742 100644 --- a/tests/test_parser_cursor.py +++ b/tests/test_parser_cursor.py @@ -4,6 +4,7 @@ from dataclaw import _json as json from dataclaw.parser import discover_projects, parse_project_sessions +from dataclaw.secrets import REDACTED from tests.parser_helpers import disable_other_providers, insert_cursor_conversation, write_cursor_db @@ -384,3 +385,37 @@ def test_nested_json_params_unwrapped(self, tmp_path, monkeypatch, mock_anonymiz monkeypatch.setattr("dataclaw.parsers.cursor.CURSOR_DB", db_path) sessions = parse_project_sessions(cwd, mock_anonymizer, source="cursor") assert "file_path" in sessions[0]["messages"][1]["tool_uses"][0]["input"] + + def test_parser_does_not_pre_redact_message_content(self, tmp_path, monkeypatch, mock_anonymizer): + disable_other_providers(monkeypatch, tmp_path, keep={"cursor"}) + cwd = "/Users/testuser/work/myapp" + db_path = tmp_path / "state.vscdb" + conn = write_cursor_db(db_path) + secret = "sk-ant-abcdefghijklmnopqrstuvwxyz123456" + insert_cursor_conversation( + conn, + "conv-secret", + [ + { + "id": "b1", + "type": 1, + "text": f"Use this key: {secret}", + "createdAt": 1706000000000, + "workspaceUris": [f"file://{cwd}"], + }, + { + "id": "b2", + "type": 2, + "text": "Noted.", + "createdAt": 1706000001000, + }, + ], + ) + conn.commit() + conn.close() + + monkeypatch.setattr("dataclaw.parsers.cursor.CURSOR_DB", db_path) + sessions = parse_project_sessions(cwd, mock_anonymizer, source="cursor") + content = sessions[0]["messages"][0]["content"] + assert secret in content + assert REDACTED not in content diff --git a/tests/test_parser_custom.py b/tests/test_parser_custom.py index 1e4b657..eb37a52 100644 --- a/tests/test_parser_custom.py +++ b/tests/test_parser_custom.py @@ -2,6 +2,7 @@ from dataclaw import _json as json from dataclaw.parser import discover_projects, parse_project_sessions +from dataclaw.secrets import REDACTED from tests.parser_helpers import disable_other_providers @@ -107,3 +108,15 @@ def test_parse_nonexistent_project(self, tmp_path, monkeypatch, mock_anonymizer) custom_dir.mkdir(parents=True) monkeypatch.setattr("dataclaw.parsers.custom.CUSTOM_DIR", custom_dir) assert parse_project_sessions("nope", mock_anonymizer, source="custom") == [] + + def test_parser_does_not_pre_redact_message_content(self, tmp_path, monkeypatch, mock_anonymizer): + custom_dir = tmp_path / "custom" + project_dir = custom_dir / "test-proj" + project_dir.mkdir(parents=True) + secret = "sk-ant-abcdefghijklmnopqrstuvwxyz123456" + (project_dir / "data.jsonl").write_text(self._make_valid_session("s1", content=f"token={secret}") + "\n") + monkeypatch.setattr("dataclaw.parsers.custom.CUSTOM_DIR", custom_dir) + sessions = parse_project_sessions("test-proj", mock_anonymizer, source="custom") + content = sessions[0]["messages"][0]["content"] + assert secret in content + assert REDACTED not in content diff --git a/tests/test_secrets.py b/tests/test_secrets.py index 90b4879..b842ecc 100644 --- a/tests/test_secrets.py +++ b/tests/test_secrets.py @@ -735,3 +735,37 @@ def test_redact_session_skips_large_base64_in_tool_output(self): assert result["messages"][0]["tool_uses"][0]["output"]["raw"]["content"][0]["source"]["data"] == blob assert result["messages"][0]["tool_uses"][0]["output"] is output assert count == 0 + + def test_redact_session_skips_short_base64_field_payload(self): + blob = "sk-ant-api03-abcdefghijklmnopqrstuvwxyz" + session = { + "messages": [ + { + "content_parts": [ + {"type": "tool_result", "content": f"Key: {blob}"}, + {"type": "image", "source": {"type": "base64", "data": blob}}, + ] + } + ] + } + result, count = redact_session(session) + assert REDACTED in result["messages"][0]["content_parts"][0]["content"] + assert result["messages"][0]["content_parts"][1]["source"]["data"] == blob + assert count >= 1 + + def test_redact_session_skips_data_url_source(self): + data_url = "data:text/plain;base64,sk-ant-api03-abcdefghijklmnopqrstuvwxyz" + session = { + "messages": [ + { + "content_parts": [ + {"type": "document", "source": {"type": "url", "url": data_url}}, + {"type": "tool_result", "content": "token: sk-ant-api03-abcdefghijklmnopqrstuvwxyz"}, + ] + } + ] + } + result, count = redact_session(session) + assert result["messages"][0]["content_parts"][0]["source"]["url"] == data_url + assert REDACTED in result["messages"][0]["content_parts"][1]["content"] + assert count >= 1