From e327abae9ee0454526f4f21269b9639d438f97ab Mon Sep 17 00:00:00 2001 From: woctordho Date: Sun, 19 Apr 2026 23:22:44 +0800 Subject: [PATCH 1/8] Fuse anonymizer and secret redaction traversal --- dataclaw/_cli/exporting.py | 10 +-- dataclaw/parsers/claude.py | 4 +- dataclaw/parsers/codex.py | 2 +- dataclaw/parsers/common.py | 87 +-------------------- dataclaw/parsers/cursor.py | 2 +- dataclaw/parsers/custom.py | 3 +- dataclaw/parsers/gemini.py | 2 +- dataclaw/parsers/kimi.py | 2 +- dataclaw/parsers/openclaw.py | 2 +- dataclaw/parsers/opencode.py | 2 +- dataclaw/secrets.py | 142 ++++++++++++++++++++++++++++------ tests/test_parser_codex.py | 4 +- tests/test_parser_common.py | 5 +- tests/test_parser_gemini.py | 2 +- tests/test_parser_opencode.py | 3 +- 15 files changed, 140 insertions(+), 132 deletions(-) diff --git a/dataclaw/_cli/exporting.py b/dataclaw/_cli/exporting.py index 3293ee1..6a068eb 100644 --- a/dataclaw/_cli/exporting.py +++ b/dataclaw/_cli/exporting.py @@ -17,7 +17,7 @@ from .._workers import configured_workers from ..anonymizer import Anonymizer from ..parser import iter_project_sessions -from ..secrets import redact_session +from ..secrets import transform_session from ..session_tasks import ExportSessionTask, build_export_session_tasks, parse_export_session_task from .common import HF_TAG, REPO_URL, SKILL_URL, _format_token_count, _provider_dataset_tags @@ -197,8 +197,8 @@ def _export_session_task_worker(payload) -> _WorkerSessionResult: if not model or model == "": return _WorkerSessionResult(project_index=task.project_index, skipped_model=True) + session, n_redacted = transform_session(session, anonymizer, custom_strings=custom_strings) fingerprint = _gemini_dedupe_fingerprint(session, task.source) - session, n_redacted = redact_session(session, custom_strings=custom_strings) stats = session.get("stats", {}) input_tokens, output_tokens = _token_totals(stats) has_token_stats = isinstance(stats, dict) and ("input_tokens" in stats or "output_tokens" in stats) @@ -283,13 +283,13 @@ def _export_to_jsonl_serial( skipped += 1 continue + session, n_redacted = transform_session(session, anonymizer, custom_strings=custom_strings) + total_redactions += n_redacted + fingerprint = _gemini_dedupe_fingerprint(session, source) if fingerprint is not None and fingerprint in seen_fingerprints: continue - session, n_redacted = redact_session(session, custom_strings=custom_strings) - total_redactions += n_redacted - if fingerprint is not None: seen_fingerprints.add(fingerprint) diff --git a/dataclaw/parsers/claude.py b/dataclaw/parsers/claude.py index 0274eb0..9608d26 100644 --- a/dataclaw/parsers/claude.py +++ b/dataclaw/parsers/claude.py @@ -422,7 +422,7 @@ def parse_session_file( except OSError: return None - return make_session_result(metadata, messages, stats, anonymizer=anonymizer) + return make_session_result(metadata, messages, stats) def find_subagent_sessions(project_dir: Path) -> list[Path]: @@ -490,7 +490,7 @@ def parse_subagent_session( return None metadata["session_id"] = resolve_subagent_session_id(session_dir, metadata["session_id"]) - return make_session_result(metadata, messages, stats, anonymizer=anonymizer) + return make_session_result(metadata, messages, stats) def resolve_subagent_session_id(session_dir: Path, session_id: str) -> str: diff --git a/dataclaw/parsers/codex.py b/dataclaw/parsers/codex.py index 55dbfa8..c5d7e00 100644 --- a/dataclaw/parsers/codex.py +++ b/dataclaw/parsers/codex.py @@ -283,7 +283,7 @@ def parse_session_file( else: state.metadata["model"] = "codex-unknown" - return make_session_result(state.metadata, state.messages, state.stats, anonymizer=anonymizer) + return make_session_result(state.metadata, state.messages, state.stats) def handle_session_meta( diff --git a/dataclaw/parsers/common.py b/dataclaw/parsers/common.py index 46bc8f0..852fae8 100644 --- a/dataclaw/parsers/common.py +++ b/dataclaw/parsers/common.py @@ -6,34 +6,9 @@ from typing import Any from .. import _json as json -from ..anonymizer import Anonymizer -from ..secrets import should_skip_large_binary_string, should_skip_structured_string_transform logger = logging.getLogger(__name__) -_NON_ANON_STRING_KEYS = frozenset( - { - "session_id", - "model", - "git_branch", - "start_time", - "end_time", - "role", - "timestamp", - "tool", - "status", - "type", - "media_type", - "mime_type", - "id", - "tool_use_id", - "sourceToolAssistantUUID", - "source", - "project", - "wall_time", - } -) - def iter_jsonl(filepath: Path): """Yield parsed JSON objects from a JSONL file, skipping blank/malformed lines.""" @@ -63,11 +38,10 @@ def make_session_result( metadata: dict[str, Any], messages: list[dict[str, Any]], stats: dict[str, int], - anonymizer: Anonymizer | None = None, ) -> dict[str, Any] | None: if not messages: return None - session = { + return { "session_id": metadata["session_id"], "model": metadata["model"], "git_branch": metadata["git_branch"], @@ -76,9 +50,6 @@ def make_session_result( "messages": messages, "stats": stats, } - if anonymizer is None: - return session - return anonymize_session(session, anonymizer) def update_time_bounds(metadata: dict[str, Any], timestamp: str | None) -> None: @@ -148,62 +119,6 @@ def normalize_timestamp(value: Any) -> str | None: return None -def _should_skip_anonymizing_string(key: str | None, value: str, parent_dict: dict[str, Any] | None) -> bool: - if key in _NON_ANON_STRING_KEYS: - return True - return should_skip_structured_string_transform(key, value, parent_dict) - - -def _anonymize_session_value( - key: str | None, - value: Any, - anonymizer: Anonymizer, - parent_dict: dict[str, Any] | None = None, -) -> tuple[Any, bool]: - if isinstance(value, str): - if _should_skip_anonymizing_string(key, value, parent_dict): - return value, False - if should_skip_large_binary_string(value): - return value, False - anonymized = anonymizer.text(value) - return anonymized, anonymized != value - - if isinstance(value, dict): - out: dict[str, Any] | None = None - for child_key, child_value in value.items(): - anonymized_child, changed = _anonymize_session_value(child_key, child_value, anonymizer, value) - if not changed: - continue - if out is None: - out = dict(value) - out[child_key] = anonymized_child - if out is None: - return value, False - return out, True - - if isinstance(value, list): - out_list: list[Any] | None = None - for idx, item in enumerate(value): - anonymized_item, changed = _anonymize_session_value(key, item, anonymizer, parent_dict) - if not changed: - continue - if out_list is None: - out_list = list(value) - out_list[idx] = anonymized_item - if out_list is None: - return value, False - return out_list, True - - return value, False - - -def anonymize_session(session: dict[str, Any], anonymizer: Anonymizer) -> dict[str, Any]: - anonymized, _changed = _anonymize_session_value(None, session, anonymizer) - if isinstance(anonymized, dict): - return anonymized - return session - - def parse_tool_input(input_data: Any) -> dict: """Return a structured dict for a tool's input args without anonymizing it yet.""" if not isinstance(input_data, dict): diff --git a/dataclaw/parsers/cursor.py b/dataclaw/parsers/cursor.py index 189d390..199f2b2 100644 --- a/dataclaw/parsers/cursor.py +++ b/dataclaw/parsers/cursor.py @@ -409,4 +409,4 @@ def parse_session( if metadata["model"] is None: metadata["model"] = "cursor-unknown" - return make_session_result(metadata, messages, stats, anonymizer=anonymizer) + return make_session_result(metadata, messages, stats) diff --git a/dataclaw/parsers/custom.py b/dataclaw/parsers/custom.py index f9a4ee4..0e42990 100644 --- a/dataclaw/parsers/custom.py +++ b/dataclaw/parsers/custom.py @@ -5,7 +5,6 @@ from .. import _json as json from ..anonymizer import Anonymizer from ..export_tasks import ExportSessionTask -from .common import anonymize_session logger = logging.getLogger(__name__) @@ -154,4 +153,4 @@ def parse_session_bytes(project_dir_name: str, raw_line: bytes | str, anonymizer session["project"] = f"custom:{project_dir_name}" session["source"] = SOURCE - return anonymize_session(session, anonymizer) + return session diff --git a/dataclaw/parsers/gemini.py b/dataclaw/parsers/gemini.py index 725961c..c09611a 100644 --- a/dataclaw/parsers/gemini.py +++ b/dataclaw/parsers/gemini.py @@ -532,4 +532,4 @@ def parse_session_file( stats["assistant_messages"] += 1 update_time_bounds(metadata, timestamp) - return make_session_result(metadata, messages, stats, anonymizer=anonymizer) + return make_session_result(metadata, messages, stats) diff --git a/dataclaw/parsers/kimi.py b/dataclaw/parsers/kimi.py index b426637..4744f22 100644 --- a/dataclaw/parsers/kimi.py +++ b/dataclaw/parsers/kimi.py @@ -277,4 +277,4 @@ def parse_session_file( logger.warning("Failed to read Kimi session file %s: %s", filepath, e) return None - return make_session_result(metadata, messages, stats, anonymizer=anonymizer) + return make_session_result(metadata, messages, stats) diff --git a/dataclaw/parsers/openclaw.py b/dataclaw/parsers/openclaw.py index 6be2301..aff29ee 100644 --- a/dataclaw/parsers/openclaw.py +++ b/dataclaw/parsers/openclaw.py @@ -329,7 +329,7 @@ def parse_session_file( if metadata["model"] is None: metadata["model"] = "openclaw-unknown" - return make_session_result(metadata, messages, stats, anonymizer=anonymizer) + return make_session_result(metadata, messages, stats) def _build_openclaw_tool_result(msg_data: dict[str, Any]) -> dict[str, Any]: diff --git a/dataclaw/parsers/opencode.py b/dataclaw/parsers/opencode.py index 0569a51..4c51985 100644 --- a/dataclaw/parsers/opencode.py +++ b/dataclaw/parsers/opencode.py @@ -280,7 +280,7 @@ def _parse_session_with_connection( if metadata["model"] is None: metadata["model"] = "opencode-unknown" - return make_session_result(metadata, messages, stats, anonymizer=anonymizer) + return make_session_result(metadata, messages, stats) def extract_model(message_data: dict[str, Any]) -> str | None: diff --git a/dataclaw/secrets.py b/dataclaw/secrets.py index fb407f8..764956a 100644 --- a/dataclaw/secrets.py +++ b/dataclaw/secrets.py @@ -6,6 +6,8 @@ import ahocorasick +from .anonymizer import Anonymizer + REDACTED = "[REDACTED]" _GENERIC_SECRET_SUFFIXES = ( @@ -47,6 +49,28 @@ _FAST_PATH_LOWER_MARKERS = tuple( dict.fromkeys(("postgres", "secret_key", "aws_secret_access_key", "password", "passwd") + _GENERIC_SECRET_MARKERS) ) +_NON_ANON_STRING_KEYS = frozenset( + { + "session_id", + "model", + "git_branch", + "start_time", + "end_time", + "role", + "timestamp", + "tool", + "status", + "type", + "media_type", + "mime_type", + "id", + "tool_use_id", + "sourceToolAssistantUUID", + "source", + "project", + "wall_time", + } +) # Ordered from most specific to least specific SECRET_PATTERNS = [ @@ -459,52 +483,113 @@ def redact_custom_strings(text: str, strings: list[str]) -> tuple[str, int]: return text, count -def _redact_value( +def _transform_value( value: Any, + anonymizer: Anonymizer | None = None, custom_strings: list[str] | None = None, key: str | None = None, parent_dict: dict[str, Any] | None = None, -) -> tuple[Any, int]: - """Recursively redact secrets from a string, list, or dict value.""" +) -> tuple[Any, int, bool]: + """Recursively anonymize and/or redact a string, list, or dict value.""" if isinstance(value, str): if should_skip_structured_string_transform(key, value, parent_dict): - return value, 0 + return value, 0, False if should_skip_large_binary_string(value): - return value, 0 - result, count = redact_text(value) + return value, 0, False + + result = value + count = 0 + changed = False + + if anonymizer is not None and key not in _NON_ANON_STRING_KEYS: + anonymized = anonymizer.text(result) + if anonymized != result: + result = anonymized + changed = True + + result, count = redact_text(result) + if count > 0: + changed = True if custom_strings: result, n = redact_custom_strings(result, custom_strings) count += n - return result, count + if n > 0: + changed = True + return result, count, changed + if isinstance(value, dict): total = 0 out: dict[Any, Any] | None = None for k, v in value.items(): - redacted, n = _redact_value(v, custom_strings, k, value) + transformed, n, changed = _transform_value(v, anonymizer, custom_strings, k, value) total += n if out is None: - if n == 0 and redacted is v: + if not changed: continue out = dict(value) - out[k] = redacted + out[k] = transformed if out is None: - return value, 0 - return out, total + return value, 0, False + return out, total, True + if isinstance(value, list): total = 0 out_list: list[Any] | None = None for idx, item in enumerate(value): - redacted, n = _redact_value(item, custom_strings, key, parent_dict) + transformed, n, changed = _transform_value(item, anonymizer, custom_strings, key, parent_dict) total += n if out_list is None: - if n == 0 and redacted is item: + if not changed: continue out_list = list(value[:idx]) - out_list.append(redacted) + out_list.append(transformed) if out_list is None: - return value, 0 - return out_list, total - return value, 0 + return value, 0, False + return out_list, total, True + return value, 0, False + + +def transform_session( + session: dict, + anonymizer: Anonymizer, + custom_strings: list[str] | None = None, +) -> tuple[dict, int]: + """Anonymize and redact all exported session content in one pass.""" + total = 0 + + for msg in session.get("messages", []): + for field in ("content", "thinking"): + if msg.get(field): + msg[field], count, _changed = _transform_value( + msg[field], + anonymizer, + custom_strings, + field, + msg, + ) + total += count + if msg.get("content_parts"): + msg["content_parts"], count, _changed = _transform_value( + msg["content_parts"], + anonymizer, + custom_strings, + "content_parts", + msg, + ) + total += count + for tool_use in msg.get("tool_uses", []): + for field in ("input", "output"): + if tool_use.get(field): + tool_use[field], count, _changed = _transform_value( + tool_use[field], + anonymizer, + custom_strings, + field, + tool_use, + ) + total += count + + return session, total def redact_session(session: dict, custom_strings: list[str] | None = None) -> tuple[dict, int]: @@ -514,18 +599,27 @@ def redact_session(session: dict, custom_strings: list[str] | None = None) -> tu for msg in session.get("messages", []): for field in ("content", "thinking"): if msg.get(field): - msg[field], count = redact_text(msg[field]) + msg[field], count, _changed = _transform_value( + msg[field], custom_strings=custom_strings, key=field, parent_dict=msg + ) total += count - if custom_strings: - msg[field], count = redact_custom_strings(msg[field], custom_strings) - total += count if msg.get("content_parts"): - msg["content_parts"], count = _redact_value(msg["content_parts"], custom_strings) + msg["content_parts"], count, _changed = _transform_value( + msg["content_parts"], + custom_strings=custom_strings, + key="content_parts", + parent_dict=msg, + ) total += count for tool_use in msg.get("tool_uses", []): for field in ("input", "output"): if tool_use.get(field): - tool_use[field], count = _redact_value(tool_use[field], custom_strings) + tool_use[field], count, _changed = _transform_value( + tool_use[field], + custom_strings=custom_strings, + key=field, + parent_dict=tool_use, + ) total += count return session, total diff --git a/tests/test_parser_codex.py b/tests/test_parser_codex.py index 7a91eac..14f17be 100644 --- a/tests/test_parser_codex.py +++ b/tests/test_parser_codex.py @@ -329,7 +329,9 @@ def test_codex_user_local_image_fallback(self, tmp_path, monkeypatch, mock_anony assert user_message["content"] == "Please inspect this image." assert user_message["content_parts"][0]["type"] == "image" assert user_message["content_parts"][0]["source"]["type"] == "url" - assert "testuser" not in user_message["content_parts"][0]["source"]["url"] + assert ( + user_message["content_parts"][0]["source"]["url"] == "file:///Users/testuser/Documents/myrepo/tmp/image.png" + ) assert ( user_message["content_parts"][0]["source"]["url"] .replace("\\", "/") diff --git a/tests/test_parser_common.py b/tests/test_parser_common.py index 0adb2ec..c04612c 100644 --- a/tests/test_parser_common.py +++ b/tests/test_parser_common.py @@ -129,7 +129,7 @@ def test_surrogate_escapes_are_sanitized_for_export(self): class TestMakeSessionResult: - def test_centralized_anonymization_skips_base64_data(self, mock_anonymizer): + def test_make_session_result_leaves_content_raw(self, mock_anonymizer): session = make_session_result( { "session_id": "s1", @@ -155,9 +155,8 @@ def test_centralized_anonymization_skips_base64_data(self, mock_anonymizer): } ], {"user_messages": 1, "assistant_messages": 0, "tool_uses": 0, "input_tokens": 0, "output_tokens": 0}, - anonymizer=mock_anonymizer, ) assert session is not None - assert "testuser" not in session["messages"][0]["content"] + assert session["messages"][0]["content"] == "hello testuser at /Users/testuser/project" assert session["messages"][0]["content_parts"][0]["source"]["data"] == "testuserbase64payload" diff --git a/tests/test_parser_gemini.py b/tests/test_parser_gemini.py index 9e4865b..73581fa 100644 --- a/tests/test_parser_gemini.py +++ b/tests/test_parser_gemini.py @@ -190,7 +190,7 @@ def test_user_function_parts_preserved_and_linked(self, tmp_path, mock_anonymize tool_use, tool_result = message["content_parts"] assert tool_use["type"] == "tool_use" assert tool_use["name"] == "read_file" - assert "testuser" not in tool_use["input"]["file_path"] + assert tool_use["input"]["file_path"] == "/Users/testuser/Documents/myproject/src/app.py" assert tool_result == { "type": "tool_result", "tool_use_id": tool_use["id"], diff --git a/tests/test_parser_opencode.py b/tests/test_parser_opencode.py index f2069ff..766da84 100644 --- a/tests/test_parser_opencode.py +++ b/tests/test_parser_opencode.py @@ -189,8 +189,7 @@ def test_parse_opencode_user_file_parts(self, tmp_path, monkeypatch, mock_anonym assert message["content_parts"][1]["type"] == "document" assert message["content_parts"][1]["source"]["type"] == "url" assert message["content_parts"][1]["source"]["media_type"] == "text/plain" - assert "testuser" not in message["content_parts"][1]["source"]["url"] - assert message["content_parts"][1]["source"]["url"].startswith("file:///Users/user_") + assert message["content_parts"][1]["source"]["url"].startswith("file:///Users/testuser") assert message["content_parts"][1]["source"]["url"].endswith("/work/repo/notes.txt") def test_parse_opencode_user_file_only_message(self, tmp_path, monkeypatch, mock_anonymizer): From cf5b39d73709559caa1dc18ffd59a9d496be5a7b Mon Sep 17 00:00:00 2001 From: woctordho Date: Sat, 2 May 2026 10:22:51 +0800 Subject: [PATCH 2/8] Fix image handling for Codex CLI in both user message and tool call --- dataclaw/parsers/codex.py | 77 ++++++++++++++-- tests/test_parser_codex.py | 177 +++++++++++++++++++++++++++++++++++++ 2 files changed, 249 insertions(+), 5 deletions(-) diff --git a/dataclaw/parsers/codex.py b/dataclaw/parsers/codex.py index c5d7e00..27e3170 100644 --- a/dataclaw/parsers/codex.py +++ b/dataclaw/parsers/codex.py @@ -1,7 +1,8 @@ import dataclasses import logging +import posixpath from collections.abc import Iterable -from pathlib import Path +from pathlib import Path, PurePosixPath, PureWindowsPath from typing import Any from .. import _json as json @@ -174,6 +175,11 @@ def _build_codex_tool_result(payload: dict[str, Any]) -> dict[str, Any] | None: if payload_type == "function_call_output": raw = payload.get("output", "") + if isinstance(raw, list): + return {"output": _build_codex_structured_tool_output(raw), "status": "success"} + if not isinstance(raw, str): + return {"output": {"raw": raw}, "status": "success"} + out: dict[str, Any] = {} lines = raw.splitlines() output_lines: list[str] = [] @@ -215,6 +221,40 @@ def _build_codex_tool_result(payload: dict[str, Any]) -> dict[str, Any] | None: return None +def _build_codex_structured_tool_output(parts: list[Any]) -> dict[str, Any]: + out: dict[str, Any] = {} + text_parts: list[str] = [] + raw_parts: list[Any] = [] + + for part in parts: + if not isinstance(part, dict): + raw_parts.append(part) + continue + + part_type = part.get("type") + if part_type in {"text", "output_text"}: + text = part.get("text") + if isinstance(text, str) and text.strip(): + text_parts.append(text.strip()) + raw_parts.append(part) + continue + + if part_type == "input_image": + image_url = part.get("image_url") + if isinstance(image_url, str) and image_url: + image_part = _build_codex_image_part(image_url) + if image_part is not None: + raw_parts.append(image_part) + continue + raw_parts.append(part) + + if text_parts: + out["text"] = "\n\n".join(text_parts) + if raw_parts: + out["raw"] = {"content": raw_parts} + return out + + def parse_session_file( filepath: Path, anonymizer: Anonymizer, @@ -353,18 +393,45 @@ def _build_codex_image_part(image_url: str) -> dict[str, Any] | None: def _build_codex_local_image_part(image_path: str, state: CodexParseState) -> dict[str, Any]: - path = Path(image_path) - if not path.is_absolute() and state.raw_cwd != UNKNOWN_CODEX_CWD: - path = Path(state.raw_cwd) / path + path = _resolve_codex_local_path(image_path, state.raw_cwd) return { "type": "image", "source": { "type": "url", - "url": f"file://{path}", + "url": _codex_file_url(path), }, } +def _is_windows_absolute_path(path: str) -> bool: + return PureWindowsPath(path).is_absolute() + + +def _is_posix_absolute_path(path: str) -> bool: + return PurePosixPath(path).is_absolute() + + +def _resolve_codex_local_path(image_path: str, cwd: str) -> str: + if _is_windows_absolute_path(image_path) or _is_posix_absolute_path(image_path): + return image_path + if cwd == UNKNOWN_CODEX_CWD: + return image_path + if _is_windows_absolute_path(cwd): + return str(PureWindowsPath(cwd) / image_path) + if _is_posix_absolute_path(cwd): + return posixpath.join(cwd, image_path.replace("\\", "/")) + return str(Path(cwd) / image_path) + + +def _codex_file_url(path: str) -> str: + if _is_windows_absolute_path(path): + return PureWindowsPath(path).as_uri() + if _is_posix_absolute_path(path): + return PurePosixPath(path.replace("\\", "/")).as_uri() + normalized_path = path.replace("\\", "/") + return f"file://{normalized_path}" + + def _extract_response_user_content_parts(payload: dict[str, Any]) -> list[dict[str, Any]]: content_parts: list[dict[str, Any]] = [] for part in payload.get("content", []): diff --git a/tests/test_parser_codex.py b/tests/test_parser_codex.py index 14f17be..9350a44 100644 --- a/tests/test_parser_codex.py +++ b/tests/test_parser_codex.py @@ -338,6 +338,65 @@ def test_codex_user_local_image_fallback(self, tmp_path, monkeypatch, mock_anony .endswith("/Documents/myrepo/tmp/image.png") ) + def test_codex_user_windows_local_image_fallback(self, tmp_path, monkeypatch, mock_anonymizer): + disable_other_providers(monkeypatch, tmp_path, keep={"codex"}) + codex_sessions = tmp_path / "codex-sessions" / "2026" / "05" / "02" + codex_sessions.mkdir(parents=True) + session_file = codex_sessions / "rollout-windows-local-image.jsonl" + lines = [ + { + "timestamp": "2026-05-02T02:12:26.890Z", + "type": "session_meta", + "payload": { + "id": "session-windows-local-image", + "cwd": "C:\\tmp\\test_codex", + "model_provider": "custom", + }, + }, + { + "timestamp": "2026-05-02T02:12:26.891Z", + "type": "turn_context", + "payload": { + "cwd": "C:\\tmp\\test_codex", + "model": "gpt-5.5", + }, + }, + { + "timestamp": "2026-05-02T02:12:27.079Z", + "type": "event_msg", + "payload": { + "type": "user_message", + "message": "Let's test the image read tool again. Read [Image #1] and describe it.", + "images": [], + "local_images": ["in.png"], + "text_elements": [{"placeholder": "[Image #1]"}], + }, + }, + ] + session_file.write_text("\n".join(json.dumps(line) for line in lines) + "\n") + + monkeypatch.setattr("dataclaw.parsers.codex.CODEX_SESSIONS_DIR", tmp_path / "codex-sessions") + monkeypatch.setattr("dataclaw.parsers.codex.CODEX_ARCHIVED_DIR", tmp_path / "codex-archived") + + result = parse_session_file( + session_file, + mock_anonymizer, + include_thinking=True, + target_cwd="C:\\tmp\\test_codex", + ) + + assert result is not None + user_message = result["messages"][0] + assert user_message["content_parts"] == [ + { + "type": "image", + "source": { + "type": "url", + "url": "file:///C:/tmp/test_codex/in.png", + }, + } + ] + def test_codex_image_only_response_item_flushes_user_message(self, tmp_path, monkeypatch, mock_anonymizer): disable_other_providers(monkeypatch, tmp_path, keep={"codex"}) codex_sessions = tmp_path / "codex-sessions" / "2026" / "04" / "02" @@ -425,6 +484,43 @@ def test_function_call_output(self, mock_anonymizer): assert result["call-1"]["output"]["wall_time"] == "1 seconds" assert "hello world" in result["call-1"]["output"]["output"] + def test_view_image_function_call_output_preserves_image_data(self, mock_anonymizer): + entries = [ + { + "type": "response_item", + "payload": { + "type": "function_call_output", + "call_id": "call-image", + "output": [ + { + "type": "input_image", + "image_url": "data:image/png;base64,QUJDRA==", + } + ], + }, + } + ] + + result = build_tool_result_map(entries) + + assert result["call-image"] == { + "status": "success", + "output": { + "raw": { + "content": [ + { + "type": "image", + "source": { + "type": "base64", + "media_type": "image/png", + "data": "QUJDRA==", + }, + } + ] + } + }, + } + def test_custom_tool_call_output(self, mock_anonymizer): entries = [ { @@ -521,3 +617,84 @@ def test_output_attached_end_to_end(self, tmp_path, monkeypatch, mock_anonymizer assert tool_use["status"] == "success" assert tool_use["output"]["exit_code"] == 0 assert "foo.py" in tool_use["output"]["output"] + + def test_view_image_output_attached_end_to_end(self, tmp_path, monkeypatch, mock_anonymizer): + disable_other_providers(monkeypatch, tmp_path, keep={"codex"}) + codex_sessions = tmp_path / "codex-sessions" / "2026" / "05" / "02" + codex_sessions.mkdir(parents=True) + session_file = codex_sessions / "rollout-view-image.jsonl" + lines = [ + { + "timestamp": "2026-05-02T02:07:05.022Z", + "type": "session_meta", + "payload": {"id": "s-view-image", "cwd": "C:\\tmp\\test_codex", "model_provider": "custom"}, + }, + { + "timestamp": "2026-05-02T02:07:05.024Z", + "type": "turn_context", + "payload": {"cwd": "C:\\tmp\\test_codex", "model": "gpt-5.5"}, + }, + { + "timestamp": "2026-05-02T02:07:05.025Z", + "type": "event_msg", + "payload": { + "type": "user_message", + "message": "Let's test the image read tool. Read in.png in this folder and describe it.", + "images": [], + "local_images": [], + "text_elements": [], + }, + }, + { + "timestamp": "2026-05-02T02:07:25.553Z", + "type": "response_item", + "payload": { + "type": "function_call", + "name": "view_image", + "call_id": "call-image", + "arguments": json.dumps({"path": "C:\\tmp\\test_codex\\in.png", "detail": "original"}), + }, + }, + { + "timestamp": "2026-05-02T02:07:25.676Z", + "type": "response_item", + "payload": { + "type": "function_call_output", + "call_id": "call-image", + "output": [ + { + "type": "input_image", + "image_url": "data:image/png;base64,QUJDRA==", + } + ], + }, + }, + { + "timestamp": "2026-05-02T02:07:40.596Z", + "type": "event_msg", + "payload": {"type": "agent_message", "message": "It is a screenshot."}, + }, + ] + session_file.write_text("\n".join(json.dumps(line) for line in lines) + "\n") + + monkeypatch.setattr("dataclaw.parsers.codex.CODEX_SESSIONS_DIR", tmp_path / "codex-sessions") + monkeypatch.setattr("dataclaw.parsers.codex.CODEX_ARCHIVED_DIR", tmp_path / "codex-archived") + + result = parse_session_file( + session_file, + mock_anonymizer, + include_thinking=True, + target_cwd="C:\\tmp\\test_codex", + ) + + assert result is not None + assistant_messages = [message for message in result["messages"] if message["role"] == "assistant"] + tool_use = assistant_messages[0]["tool_uses"][0] + assert tool_use["tool"] == "view_image" + assert tool_use["input"] == {"path": "C:\\tmp\\test_codex\\in.png", "detail": "original"} + assert tool_use["status"] == "success" + assert tool_use["output"]["raw"]["content"][0]["source"] == { + "type": "base64", + "media_type": "image/png", + "data": "QUJDRA==", + } From ba826e666fc8ba011f4292c15a15b22e9793cab3 Mon Sep 17 00:00:00 2001 From: woctordho Date: Sat, 2 May 2026 11:30:47 +0800 Subject: [PATCH 3/8] Fix image handling for Gemini CLI in both user message and tool call with new jsonl format --- dataclaw/parsers/gemini.py | 63 ++++++++++++-- tests/test_parser_gemini.py | 168 +++++++++++++++++++++++++++++++++++- 2 files changed, 222 insertions(+), 9 deletions(-) diff --git a/dataclaw/parsers/gemini.py b/dataclaw/parsers/gemini.py index c09611a..175f1b6 100644 --- a/dataclaw/parsers/gemini.py +++ b/dataclaw/parsers/gemini.py @@ -13,6 +13,7 @@ from .common import ( collect_project_sessions, count_existing_paths_and_sizes, + iter_jsonl, make_session_result, make_stats, parse_tool_input, @@ -27,6 +28,47 @@ _HASH_MAP: dict[str, str] = {} +def iter_gemini_session_files(chats_dir: Path) -> list[Path]: + return sorted([*chats_dir.glob("session-*.json"), *chats_dir.glob("session-*.jsonl")]) + + +def load_gemini_session_data(filepath: Path) -> dict[str, Any]: + if filepath.suffix == ".jsonl": + return load_gemini_jsonl_session_data(filepath) + + with open(filepath, "rb") as f: + return json.load(f) + + +def load_gemini_jsonl_session_data(filepath: Path) -> dict[str, Any]: + data: dict[str, Any] = {"messages": []} + message_positions: dict[str, int] = {} + + for entry in iter_jsonl(filepath): + update = entry.get("$set") + if isinstance(update, dict): + data.update(update) + continue + + if entry.get("type") in ("user", "gemini"): + message_id = entry.get("id") + if isinstance(message_id, str) and message_id: + position = message_positions.get(message_id) + if position is not None: + data["messages"][position] = entry + continue + message_positions[message_id] = len(data["messages"]) + data["messages"].append(entry) + continue + + if isinstance(entry.get("sessionId"), str): + for key in ("sessionId", "startTime", "lastUpdated"): + if key in entry: + data[key] = entry[key] + + return data + + def build_hash_map() -> dict[str, str]: """Build a mapping from SHA-256 hash prefix to directory path.""" result: dict[str, str] = {} @@ -54,10 +96,9 @@ def extract_project_path_from_sessions(project_hash: str, gemini_dir: Path) -> s if not chats_dir.exists(): return None - for session_file in sorted(chats_dir.glob("session-*.json"), reverse=True): + for session_file in sorted(iter_gemini_session_files(chats_dir), reverse=True): try: - with open(session_file, "rb") as f: - data = json.load(f) + data = load_gemini_session_data(session_file) except json.JSONDecodeError as e: logger.warning("Failed to parse JSON in %s: %s", session_file, e) continue @@ -139,7 +180,7 @@ def discover_projects( chats_dir = project_dir / "chats" if not chats_dir.exists(): continue - session_count, total_size = count_existing_paths_and_sizes(chats_dir.glob("session-*.json")) + session_count, total_size = count_existing_paths_and_sizes(iter_gemini_session_files(chats_dir)) if session_count == 0: continue projects.append( @@ -164,7 +205,7 @@ def parse_project_sessions( return () return collect_project_sessions( - sorted(project_path.glob("session-*.json")), + iter_gemini_session_files(project_path), lambda session_file: parse_session_file(session_file, anonymizer, include_thinking), build_project_name(project_dir_name), SOURCE, @@ -177,7 +218,7 @@ def build_export_session_tasks(project_index: int, project: dict) -> list[Export return [] tasks: list[ExportSessionTask] = [] - for task_index, session_file in enumerate(sorted(project_path.glob("session-*.json"))): + for task_index, session_file in enumerate(iter_gemini_session_files(project_path)): tasks.append( ExportSessionTask( source=SOURCE, @@ -212,6 +253,7 @@ def parse_tool_call(tool_call: dict) -> dict: output_text: str | None = None extra_texts: list[str] = [] + raw_parts: list[dict[str, Any]] = [] for item in result_list: if not isinstance(item, dict): continue @@ -220,6 +262,10 @@ def parse_tool_call(tool_call: dict) -> dict: output_text = resp.get("output") elif "text" in item: extra_texts.append(item["text"]) + elif "inlineData" in item or "fileData" in item: + _text, content_part = parse_gemini_user_part(item, defaultdict(deque), defaultdict(int)) + if content_part is not None: + raw_parts.append(content_part) if name == "read_file": inp = {"file_path": args.get("file_path", "")} @@ -316,6 +362,8 @@ def parse_tool_call(tool_call: dict) -> dict: out = {"text": output_text} else: out = {} + if raw_parts: + out["raw"] = {"content": raw_parts} return {"tool": name, "input": inp, "output": out, "status": status} @@ -456,8 +504,7 @@ def parse_session_file( include_thinking: bool = True, ) -> dict | None: try: - with open(filepath, "rb") as f: - data = json.load(f) + data = load_gemini_session_data(filepath) except json.JSONDecodeError as e: logger.warning("Failed to parse JSON in %s: %s", filepath, e) return None diff --git a/tests/test_parser_gemini.py b/tests/test_parser_gemini.py index 73581fa..7147ff9 100644 --- a/tests/test_parser_gemini.py +++ b/tests/test_parser_gemini.py @@ -1,7 +1,12 @@ """Tests for Gemini parser behavior.""" from dataclaw import _json as json -from dataclaw.parsers.gemini import discover_projects, parse_session_file +from dataclaw.parsers.gemini import ( + build_export_session_tasks, + discover_projects, + parse_project_sessions, + parse_session_file, +) class TestDiscoverGeminiProjects: @@ -20,6 +25,54 @@ def test_discovers_sessions_without_materializing_file_list(self, tmp_path, monk assert projects[0]["display_name"] == "gemini:resolved-project" assert projects[0]["session_count"] == 2 + def test_discovers_jsonl_sessions(self, tmp_path, monkeypatch): + gemini_dir = tmp_path / "tmp" + chats_dir = gemini_dir / "project-hash" / "chats" + chats_dir.mkdir(parents=True) + (chats_dir / "session-1.jsonl").write_text("{}\n", encoding="utf-8") + + monkeypatch.setattr("dataclaw.parsers.gemini.GEMINI_DIR", gemini_dir) + + projects = discover_projects(resolve_hash_fn=lambda _hash: "resolved-project") + + assert len(projects) == 1 + assert projects[0]["session_count"] == 1 + + def test_parse_project_sessions_reads_jsonl_sessions(self, tmp_path, monkeypatch, mock_anonymizer): + gemini_dir = tmp_path / "tmp" + chats_dir = gemini_dir / "project-hash" / "chats" + chats_dir.mkdir(parents=True) + session_file = chats_dir / "session-2026-05-02T02-38-5c45ceef.jsonl" + session_file.write_text( + "\n".join( + json.dumps(line) + for line in [ + { + "sessionId": "gemini-jsonl-session", + "startTime": "2026-05-02T02:38:45.721Z", + "lastUpdated": "2026-05-02T02:38:45.721Z", + }, + { + "id": "message-1", + "timestamp": "2026-05-02T02:39:06.404Z", + "type": "user", + "content": [{"text": "hello"}], + }, + ] + ) + + "\n", + encoding="utf-8", + ) + monkeypatch.setattr("dataclaw.parsers.gemini.GEMINI_DIR", gemini_dir) + + sessions = list(parse_project_sessions("project-hash", mock_anonymizer)) + tasks = build_export_session_tasks(0, {"dir_name": "project-hash", "display_name": "gemini:resolved-project"}) + + assert len(sessions) == 1 + assert sessions[0]["session_id"] == "gemini-jsonl-session" + assert len(tasks) == 1 + assert tasks[0].file_path == str(session_file) + class TestParseGeminiUserContentParts: def test_user_text_parts_preserve_whitespace_and_empty_parts(self, tmp_path, mock_anonymizer): @@ -254,3 +307,116 @@ def test_multi_mb_inline_data_preserved_verbatim(self, tmp_path, mock_anonymizer assert result is not None message = result["messages"][0] assert message["content_parts"][0]["source"]["data"] == blob + + def test_jsonl_user_inline_data_preserved(self, tmp_path, mock_anonymizer): + session_file = tmp_path / "session-gemini.jsonl" + session_file.write_text( + "\n".join( + json.dumps(line) + for line in [ + { + "sessionId": "gemini-jsonl-inline", + "startTime": "2026-05-02T02:38:45.721Z", + "lastUpdated": "2026-05-02T02:38:45.721Z", + }, + { + "id": "message-1", + "timestamp": "2026-05-02T02:39:06.404Z", + "type": "user", + "content": [ + {"text": "Let's test the image read tool again. Read @in.png and describe it."}, + {"text": "\n--- Content from referenced files ---"}, + {"inlineData": {"mimeType": "image/png", "data": "QUJDRA=="}}, + ], + }, + ] + ) + + "\n", + encoding="utf-8", + ) + + result = parse_session_file(session_file, mock_anonymizer) + + assert result is not None + message = result["messages"][0] + assert message["content"] == ( + "Let's test the image read tool again. Read @in.png and describe it.\n\n--- Content from referenced files ---" + ) + assert message["content_parts"] == [ + { + "type": "image", + "source": { + "type": "base64", + "media_type": "image/png", + "data": "QUJDRA==", + }, + } + ] + + def test_jsonl_read_file_binary_tool_output_preserved(self, tmp_path, mock_anonymizer): + session_file = tmp_path / "session-gemini.jsonl" + lines = [ + { + "sessionId": "gemini-jsonl-read-image", + "startTime": "2026-05-02T02:37:49.732Z", + "lastUpdated": "2026-05-02T02:37:49.732Z", + }, + { + "id": "message-1", + "timestamp": "2026-05-02T02:37:52.741Z", + "type": "user", + "content": [{"text": "Let's test the image read tool. Read in.png in this folder and describe it."}], + }, + { + "id": "message-2", + "timestamp": "2026-05-02T02:38:05.274Z", + "type": "gemini", + "content": "", + "thoughts": [{"description": "Reading image contents"}], + "tokens": {"input": 7, "cached": 2, "output": 3}, + "model": "gemini-3.1-pro-preview", + "toolCalls": [ + { + "id": "read_file_1", + "name": "read_file", + "args": {"file_path": "C:\\tmp\\test_codex\\in.png"}, + "result": [ + { + "functionResponse": { + "id": "read_file_1", + "name": "read_file", + "response": {"output": "Binary content provided (1 item(s))."}, + } + }, + {"inlineData": {"mimeType": "image/png", "data": "QUJDRA=="}}, + ], + "status": "success", + } + ], + }, + {"$set": {"lastUpdated": "2026-05-02T02:38:05.275Z"}}, + ] + session_file.write_text("\n".join(json.dumps(line) for line in lines) + "\n", encoding="utf-8") + + result = parse_session_file(session_file, mock_anonymizer) + + assert result is not None + assistant_message = result["messages"][1] + tool_use = assistant_message["tool_uses"][0] + assert tool_use["tool"] == "read_file" + assert tool_use["input"] == {"file_path": "C:\\tmp\\test_codex\\in.png"} + assert tool_use["output"] == { + "text": "Binary content provided (1 item(s)).", + "raw": { + "content": [ + { + "type": "image", + "source": { + "type": "base64", + "media_type": "image/png", + "data": "QUJDRA==", + }, + } + ] + }, + } From f2ffc7af8c159d0efdd7ee27bfec592874c0ff98 Mon Sep 17 00:00:00 2001 From: woctordho Date: Sat, 2 May 2026 11:58:03 +0800 Subject: [PATCH 4/8] Fix image handling for OpenCode in both user message and tool call --- README.md | 2 +- dataclaw/parsers/opencode.py | 35 +++++- tests/test_parser_opencode.py | 213 ++++++++++++++++++++++++++++++++++ 3 files changed, 245 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 16f81d2..0b13eb1 100644 --- a/README.md +++ b/README.md @@ -293,7 +293,7 @@ The auto-generated HF README includes: - Did you export all data, especially: - tool call inputs and outputs - long inputs and outputs that may be saved somewhere else - - binary content (may be encoded as base64) such as images. We do not apply anonymizer on binary content + - binary content (may be encoded as base64) such as images, in both user messages and tool calls. We do not apply anonymizer on binary content - subagents - Does the coding agent automatically delete old sessions? How to prevent this? diff --git a/dataclaw/parsers/opencode.py b/dataclaw/parsers/opencode.py index 4c51985..b1d9b95 100644 --- a/dataclaw/parsers/opencode.py +++ b/dataclaw/parsers/opencode.py @@ -342,6 +342,31 @@ def extract_opencode_file_part(part: dict[str, Any]) -> dict[str, Any] | None: return {"type": "document", "source": source} +def extract_opencode_tool_attachment(attachment: Any) -> dict[str, Any] | None: + if not isinstance(attachment, dict): + return None + return extract_opencode_file_part(attachment) + + +def build_opencode_tool_output(state: dict[str, Any]) -> dict[str, Any]: + output: dict[str, Any] = {} + text = state.get("output") + if isinstance(text, str) and text: + output["text"] = text + elif isinstance(state.get("error"), str) and state["error"]: + output["text"] = state["error"] + + attachments = state.get("attachments") + if isinstance(attachments, list): + content = [ + part for attachment in attachments if (part := extract_opencode_tool_attachment(attachment)) is not None + ] + if content: + output["raw"] = {"content": content} + + return output + + def extract_user_message(parts: Iterable[dict[str, Any]]) -> dict[str, Any] | None: text_parts: list[str] = [] content_parts: list[dict[str, Any]] = [] @@ -350,6 +375,8 @@ def extract_user_message(parts: Iterable[dict[str, Any]]) -> dict[str, Any] | No continue part_type = part.get("type") if part_type == "text": + if part.get("synthetic") is True: + continue text = part.get("text") if isinstance(text, str) and text.strip(): text_parts.append(text.strip()) @@ -403,10 +430,10 @@ def extract_assistant_content( if isinstance(status, str): tool_use["status"] = "success" if status == "completed" else status output = state.get("output") - if isinstance(output, str) and output: - tool_use["output"] = {"text": output} - elif output is not None: - tool_use["output"] = {} + attachments = state.get("attachments") + error = state.get("error") + if output is not None or attachments is not None or error is not None: + tool_use["output"] = build_opencode_tool_output(state) tool_uses.append(tool_use) if not text_parts and not thinking_parts and not tool_uses: diff --git a/tests/test_parser_opencode.py b/tests/test_parser_opencode.py index 766da84..e778177 100644 --- a/tests/test_parser_opencode.py +++ b/tests/test_parser_opencode.py @@ -250,6 +250,219 @@ def test_parse_opencode_user_file_only_message(self, tmp_path, monkeypatch, mock } ] + def test_parse_opencode_user_synthetic_image_file(self, tmp_path, monkeypatch, mock_anonymizer): + disable_other_providers(monkeypatch, tmp_path, keep={"opencode"}) + db_path = tmp_path / "opencode.db" + conn = write_opencode_db(db_path) + + session_id = "ses_synthetic_file" + cwd = "C:\\tmp\\test_codex" + conn.execute( + "INSERT INTO session (id, directory, time_created, time_updated) VALUES (?, ?, ?, ?)", + (session_id, cwd, 1706000000000, 1706000005000), + ) + conn.execute( + "INSERT INTO message (id, session_id, time_created, data) VALUES (?, ?, ?, ?)", + ( + "msg_user", + session_id, + 1706000001000, + json.dumps({"role": "user", "model": {"providerID": "openai", "modelID": "gpt-5.5"}}), + ), + ) + parts = [ + {"id": "prt_text", "data": {"type": "text", "text": "Let's test the image read tool again."}}, + { + "id": "prt_synthetic_call", + "data": { + "type": "text", + "synthetic": True, + "text": 'Called the Read tool with the following input: {"filePath":"C:\\\\tmp\\\\test_codex\\\\in.png"}', + }, + }, + { + "id": "prt_synthetic_output", + "data": {"type": "text", "synthetic": True, "text": "Image read successfully"}, + }, + { + "id": "prt_image", + "data": { + "type": "file", + "mime": "image/png", + "url": "data:image/png;base64,QUJDRA==", + "synthetic": True, + "filename": "in.png", + }, + }, + ] + for index, part in enumerate(parts): + conn.execute( + "INSERT INTO part (id, message_id, time_created, data) VALUES (?, ?, ?, ?)", + (part["id"], "msg_user", 1706000001001 + index, json.dumps(part["data"])), + ) + conn.commit() + conn.close() + + monkeypatch.setattr("dataclaw.parsers.opencode.OPENCODE_DB_PATH", db_path) + monkeypatch.setattr("dataclaw.parsers.opencode._PROJECT_INDEX", {}) + + sessions = parse_project_sessions(cwd, mock_anonymizer, source="opencode") + + assert len(sessions) == 1 + message = sessions[0]["messages"][0] + assert message["content"] == "Let's test the image read tool again." + assert message["content_parts"] == [ + { + "type": "image", + "source": { + "type": "base64", + "media_type": "image/png", + "data": "QUJDRA==", + }, + } + ] + + def test_parse_opencode_tool_image_attachments(self, tmp_path, monkeypatch, mock_anonymizer): + disable_other_providers(monkeypatch, tmp_path, keep={"opencode"}) + db_path = tmp_path / "opencode.db" + conn = write_opencode_db(db_path) + + session_id = "ses_tool_attachment" + cwd = "C:\\tmp\\test_codex" + conn.execute( + "INSERT INTO session (id, directory, time_created, time_updated) VALUES (?, ?, ?, ?)", + (session_id, cwd, 1706000000000, 1706000005000), + ) + conn.execute( + "INSERT INTO message (id, session_id, time_created, data) VALUES (?, ?, ?, ?)", + ( + "msg_user", + session_id, + 1706000001000, + json.dumps({"role": "user", "model": {"providerID": "openai", "modelID": "gpt-5.5"}}), + ), + ) + conn.execute( + "INSERT INTO message (id, session_id, time_created, data) VALUES (?, ?, ?, ?)", + ( + "msg_assistant", + session_id, + 1706000002000, + json.dumps({"role": "assistant", "providerID": "openai", "modelID": "gpt-5.5"}), + ), + ) + conn.execute( + "INSERT INTO part (id, message_id, time_created, data) VALUES (?, ?, ?, ?)", + ("prt_user", "msg_user", 1706000001001, json.dumps({"type": "text", "text": "Read in.png"})), + ) + conn.execute( + "INSERT INTO part (id, message_id, time_created, data) VALUES (?, ?, ?, ?)", + ( + "prt_read", + "msg_assistant", + 1706000002001, + json.dumps( + { + "type": "tool", + "tool": "read", + "state": { + "status": "completed", + "input": {"filePath": "C:\\tmp\\test_codex\\in.png", "limit": 2000, "offset": 1}, + "output": "Image read successfully", + "attachments": [ + { + "type": "file", + "mime": "image/png", + "url": "data:image/png;base64,QUJDRA==", + "id": "prt_attachment", + "sessionID": session_id, + "messageID": "msg_assistant", + } + ], + }, + } + ), + ), + ) + conn.commit() + conn.close() + + monkeypatch.setattr("dataclaw.parsers.opencode.OPENCODE_DB_PATH", db_path) + monkeypatch.setattr("dataclaw.parsers.opencode._PROJECT_INDEX", {}) + + sessions = parse_project_sessions(cwd, mock_anonymizer, source="opencode") + + assert len(sessions) == 1 + tool_use = sessions[0]["messages"][1]["tool_uses"][0] + assert tool_use["status"] == "success" + assert tool_use["output"] == { + "text": "Image read successfully", + "raw": { + "content": [ + { + "type": "image", + "source": { + "type": "base64", + "media_type": "image/png", + "data": "QUJDRA==", + }, + } + ] + }, + } + + def test_parse_opencode_tool_error_output(self, tmp_path, monkeypatch, mock_anonymizer): + disable_other_providers(monkeypatch, tmp_path, keep={"opencode"}) + db_path = tmp_path / "opencode.db" + conn = write_opencode_db(db_path) + + session_id = "ses_tool_error" + cwd = "C:\\tmp\\test_codex" + conn.execute( + "INSERT INTO session (id, directory, time_created, time_updated) VALUES (?, ?, ?, ?)", + (session_id, cwd, 1706000000000, 1706000005000), + ) + conn.execute( + "INSERT INTO message (id, session_id, time_created, data) VALUES (?, ?, ?, ?)", + ( + "msg_assistant", + session_id, + 1706000002000, + json.dumps({"role": "assistant", "providerID": "openai", "modelID": "gpt-5.5"}), + ), + ) + conn.execute( + "INSERT INTO part (id, message_id, time_created, data) VALUES (?, ?, ?, ?)", + ( + "prt_read_error", + "msg_assistant", + 1706000002001, + json.dumps( + { + "type": "tool", + "tool": "read", + "state": { + "status": "error", + "input": {"filePath": "C:\\tmp\\test_codex\\in.png", "limit": 2000, "offset": 0}, + "error": "offset must be greater than or equal to 1", + }, + } + ), + ), + ) + conn.commit() + conn.close() + + monkeypatch.setattr("dataclaw.parsers.opencode.OPENCODE_DB_PATH", db_path) + monkeypatch.setattr("dataclaw.parsers.opencode._PROJECT_INDEX", {}) + + sessions = parse_project_sessions(cwd, mock_anonymizer, source="opencode") + + assert len(sessions) == 1 + tool_use = sessions[0]["messages"][0]["tool_uses"][0] + assert tool_use["status"] == "error" + assert tool_use["output"] == {"text": "offset must be greater than or equal to 1"} + def test_parse_project_sessions_reuses_single_db_connection(self, tmp_path, monkeypatch, mock_anonymizer): disable_other_providers(monkeypatch, tmp_path, keep={"opencode"}) db_path = tmp_path / "opencode.db" From a48232e2d3e11dab3d79fb3f3df50a43836ebd8e Mon Sep 17 00:00:00 2001 From: woctordho Date: Sat, 2 May 2026 12:21:56 +0800 Subject: [PATCH 5/8] Deduplicate image data for Claude Code --- dataclaw/parsers/claude.py | 38 ++++++++++++++++++++++++++++-- tests/test_parser_claude.py | 46 +++++++++++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+), 2 deletions(-) diff --git a/dataclaw/parsers/claude.py b/dataclaw/parsers/claude.py index 9608d26..12f5dc4 100644 --- a/dataclaw/parsers/claude.py +++ b/dataclaw/parsers/claude.py @@ -181,7 +181,7 @@ def build_tool_result_output( if text is None: text = extract_tool_result_text(entry.get("toolUseResult")) - raw_result = sanitize_tool_use_result(entry.get("toolUseResult"), text) + raw_result = sanitize_tool_use_result(entry.get("toolUseResult"), text, raw_content) source_tool_uuid = entry.get("sourceToolAssistantUUID") if isinstance(source_tool_uuid, str) and source_tool_uuid: if raw_result is None: @@ -283,6 +283,7 @@ def normalize_tool_result_text(value: Any) -> str | None: def sanitize_tool_use_result( tool_use_result: Any, text: str | None, + raw_content: Any = None, ) -> dict[str, Any] | None: if tool_use_result is None: return None @@ -295,7 +296,7 @@ def sanitize_tool_use_result( return None return {"text": sanitized_text} - sanitized = tool_use_result + sanitized = drop_duplicate_tool_result_blobs(tool_use_result, raw_content) sanitized = drop_redundant_result_fields(sanitized) sanitized = drop_duplicate_text_fields(sanitized, text) pruned = prune_empty_values(sanitized) @@ -306,6 +307,39 @@ def sanitize_tool_use_result( return {"value": pruned} +def drop_duplicate_tool_result_blobs(tool_use_result: Any, raw_content: Any) -> Any: + duplicate_blobs = collect_tool_result_blobs(raw_content) + if not duplicate_blobs: + return tool_use_result + return drop_matching_base64_fields(tool_use_result, duplicate_blobs) + + +def collect_tool_result_blobs(value: Any) -> set[str]: + blobs: set[str] = set() + if isinstance(value, dict): + source = value.get("source") + if isinstance(source, dict) and source.get("type") == "base64": + data = source.get("data") + if isinstance(data, str) and data: + blobs.add(data) + for item in value.values(): + blobs.update(collect_tool_result_blobs(item)) + elif isinstance(value, list): + for item in value: + blobs.update(collect_tool_result_blobs(item)) + return blobs + + +def drop_matching_base64_fields(value: Any, duplicate_blobs: set[str], key: str | None = None) -> Any: + if isinstance(value, dict): + return {k: drop_matching_base64_fields(v, duplicate_blobs, k) for k, v in value.items()} + if isinstance(value, list): + return [drop_matching_base64_fields(item, duplicate_blobs) for item in value] + if isinstance(value, str) and key == "base64" and value in duplicate_blobs: + return None + return value + + def drop_redundant_result_fields(value: Any) -> Any: if isinstance(value, dict): redundant_keys = set() diff --git a/tests/test_parser_claude.py b/tests/test_parser_claude.py index 4a07bd3..c5d8ab2 100644 --- a/tests/test_parser_claude.py +++ b/tests/test_parser_claude.py @@ -854,6 +854,52 @@ def test_non_text_tool_result_blocks_preserved(self, mock_anonymizer): assert output["raw"]["content"][0]["type"] == "image" assert output["raw"]["content"][0]["source"]["data"] == image_data + def test_image_tool_result_drops_duplicate_tool_use_result_base64(self, mock_anonymizer): + image_data = "A" * 5000 + entries = [ + { + "type": "user", + "message": { + "content": [ + { + "type": "tool_result", + "tool_use_id": "tu-image", + "content": [ + { + "type": "image", + "source": { + "type": "base64", + "data": image_data, + "media_type": "image/jpeg", + }, + } + ], + } + ] + }, + "toolUseResult": { + "type": "image", + "file": { + "base64": image_data, + "type": "image/jpeg", + "originalSize": 1567019, + }, + }, + "sourceToolAssistantUUID": "assistant-image", + } + ] + + result = build_tool_result_map(entries) + raw = result["tu-image"]["output"]["raw"] + + assert raw["content"][0]["source"]["data"] == image_data + assert raw["toolUseResult"]["type"] == "image" + assert raw["toolUseResult"]["file"] == { + "type": "image/jpeg", + "originalSize": 1567019, + } + assert raw["toolUseResult"]["sourceToolAssistantUUID"] == "assistant-image" + def test_large_string_blob_content_preserved_verbatim_in_raw(self, mock_anonymizer): blob = "A" * 5000 entries = [ From 6e5669b385226a41590c397b2d245ab3e055d8ac Mon Sep 17 00:00:00 2001 From: woctordho Date: Sat, 2 May 2026 12:41:56 +0800 Subject: [PATCH 6/8] Let each provider define own keys exempted from anonymization --- dataclaw/_cli/exporting.py | 15 ++++++++++++-- dataclaw/parsers/claude.py | 1 + dataclaw/parsers/codex.py | 1 + dataclaw/parsers/gemini.py | 1 + dataclaw/providers.py | 13 +++++++++++++ dataclaw/secrets.py | 27 +++++++++++++++++-------- tests/test_secrets.py | 40 ++++++++++++++++++++++++++++++++++++++ 7 files changed, 88 insertions(+), 10 deletions(-) diff --git a/dataclaw/_cli/exporting.py b/dataclaw/_cli/exporting.py index 6a068eb..f647c72 100644 --- a/dataclaw/_cli/exporting.py +++ b/dataclaw/_cli/exporting.py @@ -17,6 +17,7 @@ from .._workers import configured_workers from ..anonymizer import Anonymizer from ..parser import iter_project_sessions +from ..providers import get_provider_non_anon_string_keys from ..secrets import transform_session from ..session_tasks import ExportSessionTask, build_export_session_tasks, parse_export_session_task from .common import HF_TAG, REPO_URL, SKILL_URL, _format_token_count, _provider_dataset_tags @@ -197,7 +198,12 @@ def _export_session_task_worker(payload) -> _WorkerSessionResult: if not model or model == "": return _WorkerSessionResult(project_index=task.project_index, skipped_model=True) - session, n_redacted = transform_session(session, anonymizer, custom_strings=custom_strings) + session, n_redacted = transform_session( + session, + anonymizer, + custom_strings=custom_strings, + non_anon_string_keys=get_provider_non_anon_string_keys(task.source), + ) fingerprint = _gemini_dedupe_fingerprint(session, task.source) stats = session.get("stats", {}) input_tokens, output_tokens = _token_totals(stats) @@ -283,7 +289,12 @@ def _export_to_jsonl_serial( skipped += 1 continue - session, n_redacted = transform_session(session, anonymizer, custom_strings=custom_strings) + session, n_redacted = transform_session( + session, + anonymizer, + custom_strings=custom_strings, + non_anon_string_keys=get_provider_non_anon_string_keys(source), + ) total_redactions += n_redacted fingerprint = _gemini_dedupe_fingerprint(session, source) diff --git a/dataclaw/parsers/claude.py b/dataclaw/parsers/claude.py index 12f5dc4..030d5b0 100644 --- a/dataclaw/parsers/claude.py +++ b/dataclaw/parsers/claude.py @@ -22,6 +22,7 @@ SOURCE = "claude" CLAUDE_DIR = Path.home() / ".claude" PROJECTS_DIR = CLAUDE_DIR / "projects" +NON_ANON_STRING_KEYS = frozenset({"sourceToolAssistantUUID"}) def discover_projects(projects_dir: Path | None = None) -> list[dict]: diff --git a/dataclaw/parsers/codex.py b/dataclaw/parsers/codex.py index 27e3170..9f89a8f 100644 --- a/dataclaw/parsers/codex.py +++ b/dataclaw/parsers/codex.py @@ -29,6 +29,7 @@ CODEX_DIR = Path.home() / ".codex" CODEX_SESSIONS_DIR = CODEX_DIR / "sessions" CODEX_ARCHIVED_DIR = CODEX_DIR / "archived_sessions" +NON_ANON_STRING_KEYS = frozenset({"wall_time"}) UNKNOWN_CODEX_CWD = "" _PROJECT_INDEX: dict[str, list[Path]] = {} diff --git a/dataclaw/parsers/gemini.py b/dataclaw/parsers/gemini.py index 175f1b6..919b9ea 100644 --- a/dataclaw/parsers/gemini.py +++ b/dataclaw/parsers/gemini.py @@ -24,6 +24,7 @@ SOURCE = "gemini" GEMINI_DIR = Path.home() / ".gemini" / "tmp" +NON_ANON_STRING_KEYS = frozenset({"id", "name", "tool_use_id"}) _HASH_MAP: dict[str, str] = {} diff --git a/dataclaw/providers.py b/dataclaw/providers.py index e434614..8780387 100644 --- a/dataclaw/providers.py +++ b/dataclaw/providers.py @@ -51,6 +51,9 @@ def has_session_source(self) -> bool: def missing_source_message(self) -> str: return f"{self.source_path} was not found." + def non_anon_string_keys(self) -> frozenset[str]: + return frozenset() + @dataclass(frozen=True) class ModuleProvider(Provider): @@ -87,6 +90,9 @@ def parse_export_session_task( ) -> dict | None: return self.module.parse_export_session_task(task, anonymizer, include_thinking) + def non_anon_string_keys(self) -> frozenset[str]: + return frozenset(getattr(self.module, "NON_ANON_STRING_KEYS", ())) + PROVIDERS: dict[str, Provider] = { _claude_mod.SOURCE: ModuleProvider.from_module( @@ -138,5 +144,12 @@ def get_provider(source: str) -> Provider: return PROVIDERS[source] +def get_provider_non_anon_string_keys(source: str) -> frozenset[str]: + provider = PROVIDERS.get(source) + if provider is None: + return frozenset() + return provider.non_anon_string_keys() + + def iter_providers() -> tuple[Provider, ...]: return PROVIDER_ORDER diff --git a/dataclaw/secrets.py b/dataclaw/secrets.py index 764956a..b192375 100644 --- a/dataclaw/secrets.py +++ b/dataclaw/secrets.py @@ -51,6 +51,9 @@ ) _NON_ANON_STRING_KEYS = frozenset( { + # Keep this list limited to keys in DataClaw's normalized export schema. + # Providers that preserve source-specific raw payload fields can expose + # NON_ANON_STRING_KEYS in their parser module; export passes those here. "session_id", "model", "git_branch", @@ -62,13 +65,8 @@ "status", "type", "media_type", - "mime_type", - "id", - "tool_use_id", - "sourceToolAssistantUUID", "source", "project", - "wall_time", } ) @@ -489,6 +487,7 @@ def _transform_value( custom_strings: list[str] | None = None, key: str | None = None, parent_dict: dict[str, Any] | None = None, + non_anon_string_keys: frozenset[str] = _NON_ANON_STRING_KEYS, ) -> tuple[Any, int, bool]: """Recursively anonymize and/or redact a string, list, or dict value.""" if isinstance(value, str): @@ -501,7 +500,7 @@ def _transform_value( count = 0 changed = False - if anonymizer is not None and key not in _NON_ANON_STRING_KEYS: + if anonymizer is not None and key not in non_anon_string_keys: anonymized = anonymizer.text(result) if anonymized != result: result = anonymized @@ -521,7 +520,7 @@ def _transform_value( total = 0 out: dict[Any, Any] | None = None for k, v in value.items(): - transformed, n, changed = _transform_value(v, anonymizer, custom_strings, k, value) + transformed, n, changed = _transform_value(v, anonymizer, custom_strings, k, value, non_anon_string_keys) total += n if out is None: if not changed: @@ -536,7 +535,14 @@ def _transform_value( total = 0 out_list: list[Any] | None = None for idx, item in enumerate(value): - transformed, n, changed = _transform_value(item, anonymizer, custom_strings, key, parent_dict) + transformed, n, changed = _transform_value( + item, + anonymizer, + custom_strings, + key, + parent_dict, + non_anon_string_keys, + ) total += n if out_list is None: if not changed: @@ -553,9 +559,11 @@ def transform_session( session: dict, anonymizer: Anonymizer, custom_strings: list[str] | None = None, + non_anon_string_keys: frozenset[str] | set[str] | None = None, ) -> tuple[dict, int]: """Anonymize and redact all exported session content in one pass.""" total = 0 + effective_non_anon_string_keys = _NON_ANON_STRING_KEYS | frozenset(non_anon_string_keys or ()) for msg in session.get("messages", []): for field in ("content", "thinking"): @@ -566,6 +574,7 @@ def transform_session( custom_strings, field, msg, + effective_non_anon_string_keys, ) total += count if msg.get("content_parts"): @@ -575,6 +584,7 @@ def transform_session( custom_strings, "content_parts", msg, + effective_non_anon_string_keys, ) total += count for tool_use in msg.get("tool_uses", []): @@ -586,6 +596,7 @@ def transform_session( custom_strings, field, tool_use, + effective_non_anon_string_keys, ) total += count diff --git a/tests/test_secrets.py b/tests/test_secrets.py index b842ecc..2683135 100644 --- a/tests/test_secrets.py +++ b/tests/test_secrets.py @@ -3,6 +3,7 @@ import pytest from dataclaw.secrets import ( + _NON_ANON_STRING_KEYS, REDACTED, _has_mixed_char_types, _shannon_entropy, @@ -11,6 +12,7 @@ redact_text, scan_text, should_skip_large_binary_string, + transform_session, ) # --- _shannon_entropy --- @@ -769,3 +771,41 @@ def test_redact_session_skips_data_url_source(self): assert result["messages"][0]["content_parts"][0]["source"]["url"] == data_url assert REDACTED in result["messages"][0]["content_parts"][1]["content"] assert count >= 1 + + +class TestTransformSession: + def test_common_non_anon_keys_do_not_include_provider_raw_keys(self): + assert "sourceToolAssistantUUID" not in _NON_ANON_STRING_KEYS + assert "wall_time" not in _NON_ANON_STRING_KEYS + assert "tool_use_id" not in _NON_ANON_STRING_KEYS + + def test_provider_can_exempt_raw_payload_keys(self, mock_anonymizer): + session = { + "messages": [ + { + "role": "assistant", + "tool_uses": [ + { + "tool": "Read", + "output": { + "raw": { + "sourceToolAssistantUUID": "assistant-testuser", + "note": "path owned by testuser", + } + }, + } + ], + } + ] + } + + result, count = transform_session( + session, + mock_anonymizer, + non_anon_string_keys={"sourceToolAssistantUUID"}, + ) + + raw = result["messages"][0]["tool_uses"][0]["output"]["raw"] + assert count == 0 + assert raw["sourceToolAssistantUUID"] == "assistant-testuser" + assert raw["note"] == f"path owned by {mock_anonymizer.username_hash}" From 2fc90b327ed23ddcd6d4c06fb5e2d139ce7d851f Mon Sep 17 00:00:00 2001 From: woctordho Date: Sat, 2 May 2026 13:02:20 +0800 Subject: [PATCH 7/8] Keep text file content in user messages for OpenCode --- dataclaw/parsers/opencode.py | 48 ++++- tests/test_parser_opencode.py | 320 ++++++++++++++++++++++++++++++++++ 2 files changed, 359 insertions(+), 9 deletions(-) diff --git a/dataclaw/parsers/opencode.py b/dataclaw/parsers/opencode.py index b1d9b95..f73449a 100644 --- a/dataclaw/parsers/opencode.py +++ b/dataclaw/parsers/opencode.py @@ -26,6 +26,8 @@ OPENCODE_DIR = Path.home() / ".local" / "share" / "opencode" OPENCODE_DB_PATH = OPENCODE_DIR / "opencode.db" UNKNOWN_OPENCODE_CWD = "" +READ_TOOL_SYNTHETIC_PREFIX = "Called the Read tool with the following input: " +IMAGE_READ_SUCCESS_TEXT = "Image read successfully" _PROJECT_INDEX: dict[str, list[str]] = {} _SESSION_SIZE_MAP: dict[str, int] = {} @@ -331,15 +333,23 @@ def build_opencode_file_source(url: Any, mime: Any) -> dict[str, Any] | None: return source -def extract_opencode_file_part(part: dict[str, Any]) -> dict[str, Any] | None: +def extract_opencode_file_part(part: dict[str, Any], file_path: str | None = None) -> dict[str, Any] | None: source = build_opencode_file_source(part.get("url"), part.get("mime")) if source is None: return None mime = part.get("mime") if isinstance(mime, str) and mime.startswith("image/"): - return {"type": "image", "source": source} - return {"type": "document", "source": source} + content_part = {"type": "image", "source": source} + else: + content_part = {"type": "document", "source": source} + + if source.get("type") == "base64": + if file_path: + content_part["path"] = file_path + elif isinstance(part.get("filename"), str) and part["filename"]: + content_part["filename"] = part["filename"] + return content_part def extract_opencode_tool_attachment(attachment: Any) -> dict[str, Any] | None: @@ -348,6 +358,16 @@ def extract_opencode_tool_attachment(attachment: Any) -> dict[str, Any] | None: return extract_opencode_file_part(attachment) +def extract_synthetic_read_file_path(text: Any) -> str | None: + if not isinstance(text, str) or not text.startswith(READ_TOOL_SYNTHETIC_PREFIX): + return None + args = load_json_field(text.removeprefix(READ_TOOL_SYNTHETIC_PREFIX)) + file_path = args.get("filePath") + if isinstance(file_path, str) and file_path: + return file_path + return None + + def build_opencode_tool_output(state: dict[str, Any]) -> dict[str, Any]: output: dict[str, Any] = {} text = state.get("output") @@ -368,22 +388,32 @@ def build_opencode_tool_output(state: dict[str, Any]) -> dict[str, Any]: def extract_user_message(parts: Iterable[dict[str, Any]]) -> dict[str, Any] | None: + part_list = [part for part in parts if isinstance(part, dict)] + pending_file_path: str | None = None + text_parts: list[str] = [] content_parts: list[dict[str, Any]] = [] - for part in parts: - if not isinstance(part, dict): - continue + for part in part_list: part_type = part.get("type") if part_type == "text": - if part.get("synthetic") is True: - continue text = part.get("text") + if part.get("synthetic") is True: + file_path = extract_synthetic_read_file_path(text) + if file_path is not None: + pending_file_path = file_path + continue + if text == IMAGE_READ_SUCCESS_TEXT: + continue if isinstance(text, str) and text.strip(): text_parts.append(text.strip()) elif part_type == "file": - content_part = extract_opencode_file_part(part) + content_part = extract_opencode_file_part(part, pending_file_path) if content_part is not None: content_parts.append(content_part) + pending_file_path = None + else: + if part.get("synthetic") is True: + continue if not text_parts and not content_parts: return None diff --git a/tests/test_parser_opencode.py b/tests/test_parser_opencode.py index e778177..2f7f036 100644 --- a/tests/test_parser_opencode.py +++ b/tests/test_parser_opencode.py @@ -180,6 +180,7 @@ def test_parse_opencode_user_file_parts(self, tmp_path, monkeypatch, mock_anonym assert message["content"] == "Please inspect these files." assert message["content_parts"][0] == { "type": "image", + "filename": "plot.png", "source": { "type": "base64", "media_type": "image/png", @@ -242,6 +243,7 @@ def test_parse_opencode_user_file_only_message(self, tmp_path, monkeypatch, mock assert message["content_parts"] == [ { "type": "image", + "filename": "plot.png", "source": { "type": "base64", "media_type": "image/png", @@ -314,6 +316,7 @@ def test_parse_opencode_user_synthetic_image_file(self, tmp_path, monkeypatch, m assert message["content_parts"] == [ { "type": "image", + "path": "C:\\tmp\\test_codex\\in.png", "source": { "type": "base64", "media_type": "image/png", @@ -322,6 +325,323 @@ def test_parse_opencode_user_synthetic_image_file(self, tmp_path, monkeypatch, m } ] + def test_parse_opencode_user_synthetic_text_file_content(self, tmp_path, monkeypatch, mock_anonymizer): + disable_other_providers(monkeypatch, tmp_path, keep={"opencode"}) + db_path = tmp_path / "opencode.db" + conn = write_opencode_db(db_path) + + session_id = "ses_synthetic_text_file" + cwd = "C:\\dataclaw" + conn.execute( + "INSERT INTO session (id, directory, time_created, time_updated) VALUES (?, ?, ?, ?)", + (session_id, cwd, 1706000000000, 1706000005000), + ) + conn.execute( + "INSERT INTO message (id, session_id, time_created, data) VALUES (?, ?, ?, ?)", + ( + "msg_user", + session_id, + 1706000001000, + json.dumps({"role": "user", "model": {"providerID": "openai", "modelID": "gpt-5.5"}}), + ), + ) + parts = [ + {"id": "prt_text", "data": {"type": "text", "text": "Check @dataclaw\\cli.py"}}, + { + "id": "prt_synthetic_call", + "data": { + "type": "text", + "synthetic": True, + "text": 'Called the Read tool with the following input: {"filePath":"C:\\\\dataclaw\\\\dataclaw\\\\cli.py"}', + }, + }, + { + "id": "prt_synthetic_content", + "data": { + "type": "text", + "synthetic": True, + "text": 'C:\\dataclaw\\dataclaw\\cli.py\nfile\n\n1: """CLI facade for DataClaw."""\n', + }, + }, + { + "id": "prt_file", + "data": { + "type": "file", + "mime": "text/plain", + "filename": "dataclaw\\cli.py", + "url": "file:///C:/dataclaw/dataclaw/cli.py", + }, + }, + ] + for index, part in enumerate(parts): + conn.execute( + "INSERT INTO part (id, message_id, time_created, data) VALUES (?, ?, ?, ?)", + (part["id"], "msg_user", 1706000001001 + index, json.dumps(part["data"])), + ) + conn.commit() + conn.close() + + monkeypatch.setattr("dataclaw.parsers.opencode.OPENCODE_DB_PATH", db_path) + monkeypatch.setattr("dataclaw.parsers.opencode._PROJECT_INDEX", {}) + + sessions = parse_project_sessions(cwd, mock_anonymizer, source="opencode") + + assert len(sessions) == 1 + message = sessions[0]["messages"][0] + assert message["content"] == ( + 'Check @dataclaw\\cli.py\n\nC:\\dataclaw\\dataclaw\\cli.py\nfile\n\n1: """CLI facade for DataClaw."""\n' + ) + assert message["content_parts"] == [ + { + "type": "document", + "source": { + "type": "url", + "url": "file:///C:/dataclaw/dataclaw/cli.py", + "media_type": "text/plain", + }, + } + ] + + def test_parse_opencode_user_synthetic_legacy_text_file_contents(self, tmp_path, monkeypatch, mock_anonymizer): + disable_other_providers(monkeypatch, tmp_path, keep={"opencode"}) + db_path = tmp_path / "opencode.db" + conn = write_opencode_db(db_path) + + session_id = "ses_legacy_text_files" + cwd = "C:\\tmp\\test_html" + conn.execute( + "INSERT INTO session (id, directory, time_created, time_updated) VALUES (?, ?, ?, ?)", + (session_id, cwd, 1706000000000, 1706000005000), + ) + conn.execute( + "INSERT INTO message (id, session_id, time_created, data) VALUES (?, ?, ?, ?)", + ( + "msg_user", + session_id, + 1706000001000, + json.dumps({"role": "user", "model": {"providerID": "openai", "modelID": "gpt-5.5"}}), + ), + ) + parts = [ + { + "id": "prt_text", + "data": {"type": "text", "text": "Read @abs.html and @abs_expected.json"}, + }, + { + "id": "prt_doc_html", + "data": { + "type": "file", + "mime": "text/plain", + "filename": "abs.html", + "url": "file://C:\\tmp\\test_html/abs.html", + }, + }, + { + "id": "prt_doc_json", + "data": { + "type": "file", + "mime": "text/plain", + "filename": "abs_expected.json", + "url": "file://C:\\tmp\\test_html/abs_expected.json", + }, + }, + { + "id": "prt_synthetic_call_json", + "data": { + "type": "text", + "synthetic": True, + "text": 'Called the Read tool with the following input: {"filePath":"C:\\\\tmp\\\\test_html\\\\abs_expected.json"}', + }, + }, + { + "id": "prt_synthetic_call_html", + "data": { + "type": "text", + "synthetic": True, + "text": 'Called the Read tool with the following input: {"filePath":"C:\\\\tmp\\\\test_html\\\\abs.html"}', + }, + }, + { + "id": "prt_synthetic_html", + "data": {"type": "text", "synthetic": True, "text": "\n00001| \n"}, + }, + { + "id": "prt_synthetic_json", + "data": {"type": "text", "synthetic": True, "text": '\n00001| {"Input": {}}\n'}, + }, + ] + for index, part in enumerate(parts): + conn.execute( + "INSERT INTO part (id, message_id, time_created, data) VALUES (?, ?, ?, ?)", + (part["id"], "msg_user", 1706000001001 + index, json.dumps(part["data"])), + ) + conn.commit() + conn.close() + + monkeypatch.setattr("dataclaw.parsers.opencode.OPENCODE_DB_PATH", db_path) + monkeypatch.setattr("dataclaw.parsers.opencode._PROJECT_INDEX", {}) + + sessions = parse_project_sessions(cwd, mock_anonymizer, source="opencode") + + assert len(sessions) == 1 + message = sessions[0]["messages"][0] + assert "Called the Read tool" not in message["content"] + assert message["content"] == ( + 'Read @abs.html and @abs_expected.json\n\n\n00001| \n\n\n\n00001| {"Input": {}}\n' + ) + assert message["content_parts"] == [ + { + "type": "document", + "source": { + "type": "url", + "url": "file://C:\\tmp\\test_html/abs.html", + "media_type": "text/plain", + }, + }, + { + "type": "document", + "source": { + "type": "url", + "url": "file://C:\\tmp\\test_html/abs_expected.json", + "media_type": "text/plain", + }, + }, + ] + + def test_parse_opencode_user_synthetic_multiple_image_and_text_files(self, tmp_path, monkeypatch, mock_anonymizer): + disable_other_providers(monkeypatch, tmp_path, keep={"opencode"}) + db_path = tmp_path / "opencode.db" + conn = write_opencode_db(db_path) + + session_id = "ses_multiple_files" + cwd = "C:\\tmp\\test_codex" + conn.execute( + "INSERT INTO session (id, directory, time_created, time_updated) VALUES (?, ?, ?, ?)", + (session_id, cwd, 1706000000000, 1706000005000), + ) + conn.execute( + "INSERT INTO message (id, session_id, time_created, data) VALUES (?, ?, ?, ?)", + ( + "msg_user", + session_id, + 1706000001000, + json.dumps({"role": "user", "model": {"providerID": "openai", "modelID": "gpt-5.5"}}), + ), + ) + parts = [ + {"id": "prt_text", "data": {"type": "text", "text": "Read @in1.png, @in2.png, and @in3.txt"}}, + { + "id": "prt_synthetic_call_image1", + "data": { + "type": "text", + "synthetic": True, + "text": 'Called the Read tool with the following input: {"filePath":"C:\\\\tmp\\\\test_codex\\\\in1.png"}', + }, + }, + { + "id": "prt_image_success1", + "data": {"type": "text", "synthetic": True, "text": "Image read successfully"}, + }, + { + "id": "prt_image1", + "data": { + "type": "file", + "mime": "image/png", + "filename": "in1.png", + "url": "data:image/png;base64,SU1HMQ==", + "synthetic": True, + }, + }, + { + "id": "prt_synthetic_call_image2", + "data": { + "type": "text", + "synthetic": True, + "text": 'Called the Read tool with the following input: {"filePath":"C:\\\\tmp\\\\test_codex\\\\in2.png"}', + }, + }, + { + "id": "prt_image_success2", + "data": {"type": "text", "synthetic": True, "text": "Image read successfully"}, + }, + { + "id": "prt_image2", + "data": { + "type": "file", + "mime": "image/png", + "filename": "in2.png", + "url": "data:image/png;base64,SU1HMg==", + "synthetic": True, + }, + }, + { + "id": "prt_synthetic_call_text", + "data": { + "type": "text", + "synthetic": True, + "text": 'Called the Read tool with the following input: {"filePath":"C:\\\\tmp\\\\test_codex\\\\in3.txt"}', + }, + }, + { + "id": "prt_synthetic_text_content", + "data": { + "type": "text", + "synthetic": True, + "text": "C:\\tmp\\test_codex\\in3.txt\nfile\n\n1: hello\n", + }, + }, + { + "id": "prt_doc", + "data": { + "type": "file", + "mime": "text/plain", + "filename": "in3.txt", + "url": "file://C:\\tmp\\test_codex/in3.txt", + }, + }, + ] + for index, part in enumerate(parts): + conn.execute( + "INSERT INTO part (id, message_id, time_created, data) VALUES (?, ?, ?, ?)", + (part["id"], "msg_user", 1706000001001 + index, json.dumps(part["data"])), + ) + conn.commit() + conn.close() + + monkeypatch.setattr("dataclaw.parsers.opencode.OPENCODE_DB_PATH", db_path) + monkeypatch.setattr("dataclaw.parsers.opencode._PROJECT_INDEX", {}) + + sessions = parse_project_sessions(cwd, mock_anonymizer, source="opencode") + + assert len(sessions) == 1 + message = sessions[0]["messages"][0] + assert "Called the Read tool" not in message["content"] + assert "Image read successfully" not in message["content"] + assert message["content"] == ( + "Read @in1.png, @in2.png, and @in3.txt\n\n" + "C:\\tmp\\test_codex\\in3.txt\nfile\n\n1: hello\n" + ) + assert message["content_parts"] == [ + { + "type": "image", + "path": "C:\\tmp\\test_codex\\in1.png", + "source": {"type": "base64", "media_type": "image/png", "data": "SU1HMQ=="}, + }, + { + "type": "image", + "path": "C:\\tmp\\test_codex\\in2.png", + "source": {"type": "base64", "media_type": "image/png", "data": "SU1HMg=="}, + }, + { + "type": "document", + "source": { + "type": "url", + "url": "file://C:\\tmp\\test_codex/in3.txt", + "media_type": "text/plain", + }, + }, + ] + def test_parse_opencode_tool_image_attachments(self, tmp_path, monkeypatch, mock_anonymizer): disable_other_providers(monkeypatch, tmp_path, keep={"opencode"}) db_path = tmp_path / "opencode.db" From af6c4fc28ee3614b87289cd03235c49fd5616499 Mon Sep 17 00:00:00 2001 From: woctordho Date: Sat, 2 May 2026 14:51:22 +0800 Subject: [PATCH 8/8] Remove RELEASE_NOTES.md . Just write it in GitHub Releases. --- RELEASE_NOTES.md | 20 -------------------- 1 file changed, 20 deletions(-) delete mode 100644 RELEASE_NOTES.md diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md deleted file mode 100644 index 1992bfa..0000000 --- a/RELEASE_NOTES.md +++ /dev/null @@ -1,20 +0,0 @@ -# DataClaw 0.4.2 - -This release adds the macOS menu-bar app distribution path for DataClaw. - -## What's New - -- Signed macOS app release workflow for Apple Silicon Macs. -- Direct GitHub Release download for Apple Silicon Macs: - - `DataClaw-macOS-Apple-Silicon.dmg` -- Bundled PyInstaller sidecar so Mac app users do not need to install Python or the CLI separately. -- Tauri updater support through a signed `latest.json` release asset. -- Release documentation covering signing, notarization, updater credentials, and verification. - -## Install - -Download the latest Apple Silicon DMG: - -https://github.com/peteromallet/dataclaw/releases/latest/download/DataClaw-macOS-Apple-Silicon.dmg - -Intel Mac users can use the CLI install for now.