Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,7 @@ The auto-generated HF README includes:
- Did you export all data, especially:
- tool call inputs and outputs
- long inputs and outputs that may be saved somewhere else
- binary content (may be encoded as base64) such as images. We do not apply anonymizer on binary content
- binary content (may be encoded as base64) such as images, in both user messages and tool calls. We do not apply anonymizer on binary content
- subagents
- Does the coding agent automatically delete old sessions? How to prevent this?

Expand Down
20 changes: 0 additions & 20 deletions RELEASE_NOTES.md

This file was deleted.

21 changes: 16 additions & 5 deletions dataclaw/_cli/exporting.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@
from .._workers import configured_workers
from ..anonymizer import Anonymizer
from ..parser import iter_project_sessions
from ..secrets import redact_session
from ..providers import get_provider_non_anon_string_keys
from ..secrets import transform_session
from ..session_tasks import ExportSessionTask, build_export_session_tasks, parse_export_session_task
from .common import HF_TAG, REPO_URL, SKILL_URL, _format_token_count, _provider_dataset_tags

Expand Down Expand Up @@ -197,8 +198,13 @@ def _export_session_task_worker(payload) -> _WorkerSessionResult:
if not model or model == "<synthetic>":
return _WorkerSessionResult(project_index=task.project_index, skipped_model=True)

session, n_redacted = transform_session(
session,
anonymizer,
custom_strings=custom_strings,
non_anon_string_keys=get_provider_non_anon_string_keys(task.source),
)
fingerprint = _gemini_dedupe_fingerprint(session, task.source)
session, n_redacted = redact_session(session, custom_strings=custom_strings)
stats = session.get("stats", {})
input_tokens, output_tokens = _token_totals(stats)
has_token_stats = isinstance(stats, dict) and ("input_tokens" in stats or "output_tokens" in stats)
Expand Down Expand Up @@ -283,13 +289,18 @@ def _export_to_jsonl_serial(
skipped += 1
continue

session, n_redacted = transform_session(
session,
anonymizer,
custom_strings=custom_strings,
non_anon_string_keys=get_provider_non_anon_string_keys(source),
)
total_redactions += n_redacted

fingerprint = _gemini_dedupe_fingerprint(session, source)
if fingerprint is not None and fingerprint in seen_fingerprints:
continue

session, n_redacted = redact_session(session, custom_strings=custom_strings)
total_redactions += n_redacted

if fingerprint is not None:
seen_fingerprints.add(fingerprint)

Expand Down
43 changes: 39 additions & 4 deletions dataclaw/parsers/claude.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
SOURCE = "claude"
CLAUDE_DIR = Path.home() / ".claude"
PROJECTS_DIR = CLAUDE_DIR / "projects"
NON_ANON_STRING_KEYS = frozenset({"sourceToolAssistantUUID"})


def discover_projects(projects_dir: Path | None = None) -> list[dict]:
Expand Down Expand Up @@ -181,7 +182,7 @@ def build_tool_result_output(
if text is None:
text = extract_tool_result_text(entry.get("toolUseResult"))

raw_result = sanitize_tool_use_result(entry.get("toolUseResult"), text)
raw_result = sanitize_tool_use_result(entry.get("toolUseResult"), text, raw_content)
source_tool_uuid = entry.get("sourceToolAssistantUUID")
if isinstance(source_tool_uuid, str) and source_tool_uuid:
if raw_result is None:
Expand Down Expand Up @@ -283,6 +284,7 @@ def normalize_tool_result_text(value: Any) -> str | None:
def sanitize_tool_use_result(
tool_use_result: Any,
text: str | None,
raw_content: Any = None,
) -> dict[str, Any] | None:
if tool_use_result is None:
return None
Expand All @@ -295,7 +297,7 @@ def sanitize_tool_use_result(
return None
return {"text": sanitized_text}

sanitized = tool_use_result
sanitized = drop_duplicate_tool_result_blobs(tool_use_result, raw_content)
sanitized = drop_redundant_result_fields(sanitized)
sanitized = drop_duplicate_text_fields(sanitized, text)
pruned = prune_empty_values(sanitized)
Expand All @@ -306,6 +308,39 @@ def sanitize_tool_use_result(
return {"value": pruned}


def drop_duplicate_tool_result_blobs(tool_use_result: Any, raw_content: Any) -> Any:
duplicate_blobs = collect_tool_result_blobs(raw_content)
if not duplicate_blobs:
return tool_use_result
return drop_matching_base64_fields(tool_use_result, duplicate_blobs)


def collect_tool_result_blobs(value: Any) -> set[str]:
blobs: set[str] = set()
if isinstance(value, dict):
source = value.get("source")
if isinstance(source, dict) and source.get("type") == "base64":
data = source.get("data")
if isinstance(data, str) and data:
blobs.add(data)
for item in value.values():
blobs.update(collect_tool_result_blobs(item))
elif isinstance(value, list):
for item in value:
blobs.update(collect_tool_result_blobs(item))
return blobs


def drop_matching_base64_fields(value: Any, duplicate_blobs: set[str], key: str | None = None) -> Any:
if isinstance(value, dict):
return {k: drop_matching_base64_fields(v, duplicate_blobs, k) for k, v in value.items()}
if isinstance(value, list):
return [drop_matching_base64_fields(item, duplicate_blobs) for item in value]
if isinstance(value, str) and key == "base64" and value in duplicate_blobs:
return None
return value


def drop_redundant_result_fields(value: Any) -> Any:
if isinstance(value, dict):
redundant_keys = set()
Expand Down Expand Up @@ -422,7 +457,7 @@ def parse_session_file(
except OSError:
return None

return make_session_result(metadata, messages, stats, anonymizer=anonymizer)
return make_session_result(metadata, messages, stats)


def find_subagent_sessions(project_dir: Path) -> list[Path]:
Expand Down Expand Up @@ -490,7 +525,7 @@ def parse_subagent_session(
return None

metadata["session_id"] = resolve_subagent_session_id(session_dir, metadata["session_id"])
return make_session_result(metadata, messages, stats, anonymizer=anonymizer)
return make_session_result(metadata, messages, stats)


def resolve_subagent_session_id(session_dir: Path, session_id: str) -> str:
Expand Down
80 changes: 74 additions & 6 deletions dataclaw/parsers/codex.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import dataclasses
import logging
import posixpath
from collections.abc import Iterable
from pathlib import Path
from pathlib import Path, PurePosixPath, PureWindowsPath
from typing import Any

from .. import _json as json
Expand All @@ -28,6 +29,7 @@
CODEX_DIR = Path.home() / ".codex"
CODEX_SESSIONS_DIR = CODEX_DIR / "sessions"
CODEX_ARCHIVED_DIR = CODEX_DIR / "archived_sessions"
NON_ANON_STRING_KEYS = frozenset({"wall_time"})
UNKNOWN_CODEX_CWD = "<unknown-cwd>"

_PROJECT_INDEX: dict[str, list[Path]] = {}
Expand Down Expand Up @@ -174,6 +176,11 @@ def _build_codex_tool_result(payload: dict[str, Any]) -> dict[str, Any] | None:

if payload_type == "function_call_output":
raw = payload.get("output", "")
if isinstance(raw, list):
return {"output": _build_codex_structured_tool_output(raw), "status": "success"}
if not isinstance(raw, str):
return {"output": {"raw": raw}, "status": "success"}

out: dict[str, Any] = {}
lines = raw.splitlines()
output_lines: list[str] = []
Expand Down Expand Up @@ -215,6 +222,40 @@ def _build_codex_tool_result(payload: dict[str, Any]) -> dict[str, Any] | None:
return None


def _build_codex_structured_tool_output(parts: list[Any]) -> dict[str, Any]:
out: dict[str, Any] = {}
text_parts: list[str] = []
raw_parts: list[Any] = []

for part in parts:
if not isinstance(part, dict):
raw_parts.append(part)
continue

part_type = part.get("type")
if part_type in {"text", "output_text"}:
text = part.get("text")
if isinstance(text, str) and text.strip():
text_parts.append(text.strip())
raw_parts.append(part)
continue

if part_type == "input_image":
image_url = part.get("image_url")
if isinstance(image_url, str) and image_url:
image_part = _build_codex_image_part(image_url)
if image_part is not None:
raw_parts.append(image_part)
continue
raw_parts.append(part)

if text_parts:
out["text"] = "\n\n".join(text_parts)
if raw_parts:
out["raw"] = {"content": raw_parts}
return out


def parse_session_file(
filepath: Path,
anonymizer: Anonymizer,
Expand Down Expand Up @@ -283,7 +324,7 @@ def parse_session_file(
else:
state.metadata["model"] = "codex-unknown"

return make_session_result(state.metadata, state.messages, state.stats, anonymizer=anonymizer)
return make_session_result(state.metadata, state.messages, state.stats)


def handle_session_meta(
Expand Down Expand Up @@ -353,18 +394,45 @@ def _build_codex_image_part(image_url: str) -> dict[str, Any] | None:


def _build_codex_local_image_part(image_path: str, state: CodexParseState) -> dict[str, Any]:
path = Path(image_path)
if not path.is_absolute() and state.raw_cwd != UNKNOWN_CODEX_CWD:
path = Path(state.raw_cwd) / path
path = _resolve_codex_local_path(image_path, state.raw_cwd)
return {
"type": "image",
"source": {
"type": "url",
"url": f"file://{path}",
"url": _codex_file_url(path),
},
}


def _is_windows_absolute_path(path: str) -> bool:
return PureWindowsPath(path).is_absolute()


def _is_posix_absolute_path(path: str) -> bool:
return PurePosixPath(path).is_absolute()


def _resolve_codex_local_path(image_path: str, cwd: str) -> str:
if _is_windows_absolute_path(image_path) or _is_posix_absolute_path(image_path):
return image_path
if cwd == UNKNOWN_CODEX_CWD:
return image_path
if _is_windows_absolute_path(cwd):
return str(PureWindowsPath(cwd) / image_path)
if _is_posix_absolute_path(cwd):
return posixpath.join(cwd, image_path.replace("\\", "/"))
return str(Path(cwd) / image_path)


def _codex_file_url(path: str) -> str:
if _is_windows_absolute_path(path):
return PureWindowsPath(path).as_uri()
if _is_posix_absolute_path(path):
return PurePosixPath(path.replace("\\", "/")).as_uri()
normalized_path = path.replace("\\", "/")
return f"file://{normalized_path}"


def _extract_response_user_content_parts(payload: dict[str, Any]) -> list[dict[str, Any]]:
content_parts: list[dict[str, Any]] = []
for part in payload.get("content", []):
Expand Down
Loading
Loading