diff --git a/src/epochx/cli.py b/src/epochx/cli.py index d4aa209..d9015d8 100644 --- a/src/epochx/cli.py +++ b/src/epochx/cli.py @@ -652,11 +652,17 @@ def bench_submit_run( task_results_list = [] for task_id, result in task_results_data.items(): external_id = task_id.split("/", 1)[1] if "/" in task_id else task_id - task_results_list.append({ + item = { "task_id": external_id, "passed": result.get("passed", False), "score": result.get("score", 0.0), - }) + } + # Attach trajectory and output if available + if result.get("trajectory"): + item["trajectory"] = result["trajectory"] + if result.get("output"): + item["output"] = result["output"][:5000] + task_results_list.append(item) payload = { "benchmark_name": stats.benchmark, diff --git a/src/epochx/core/prompt_generator.py b/src/epochx/core/prompt_generator.py index 846aad8..292ad7c 100644 --- a/src/epochx/core/prompt_generator.py +++ b/src/epochx/core/prompt_generator.py @@ -91,6 +91,19 @@ def generate_prompt(task: Task, ws_info: WorkspaceInfo) -> str: ) sections.append("") + # Trajectory + sections.append("## Trajectory (optional)\n") + sections.append( + "Your SSH commands are logged automatically. " + "For richer trajectory data (reasoning, tool choices), " + "append JSONL to `/.epochx/trajectory.jsonl` inside the container:\n" + ) + sections.append("```json") + sections.append('{"step":1,"type":"thought","content":"analyzing the issue..."}') + sections.append('{"step":2,"type":"tool_call","tool_name":"grep","tool_input":"grep -r pattern .","tool_output":"..."}') + sections.append("```") + sections.append("") + # When Done sections.append("## When Done\n") sections.append( diff --git a/src/epochx/core/runtime.py b/src/epochx/core/runtime.py index 33ebe5e..fbf8599 100644 --- a/src/epochx/core/runtime.py +++ b/src/epochx/core/runtime.py @@ -175,6 +175,30 @@ def setup(self, task_id: str, workspace_spec: WorkspaceSpec) -> WorkspaceInfo: echo "{pubkey}" >> /root/.ssh/authorized_keys chmod 600 /root/.ssh/authorized_keys /usr/sbin/sshd 2>/dev/null || true + +# ── Trajectory: auto-log ALL bash commands (interactive + non-interactive) ── +# Uses DEBUG trap which fires for every command in every bash session, +# including non-interactive 'ssh host "cmd"' invocations by agents. +cat > /etc/bash.epochx_log << 'LOGEOF' +_epochx_trap() {{ + local cmd="$BASH_COMMAND" + case "$cmd" in _epochx_trap*|true|false|"") return;; esac + [ -d "/.epochx" ] && printf '{{"ts":"%s","cmd":"%s"}}\\n' \ + "$(date -u +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || echo unknown)" \ + "$(echo "$cmd" | head -c 2000 | sed 's/\\\\/\\\\\\\\/g; s/"/\\\\"/g' | tr '\\n' ' ')" \ + >> /.epochx/ssh_log.jsonl 2>/dev/null +}} +trap '_epochx_trap' DEBUG +LOGEOF +# Inject into ALL bash startup paths so non-interactive SSH also picks it up +for f in /etc/bash.bashrc /root/.bashrc; do + grep -q 'epochx_log' "$f" 2>/dev/null || echo '. /etc/bash.epochx_log' >> "$f" 2>/dev/null +done +# Set BASH_ENV in sshd so non-interactive 'ssh host "cmd"' also sources it +# This is the critical line — without it, non-interactive SSH won't log commands. +echo 'SetEnv BASH_ENV=/etc/bash.epochx_log' >> /etc/ssh/sshd_config 2>/dev/null || true +# Restart sshd to pick up the new config +pkill sshd 2>/dev/null; /usr/sbin/sshd 2>/dev/null || true """ container.exec_run(["bash", "-c", setup_script]) diff --git a/src/epochx/exporter.py b/src/epochx/exporter.py index fb3c473..9bf0945 100644 --- a/src/epochx/exporter.py +++ b/src/epochx/exporter.py @@ -96,13 +96,18 @@ def get_task_results(self, benchmark: str | None = None) -> list[dict]: results = self.state.get_results(benchmark=benchmark) out = [] for task_id, r in sorted(results.items()): - out.append({ + item = { "task_id": task_id, "benchmark": r.get("benchmark", ""), "passed": r.get("passed", False), "score": r.get("score", 0.0), "details": r.get("details", {}), - }) + } + if r.get("trajectory"): + item["trajectory"] = r["trajectory"] + if r.get("output"): + item["output"] = r["output"] + out.append(item) return out diff --git a/src/epochx/runner.py b/src/epochx/runner.py index d94cfee..20dd897 100644 --- a/src/epochx/runner.py +++ b/src/epochx/runner.py @@ -3,6 +3,7 @@ from __future__ import annotations import json +import subprocess from dataclasses import asdict from datetime import datetime, timezone from pathlib import Path @@ -154,15 +155,23 @@ def collect_task(self, task_id: str) -> dict: output = adapter.collect_output(env.workspace, task, env=env) # Write output to .epochx/output.txt - output_path = Path(env.workspace) / ".epochx" / "output.txt" - output_path.parent.mkdir(parents=True, exist_ok=True) + epochx_dir = Path(env.workspace) / ".epochx" + epochx_dir.mkdir(parents=True, exist_ok=True) + output_path = epochx_dir / "output.txt" output_path.write_text(output) + # Collect trajectory + trajectory = self._collect_trajectory(env) + if trajectory: + (epochx_dir / "trajectory_collected.json").write_text( + json.dumps(trajectory, ensure_ascii=False, indent=2) + ) + # Update status self.state.update_status(task_id, TaskStatus.COLLECTING.value) truncated = output[:500] + ("..." if len(output) > 500 else "") - return { + result = { "status": "collected", "task_id": task_id, "output_type": task.output_spec.type.value, @@ -170,6 +179,9 @@ def collect_task(self, task_id: str) -> dict: "saved_to": str(output_path), "next_command": f"epochx-bench grade {task_id}", } + if trajectory: + result["trajectory_steps"] = len(trajectory) + return result # ------------------------------------------------------------------ # grade_task @@ -202,6 +214,20 @@ def grade_task(self, task_id: str) -> dict: # Save result to task's own .epochx/result.json result_dict = asdict(result) result_dict["benchmark"] = env.benchmark + + # Attach trajectory if collected + traj_path = Path(env.workspace) / ".epochx" / "trajectory_collected.json" + if traj_path.exists(): + try: + result_dict["trajectory"] = json.loads(traj_path.read_text()) + except Exception: + pass + + # Attach output + output_path_for_result = Path(env.workspace) / ".epochx" / "output.txt" + if output_path_for_result.exists(): + result_dict["output"] = output_path_for_result.read_text() + result_path = Path(env.workspace) / ".epochx" / "result.json" result_path.write_text(json.dumps(result_dict, indent=2)) @@ -290,6 +316,93 @@ def get_next_task(self, benchmark_name: str) -> dict: "start_command": f"epochx-bench run {benchmark_name} --task {next_task.external_id}", } + def _collect_trajectory(self, env: EnvironmentState) -> list[dict]: + """Collect trajectory from multiple sources, merge into one list. + + Sources (in priority order): + 1. /.epochx/trajectory.jsonl — agent-written rich trajectory + 2. /.epochx/ssh_log.jsonl — auto-recorded SSH commands + 3. git log inside container — fallback: extract commits as steps + """ + epochx_dir = Path(env.workspace) / ".epochx" + trajectory: list[dict] = [] + + # Source 1: agent-written trajectory + agent_traj_path = epochx_dir / "trajectory.jsonl" + if agent_traj_path.exists(): + for line in agent_traj_path.read_text().splitlines(): + line = line.strip() + if not line: + continue + try: + trajectory.append(json.loads(line)) + except json.JSONDecodeError: + continue + + # Source 2: SSH command log (from host-side runtime.exec logging) + ssh_log_path = epochx_dir / "ssh_log.jsonl" + if ssh_log_path.exists(): + step = len(trajectory) + for line in ssh_log_path.read_text().splitlines(): + line = line.strip() + if not line: + continue + try: + entry = json.loads(line) + except json.JSONDecodeError: + continue + step += 1 + trajectory.append({ + "step": step, + "type": "tool_call", + "tool_name": "shell", + "tool_input": entry.get("cmd", ""), + "tool_output": entry.get("output", "")[:500], + "duration_ms": entry.get("ms"), + "content": f"[auto-logged] {entry.get('ts', '')}", + "source": "ssh_log", + }) + + # Source 3: git log fallback + if not trajectory and env.ssh_host: + workdir = env.container_workdir or "/testbed" + try: + result = subprocess.run( + ["ssh", env.ssh_host, + f"cd {workdir} && git log --oneline --reverse --format='%H|%s|%ai' 2>/dev/null | tail -20"], + capture_output=True, text=True, timeout=15, + ) + step = 0 + for line in result.stdout.strip().splitlines(): + parts = line.split("|", 2) + if len(parts) < 2: + continue + step += 1 + trajectory.append({ + "step": step, + "type": "action", + "tool_name": "git_commit", + "content": parts[1], + "tool_input": parts[0][:12], + "source": "git_log", + }) + result2 = subprocess.run( + ["ssh", env.ssh_host, + f"cd {workdir} && git diff --stat HEAD~1 HEAD 2>/dev/null || true"], + capture_output=True, text=True, timeout=15, + ) + if result2.stdout.strip() and trajectory: + trajectory.append({ + "step": step + 1, + "type": "observation", + "content": f"Changes: {result2.stdout.strip()}", + "source": "git_log", + }) + except Exception: + pass + + return trajectory + @staticmethod def _to_external_id(task_id: str, benchmark: str) -> str: """Strip benchmark prefix from full task ID to get external_id.