Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions src/epochx/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -652,11 +652,17 @@ def bench_submit_run(
task_results_list = []
for task_id, result in task_results_data.items():
external_id = task_id.split("/", 1)[1] if "/" in task_id else task_id
task_results_list.append({
item = {
"task_id": external_id,
"passed": result.get("passed", False),
"score": result.get("score", 0.0),
})
}
# Attach trajectory and output if available
if result.get("trajectory"):
item["trajectory"] = result["trajectory"]
if result.get("output"):
item["output"] = result["output"][:5000]
task_results_list.append(item)

payload = {
"benchmark_name": stats.benchmark,
Expand Down
13 changes: 13 additions & 0 deletions src/epochx/core/prompt_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,19 @@ def generate_prompt(task: Task, ws_info: WorkspaceInfo) -> str:
)
sections.append("")

# Trajectory
sections.append("## Trajectory (optional)\n")
sections.append(
"Your SSH commands are logged automatically. "
"For richer trajectory data (reasoning, tool choices), "
"append JSONL to `/.epochx/trajectory.jsonl` inside the container:\n"
)
sections.append("```json")
sections.append('{"step":1,"type":"thought","content":"analyzing the issue..."}')
sections.append('{"step":2,"type":"tool_call","tool_name":"grep","tool_input":"grep -r pattern .","tool_output":"..."}')
sections.append("```")
sections.append("")

# When Done
sections.append("## When Done\n")
sections.append(
Expand Down
24 changes: 24 additions & 0 deletions src/epochx/core/runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,30 @@ def setup(self, task_id: str, workspace_spec: WorkspaceSpec) -> WorkspaceInfo:
echo "{pubkey}" >> /root/.ssh/authorized_keys
chmod 600 /root/.ssh/authorized_keys
/usr/sbin/sshd 2>/dev/null || true

# ── Trajectory: auto-log ALL bash commands (interactive + non-interactive) ──
# Uses DEBUG trap which fires for every command in every bash session,
# including non-interactive 'ssh host "cmd"' invocations by agents.
cat > /etc/bash.epochx_log << 'LOGEOF'
_epochx_trap() {{
local cmd="$BASH_COMMAND"
case "$cmd" in _epochx_trap*|true|false|"") return;; esac
[ -d "/.epochx" ] && printf '{{"ts":"%s","cmd":"%s"}}\\n' \
"$(date -u +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || echo unknown)" \
"$(echo "$cmd" | head -c 2000 | sed 's/\\\\/\\\\\\\\/g; s/"/\\\\"/g' | tr '\\n' ' ')" \
>> /.epochx/ssh_log.jsonl 2>/dev/null
}}
trap '_epochx_trap' DEBUG
LOGEOF
# Inject into ALL bash startup paths so non-interactive SSH also picks it up
for f in /etc/bash.bashrc /root/.bashrc; do
grep -q 'epochx_log' "$f" 2>/dev/null || echo '. /etc/bash.epochx_log' >> "$f" 2>/dev/null
done
# Set BASH_ENV in sshd so non-interactive 'ssh host "cmd"' also sources it
# This is the critical line — without it, non-interactive SSH won't log commands.
echo 'SetEnv BASH_ENV=/etc/bash.epochx_log' >> /etc/ssh/sshd_config 2>/dev/null || true
# Restart sshd to pick up the new config
pkill sshd 2>/dev/null; /usr/sbin/sshd 2>/dev/null || true
"""
container.exec_run(["bash", "-c", setup_script])

Expand Down
9 changes: 7 additions & 2 deletions src/epochx/exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,13 +96,18 @@ def get_task_results(self, benchmark: str | None = None) -> list[dict]:
results = self.state.get_results(benchmark=benchmark)
out = []
for task_id, r in sorted(results.items()):
out.append({
item = {
"task_id": task_id,
"benchmark": r.get("benchmark", ""),
"passed": r.get("passed", False),
"score": r.get("score", 0.0),
"details": r.get("details", {}),
})
}
if r.get("trajectory"):
item["trajectory"] = r["trajectory"]
if r.get("output"):
item["output"] = r["output"]
out.append(item)
return out


Expand Down
119 changes: 116 additions & 3 deletions src/epochx/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from __future__ import annotations

import json
import subprocess
from dataclasses import asdict
from datetime import datetime, timezone
from pathlib import Path
Expand Down Expand Up @@ -154,22 +155,33 @@ def collect_task(self, task_id: str) -> dict:
output = adapter.collect_output(env.workspace, task, env=env)

# Write output to .epochx/output.txt
output_path = Path(env.workspace) / ".epochx" / "output.txt"
output_path.parent.mkdir(parents=True, exist_ok=True)
epochx_dir = Path(env.workspace) / ".epochx"
epochx_dir.mkdir(parents=True, exist_ok=True)
output_path = epochx_dir / "output.txt"
output_path.write_text(output)

# Collect trajectory
trajectory = self._collect_trajectory(env)
if trajectory:
(epochx_dir / "trajectory_collected.json").write_text(
json.dumps(trajectory, ensure_ascii=False, indent=2)
)

# Update status
self.state.update_status(task_id, TaskStatus.COLLECTING.value)

truncated = output[:500] + ("..." if len(output) > 500 else "")
return {
result = {
"status": "collected",
"task_id": task_id,
"output_type": task.output_spec.type.value,
"content": truncated,
"saved_to": str(output_path),
"next_command": f"epochx-bench grade {task_id}",
}
if trajectory:
result["trajectory_steps"] = len(trajectory)
return result

# ------------------------------------------------------------------
# grade_task
Expand Down Expand Up @@ -202,6 +214,20 @@ def grade_task(self, task_id: str) -> dict:
# Save result to task's own .epochx/result.json
result_dict = asdict(result)
result_dict["benchmark"] = env.benchmark

# Attach trajectory if collected
traj_path = Path(env.workspace) / ".epochx" / "trajectory_collected.json"
if traj_path.exists():
try:
result_dict["trajectory"] = json.loads(traj_path.read_text())
except Exception:
pass

# Attach output
output_path_for_result = Path(env.workspace) / ".epochx" / "output.txt"
if output_path_for_result.exists():
result_dict["output"] = output_path_for_result.read_text()

result_path = Path(env.workspace) / ".epochx" / "result.json"
result_path.write_text(json.dumps(result_dict, indent=2))

Expand Down Expand Up @@ -290,6 +316,93 @@ def get_next_task(self, benchmark_name: str) -> dict:
"start_command": f"epochx-bench run {benchmark_name} --task {next_task.external_id}",
}

def _collect_trajectory(self, env: EnvironmentState) -> list[dict]:
"""Collect trajectory from multiple sources, merge into one list.

Sources (in priority order):
1. /.epochx/trajectory.jsonl — agent-written rich trajectory
2. /.epochx/ssh_log.jsonl — auto-recorded SSH commands
3. git log inside container — fallback: extract commits as steps
"""
epochx_dir = Path(env.workspace) / ".epochx"
trajectory: list[dict] = []

# Source 1: agent-written trajectory
agent_traj_path = epochx_dir / "trajectory.jsonl"
if agent_traj_path.exists():
for line in agent_traj_path.read_text().splitlines():
line = line.strip()
if not line:
continue
try:
trajectory.append(json.loads(line))
except json.JSONDecodeError:
continue

# Source 2: SSH command log (from host-side runtime.exec logging)
ssh_log_path = epochx_dir / "ssh_log.jsonl"
if ssh_log_path.exists():
step = len(trajectory)
for line in ssh_log_path.read_text().splitlines():
line = line.strip()
if not line:
continue
try:
entry = json.loads(line)
except json.JSONDecodeError:
continue
step += 1
trajectory.append({
"step": step,
"type": "tool_call",
"tool_name": "shell",
"tool_input": entry.get("cmd", ""),
"tool_output": entry.get("output", "")[:500],
"duration_ms": entry.get("ms"),
"content": f"[auto-logged] {entry.get('ts', '')}",
"source": "ssh_log",
})

# Source 3: git log fallback
if not trajectory and env.ssh_host:
workdir = env.container_workdir or "/testbed"
try:
result = subprocess.run(
["ssh", env.ssh_host,
f"cd {workdir} && git log --oneline --reverse --format='%H|%s|%ai' 2>/dev/null | tail -20"],
capture_output=True, text=True, timeout=15,
)
step = 0
for line in result.stdout.strip().splitlines():
parts = line.split("|", 2)
if len(parts) < 2:
continue
step += 1
trajectory.append({
"step": step,
"type": "action",
"tool_name": "git_commit",
"content": parts[1],
"tool_input": parts[0][:12],
"source": "git_log",
})
result2 = subprocess.run(
["ssh", env.ssh_host,
f"cd {workdir} && git diff --stat HEAD~1 HEAD 2>/dev/null || true"],
capture_output=True, text=True, timeout=15,
)
if result2.stdout.strip() and trajectory:
trajectory.append({
"step": step + 1,
"type": "observation",
"content": f"Changes: {result2.stdout.strip()}",
"source": "git_log",
})
except Exception:
pass

return trajectory

@staticmethod
def _to_external_id(task_id: str, benchmark: str) -> str:
"""Strip benchmark prefix from full task ID to get external_id.
Expand Down