Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions changes/319.fix
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
HTML report now correctly displays timed-out (superseded) phases. Sessions where a phase
was interrupted by a timeout and restarted at a higher generation (timeout-resume pattern)
now show a synthesised ``superseded`` phase entry in the timeline with a distinct status
dot. The phase timeline is also sorted correctly: post-superseded gen-1 phases (verify,
review, submit) appear after the replacement generation rather than before it.
75 changes: 75 additions & 0 deletions src/raki/adapters/session_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,8 +172,83 @@ def _load_phases(
phases: list[PhaseResult] = []
for phase_name in PHASE_NAMES:
phases.extend(self._load_phase_files(source, phase_name, meta_raw, events))
phases.extend(self._synthesize_superseded_phases(phases, events))
return phases

@staticmethod
def _synthesize_superseded_phases(
phases: list[PhaseResult],
events: list[SessionEvent],
) -> list[PhaseResult]:
"""Detect and synthesize superseded PhaseResult entries from the event log.

A phase at generation G is considered 'superseded' when ALL of these
hold:

1. A ``phase_started`` event exists for ``(phase=P, gen=G)``.
2. A subsequent ``phase_started`` event exists for ``(phase=P, gen=G\')``,
i.e. the same phase restarted at a higher generation.
3. No ``phase_completed`` or ``phase_failed`` event exists for phase P
*between* the two consecutive ``phase_started`` events. This
distinguishes a clean handoff (gen-1 completed, gen-2 is a rework
cycle) from an interrupted handoff (gen-1 timed out, gen-2 resumed).
4. No loaded :class:`PhaseResult` already exists for ``(phase=P, gen=G)``
— avoids duplicating a phase whose output file was recorded on disk.

The synthesized phases carry ``status="superseded"``, an empty
``output``, and no cost/duration/token metadata (none was captured).
"""
# Index already-loaded (phase_name, generation) pairs to avoid duplication.
loaded_set: set[tuple[str, int]] = {(phase.name, phase.generation) for phase in phases}

synthesized: list[PhaseResult] = []

# Collect distinct phase names that appear in events (preserving order).
seen_phase_names: set[str] = set()
ordered_phase_names: list[str] = []
for event in events:
if event.phase is not None and event.phase not in seen_phase_names:
seen_phase_names.add(event.phase)
ordered_phase_names.append(event.phase)

for phase_name in ordered_phase_names:
# Extract only the events for this phase, in event-stream order.
phase_events = [event for event in events if event.phase == phase_name]

# Locate all phase_started events together with their slice index
# so we can inspect what lies between consecutive starts.
started_indices: list[tuple[int, int]] = [
(idx, event.data.get("generation", 1))
for idx, event in enumerate(phase_events)
if event.kind == "phase_started"
and isinstance(event.data.get("generation", 1), int)
]

# Examine consecutive pairs of starts. If nothing completes or
# fails between the first and second start, the first generation
# was interrupted (superseded).
for pair_idx in range(len(started_indices) - 1):
first_event_idx, first_gen = started_indices[pair_idx]
second_event_idx, _ = started_indices[pair_idx + 1]

# Scan the events between the two starts.
has_completion = any(
phase_events[between_idx].kind in ("phase_completed", "phase_failed")
for between_idx in range(first_event_idx + 1, second_event_idx)
)

if not has_completion and (phase_name, first_gen) not in loaded_set:
synthesized.append(
PhaseResult(
name=phase_name,
generation=first_gen,
status="superseded",
output="",
)
)

return synthesized

def _load_phase_files(
self,
source: Path,
Expand Down
2 changes: 1 addition & 1 deletion src/raki/model/phases.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ class ToolCall(BaseModel):
class PhaseResult(BaseModel):
name: str
generation: int
status: Literal["completed", "failed", "skipped"]
status: Literal["completed", "failed", "skipped", "superseded"]
cost_usd: float | None = None
duration_ms: int | None = None
tokens_in: int | None = None
Expand Down
44 changes: 41 additions & 3 deletions src/raki/report/html_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,14 @@ def sort_phases(
pipeline sequence). If ``pipeline_phases`` contains names not in
``PHASE_ORDER`` (e.g. Alcove bridge steps), those are appended after
the known phases.

**Superseded-phase grouping** (timeout-resume pattern): when a phase at
generation 1 is marked ``status="superseded"``, all non-superseded
generation-1 phases that come *after* the superseded phase in
``PHASE_ORDER`` are bumped to ``max_non_superseded_generation + 1`` for
sort purposes. This preserves the chronological truth: verify/review/submit
at gen-1 actually executed *after* the replacement implement(gen-2), not
before it.
"""
order_index: dict[str, int] = {name: idx for idx, name in enumerate(PHASE_ORDER)}
if pipeline_phases:
Expand All @@ -235,11 +243,36 @@ def sort_phases(
order_index[name] = next_idx
next_idx += 1
fallback = max(order_index.values(), default=0) + 1
return sorted(
phases,
key=lambda phase: (phase.generation, order_index.get(phase.name, fallback)),

# Detect the superseded-phase boundary: the highest PHASE_ORDER index of
# any superseded phase. -1 when there are no superseded phases.
superseded_boundary: int = -1
for phase in phases:
if phase.status == "superseded":
phase_idx = order_index.get(phase.name, fallback)
if phase_idx > superseded_boundary:
superseded_boundary = phase_idx

# The highest generation seen in non-superseded phases (used to compute
# the effective generation for post-superseded gen-1 phases).
max_non_superseded_gen: int = max(
(phase.generation for phase in phases if phase.status != "superseded"),
default=1,
)

def _sort_key(phase: PhaseResult) -> tuple[int, int]:
phase_idx = order_index.get(phase.name, fallback)
if phase.status == "superseded":
# Superseded phases sort within their own generation.
return (phase.generation, phase_idx)
if phase.generation == 1 and superseded_boundary >= 0 and phase_idx > superseded_boundary:
# Gen-1 phases that come after a superseded phase in the pipeline
# actually ran *after* the replacement generation — sort them last.
return (max_non_superseded_gen + 1, phase_idx)
return (phase.generation, phase_idx)

return sorted(phases, key=_sort_key)


def _get_metric_meta(name: str) -> dict[str, str | bool]:
"""Look up metadata for a metric, falling back to sensible defaults."""
Expand Down Expand Up @@ -277,6 +310,11 @@ def determine_verdict(sample: EvalSample) -> Literal["pass", "rework", "fail"]:
"""Determine the verdict for a session sample.

Logic: failed phase -> fail, rework_cycles > 0 -> rework, else pass.

Note: ``status="superseded"`` phases (ticket #319 — timeout-resume pattern)
are intentionally NOT treated as failures. A superseded phase is an
interrupted-but-replaced attempt; only ``status="failed"`` triggers the
fail verdict.
"""
for phase in sample.phases:
if phase.status == "failed":
Expand Down
9 changes: 5 additions & 4 deletions src/raki/report/templates/report.html.j2
Original file line number Diff line number Diff line change
Expand Up @@ -641,10 +641,11 @@
border-radius: 50%;
}

.phase-status-completed { background: var(--green); }
.phase-status-rework { background: var(--yellow); }
.phase-status-failed { background: var(--red); }
.phase-status-skipped { background: var(--text-muted); }
.phase-status-completed { background: var(--green); }
.phase-status-rework { background: var(--yellow); }
.phase-status-failed { background: var(--red); }
.phase-status-skipped { background: var(--text-muted); }
.phase-status-superseded { background: var(--text-muted); opacity: 0.5; border: 1px solid var(--yellow); }

.phase-duration {
color: var(--text-muted);
Expand Down
14 changes: 13 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def make_sample(
patch_cycles: int = 0,
cost: float = 10.0,
verify_gen: int = 1,
verify_status: Literal["completed", "failed", "skipped"] = "completed",
verify_status: Literal["completed", "failed", "skipped", "superseded"] = "completed",
findings: list[ReviewFinding] | None = None,
duration_ms: int | None = None,
tokens_in: int | None = None,
Expand Down Expand Up @@ -151,6 +151,18 @@ def soda_session_dir(fixtures_dir: Path) -> Path:
return fixtures_dir / "soda-session"


@pytest.fixture
def timeout_resume_dir(sessions_dir: Path) -> Path:
"""Return the path to the timeout-resume session fixture.

This fixture models a session where implement gen-1 timed out mid-execution
(no completion event or output file was recorded) and was automatically
resumed at gen-2. It is the canonical test case for the 'superseded'
phase status and the _synthesize_superseded_phases adapter logic.
"""
return sessions_dir / "timeout-resume"


@pytest.fixture
def manifest_with_session(tmp_path: Path, pass_simple_dir: Path) -> tuple[Path, Path]:
"""Create a tmp_path with a manifest and a copied pass-simple session.
Expand Down
13 changes: 13 additions & 0 deletions tests/fixtures/sessions/timeout-resume/events.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{"timestamp":"2026-05-01T10:00:00Z","phase":"triage","kind":"phase_started","data":{"generation":1}}
{"timestamp":"2026-05-01T10:00:25Z","phase":"triage","kind":"phase_completed","data":{"cost":0.30,"duration_ms":25000}}
{"timestamp":"2026-05-01T10:00:25Z","phase":"plan","kind":"phase_started","data":{"generation":1}}
{"timestamp":"2026-05-01T10:01:25Z","phase":"plan","kind":"phase_completed","data":{"cost":0.80,"duration_ms":60000}}
{"timestamp":"2026-05-01T10:01:25Z","phase":"implement","kind":"phase_started","data":{"generation":1}}
{"timestamp":"2026-05-01T10:05:25Z","phase":"implement","kind":"phase_started","data":{"generation":2}}
{"timestamp":"2026-05-01T10:09:25Z","phase":"implement","kind":"phase_completed","data":{"cost":3.20,"duration_ms":240000}}
{"timestamp":"2026-05-01T10:09:25Z","phase":"verify","kind":"phase_started","data":{"generation":1}}
{"timestamp":"2026-05-01T10:10:55Z","phase":"verify","kind":"phase_completed","data":{"cost":1.10,"duration_ms":90000,"summary":"PASS"}}
{"timestamp":"2026-05-01T10:10:55Z","phase":"review","kind":"phase_started","data":{"generation":1}}
{"timestamp":"2026-05-01T10:12:45Z","phase":"review","kind":"phase_completed","data":{"cost":1.50,"duration_ms":110000}}
{"timestamp":"2026-05-01T10:12:45Z","phase":"submit","kind":"phase_started","data":{"generation":1}}
{"timestamp":"2026-05-01T10:13:15Z","phase":"submit","kind":"phase_completed","data":{"cost":0.60,"duration_ms":30000}}
1 change: 1 addition & 0 deletions tests/fixtures/sessions/timeout-resume/implement.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"ticket_key": "timeout-01", "branch": "soda/timeout-01", "commits": [{"hash": "abc1234", "message": "feat: add retry logic for transient HTTP errors [#timeout-01]", "task_id": "T1"}], "files_changed": [{"path": "src/http_client.py", "action": "modified"}, {"path": "tests/test_http_client.py", "action": "created"}], "task_results": [{"task_id": "T1", "status": "completed"}, {"task_id": "T2", "status": "completed"}], "tests_passed": true, "test_output": "12 passed in 0.08s", "deviations": []}
16 changes: 16 additions & 0 deletions tests/fixtures/sessions/timeout-resume/meta.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"ticket": "timeout-01",
"summary": "Fix: add retry logic for transient HTTP errors",
"branch": "soda/timeout-01",
"started_at": "2026-05-01T10:00:00Z",
"total_cost": 8.5,
"rework_cycles": 1,
"phases": {
"triage": {"status": "completed", "cost": 0.30, "duration_ms": 25000, "generation": 1},
"plan": {"status": "completed", "cost": 0.80, "duration_ms": 60000, "generation": 1},
"implement": {"status": "completed", "cost": 3.20, "duration_ms": 240000, "generation": 2},
"verify": {"status": "completed", "cost": 1.10, "duration_ms": 90000, "generation": 1},
"review": {"status": "completed", "cost": 1.50, "duration_ms": 110000, "generation": 1},
"submit": {"status": "completed", "cost": 0.60, "duration_ms": 30000, "generation": 1}
}
}
1 change: 1 addition & 0 deletions tests/fixtures/sessions/timeout-resume/plan.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"ticket_key": "timeout-01", "approach": "implement retry logic using tenacity library", "tasks": [{"task_id": "T1", "description": "Add retry decorator to HTTP client", "files": ["src/http_client.py"]}, {"task_id": "T2", "description": "Add unit tests", "files": ["tests/test_http_client.py"]}]}
1 change: 1 addition & 0 deletions tests/fixtures/sessions/timeout-resume/review.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"ticket_key": "timeout-01", "verdict": "approve", "findings": []}
1 change: 1 addition & 0 deletions tests/fixtures/sessions/timeout-resume/submit.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"ticket_key": "timeout-01", "branch": "soda/timeout-01", "target": "main", "pr_url": "https://github.com/example/repo/pull/42", "title": "feat: add retry logic for transient HTTP errors"}
1 change: 1 addition & 0 deletions tests/fixtures/sessions/timeout-resume/triage.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"ticket_key": "timeout-01", "approach": "add retry decorator with exponential backoff", "complexity": "medium", "code_area": "network/http_client.py", "files": ["src/http_client.py", "tests/test_http_client.py"], "risks": ["regression in retry timing"]}
1 change: 1 addition & 0 deletions tests/fixtures/sessions/timeout-resume/verify.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"ticket_key": "timeout-01", "verdict": "PASS", "command_results": [{"command": "python -m pytest tests/", "exit_code": 0, "passed": true}], "criteria_results": [{"criterion": "retry on transient HTTP 503", "passed": true, "evidence": "test_retry_on_503 passes"}]}
69 changes: 67 additions & 2 deletions tests/test_adapters.py
Original file line number Diff line number Diff line change
Expand Up @@ -1279,8 +1279,8 @@ def test_load_directory_with_valid_adapter_name(self, sessions_dir):
registry.register(SessionSchemaAdapter())
loader = DatasetLoader(registry)
dataset = loader.load_directory(sessions_dir, adapter_name="session-schema")
# pass-simple and rework-cycle are valid session dirs under sessions/
assert len(dataset.samples) == 2
# pass-simple, rework-cycle, and timeout-resume are valid session dirs
assert len(dataset.samples) == 3

def test_load_directory_with_invalid_adapter_name(self, sessions_dir):
registry = AdapterRegistry()
Expand Down Expand Up @@ -3565,3 +3565,68 @@ def test_session_meta_adapter_format_defaults_to_empty_string():
rework_cycles=0,
)
assert meta.adapter_format == ""


# --- Ticket #319: superseded phase synthesis ---


TIMEOUT_RESUME_FIXTURE = FIXTURES / "timeout-resume"


def test_superseded_phase_synthesized_for_timeout_resume(timeout_resume_dir: Path):
"""Adapter synthesizes a superseded PhaseResult for the timed-out gen-1 implement."""
adapter = SessionSchemaAdapter()
sample = adapter.load(timeout_resume_dir)
superseded = [phase for phase in sample.phases if phase.status == "superseded"]
assert len(superseded) == 1
assert superseded[0].name == "implement"
assert superseded[0].generation == 1


def test_superseded_phase_does_not_duplicate_existing_phase(timeout_resume_dir: Path):
"""Non-missing generations (gen-2 implement file exists) are NOT synthesized as superseded."""
adapter = SessionSchemaAdapter()
sample = adapter.load(timeout_resume_dir)
impl_phases = [phase for phase in sample.phases if phase.name == "implement"]
# Should have exactly 2: implement(gen=1, superseded) + implement(gen=2, completed)
assert len(impl_phases) == 2
statuses = {phase.generation: phase.status for phase in impl_phases}
assert statuses[1] == "superseded"
assert statuses[2] == "completed"


def test_no_spurious_superseded_on_normal_rework(rework_cycle_dir: Path):
"""Normal rework cycles (both gen files present) must NOT produce superseded phases."""
adapter = SessionSchemaAdapter()
sample = adapter.load(rework_cycle_dir)
superseded = [phase for phase in sample.phases if phase.status == "superseded"]
assert superseded == [], f"Expected no superseded phases, got {superseded}"


def test_no_spurious_superseded_on_soda_session(soda_session_dir: Path):
"""SODA rework session (gen-1 completed, gen-2 replaces it) must NOT produce superseded."""
adapter = SessionSchemaAdapter()
sample = adapter.load(soda_session_dir)
superseded = [phase for phase in sample.phases if phase.status == "superseded"]
assert superseded == [], f"Expected no superseded phases, got {superseded}"


def test_superseded_phase_has_empty_output(timeout_resume_dir: Path):
"""Synthesized superseded phases must have an empty output string."""
adapter = SessionSchemaAdapter()
sample = adapter.load(timeout_resume_dir)
superseded = [phase for phase in sample.phases if phase.status == "superseded"]
assert len(superseded) == 1
assert superseded[0].output == ""


def test_superseded_synthesis_respects_completion_event(soda_session_dir: Path):
"""When phase_completed exists for gen-1, no superseded phase is synthesized."""
# soda-session has phase_completed for implement gen-1 in events.jsonl
adapter = SessionSchemaAdapter()
sample = adapter.load(soda_session_dir)
impl_phases = [phase for phase in sample.phases if phase.name == "implement"]
# gen-1 file is implement.json.1 in the soda-session fixture
# No superseded phase should be synthesized because gen-1 completed
superseded = [phase for phase in impl_phases if phase.status == "superseded"]
assert superseded == []
Loading
Loading