Skip to content

Commit a4bbdcd

Browse files
committed
fix(tests): historical per-phase verdict display
Cells now show each phase's as-graded result. Two reworded flag plans carry their predecessor wording's records.
1 parent daf4bcb commit a4bbdcd

4 files changed

Lines changed: 793 additions & 9 deletions

File tree

app/tests/[testId]/TestDetailClient.tsx

Lines changed: 39 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,14 @@ export function TestDetailClient({
157157
const lb: LeaderboardJson = await fetch(`${CDN_BASE}/leaderboard.json`).then((r) =>
158158
r.json(),
159159
);
160+
// Real per-phase × per-plan verdict history, reassembled from each
161+
// scoring round's published score snapshots. Optional — when absent
162+
// the matrix falls back to authoring-phase verdict + retained gaps.
163+
const phaseHistory: {
164+
agents?: Record<string, Record<string, string>>;
165+
} | null = await fetch(`${CDN_BASE}/phase-history.json`)
166+
.then((r) => (r.ok ? r.json() : null))
167+
.catch(() => null);
160168
const slugs = lb.rankings.map((r) => r.agent_slug);
161169
const list: AgentVerdict[] = await Promise.all(
162170
slugs.map(async (slug) => {
@@ -200,6 +208,23 @@ export function TestDetailClient({
200208
const ownPhaseRun = runs.find((r) => runPhase(r) === planPhase);
201209
const ownVerdict = ownPhaseRun?.per_test_verdicts.find(matchVerdict)
202210
?.verdict as Verdict;
211+
// Per-phase history row for this (agent, plan) — a 10-char code
212+
// string (p/f/b/i, '.' = no record), keyed by full plan name with
213+
// the same prefix tolerance as verdict joins.
214+
const historyByName = phaseHistory?.agents?.[slug] ?? null;
215+
const historyRow = historyByName
216+
? (planName && historyByName[planName]) ??
217+
Object.entries(historyByName).find(([k]) =>
218+
planNameMatches(k, planName ?? ''),
219+
)?.[1] ??
220+
null
221+
: null;
222+
const HIST_CODE: Record<string, Verdict> = {
223+
p: 'passed',
224+
f: 'failed',
225+
b: 'blocked',
226+
i: 'inconclusive',
227+
} as Record<string, Verdict>;
203228
const columnCount = Math.max(TOTAL_PHASES, latestScoredPhase);
204229
const phase_history = Array.from({ length: columnCount }, (_, k) => {
205230
const ph = k + 1;
@@ -211,11 +236,18 @@ export function TestDetailClient({
211236
return { phase: ph, phase_label, verdict: undefined as Verdict, pending: false, not_applicable: true };
212237
}
213238
if (ph <= latestScoredPhase) {
214-
if (ph === planPhase) {
239+
// Real as-of-phase verdict from the score snapshots, when
240+
// recorded for this (agent, plan, phase).
241+
const code = historyRow?.[ph - 1];
242+
const hist = code ? HIST_CODE[code] : undefined;
243+
if (hist) {
244+
return { phase: ph, phase_label, verdict: hist, pending: false, not_applicable: false };
245+
}
246+
if (!historyRow && ph === planPhase) {
215247
return { phase: ph, phase_label, verdict: ownVerdict, pending: false, not_applicable: false };
216248
}
217-
// Re-run cumulatively in this phase, but the per-plan record
218-
// wasn't retained — an explicit gap, never a synthesized copy.
249+
// No per-plan record for this phase (plan reworded mid-event,
250+
// or record not retained) — an explicit gap, never a copy.
219251
return { phase: ph, phase_label, verdict: undefined as Verdict, pending: false, not_applicable: false, not_retained: true };
220252
}
221253
return { phase: ph, phase_label, verdict: undefined as Verdict, pending: true, not_applicable: false };
@@ -487,10 +519,10 @@ export function TestDetailClient({
487519
</div>
488520
<p className="history-note">
489521
Suites re-run cumulatively — every scored phase re-fires all
490-
earlier plans against that phase&apos;s deploy. A plan&apos;s verdict is
491-
recorded from its latest TestSprite run at its authoring phase;
492-
per-phase re-run records aren&apos;t retained (–). Aggregate per-phase
493-
results drive each agent&apos;s trajectory chart.
522+
earlier plans against that phase&apos;s deploy, and each cell shows
523+
the as-graded result from that phase&apos;s score snapshot. A dash
524+
(–) means no per-plan record exists for that phase (e.g. the
525+
plan was reworded mid-event).
494526
</p>
495527
</div>
496528
</div>

app/tests/data.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ export const TESTS: TestEntry[] = [
4747

4848
// a11y (2) — keyboard + alt text
4949
{ test_id: 'a3b4c5d6', phase: 1, category: 'a11y', path: 'a11y/01-keyboard-tab-focus.json', name: 'Keyboard Tab traversal surfaces visible focus on interactive elements', priority: 'p1' },
50-
{ test_id: 'b4c5d6e7', phase: 1, category: 'a11y', path: 'a11y/02-flag-images-alt.json', name: 'Team flag images in the bracket and group standings carry non-empty alt text', priority: 'p1' },
50+
{ test_id: 'b4c5d6e7', phase: 1, category: 'a11y', path: 'a11y/02-flag-images-alt.json', name: 'Screen-reader users can identify each team flag in the bracket and group standings', priority: 'p1' },
5151

5252
// visual (1) — hero + bracket layout
5353
{ test_id: 'c5d6e7f8', phase: 1, category: 'visual', path: 'visual/01-hero-and-bracket-present.json', name: 'Homepage shows a hero heading at the top and the bracket section below', priority: 'p1' },
@@ -66,7 +66,7 @@ export const TESTS: TestEntry[] = [
6666

6767
// details-data (4) — content rendered in the initial HTML
6868
{ test_id: 'bfab9939', phase: 2, category: 'details-data', path: 'details-data/01-team-name-rendered.json', name: 'Both team names render as readable text on match page', priority: 'p0' },
69-
{ test_id: '98805948', phase: 2, category: 'details-data', path: 'details-data/02-team-flags-present.json', name: 'Both team flag images render with non-empty alt text', priority: 'p0' },
69+
{ test_id: '98805948', phase: 2, category: 'details-data', path: 'details-data/02-team-flags-present.json', name: 'Both team flag images render the correct flags on the match detail page', priority: 'p0' },
7070
{ test_id: 'c4363884', phase: 2, category: 'details-data', path: 'details-data/03-kickoff-iso-or-local.json', name: 'Kickoff timestamp is rendered in human-readable form', priority: 'p1' },
7171
{ test_id: 'dc9c837d', phase: 2, category: 'details-data', path: 'details-data/04-stage-badge.json', name: 'Match page renders a stage badge for the fixture', priority: 'p1' },
7272

public/fixtures/phase-history.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)