From d053ae5e6a5d62488d63bfe527ca8a443ce88124 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 9 Apr 2026 08:21:45 +0000 Subject: [PATCH 1/2] feat(studio): improve test details view layout (#1014) - Move scores/evaluator score cards inside Checks tab (hidden from other tabs) - Put tab navigation at the top so Files tab editor fills maximum height - Add grader-name pill to each assertion card (shows evaluator type) - Group assertions by evaluator when per-score assertion data is available - Compact header shows testId + metadata without score bars Co-Authored-By: Claude Sonnet 4.6 (1M context) --- apps/studio/src/components/EvalDetail.tsx | 246 ++++++++++++++-------- 1 file changed, 157 insertions(+), 89 deletions(-) diff --git a/apps/studio/src/components/EvalDetail.tsx b/apps/studio/src/components/EvalDetail.tsx index ee4f9f485..380ca4ecd 100644 --- a/apps/studio/src/components/EvalDetail.tsx +++ b/apps/studio/src/components/EvalDetail.tsx @@ -1,9 +1,10 @@ /** - * Three-tab eval detail view: Checks (assertions), Files (artifact browser), + * Three-tab eval detail view: Checks (assertions + scores), Files (artifact browser), * and Feedback (review comments). * - * Shows the full evaluation result with score breakdown, assertions list, - * and a file tree browser for artifact files (input, output, grading, timing). + * Layout: compact header → tabs → full-height content area. + * Scores and assertions are only visible in the Checks tab. + * Each assertion card shows a grader-name pill identifying its evaluator. */ import { useState } from 'react'; @@ -11,7 +12,7 @@ import { useState } from 'react'; import { useQuery } from '@tanstack/react-query'; import { isPassing, useEvalFileContent, useEvalFiles, useStudioConfig } from '~/lib/api'; import { projectEvalFileContentOptions, projectEvalFilesOptions } from '~/lib/api'; -import type { EvalResult } from '~/lib/types'; +import type { AssertionEntry, EvalResult, ScoreEntry } from '~/lib/types'; import { FeedbackPanel } from './FeedbackPanel'; import type { FileNode } from './FileTree'; @@ -51,48 +52,24 @@ export function EvalDetail({ eval: result, runId, projectId }: EvalDetailProps) ]; return ( -
- {/* Score summary */} -
-
-
-

{result.testId}

-

- {result.target && Target: {result.target}} - {result.durationMs != null && ( - {(result.durationMs / 1000).toFixed(1)}s - )} - {result.costUsd != null && ${result.costUsd.toFixed(4)}} -

-
-
- -
+
+ {/* Compact header: test ID + metadata (no scores — scores live in Checks tab) */} +
+
+

{result.testId}

+

+ {result.target && Target: {result.target}} + {result.durationMs != null && ( + {(result.durationMs / 1000).toFixed(1)}s + )} + {result.costUsd != null && ${result.costUsd.toFixed(4)}} +

- {/* Per-evaluator scores */} - {result.scores && result.scores.length > 0 && ( -
-

Evaluator Scores

-
- {result.scores.map((s, i) => ( -
- - {s.name ?? s.type ?? `Score ${i + 1}`} - -
- -
-
- ))} -
-
- )} - - {/* Tab navigation */} + {/* Tab navigation — at the top so Files tab editor fills maximum height */}
-
+
{tabs.map((tab) => (
+
+ ); +} + +/** Pill showing the grader/evaluator name on an assertion card. */ +function GraderPill({ label }: { label: string }) { + return ( + + {label} + + ); +} + +/** A single assertion row, optionally annotated with its grader name. */ +function AssertionCard({ + assertion, + graderLabel, +}: { assertion: AssertionEntry; graderLabel?: string }) { + return ( +
+ + {assertion.passed ? '\u2713' : '\u2717'} + +
+
+

+ {assertion.text} + {assertion.durationMs != null && ( + + ({(assertion.durationMs / 1000).toFixed(1)}s) + + )} +

+ {graderLabel && } +
+ {assertion.evidence && ( +

{assertion.evidence}

+ )}
); } -function StepsTab({ result }: { result: EvalResult }) { +/** + * Checks tab: overall score → per-evaluator scores → assertions (with grader pills) → failure reasons. + * Assertions are grouped by evaluator when per-score assertion data is available. + */ +function ChecksTab({ result }: { result: EvalResult }) { const { data: config } = useStudioConfig(); const passThreshold = config?.threshold ?? config?.pass_threshold ?? 0.8; - const assertions = result.assertions ?? []; + const hasFailed = !isPassing(result.score, passThreshold) || result.executionStatus === 'error' || result.executionStatus === 'failed'; - // Collect failure reasons from multiple sources + // Determine how to render assertions: + // If any score entry has nested assertions, use per-score grouping (enables grader pills). + // Otherwise fall back to the top-level assertions array. + const scoresWithAssertions = (result.scores ?? []).filter( + (s): s is ScoreEntry & { assertions: AssertionEntry[] } => + Array.isArray(s.assertions) && s.assertions.length > 0, + ); + const useGrouped = scoresWithAssertions.length > 0; + const topLevelAssertions = result.assertions ?? []; + + // Collect failure reasons const failureReasons: string[] = []; if (result.error) failureReasons.push(result.error); if (result.executionStatus === 'error' || result.executionStatus === 'failed') { failureReasons.push(`Execution status: ${result.executionStatus}`); } - // Add failed assertion details - const failedAssertions = assertions.filter((a) => !a.passed); - for (const a of failedAssertions) { + const assertionsForFailures = useGrouped + ? scoresWithAssertions.flatMap((s) => s.assertions) + : topLevelAssertions; + for (const a of assertionsForFailures.filter((a) => !a.passed)) { const msg = a.evidence ? `${a.text}: ${a.evidence}` : a.text; failureReasons.push(msg); } - // Also check per-evaluator scores for failure details if (result.scores) { for (const s of result.scores) { if (!isPassing(s.score, passThreshold) && s.details) { @@ -149,54 +199,72 @@ function StepsTab({ result }: { result: EvalResult }) { typeof s.details === 'string' ? s.details : JSON.stringify(s.details, null, 2); failureReasons.push(`[${s.name ?? s.type ?? 'evaluator'}] ${detailStr}`); } - if (s.assertions) { - for (const a of s.assertions) { - if (!a.passed) { - const msg = a.evidence ? `${a.text}: ${a.evidence}` : a.text; - if (!failureReasons.includes(msg)) failureReasons.push(msg); - } - } - } } } return ( -
- {assertions.length === 0 && ( -

No assertion steps recorded.

+
+ {/* Overall score */} +
+
+ Overall score +
+ +
+
+
+ + {/* Per-evaluator scores */} + {result.scores && result.scores.length > 0 && ( +
+

Evaluator Scores

+
+ {result.scores.map((s, i) => ( +
+ + {s.name ?? s.type ?? `Score ${i + 1}`} + +
+ +
+
+ ))} +
+
)} - {assertions.length > 0 && ( -
- {assertions.map((a) => ( -
- - {a.passed ? '\u2713' : '\u2717'} - -
-

- {a.text} - {a.durationMs != null && ( - - ({(a.durationMs / 1000).toFixed(1)}s) - - )} -

- {a.evidence &&

{a.evidence}

} + {/* Assertions */} + {useGrouped ? ( +
+ {scoresWithAssertions.map((s, si) => { + const graderLabel = s.name ?? s.type ?? `Evaluator ${si + 1}`; + return ( +
+

+ {graderLabel} +

+ {s.assertions.map((a, ai) => ( + + ))}
-
+ ); + })} +
+ ) : topLevelAssertions.length > 0 ? ( +
+ {topLevelAssertions.map((a, i) => ( + ))}
+ ) : ( +

No assertion steps recorded.

)} - {/* Failure reason section */} + {/* Failure reasons */} {hasFailed && failureReasons.length > 0 && (

Failure Reason

From 2a8548e713e49f8f9686308ecfc60397b4dad2c6 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 9 Apr 2026 08:23:06 +0000 Subject: [PATCH 2/2] style(studio): fix biome formatting in EvalDetail Co-Authored-By: Claude Sonnet 4.6 (1M context) --- apps/studio/src/components/EvalDetail.tsx | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/apps/studio/src/components/EvalDetail.tsx b/apps/studio/src/components/EvalDetail.tsx index 380ca4ecd..4b3414751 100644 --- a/apps/studio/src/components/EvalDetail.tsx +++ b/apps/studio/src/components/EvalDetail.tsx @@ -131,9 +131,7 @@ function AssertionCard({ : 'border-red-900/50 bg-red-950/20' }`} > - + {assertion.passed ? '\u2713' : '\u2717'}
@@ -148,9 +146,7 @@ function AssertionCard({

{graderLabel && }
- {assertion.evidence && ( -

{assertion.evidence}

- )} + {assertion.evidence &&

{assertion.evidence}

}
);