diff --git a/apps/studio/src/components/EvalDetail.tsx b/apps/studio/src/components/EvalDetail.tsx
index ee4f9f485..4b3414751 100644
--- a/apps/studio/src/components/EvalDetail.tsx
+++ b/apps/studio/src/components/EvalDetail.tsx
@@ -1,9 +1,10 @@
/**
- * Three-tab eval detail view: Checks (assertions), Files (artifact browser),
+ * Three-tab eval detail view: Checks (assertions + scores), Files (artifact browser),
* and Feedback (review comments).
*
- * Shows the full evaluation result with score breakdown, assertions list,
- * and a file tree browser for artifact files (input, output, grading, timing).
+ * Layout: compact header → tabs → full-height content area.
+ * Scores and assertions are only visible in the Checks tab.
+ * Each assertion card shows a grader-name pill identifying its evaluator.
*/
import { useState } from 'react';
@@ -11,7 +12,7 @@ import { useState } from 'react';
import { useQuery } from '@tanstack/react-query';
import { isPassing, useEvalFileContent, useEvalFiles, useStudioConfig } from '~/lib/api';
import { projectEvalFileContentOptions, projectEvalFilesOptions } from '~/lib/api';
-import type { EvalResult } from '~/lib/types';
+import type { AssertionEntry, EvalResult, ScoreEntry } from '~/lib/types';
import { FeedbackPanel } from './FeedbackPanel';
import type { FileNode } from './FileTree';
@@ -51,48 +52,24 @@ export function EvalDetail({ eval: result, runId, projectId }: EvalDetailProps)
];
return (
-
- {/* Score summary */}
-
-
-
-
{result.testId}
-
- {result.target && Target: {result.target}}
- {result.durationMs != null && (
- {(result.durationMs / 1000).toFixed(1)}s
- )}
- {result.costUsd != null && ${result.costUsd.toFixed(4)}}
-
-
-
-
-
+
+ {/* Compact header: test ID + metadata (no scores — scores live in Checks tab) */}
+
+
+
{result.testId}
+
+ {result.target && Target: {result.target}}
+ {result.durationMs != null && (
+ {(result.durationMs / 1000).toFixed(1)}s
+ )}
+ {result.costUsd != null && ${result.costUsd.toFixed(4)}}
+
- {/* Per-evaluator scores */}
- {result.scores && result.scores.length > 0 && (
-
-
Evaluator Scores
-
- {result.scores.map((s, i) => (
-
-
- {s.name ?? s.type ?? `Score ${i + 1}`}
-
-
-
-
-
- ))}
-
-
- )}
-
- {/* Tab navigation */}
+ {/* Tab navigation — at the top so Files tab editor fills maximum height */}
-
+
{tabs.map((tab) => (
+
+ );
+}
+
+/** Pill showing the grader/evaluator name on an assertion card. */
+function GraderPill({ label }: { label: string }) {
+ return (
+
+ {label}
+
+ );
+}
+
+/** A single assertion row, optionally annotated with its grader name. */
+function AssertionCard({
+ assertion,
+ graderLabel,
+}: { assertion: AssertionEntry; graderLabel?: string }) {
+ return (
+
+
+ {assertion.passed ? '\u2713' : '\u2717'}
+
+
+
+
+ {assertion.text}
+ {assertion.durationMs != null && (
+
+ ({(assertion.durationMs / 1000).toFixed(1)}s)
+
+ )}
+
+ {graderLabel &&
}
+
+ {assertion.evidence &&
{assertion.evidence}
}
);
}
-function StepsTab({ result }: { result: EvalResult }) {
+/**
+ * Checks tab: overall score → per-evaluator scores → assertions (with grader pills) → failure reasons.
+ * Assertions are grouped by evaluator when per-score assertion data is available.
+ */
+function ChecksTab({ result }: { result: EvalResult }) {
const { data: config } = useStudioConfig();
const passThreshold = config?.threshold ?? config?.pass_threshold ?? 0.8;
- const assertions = result.assertions ?? [];
+
const hasFailed =
!isPassing(result.score, passThreshold) ||
result.executionStatus === 'error' ||
result.executionStatus === 'failed';
- // Collect failure reasons from multiple sources
+ // Determine how to render assertions:
+ // If any score entry has nested assertions, use per-score grouping (enables grader pills).
+ // Otherwise fall back to the top-level assertions array.
+ const scoresWithAssertions = (result.scores ?? []).filter(
+ (s): s is ScoreEntry & { assertions: AssertionEntry[] } =>
+ Array.isArray(s.assertions) && s.assertions.length > 0,
+ );
+ const useGrouped = scoresWithAssertions.length > 0;
+ const topLevelAssertions = result.assertions ?? [];
+
+ // Collect failure reasons
const failureReasons: string[] = [];
if (result.error) failureReasons.push(result.error);
if (result.executionStatus === 'error' || result.executionStatus === 'failed') {
failureReasons.push(`Execution status: ${result.executionStatus}`);
}
- // Add failed assertion details
- const failedAssertions = assertions.filter((a) => !a.passed);
- for (const a of failedAssertions) {
+ const assertionsForFailures = useGrouped
+ ? scoresWithAssertions.flatMap((s) => s.assertions)
+ : topLevelAssertions;
+ for (const a of assertionsForFailures.filter((a) => !a.passed)) {
const msg = a.evidence ? `${a.text}: ${a.evidence}` : a.text;
failureReasons.push(msg);
}
- // Also check per-evaluator scores for failure details
if (result.scores) {
for (const s of result.scores) {
if (!isPassing(s.score, passThreshold) && s.details) {
@@ -149,54 +195,72 @@ function StepsTab({ result }: { result: EvalResult }) {
typeof s.details === 'string' ? s.details : JSON.stringify(s.details, null, 2);
failureReasons.push(`[${s.name ?? s.type ?? 'evaluator'}] ${detailStr}`);
}
- if (s.assertions) {
- for (const a of s.assertions) {
- if (!a.passed) {
- const msg = a.evidence ? `${a.text}: ${a.evidence}` : a.text;
- if (!failureReasons.includes(msg)) failureReasons.push(msg);
- }
- }
- }
}
}
return (
-
- {assertions.length === 0 && (
-
No assertion steps recorded.
+
+ {/* Overall score */}
+
+
+ {/* Per-evaluator scores */}
+ {result.scores && result.scores.length > 0 && (
+
+
Evaluator Scores
+
+ {result.scores.map((s, i) => (
+
+
+ {s.name ?? s.type ?? `Score ${i + 1}`}
+
+
+
+
+
+ ))}
+
+
)}
- {assertions.length > 0 && (
-
- {assertions.map((a) => (
-
-
- {a.passed ? '\u2713' : '\u2717'}
-
-
-
- {a.text}
- {a.durationMs != null && (
-
- ({(a.durationMs / 1000).toFixed(1)}s)
-
- )}
-
- {a.evidence &&
{a.evidence}
}
+ {/* Assertions */}
+ {useGrouped ? (
+
+ {scoresWithAssertions.map((s, si) => {
+ const graderLabel = s.name ?? s.type ?? `Evaluator ${si + 1}`;
+ return (
+
+
+ {graderLabel}
+
+ {s.assertions.map((a, ai) => (
+
+ ))}
-
+ );
+ })}
+
+ ) : topLevelAssertions.length > 0 ? (
+
+ {topLevelAssertions.map((a, i) => (
+
))}
+ ) : (
+
No assertion steps recorded.
)}
- {/* Failure reason section */}
+ {/* Failure reasons */}
{hasFailed && failureReasons.length > 0 && (
Failure Reason