Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
242 changes: 153 additions & 89 deletions apps/studio/src/components/EvalDetail.tsx
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
/**
* Three-tab eval detail view: Checks (assertions), Files (artifact browser),
* Three-tab eval detail view: Checks (assertions + scores), Files (artifact browser),
* and Feedback (review comments).
*
* Shows the full evaluation result with score breakdown, assertions list,
* and a file tree browser for artifact files (input, output, grading, timing).
* Layout: compact header → tabs → full-height content area.
* Scores and assertions are only visible in the Checks tab.
* Each assertion card shows a grader-name pill identifying its evaluator.
*/

import { useState } from 'react';

import { useQuery } from '@tanstack/react-query';
import { isPassing, useEvalFileContent, useEvalFiles, useStudioConfig } from '~/lib/api';
import { projectEvalFileContentOptions, projectEvalFilesOptions } from '~/lib/api';
import type { EvalResult } from '~/lib/types';
import type { AssertionEntry, EvalResult, ScoreEntry } from '~/lib/types';

import { FeedbackPanel } from './FeedbackPanel';
import type { FileNode } from './FileTree';
Expand Down Expand Up @@ -51,48 +52,24 @@ export function EvalDetail({ eval: result, runId, projectId }: EvalDetailProps)
];

return (
<div className="flex min-h-full flex-col gap-6">
{/* Score summary */}
<div className="rounded-lg border border-gray-800 bg-gray-900 p-4">
<div className="flex items-center justify-between">
<div>
<h3 className="text-lg font-medium">{result.testId}</h3>
<p className="mt-1 text-sm text-gray-400">
{result.target && <span>Target: {result.target}</span>}
{result.durationMs != null && (
<span className="ml-4">{(result.durationMs / 1000).toFixed(1)}s</span>
)}
{result.costUsd != null && <span className="ml-4">${result.costUsd.toFixed(4)}</span>}
</p>
</div>
<div className="w-48">
<ScoreBar score={result.score} />
</div>
<div className="flex min-h-full flex-col">
{/* Compact header: test ID + metadata (no scores — scores live in Checks tab) */}
<div className="flex items-start justify-between border-b border-gray-800 px-4 py-3">
<div>
<h3 className="text-lg font-medium">{result.testId}</h3>
<p className="mt-0.5 text-sm text-gray-400">
{result.target && <span>Target: {result.target}</span>}
{result.durationMs != null && (
<span className="ml-4">{(result.durationMs / 1000).toFixed(1)}s</span>
)}
{result.costUsd != null && <span className="ml-4">${result.costUsd.toFixed(4)}</span>}
</p>
</div>
</div>

{/* Per-evaluator scores */}
{result.scores && result.scores.length > 0 && (
<div className="rounded-lg border border-gray-800 bg-gray-900 p-4">
<h4 className="mb-3 text-sm font-medium text-gray-400">Evaluator Scores</h4>
<div className="space-y-3">
{result.scores.map((s, i) => (
<div key={`${s.name ?? s.type ?? i}`} className="flex items-center gap-4">
<span className="w-40 truncate text-sm text-gray-300">
{s.name ?? s.type ?? `Score ${i + 1}`}
</span>
<div className="flex-1">
<ScoreBar score={s.score} />
</div>
</div>
))}
</div>
</div>
)}

{/* Tab navigation */}
{/* Tab navigation — at the top so Files tab editor fills maximum height */}
<div className="border-b border-gray-800">
<div className="flex gap-1">
<div className="flex gap-1 px-4">
{tabs.map((tab) => (
<button
type="button"
Expand All @@ -112,91 +89,178 @@ export function EvalDetail({ eval: result, runId, projectId }: EvalDetailProps)

{/* Tab content */}
<div className="min-h-0 flex-1">
{activeTab === 'checks' && <StepsTab result={result} />}
{activeTab === 'files' && <FilesTab result={result} runId={runId} projectId={projectId} />}
{!isReadOnly && activeTab === 'feedback' && <FeedbackPanel testId={result.testId} />}
{activeTab === 'checks' && (
<div className="overflow-auto p-4">
<ChecksTab result={result} />
</div>
)}
{activeTab === 'files' && (
<div className="h-full p-4">
<FilesTab result={result} runId={runId} projectId={projectId} />
</div>
)}
{!isReadOnly && activeTab === 'feedback' && (
<div className="p-4">
<FeedbackPanel testId={result.testId} />
</div>
)}
</div>
</div>
);
}

/** Pill showing the grader/evaluator name on an assertion card. */
function GraderPill({ label }: { label: string }) {
return (
<span className="inline-flex items-center rounded-full border border-gray-700 bg-gray-800 px-2 py-0.5 text-xs font-medium text-gray-400">
{label}
</span>
);
}

/** A single assertion row, optionally annotated with its grader name. */
function AssertionCard({
assertion,
graderLabel,
}: { assertion: AssertionEntry; graderLabel?: string }) {
return (
<div
className={`flex items-start gap-3 rounded-lg border p-3 ${
assertion.passed
? 'border-emerald-900/50 bg-emerald-950/20'
: 'border-red-900/50 bg-red-950/20'
}`}
>
<span className={`mt-0.5 text-lg ${assertion.passed ? 'text-emerald-400' : 'text-red-400'}`}>
{assertion.passed ? '\u2713' : '\u2717'}
</span>
<div className="min-w-0 flex-1">
<div className="flex flex-wrap items-center gap-2">
<p className="text-sm text-gray-200">
{assertion.text}
{assertion.durationMs != null && (
<span className="ml-2 text-xs text-gray-500">
({(assertion.durationMs / 1000).toFixed(1)}s)
</span>
)}
</p>
{graderLabel && <GraderPill label={graderLabel} />}
</div>
{assertion.evidence && <p className="mt-1 text-xs text-gray-400">{assertion.evidence}</p>}
</div>
</div>
);
}

function StepsTab({ result }: { result: EvalResult }) {
/**
* Checks tab: overall score → per-evaluator scores → assertions (with grader pills) → failure reasons.
* Assertions are grouped by evaluator when per-score assertion data is available.
*/
function ChecksTab({ result }: { result: EvalResult }) {
const { data: config } = useStudioConfig();
const passThreshold = config?.threshold ?? config?.pass_threshold ?? 0.8;
const assertions = result.assertions ?? [];

const hasFailed =
!isPassing(result.score, passThreshold) ||
result.executionStatus === 'error' ||
result.executionStatus === 'failed';

// Collect failure reasons from multiple sources
// Determine how to render assertions:
// If any score entry has nested assertions, use per-score grouping (enables grader pills).
// Otherwise fall back to the top-level assertions array.
const scoresWithAssertions = (result.scores ?? []).filter(
(s): s is ScoreEntry & { assertions: AssertionEntry[] } =>
Array.isArray(s.assertions) && s.assertions.length > 0,
);
const useGrouped = scoresWithAssertions.length > 0;
const topLevelAssertions = result.assertions ?? [];

// Collect failure reasons
const failureReasons: string[] = [];
if (result.error) failureReasons.push(result.error);
if (result.executionStatus === 'error' || result.executionStatus === 'failed') {
failureReasons.push(`Execution status: ${result.executionStatus}`);
}
// Add failed assertion details
const failedAssertions = assertions.filter((a) => !a.passed);
for (const a of failedAssertions) {
const assertionsForFailures = useGrouped
? scoresWithAssertions.flatMap((s) => s.assertions)
: topLevelAssertions;
for (const a of assertionsForFailures.filter((a) => !a.passed)) {
const msg = a.evidence ? `${a.text}: ${a.evidence}` : a.text;
failureReasons.push(msg);
}
// Also check per-evaluator scores for failure details
if (result.scores) {
for (const s of result.scores) {
if (!isPassing(s.score, passThreshold) && s.details) {
const detailStr =
typeof s.details === 'string' ? s.details : JSON.stringify(s.details, null, 2);
failureReasons.push(`[${s.name ?? s.type ?? 'evaluator'}] ${detailStr}`);
}
if (s.assertions) {
for (const a of s.assertions) {
if (!a.passed) {
const msg = a.evidence ? `${a.text}: ${a.evidence}` : a.text;
if (!failureReasons.includes(msg)) failureReasons.push(msg);
}
}
}
}
}

return (
<div className="space-y-4">
{assertions.length === 0 && (
<p className="text-sm text-gray-500">No assertion steps recorded.</p>
<div className="space-y-6">
{/* Overall score */}
<div className="rounded-lg border border-gray-800 bg-gray-900 p-4">
<div className="flex items-center gap-4">
<span className="text-sm font-medium text-gray-400">Overall score</span>
<div className="flex-1">
<ScoreBar score={result.score} />
</div>
</div>
</div>

{/* Per-evaluator scores */}
{result.scores && result.scores.length > 0 && (
<div className="rounded-lg border border-gray-800 bg-gray-900 p-4">
<h4 className="mb-3 text-sm font-medium text-gray-400">Evaluator Scores</h4>
<div className="space-y-3">
{result.scores.map((s, i) => (
<div key={`${s.name ?? s.type ?? i}`} className="flex items-center gap-4">
<span className="w-40 truncate text-sm text-gray-300">
{s.name ?? s.type ?? `Score ${i + 1}`}
</span>
<div className="flex-1">
<ScoreBar score={s.score} />
</div>
</div>
))}
</div>
</div>
)}

{assertions.length > 0 && (
<div className="space-y-2">
{assertions.map((a) => (
<div
key={`${a.text}-${a.passed}`}
className={`flex items-start gap-3 rounded-lg border p-3 ${
a.passed
? 'border-emerald-900/50 bg-emerald-950/20'
: 'border-red-900/50 bg-red-950/20'
}`}
>
<span className={`mt-0.5 text-lg ${a.passed ? 'text-emerald-400' : 'text-red-400'}`}>
{a.passed ? '\u2713' : '\u2717'}
</span>
<div className="min-w-0 flex-1">
<p className="text-sm text-gray-200">
{a.text}
{a.durationMs != null && (
<span className="ml-2 text-xs text-gray-500">
({(a.durationMs / 1000).toFixed(1)}s)
</span>
)}
</p>
{a.evidence && <p className="mt-1 text-xs text-gray-400">{a.evidence}</p>}
{/* Assertions */}
{useGrouped ? (
<div className="space-y-6">
{scoresWithAssertions.map((s, si) => {
const graderLabel = s.name ?? s.type ?? `Evaluator ${si + 1}`;
return (
<div key={`${s.name ?? s.type ?? si}`} className="space-y-2">
<h4 className="text-xs font-semibold uppercase tracking-wider text-gray-500">
{graderLabel}
</h4>
{s.assertions.map((a, ai) => (
<AssertionCard
key={`${a.text}-${ai}`}
assertion={a}
graderLabel={s.type ?? undefined}
/>
))}
</div>
</div>
);
})}
</div>
) : topLevelAssertions.length > 0 ? (
<div className="space-y-2">
{topLevelAssertions.map((a, i) => (
<AssertionCard key={`${a.text}-${i}`} assertion={a} />
))}
</div>
) : (
<p className="text-sm text-gray-500">No assertion steps recorded.</p>
)}

{/* Failure reason section */}
{/* Failure reasons */}
{hasFailed && failureReasons.length > 0 && (
<div className="rounded-lg border border-red-500/30 bg-red-500/10 p-4">
<h4 className="mb-2 text-sm font-medium text-red-400">Failure Reason</h4>
Expand Down
Loading