From d053ae5e6a5d62488d63bfe527ca8a443ce88124 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Thu, 9 Apr 2026 08:21:45 +0000
Subject: [PATCH 1/2] feat(studio): improve test details view layout (#1014)

- Move scores/evaluator score cards inside Checks tab (hidden from other tabs)
- Put tab navigation at the top so Files tab editor fills maximum height
- Add grader-name pill to each assertion card (shows evaluator type)
- Group assertions by evaluator when per-score assertion data is available
- Compact header shows testId + metadata without score bars

Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
---
 apps/studio/src/components/EvalDetail.tsx | 246 ++++++++++++++--------
 1 file changed, 157 insertions(+), 89 deletions(-)
diff --git a/apps/studio/src/components/EvalDetail.tsx b/apps/studio/src/components/EvalDetail.tsx
index ee4f9f485..380ca4ecd 100644
--- a/apps/studio/src/components/EvalDetail.tsx
+++ b/apps/studio/src/components/EvalDetail.tsx
@@ -1,9 +1,10 @@
 /**
- * Three-tab eval detail view: Checks (assertions), Files (artifact browser),
+ * Three-tab eval detail view: Checks (assertions + scores), Files (artifact browser),
  * and Feedback (review comments).
  *
- * Shows the full evaluation result with score breakdown, assertions list,
- * and a file tree browser for artifact files (input, output, grading, timing).
+ * Layout: compact header → tabs → full-height content area.
+ * Scores and assertions are only visible in the Checks tab.
+ * Each assertion card shows a grader-name pill identifying its evaluator.
  */
 
 import { useState } from 'react';
@@ -11,7 +12,7 @@ import { useState } from 'react';
 import { useQuery } from '@tanstack/react-query';
 import { isPassing, useEvalFileContent, useEvalFiles, useStudioConfig } from '~/lib/api';
 import { projectEvalFileContentOptions, projectEvalFilesOptions } from '~/lib/api';
-import type { EvalResult } from '~/lib/types';
+import type { AssertionEntry, EvalResult, ScoreEntry } from '~/lib/types';
 
 import { FeedbackPanel } from './FeedbackPanel';
 import type { FileNode } from './FileTree';
@@ -51,48 +52,24 @@ export function EvalDetail({ eval: result, runId, projectId }: EvalDetailProps)
   ];
 
   return (
-    <div className="flex min-h-full flex-col gap-6">
-      {/* Score summary */}
-      <div className="rounded-lg border border-gray-800 bg-gray-900 p-4">
-        <div className="flex items-center justify-between">
-          <div>
-            <h3 className="text-lg font-medium">{result.testId}</h3>
-            <p className="mt-1 text-sm text-gray-400">
-              {result.target && <span>Target: {result.target}</span>}
-              {result.durationMs != null && (
-                <span className="ml-4">{(result.durationMs / 1000).toFixed(1)}s</span>
-              )}
-              {result.costUsd != null && <span className="ml-4">${result.costUsd.toFixed(4)}</span>}
-            </p>
-          </div>
-          <div className="w-48">
-            <ScoreBar score={result.score} />
-          </div>
+    <div className="flex min-h-full flex-col">
+      {/* Compact header: test ID + metadata (no scores — scores live in Checks tab) */}
+      <div className="flex items-start justify-between border-b border-gray-800 px-4 py-3">
+        <div>
+          <h3 className="text-lg font-medium">{result.testId}</h3>
+          <p className="mt-0.5 text-sm text-gray-400">
+            {result.target && <span>Target: {result.target}</span>}
+            {result.durationMs != null && (
+              <span className="ml-4">{(result.durationMs / 1000).toFixed(1)}s</span>
+            )}
+            {result.costUsd != null && <span className="ml-4">${result.costUsd.toFixed(4)}</span>}
+          </p>
         </div>
       </div>
 
-      {/* Per-evaluator scores */}
-      {result.scores && result.scores.length > 0 && (
-        <div className="rounded-lg border border-gray-800 bg-gray-900 p-4">
-          <h4 className="mb-3 text-sm font-medium text-gray-400">Evaluator Scores</h4>
-          <div className="space-y-3">
-            {result.scores.map((s, i) => (
-              <div key={`${s.name ?? s.type ?? i}`} className="flex items-center gap-4">
-                <span className="w-40 truncate text-sm text-gray-300">
-                  {s.name ?? s.type ?? `Score ${i + 1}`}
-                </span>
-                <div className="flex-1">
-                  <ScoreBar score={s.score} />
-                </div>
-              </div>
-            ))}
-          </div>
-        </div>
-      )}
-
-      {/* Tab navigation */}
+      {/* Tab navigation — at the top so Files tab editor fills maximum height */}
       <div className="border-b border-gray-800">
-        <div className="flex gap-1">
+        <div className="flex gap-1 px-4">
           {tabs.map((tab) => (
             <button
               type="button"
@@ -112,36 +89,109 @@ export function EvalDetail({ eval: result, runId, projectId }: EvalDetailProps)
 
       {/* Tab content */}
       <div className="min-h-0 flex-1">
-        {activeTab === 'checks' && <StepsTab result={result} />}
-        {activeTab === 'files' && <FilesTab result={result} runId={runId} projectId={projectId} />}
-        {!isReadOnly && activeTab === 'feedback' && <FeedbackPanel testId={result.testId} />}
+        {activeTab === 'checks' && (
+          <div className="overflow-auto p-4">
+            <ChecksTab result={result} />
+          </div>
+        )}
+        {activeTab === 'files' && (
+          <div className="h-full p-4">
+            <FilesTab result={result} runId={runId} projectId={projectId} />
+          </div>
+        )}
+        {!isReadOnly && activeTab === 'feedback' && (
+          <div className="p-4">
+            <FeedbackPanel testId={result.testId} />
+          </div>
+        )}
+      </div>
+    </div>
+  );
+}
+
+/** Pill showing the grader/evaluator name on an assertion card. */
+function GraderPill({ label }: { label: string }) {
+  return (
+    <span className="inline-flex items-center rounded-full border border-gray-700 bg-gray-800 px-2 py-0.5 text-xs font-medium text-gray-400">
+      {label}
+    </span>
+  );
+}
+
+/** A single assertion row, optionally annotated with its grader name. */
+function AssertionCard({
+  assertion,
+  graderLabel,
+}: { assertion: AssertionEntry; graderLabel?: string }) {
+  return (
+    <div
+      className={`flex items-start gap-3 rounded-lg border p-3 ${
+        assertion.passed
+          ? 'border-emerald-900/50 bg-emerald-950/20'
+          : 'border-red-900/50 bg-red-950/20'
+      }`}
+    >
+      <span
+        className={`mt-0.5 text-lg ${assertion.passed ? 'text-emerald-400' : 'text-red-400'}`}
+      >
+        {assertion.passed ? '\u2713' : '\u2717'}
+      </span>
+      <div className="min-w-0 flex-1">
+        <div className="flex flex-wrap items-center gap-2">
+          <p className="text-sm text-gray-200">
+            {assertion.text}
+            {assertion.durationMs != null && (
+              <span className="ml-2 text-xs text-gray-500">
+                ({(assertion.durationMs / 1000).toFixed(1)}s)
+              </span>
+            )}
+          </p>
+          {graderLabel && <GraderPill label={graderLabel} />}
+        </div>
+        {assertion.evidence && (
+          <p className="mt-1 text-xs text-gray-400">{assertion.evidence}</p>
+        )}
       </div>
     </div>
   );
 }
 
-function StepsTab({ result }: { result: EvalResult }) {
+/**
+ * Checks tab: overall score → per-evaluator scores → assertions (with grader pills) → failure reasons.
+ * Assertions are grouped by evaluator when per-score assertion data is available.
+ */
+function ChecksTab({ result }: { result: EvalResult }) {
   const { data: config } = useStudioConfig();
   const passThreshold = config?.threshold ?? config?.pass_threshold ?? 0.8;
-  const assertions = result.assertions ?? [];
+
   const hasFailed =
     !isPassing(result.score, passThreshold) ||
     result.executionStatus === 'error' ||
     result.executionStatus === 'failed';
 
-  // Collect failure reasons from multiple sources
+  // Determine how to render assertions:
+  // If any score entry has nested assertions, use per-score grouping (enables grader pills).
+  // Otherwise fall back to the top-level assertions array.
+  const scoresWithAssertions = (result.scores ?? []).filter(
+    (s): s is ScoreEntry & { assertions: AssertionEntry[] } =>
+      Array.isArray(s.assertions) && s.assertions.length > 0,
+  );
+  const useGrouped = scoresWithAssertions.length > 0;
+  const topLevelAssertions = result.assertions ?? [];
+
+  // Collect failure reasons
   const failureReasons: string[] = [];
   if (result.error) failureReasons.push(result.error);
   if (result.executionStatus === 'error' || result.executionStatus === 'failed') {
     failureReasons.push(`Execution status: ${result.executionStatus}`);
   }
-  // Add failed assertion details
-  const failedAssertions = assertions.filter((a) => !a.passed);
-  for (const a of failedAssertions) {
+  const assertionsForFailures = useGrouped
+    ? scoresWithAssertions.flatMap((s) => s.assertions)
+    : topLevelAssertions;
+  for (const a of assertionsForFailures.filter((a) => !a.passed)) {
     const msg = a.evidence ? `${a.text}: ${a.evidence}` : a.text;
     failureReasons.push(msg);
   }
-  // Also check per-evaluator scores for failure details
   if (result.scores) {
     for (const s of result.scores) {
       if (!isPassing(s.score, passThreshold) && s.details) {
@@ -149,54 +199,72 @@ function StepsTab({ result }: { result: EvalResult }) {
           typeof s.details === 'string' ? s.details : JSON.stringify(s.details, null, 2);
         failureReasons.push(`[${s.name ?? s.type ?? 'evaluator'}] ${detailStr}`);
       }
-      if (s.assertions) {
-        for (const a of s.assertions) {
-          if (!a.passed) {
-            const msg = a.evidence ? `${a.text}: ${a.evidence}` : a.text;
-            if (!failureReasons.includes(msg)) failureReasons.push(msg);
-          }
-        }
-      }
     }
   }
 
   return (
-    <div className="space-y-4">
-      {assertions.length === 0 && (
-        <p className="text-sm text-gray-500">No assertion steps recorded.</p>
+    <div className="space-y-6">
+      {/* Overall score */}
+      <div className="rounded-lg border border-gray-800 bg-gray-900 p-4">
+        <div className="flex items-center gap-4">
+          <span className="text-sm font-medium text-gray-400">Overall score</span>
+          <div className="flex-1">
+            <ScoreBar score={result.score} />
+          </div>
+        </div>
+      </div>
+
+      {/* Per-evaluator scores */}
+      {result.scores && result.scores.length > 0 && (
+        <div className="rounded-lg border border-gray-800 bg-gray-900 p-4">
+          <h4 className="mb-3 text-sm font-medium text-gray-400">Evaluator Scores</h4>
+          <div className="space-y-3">
+            {result.scores.map((s, i) => (
+              <div key={`${s.name ?? s.type ?? i}`} className="flex items-center gap-4">
+                <span className="w-40 truncate text-sm text-gray-300">
+                  {s.name ?? s.type ?? `Score ${i + 1}`}
+                </span>
+                <div className="flex-1">
+                  <ScoreBar score={s.score} />
+                </div>
+              </div>
+            ))}
+          </div>
+        </div>
       )}
 
-      {assertions.length > 0 && (
-        <div className="space-y-2">
-          {assertions.map((a) => (
-            <div
-              key={`${a.text}-${a.passed}`}
-              className={`flex items-start gap-3 rounded-lg border p-3 ${
-                a.passed
-                  ? 'border-emerald-900/50 bg-emerald-950/20'
-                  : 'border-red-900/50 bg-red-950/20'
-              }`}
-            >
-              <span className={`mt-0.5 text-lg ${a.passed ? 'text-emerald-400' : 'text-red-400'}`}>
-                {a.passed ? '\u2713' : '\u2717'}
-              </span>
-              <div className="min-w-0 flex-1">
-                <p className="text-sm text-gray-200">
-                  {a.text}
-                  {a.durationMs != null && (
-                    <span className="ml-2 text-xs text-gray-500">
-                      ({(a.durationMs / 1000).toFixed(1)}s)
-                    </span>
-                  )}
-                </p>
-                {a.evidence && <p className="mt-1 text-xs text-gray-400">{a.evidence}</p>}
+      {/* Assertions */}
+      {useGrouped ? (
+        <div className="space-y-6">
+          {scoresWithAssertions.map((s, si) => {
+            const graderLabel = s.name ?? s.type ?? `Evaluator ${si + 1}`;
+            return (
+              <div key={`${s.name ?? s.type ?? si}`} className="space-y-2">
+                <h4 className="text-xs font-semibold uppercase tracking-wider text-gray-500">
+                  {graderLabel}
+                </h4>
+                {s.assertions.map((a, ai) => (
+                  <AssertionCard
+                    key={`${a.text}-${ai}`}
+                    assertion={a}
+                    graderLabel={s.type ?? undefined}
+                  />
+                ))}
               </div>
-            </div>
+            );
+          })}
+        </div>
+      ) : topLevelAssertions.length > 0 ? (
+        <div className="space-y-2">
+          {topLevelAssertions.map((a, i) => (
+            <AssertionCard key={`${a.text}-${i}`} assertion={a} />
           ))}
         </div>
+      ) : (
+        <p className="text-sm text-gray-500">No assertion steps recorded.</p>
       )}
 
-      {/* Failure reason section */}
+      {/* Failure reasons */}
       {hasFailed && failureReasons.length > 0 && (
         <div className="rounded-lg border border-red-500/30 bg-red-500/10 p-4">
           <h4 className="mb-2 text-sm font-medium text-red-400">Failure Reason</h4>

From 2a8548e713e49f8f9686308ecfc60397b4dad2c6 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Thu, 9 Apr 2026 08:23:06 +0000
Subject: [PATCH 2/2] style(studio): fix biome formatting in EvalDetail

Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
---
 apps/studio/src/components/EvalDetail.tsx | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/apps/studio/src/components/EvalDetail.tsx b/apps/studio/src/components/EvalDetail.tsx
index 380ca4ecd..4b3414751 100644
--- a/apps/studio/src/components/EvalDetail.tsx
+++ b/apps/studio/src/components/EvalDetail.tsx
@@ -131,9 +131,7 @@ function AssertionCard({
           : 'border-red-900/50 bg-red-950/20'
       }`}
     >
-      <span
-        className={`mt-0.5 text-lg ${assertion.passed ? 'text-emerald-400' : 'text-red-400'}`}
-      >
+      <span className={`mt-0.5 text-lg ${assertion.passed ? 'text-emerald-400' : 'text-red-400'}`}>
         {assertion.passed ? '\u2713' : '\u2717'}
       </span>
       <div className="min-w-0 flex-1">
@@ -148,9 +146,7 @@ function AssertionCard({
           </p>
           {graderLabel && <GraderPill label={graderLabel} />}
         </div>
-        {assertion.evidence && (
-          <p className="mt-1 text-xs text-gray-400">{assertion.evidence}</p>
-        )}
+        {assertion.evidence && <p className="mt-1 text-xs text-gray-400">{assertion.evidence}</p>}
       </div>
     </div>
   );