Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
132 changes: 132 additions & 0 deletions apps/cli/src/commands/results/run-tags.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
/**
* Per-run tag sidecar file helpers.
*
* Tags are stored as a `tags.json` sidecar next to the run's `index.jsonl`
* manifest. The sidecar is optional, mutable, and non-breaking — absence
* means the run has no user-assigned tags.
*
* Wire format (stored on disk):
* ```json
* { "tags": ["baseline", "v2-prompt"], "updated_at": "2026-04-10T00:00:00.000Z" }
* ```
*
* Used by the Studio compare API so users can retroactively tag runs
* without changing the eval YAML or the run manifest itself. This mirrors
* the Langfuse / W&B / GitHub `tags` pattern — a mutable multi-valued
* list of free-form labels that lives alongside the immutable run_id.
*
* Validation rules:
* - Each tag is 1–60 characters after trimming
* - No control characters (\n, \t, DEL, etc.)
* - Tags are deduplicated case-sensitively
* - A run can have at most 20 tags
* - Writing an empty array removes the sidecar file
*
* To extend (e.g. add colored labels or descriptions): add optional fields
* to `RunTagsFile` and keep the schema additive so older files still parse.
*/

import { existsSync, readFileSync, unlinkSync, writeFileSync } from 'node:fs';
import path from 'node:path';

export const RUN_TAGS_FILENAME = 'tags.json';

/** Maximum number of tags per run. */
export const MAX_TAGS_PER_RUN = 20;

/** Maximum length of a single tag after trimming. */
export const MAX_TAG_LENGTH = 60;

export interface RunTagsFile {
/** Ordered, deduplicated list of user-assigned tags. */
tags: string[];
/** ISO-8601 timestamp of last update. */
updated_at: string;
}

/** Resolve the tags sidecar path given a run manifest (index.jsonl) path. */
export function runTagsPath(manifestPath: string): string {
return path.join(path.dirname(manifestPath), RUN_TAGS_FILENAME);
}

/** Read the tags for a run. Returns `undefined` if missing or unreadable. */
export function readRunTags(manifestPath: string): RunTagsFile | undefined {
const fp = runTagsPath(manifestPath);
if (!existsSync(fp)) return undefined;
try {
const parsed = JSON.parse(readFileSync(fp, 'utf8')) as unknown;
if (!parsed || typeof parsed !== 'object') return undefined;
const record = parsed as Record<string, unknown>;
if (!Array.isArray(record.tags)) return undefined;
const tags = record.tags.filter(
(t): t is string => typeof t === 'string' && t.trim().length > 0,
);
if (tags.length === 0) return undefined;
return {
tags,
updated_at: typeof record.updated_at === 'string' ? record.updated_at : '',
};
} catch {
return undefined;
}
}

/**
* Write tags for a run. Replaces any existing tags. Pass an empty array
* to remove the sidecar entirely.
*/
export function writeRunTags(manifestPath: string, tags: readonly string[]): RunTagsFile | null {
const cleaned = normalizeTags(tags);
if (cleaned.length === 0) {
deleteRunTags(manifestPath);
return null;
}
const entry: RunTagsFile = {
tags: cleaned,
updated_at: new Date().toISOString(),
};
writeFileSync(runTagsPath(manifestPath), `${JSON.stringify(entry, null, 2)}\n`, 'utf8');
return entry;
}

/** Remove a run's tags sidecar. No-op if the file does not exist. */
export function deleteRunTags(manifestPath: string): void {
const fp = runTagsPath(manifestPath);
if (existsSync(fp)) {
unlinkSync(fp);
}
}

/**
* Trim, validate, and deduplicate an incoming tag array. Throws on any
* invalid entry so the caller can surface a user-friendly error.
*/
function normalizeTags(tags: readonly string[]): string[] {
const seen = new Set<string>();
const out: string[] = [];
for (const raw of tags) {
if (typeof raw !== 'string') {
throw new Error('Tags must be strings');
}
const trimmed = raw.trim();
if (trimmed === '') continue;
if (trimmed.length > MAX_TAG_LENGTH) {
throw new Error(`Tag "${trimmed.slice(0, 20)}…" exceeds ${MAX_TAG_LENGTH} characters`);
}
// Reject control characters (newlines, tabs, DEL, etc.) — they break
// column headers in compare views and confuse test assertions.
for (let i = 0; i < trimmed.length; i++) {
const code = trimmed.charCodeAt(i);
if (code < 0x20 || code === 0x7f) {
throw new Error('Tag must not contain control characters');
}
}
if (seen.has(trimmed)) continue;
seen.add(trimmed);
out.push(trimmed);
}
if (out.length > MAX_TAGS_PER_RUN) {
throw new Error(`Too many tags (max ${MAX_TAGS_PER_RUN})`);
}
return out;
}
147 changes: 144 additions & 3 deletions apps/cli/src/commands/results/serve.ts
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ import {
listMergedResultFiles,
syncRemoteResults,
} from './remote.js';
import { deleteRunTags, readRunTags, writeRunTags } from './run-tags.js';
import { type StudioConfig, loadStudioConfig, saveStudioConfig } from './studio-config.js';

// ── Source resolution ────────────────────────────────────────────────────
Expand Down Expand Up @@ -273,6 +274,7 @@ async function handleRuns(c: C, { searchDir, agentvDir }: DataContext) {
} catch {
// ignore enrichment errors
}
const tagsEntry = readRunTags(m.path);
return {
filename: m.filename,
display_name: m.displayName,
Expand All @@ -285,6 +287,7 @@ async function handleRuns(c: C, { searchDir, agentvDir }: DataContext) {
source: m.source,
...(target && { target }),
...(experiment && { experiment }),
...(tagsEntry && { tags: tagsEntry.tags }),
};
}),
});
Expand Down Expand Up @@ -551,7 +554,7 @@ async function handleCompare(c: C, { searchDir, agentvDir }: DataContext) {
const { runs: metas } = await listMergedResultFiles(searchDir);
const { threshold: pass_threshold } = loadStudioConfig(agentvDir);

// Collect per-test-case results keyed by experiment × target
// Collect per-test-case results keyed by experiment × target (aggregated view)
const cellMap = new Map<
string,
{
Expand All @@ -569,17 +572,54 @@ async function handleCompare(c: C, { searchDir, agentvDir }: DataContext) {
}
>();

// Per-run entries (per-run view). Each run workspace contributes exactly
// one entry, independent of the aggregated matrix.
const runEntries: Array<{
run_id: string;
started_at: string;
experiment: string;
target: string;
tags?: string[];
source: 'local' | 'remote';
eval_count: number;
passed_count: number;
pass_rate: number;
avg_score: number;
tests: Array<{
test_id: string;
score: number;
passed: boolean;
execution_status?: string;
}>;
}> = [];

const experimentsSet = new Set<string>();
const targetsSet = new Set<string>();
const MAX_TESTS_PER_CELL = 100;

for (const m of metas) {
try {
const records = loadLightweightResults(m.path);
const runTestMap = new Map<
string,
{ test_id: string; score: number; passed: boolean; execution_status?: string }
>();
let runEvalCount = 0;
let runPassedCount = 0;
let runScoreSum = 0;
let runExperiment = 'default';
let runTarget = 'default';
let runStartedAt = m.timestamp;

for (const r of records) {
const experiment = r.experiment ?? 'default';
const target = r.target ?? 'default';
experimentsSet.add(experiment);
targetsSet.add(target);
runExperiment = experiment;
runTarget = target;
if (r.timestamp && r.timestamp < runStartedAt) runStartedAt = r.timestamp;

const key = JSON.stringify([experiment, target]);
const entry = cellMap.get(key) ?? {
experiment,
Expand All @@ -600,14 +640,41 @@ async function handleCompare(c: C, { searchDir, agentvDir }: DataContext) {
execution_status: r.executionStatus,
});
cellMap.set(key, entry);

// Per-run accumulation. Dedupe tests within the run by last-wins.
runTestMap.set(r.testId, {
test_id: r.testId,
score: r.score,
passed,
execution_status: r.executionStatus,
});
runEvalCount++;
if (passed) runPassedCount++;
runScoreSum += r.score;
}

if (runEvalCount === 0) continue;

const runTests = [...runTestMap.values()].slice(-MAX_TESTS_PER_CELL);
const tagsEntry = readRunTags(m.path);
runEntries.push({
run_id: m.filename,
started_at: runStartedAt,
experiment: runExperiment,
target: runTarget,
...(tagsEntry && { tags: tagsEntry.tags }),
source: m.source,
eval_count: runEvalCount,
passed_count: runPassedCount,
pass_rate: runPassedCount / runEvalCount,
avg_score: runScoreSum / runEvalCount,
tests: runTests,
});
} catch {
// skip runs that fail to load
}
}

const MAX_TESTS_PER_CELL = 100;

const cells = [...cellMap.values()].map((entry) => {
// Deduplicate tests: keep only the latest entry per test_id (last wins by insertion order)
const dedupMap = new Map<string, (typeof entry.tests)[number]>();
Expand All @@ -630,10 +697,14 @@ async function handleCompare(c: C, { searchDir, agentvDir }: DataContext) {
};
});

// Per-run entries sorted by timestamp descending (newest first).
runEntries.sort((a, b) => b.started_at.localeCompare(a.started_at));

return c.json({
experiments: [...experimentsSet].sort(),
targets: [...targetsSet].sort(),
cells,
runs: runEntries,
});
}

Expand Down Expand Up @@ -702,6 +773,52 @@ function handleFeedbackRead(c: C, { searchDir }: DataContext) {
return c.json(readFeedback(existsSync(resultsDir) ? resultsDir : searchDir));
}

async function handleRunTagsPut(c: C, { searchDir }: DataContext) {
const filename = c.req.param('filename') ?? '';
const meta = await findRunById(searchDir, filename);
if (!meta) return c.json({ error: 'Run not found' }, 404);
if (meta.source === 'remote') {
return c.json({ error: 'Tags can only be set on local runs' }, 400);
}
let body: unknown;
try {
body = await c.req.json();
} catch {
return c.json({ error: 'Invalid JSON' }, 400);
}
if (!body || typeof body !== 'object') {
return c.json({ error: 'Invalid payload' }, 400);
}
const tags = (body as Record<string, unknown>).tags;
if (!Array.isArray(tags)) {
return c.json({ error: 'Missing tags array' }, 400);
}
try {
const entry = writeRunTags(meta.path, tags as string[]);
return c.json({
tags: entry?.tags ?? [],
updated_at: entry?.updated_at ?? new Date().toISOString(),
});
} catch (err) {
return c.json({ error: (err as Error).message }, 400);
}
}

async function handleRunTagsDelete(c: C, { searchDir }: DataContext) {
const filename = c.req.param('filename') ?? '';
const meta = await findRunById(searchDir, filename);
if (!meta) return c.json({ error: 'Run not found' }, 404);
if (meta.source === 'remote') {
return c.json({ error: 'Tags can only be removed on local runs' }, 400);
}
try {
deleteRunTags(meta.path);
return c.json({ ok: true });
} catch (err) {
return c.json({ error: (err as Error).message }, 500);
}
}

// ── Hono app factory ─────────────────────────────────────────────────────

/**
Expand Down Expand Up @@ -934,6 +1051,18 @@ export function createApp(
app.get('/api/remote/status', async (c) => c.json(await getRemoteResultsStatus(searchDir)));
app.post('/api/remote/sync', async (c) => c.json(await syncRemoteResults(searchDir)));
app.get('/api/runs', (c) => handleRuns(c, defaultCtx));
app.put('/api/runs/:filename/tags', (c) => {
if (readOnly) {
return c.json({ error: 'Studio is running in read-only mode' }, 403);
}
return handleRunTagsPut(c, defaultCtx);
});
app.delete('/api/runs/:filename/tags', (c) => {
if (readOnly) {
return c.json({ error: 'Studio is running in read-only mode' }, 403);
}
return handleRunTagsDelete(c, defaultCtx);
});
app.get('/api/runs/:filename', (c) => handleRunDetail(c, defaultCtx));
app.get('/api/runs/:filename/suites', (c) => handleRunSuites(c, defaultCtx));
app.get('/api/runs/:filename/categories', (c) => handleRunCategories(c, defaultCtx));
Expand Down Expand Up @@ -1046,6 +1175,18 @@ export function createApp(
withBenchmark(c, async (ctx, dataCtx) => ctx.json(await syncRemoteResults(dataCtx.searchDir))),
);
app.get('/api/benchmarks/:benchmarkId/runs', (c) => withBenchmark(c, handleRuns));
app.put('/api/benchmarks/:benchmarkId/runs/:filename/tags', (c) => {
if (readOnly) {
return c.json({ error: 'Studio is running in read-only mode' }, 403);
}
return withBenchmark(c, handleRunTagsPut);
});
app.delete('/api/benchmarks/:benchmarkId/runs/:filename/tags', (c) => {
if (readOnly) {
return c.json({ error: 'Studio is running in read-only mode' }, 403);
}
return withBenchmark(c, handleRunTagsDelete);
});
app.get('/api/benchmarks/:benchmarkId/runs/:filename', (c) => withBenchmark(c, handleRunDetail));
app.get('/api/benchmarks/:benchmarkId/runs/:filename/suites', (c) =>
withBenchmark(c, handleRunSuites),
Expand Down
Loading
Loading