Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .env.local.example
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ INTERNAL_API_SECRET=changeme
# Worker URLs (defaults shown, workers are optional)
CLOUD_AGENT_API_URL=http://localhost:8788
WEBHOOK_AGENT_URL=http://localhost:8793
MODEL_EVAL_INGEST_URL=http://localhost:8798
SESSION_INGEST_WORKER_URL=
CODE_REVIEW_WORKER_URL=mock-url
CODE_REVIEW_WORKER_AUTH_TOKEN=mock-token
Expand Down
2 changes: 1 addition & 1 deletion DEVELOPMENT.md
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ Copy `.env.local.example` to `.env.local`, then update the following variables i
- `INTERNAL_API_SECRET`: Generate a random secret with `openssl rand -base64 32`
- `STRIPE_SECRET_KEY` and `NEXT_PUBLIC_STRIPE_PUBLISHABLE_KEY`: These must be set to create a fake account. You can use an existing Stripe account or create a new one, and use the keys from Sandbox Mode (formerly Test Mode) here.

Then copy `.env.development.local.example` to `.env.development.local`.
Then run `pnpm dev:env`. It derives `apps/web/.env.development.local` and Worker `.dev.vars` files from `.env.local` plus each `.example` template. Re-run it after pulling changes that add local service URLs or Worker env vars.

These changes will allow you to do local testing with a fake account.

Expand Down
3 changes: 3 additions & 0 deletions apps/web/.env.development.local.example
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ USER_DEPLOYMENTS_DISPATCHER_URL=http://localhost:8799
# @url cloudflare-webhook-agent-ingest
WEBHOOK_AGENT_URL=http://localhost:8793

# @url cloudflare-model-eval-ingest
MODEL_EVAL_INGEST_URL=http://localhost:8798

# @url cloudflare-o11y
O11Y_SERVICE_URL=http://localhost:8801

Expand Down
5 changes: 5 additions & 0 deletions apps/web/src/app/admin/components/AppSidebar.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,11 @@ const analyticsObservabilityItems: MenuItem[] = [
url: '/admin/model-stats',
icon: () => <BarChart />,
},
{
title: () => 'Model Benchmarks',
url: '/admin/model-eval-ingest',
icon: () => <FileSearch />,
},
{
title: () => 'Session Traces',
url: '/admin/session-traces',
Expand Down
217 changes: 217 additions & 0 deletions apps/web/src/app/admin/model-eval-ingest/ModelEvalIngestContent.tsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
'use client';

import { useState } from 'react';
import { useMutation, useQuery } from '@tanstack/react-query';
import { ExternalLink, RefreshCw } from 'lucide-react';
import { toast } from 'sonner';
import { useTRPC } from '@/lib/trpc/utils';
import { Button } from '@/components/ui/button';
import { Card, CardContent, CardDescription, CardHeader, CardTitle } from '@/components/ui/card';
import {
Table,
TableBody,
TableCell,
TableHead,
TableHeader,
TableRow,
} from '@/components/ui/table';

const PAGE_SIZE = 50;

export function ModelEvalIngestContent() {
const trpc = useTRPC();
const [page, setPage] = useState(1);
const historyQuery = useQuery(
trpc.admin.modelEvalIngest.list.queryOptions({ page, limit: PAGE_SIZE })
);
const syncMutation = useMutation(
trpc.admin.modelEvalIngest.syncNow.mutationOptions({
onSuccess: result => {
toast.success(formatSyncToast(result));
void historyQuery.refetch();
},
onError: error => toast.error(error.message || 'Model eval sync failed'),
})
);
const repullMutation = useMutation(
trpc.admin.modelEvalIngest.repullPromotion.mutationOptions({
onSuccess: result => {
toast.success(
`Promotion re-pull fetched ${result.fetched} records and refreshed ${result.cacheRecomputes} caches`
);
void historyQuery.refetch();
},
onError: error => toast.error(error.message || 'Promotion re-pull failed'),
})
);

const rows = historyQuery.data?.rows ?? [];
const pagination = historyQuery.data?.pagination;

return (
<div className="flex w-full flex-col gap-6">
<div className="flex flex-col justify-between gap-4 lg:flex-row lg:items-start">
<div className="space-y-2">
<h2 className="text-2xl font-bold">Model Benchmarks</h2>
<p className="text-muted-foreground max-w-4xl">
Audit promoted kilo-bench evals that cloud has pulled, then refresh the public Kilo
Bench cache on demand. Bench remains the aggregate source; this table is the cloud-side
ingest history.
</p>
</div>
<Button onClick={() => syncMutation.mutate()} disabled={syncMutation.isPending}>
<RefreshCw className={`size-4 ${syncMutation.isPending ? 'animate-spin' : ''}`} />
{syncMutation.isPending ? 'Syncing...' : 'Sync now'}
</Button>
</div>

<Card>
<CardHeader>
<CardTitle>Promotion history</CardTitle>
<CardDescription>
Rows are append-only and deduplicated by bench eval name. Promoter email and bench links
stay admin-only here, never in the public model-stats cache.
</CardDescription>
</CardHeader>
<CardContent className="flex flex-col gap-4">
<div className="overflow-x-auto rounded-lg border">
<Table>
<TableHeader>
<TableRow>
<TableHead>Bench eval</TableHead>
<TableHead>Model</TableHead>
<TableHead>Task</TableHead>
<TableHead className="text-right">Score</TableHead>
<TableHead className="text-right">Trials</TableHead>
<TableHead>Promoted</TableHead>
<TableHead>Promoter</TableHead>
<TableHead>Ingested</TableHead>
<TableHead className="text-right">Action</TableHead>
</TableRow>
</TableHeader>
<TableBody>
{rows.length === 0 ? (
<TableRow>
<TableCell colSpan={9} className="text-muted-foreground h-24 text-center">
{historyQuery.isLoading
? 'Loading ingest history...'
: 'No ingested promotions yet.'}
</TableCell>
</TableRow>
) : (
rows.map(row => (
<TableRow key={row.id}>
<TableCell className="min-w-64">
<a
href={row.benchEvalUrl}
target="_blank"
rel="noreferrer"
className="inline-flex max-w-80 items-center gap-1 text-sm text-blue-400 hover:text-blue-300"
>
<span className="truncate">{row.benchEvalName}</span>
<ExternalLink className="size-3 shrink-0" />
</a>
</TableCell>
<TableCell className="min-w-56 font-mono text-xs">
<div>{row.model}</div>
<div className="text-muted-foreground">
{row.provider}
{row.variant ? ` / ${row.variant}` : ''}
</div>
</TableCell>
<TableCell>{row.taskSource}</TableCell>
<TableCell className="text-right font-mono tabular-nums">
{formatScore(row.overallScore)}
<div className="text-muted-foreground text-xs">
total {formatScore(row.totalScore)}
</div>
</TableCell>
<TableCell className="text-right font-mono tabular-nums">
{row.nTotalTrials}
<div className="text-muted-foreground text-xs">{row.nErrored} errored</div>
</TableCell>
<TableCell className="whitespace-nowrap">
{formatTimestamp(row.promotedAt)}
</TableCell>
<TableCell className="whitespace-nowrap text-sm">
{row.promotedByEmail}
</TableCell>
<TableCell className="whitespace-nowrap">
{formatTimestamp(row.createdAt)}
</TableCell>
<TableCell className="text-right">
<Button
variant="secondary"
size="sm"
onClick={() =>
repullMutation.mutate({ promotionName: row.benchEvalName })
}
disabled={repullMutation.isPending}
>
Repull
</Button>
</TableCell>
</TableRow>
))
)}
</TableBody>
</Table>
</div>

<div className="flex items-center justify-between gap-3 text-sm">
<p className="text-muted-foreground">
{pagination ? `${pagination.total} ingested promotion rows` : 'Loading row count...'}
</p>
<div className="flex gap-2">
<Button
variant="secondary"
onClick={() => setPage(current => Math.max(1, current - 1))}
disabled={page <= 1 || historyQuery.isFetching}
>
Previous
</Button>
<Button
variant="secondary"
onClick={() => setPage(current => current + 1)}
disabled={!pagination || page >= pagination.totalPages || historyQuery.isFetching}
>
Next
</Button>
</div>
</div>
</CardContent>
</Card>
</div>
);
}

function formatTimestamp(value: string): string {
return new Date(value).toLocaleString();
}

function formatScore(value: number): string {
return value.toFixed(4);
}

function formatSyncToast(result: {
inserted: number;
alreadyHad: number;
fetched: number;
}): string {
if (result.inserted > 0) {
const inserted = `Bench sync inserted ${formatCount(result.inserted, 'new promotion')}.`;
return result.alreadyHad > 0
? `${inserted} ${formatCount(result.alreadyHad, 'existing promotion')} rechecked.`
: inserted;
}

if (result.alreadyHad > 0) {
return `Bench sync is up to date; ${formatCount(result.alreadyHad, 'existing promotion')} rechecked.`;
}

return `Bench sync is up to date; ${formatCount(result.fetched, 'promotion')} returned.`;
}

function formatCount(count: number, label: string): string {
return `${count} ${label}${count === 1 ? '' : 's'}`;
}
17 changes: 17 additions & 0 deletions apps/web/src/app/admin/model-eval-ingest/page.tsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import { BreadcrumbItem, BreadcrumbPage } from '@/components/ui/breadcrumb';
import AdminPage from '@/app/admin/components/AdminPage';
import { ModelEvalIngestContent } from './ModelEvalIngestContent';

const breadcrumbs = (
<BreadcrumbItem>
<BreadcrumbPage>Model Benchmarks</BreadcrumbPage>
</BreadcrumbItem>
);

export default function ModelEvalIngestPage() {
return (
<AdminPage breadcrumbs={breadcrumbs}>
<ModelEvalIngestContent />
</AdminPage>
);
}
3 changes: 3 additions & 0 deletions apps/web/src/lib/config.server.ts
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,9 @@ export const STRIPE_KILOCLAW_EARLYBIRD_COUPON_ID = getEnvVariable(
export const WEBHOOK_AGENT_URL =
getEnvVariable('WEBHOOK_AGENT_URL') || 'https://hooks.kilosessions.ai';

// Model eval ingest Worker
export const MODEL_EVAL_INGEST_URL = getEnvVariable('MODEL_EVAL_INGEST_URL') || '';

// Session ingest worker (public share proxy)
export const SESSION_INGEST_WORKER_URL = getEnvVariable('SESSION_INGEST_WORKER_URL') || '';

Expand Down
45 changes: 45 additions & 0 deletions apps/web/src/lib/model-eval-ingest-client.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import 'server-only';
import * as z from 'zod';
import { INTERNAL_API_SECRET, MODEL_EVAL_INGEST_URL } from '@/lib/config.server';

const ModelEvalSyncResultSchema = z.object({
success: z.literal(true),
inserted: z.number().int().nonnegative(),
alreadyHad: z.number().int().nonnegative(),
cacheRecomputes: z.number().int().nonnegative(),
fetched: z.number().int().nonnegative(),
});
const ModelEvalSyncErrorSchema = z.object({ error: z.string().optional() });

export type ModelEvalSyncResult = z.infer<typeof ModelEvalSyncResultSchema>;

type ModelEvalSyncRequest = {
promotionName?: string;
};

export async function syncModelEvalPromotions(
request: ModelEvalSyncRequest = {}
): Promise<ModelEvalSyncResult> {
if (!MODEL_EVAL_INGEST_URL) {
throw new Error('MODEL_EVAL_INGEST_URL is not configured');
}

const response = await fetch(`${MODEL_EVAL_INGEST_URL}/internal/sync`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'x-internal-api-key': INTERNAL_API_SECRET,
},
body: JSON.stringify(request),
});

const body: unknown = await response.json();
if (!response.ok) {
const errorBody = ModelEvalSyncErrorSchema.safeParse(body);
throw new Error(
errorBody.success && errorBody.data.error ? errorBody.data.error : `HTTP ${response.status}`
);
}

return ModelEvalSyncResultSchema.parse(body);
}
Loading