From bd97765940bb367123f3fb65bf3eb08c66982b89 Mon Sep 17 00:00:00 2001 From: James Grugett Date: Tue, 14 Apr 2026 10:33:30 -0700 Subject: [PATCH] Update judge to gpt 5.4 --- evals/buffbench/judge.ts | 4 ++-- evals/buffbench/main-nightly.ts | 2 +- evals/buffbench/main.ts | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/evals/buffbench/judge.ts b/evals/buffbench/judge.ts index 0abe70a86..eea09deba 100644 --- a/evals/buffbench/judge.ts +++ b/evals/buffbench/judge.ts @@ -123,7 +123,7 @@ Provide detailed analysis, strengths, weaknesses, and numerical scores.`, const judgeAgents: Record = { 'judge-gpt': { id: 'judge-gpt', - model: 'openai/gpt-5.1', + model: 'openai/gpt-5.4', ...judgeAgentBase, }, 'judge-gemini': { @@ -133,7 +133,7 @@ const judgeAgents: Record = { }, 'judge-sonnet': { id: 'judge-claude', - model: 'anthropic/claude-sonnet-4.5', + model: 'anthropic/claude-sonnet-4.6', ...judgeAgentBase, }, } diff --git a/evals/buffbench/main-nightly.ts b/evals/buffbench/main-nightly.ts index c96685c13..35998fbc2 100644 --- a/evals/buffbench/main-nightly.ts +++ b/evals/buffbench/main-nightly.ts @@ -17,7 +17,7 @@ async function main() { const results = await runBuffBench({ evalDataPaths: [ path.join(__dirname, 'eval-codebuff.json')], agents: ['base2-free'], - taskConcurrency: 6, + taskConcurrency: 5, saveTraces, }) diff --git a/evals/buffbench/main.ts b/evals/buffbench/main.ts index 471f6e6db..5508dccbe 100644 --- a/evals/buffbench/main.ts +++ b/evals/buffbench/main.ts @@ -11,7 +11,7 @@ async function main() { await runBuffBench({ evalDataPaths: [path.join(__dirname, 'eval-codebuff.json')], agents: ['base2-free-evals'], - taskConcurrency: 10, + taskConcurrency: 6, saveTraces, })