Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions packages/core/data/arena-code.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
{
"meta": {
"lastUpdated": "2026-06-01",
"source": "https://lmarena.ai/",
"description": "Code benchmark Elo scores from Chatbot Arena"
},
"models": [
{ "model": "claude-sonnet-4-20250514", "rawName": "Claude Sonnet 4", "provider": "anthropic", "elo": 1480, "95thPercentile": 1420, "votes": 95000, "organization": "Anthropic", "license": "Proprietary", "ranking": 1, "lastUpdated": "2026-06-01" },
{ "model": "claude-opus-4-6", "rawName": "Claude Opus 4", "provider": "anthropic", "elo": 1475, "95thPercentile": 1408, "votes": 78000, "organization": "Anthropic", "license": "Proprietary", "ranking": 2, "lastUpdated": "2026-06-01" },
{ "model": "gpt-5.4", "rawName": "GPT-5.4", "provider": "openai", "elo": 1468, "95thPercentile": 1400, "votes": 88000, "organization": "OpenAI", "license": "Proprietary", "ranking": 3, "lastUpdated": "2026-06-01" },
{ "model": "deepseek-v4-pro", "rawName": "DeepSeek V4 Pro", "provider": "deepseek", "elo": 1445, "95thPercentile": 1375, "votes": 72000, "organization": "DeepSeek", "license": "MIT", "ranking": 6, "lastUpdated": "2026-06-01" },
{ "model": "o4-mini", "rawName": "o4-mini", "provider": "openai", "elo": 1440, "95thPercentile": 1378, "votes": 65000, "organization": "OpenAI", "license": "Proprietary", "ranking": 4, "lastUpdated": "2026-06-01" },
{ "model": "gemini-3-1-pro", "rawName": "Gemini 3.1 Pro", "provider": "google", "elo": 1430, "95thPercentile": 1360, "votes": 74000, "organization": "Google", "license": "Proprietary", "ranking": 8, "lastUpdated": "2026-06-01" },
{ "model": "gpt-4o", "rawName": "GPT-4o", "provider": "openai", "elo": 1425, "95thPercentile": 1355, "votes": 85000, "organization": "OpenAI", "license": "Proprietary", "ranking": 5, "lastUpdated": "2026-06-01" },
{ "model": "deepseek-v4-flash", "rawName": "DeepSeek V4 Flash", "provider": "deepseek", "elo": 1415, "95thPercentile": 1345, "votes": 58000, "organization": "DeepSeek", "license": "MIT", "ranking": 7, "lastUpdated": "2026-06-01" },
{ "model": "xiaomi/mimo-v2-pro", "rawName": "Xiaomi MiMo V2 Pro", "provider": "openrouter", "elo": 1398, "95thPercentile": 1325, "votes": 10000, "organization": "Xiaomi", "license": "MIT", "ranking": 10, "lastUpdated": "2026-06-01" },
{ "model": "gemini-2.5-flash", "rawName": "Gemini 2.5 Flash", "provider": "google", "elo": 1395, "95thPercentile": 1320, "votes": 75000, "organization": "Google", "license": "Proprietary", "ranking": 9, "lastUpdated": "2026-06-01" },
{ "model": "moonshotai/Kimi-K2.5", "rawName": "Kimi K2.5", "provider": "siliconflow", "elo": 1385, "95thPercentile": 1310, "votes": 22000, "organization": "Moonshot AI", "license": "Proprietary", "ranking": 15, "lastUpdated": "2026-06-01" },
{ "model": "Qwen/Qwen3.5-35B-A3B", "rawName": "Qwen 3.5 35B-A3B", "provider": "siliconflow", "elo": 1380, "95thPercentile": 1305, "votes": 28000, "organization": "Alibaba", "license": "Apache-2.0", "ranking": 12, "lastUpdated": "2026-06-01" },
{ "model": "MiniMax-M2.7", "rawName": "MiniMax M2.7", "provider": "minimax", "elo": 1375, "95thPercentile": 1298, "votes": 12000, "organization": "MiniMax", "license": "MIT", "ranking": 16, "lastUpdated": "2026-06-01" },
{ "model": "Qwen/Qwen3.5-32B-A3B", "rawName": "Qwen 3.5 32B-A3B", "provider": "siliconflow", "elo": 1372, "95thPercentile": 1295, "votes": 18000, "organization": "Alibaba", "license": "Apache-2.0", "ranking": 13, "lastUpdated": "2026-06-01" },
{ "model": "deepseek-ai/DeepSeek-Coder-V2", "rawName": "DeepSeek Coder V2", "provider": "siliconflow", "elo": 1370, "95thPercentile": 1290, "votes": 38000, "organization": "DeepSeek", "license": "MIT", "ranking": 11, "lastUpdated": "2026-06-01" },
{ "model": "deepseek-ai/DeepSeek-V3", "rawName": "DeepSeek V3", "provider": "siliconflow", "elo": 1355, "95thPercentile": 1280, "votes": 42000, "organization": "DeepSeek", "license": "MIT", "ranking": 14, "lastUpdated": "2026-06-01" },
{ "model": "glm-5.1", "rawName": "GLM-5.1", "provider": "zai", "elo": 1350, "95thPercentile": 1275, "votes": 14000, "organization": "Zhipu AI", "license": "Proprietary", "ranking": 17, "lastUpdated": "2026-06-01" },
{ "model": "MiniMax-M2.5", "rawName": "MiniMax M2.5", "provider": "minimax", "elo": 1348, "95thPercentile": 1272, "votes": 10000, "organization": "MiniMax", "license": "MIT", "ranking": 19, "lastUpdated": "2026-06-01" },
{ "model": "glm-5", "rawName": "GLM-5", "provider": "zai", "elo": 1335, "95thPercentile": 1258, "votes": 11000, "organization": "Zhipu AI", "license": "Proprietary", "ranking": 18, "lastUpdated": "2026-06-01" },
{ "model": "glm-4-turbo", "rawName": "GLM-4 Turbo", "provider": "zai", "elo": 1310, "95thPercentile": 1235, "votes": 13000, "organization": "Zhipu AI", "license": "Proprietary", "ranking": 20, "lastUpdated": "2026-06-01" }
]
}
29 changes: 29 additions & 0 deletions packages/core/data/arena-text.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
{
"meta": {
"lastUpdated": "2026-06-01",
"source": "https://lmarena.ai/",
"description": "Text benchmark Elo scores from Chatbot Arena"
},
"models": [
{ "model": "claude-sonnet-4-20250514", "rawName": "Claude Sonnet 4", "provider": "anthropic", "elo": 1452, "95thPercentile": 1384, "votes": 125000, "organization": "Anthropic", "license": "Proprietary", "ranking": 1, "lastUpdated": "2026-06-01" },
{ "model": "claude-opus-4-6", "rawName": "Claude Opus 4", "provider": "anthropic", "elo": 1448, "95thPercentile": 1365, "votes": 93000, "organization": "Anthropic", "license": "Proprietary", "ranking": 2, "lastUpdated": "2026-06-01" },
{ "model": "gpt-5.4", "rawName": "GPT-5.4", "provider": "openai", "elo": 1440, "95thPercentile": 1370, "votes": 110000, "organization": "OpenAI", "license": "Proprietary", "ranking": 3, "lastUpdated": "2026-06-01" },
{ "model": "gemini-3-1-pro", "rawName": "Gemini 3.1 Pro", "provider": "google", "elo": 1420, "95thPercentile": 1350, "votes": 87000, "organization": "Google", "license": "Proprietary", "ranking": 6, "lastUpdated": "2026-06-01" },
{ "model": "deepseek-v4-pro", "rawName": "DeepSeek V4 Pro", "provider": "deepseek", "elo": 1410, "95thPercentile": 1338, "votes": 82000, "organization": "DeepSeek", "license": "MIT", "ranking": 8, "lastUpdated": "2026-06-01" },
{ "model": "o4-mini", "rawName": "o4-mini", "provider": "openai", "elo": 1405, "95thPercentile": 1340, "votes": 78000, "organization": "OpenAI", "license": "Proprietary", "ranking": 5, "lastUpdated": "2026-06-01" },
{ "model": "gpt-4o", "rawName": "GPT-4o", "provider": "openai", "elo": 1395, "95thPercentile": 1322, "votes": 98000, "organization": "OpenAI", "license": "Proprietary", "ranking": 4, "lastUpdated": "2026-06-01" },
{ "model": "deepseek-v4-flash", "rawName": "DeepSeek V4 Flash", "provider": "deepseek", "elo": 1390, "95thPercentile": 1320, "votes": 65000, "organization": "DeepSeek", "license": "MIT", "ranking": 9, "lastUpdated": "2026-06-01" },
{ "model": "xiaomi/mimo-v2-pro", "rawName": "Xiaomi MiMo V2 Pro", "provider": "openrouter", "elo": 1385, "95thPercentile": 1310, "votes": 12000, "organization": "Xiaomi", "license": "MIT", "ranking": 12, "lastUpdated": "2026-06-01" },
{ "model": "gemini-2.5-flash", "rawName": "Gemini 2.5 Flash", "provider": "google", "elo": 1378, "95thPercentile": 1305, "votes": 88000, "organization": "Google", "license": "Proprietary", "ranking": 7, "lastUpdated": "2026-06-01" },
{ "model": "Qwen/Qwen3.5-35B-A3B", "rawName": "Qwen 3.5 35B-A3B", "provider": "siliconflow", "elo": 1370, "95thPercentile": 1295, "votes": 35000, "organization": "Alibaba", "license": "Apache-2.0", "ranking": 13, "lastUpdated": "2026-06-01" },
{ "model": "moonshotai/Kimi-K2.5", "rawName": "Kimi K2.5", "provider": "siliconflow", "elo": 1368, "95thPercentile": 1288, "votes": 28000, "organization": "Moonshot AI", "license": "Proprietary", "ranking": 17, "lastUpdated": "2026-06-01" },
{ "model": "MiniMax-M2.7", "rawName": "MiniMax M2.7", "provider": "minimax", "elo": 1365, "95thPercentile": 1280, "votes": 15000, "organization": "MiniMax", "license": "MIT", "ranking": 10, "lastUpdated": "2026-06-01" },
{ "model": "Qwen/Qwen3.5-32B-A3B", "rawName": "Qwen 3.5 32B-A3B", "provider": "siliconflow", "elo": 1362, "95thPercentile": 1285, "votes": 22000, "organization": "Alibaba", "license": "Apache-2.0", "ranking": 14, "lastUpdated": "2026-06-01" },
{ "model": "glm-5.1", "rawName": "GLM-5.1", "provider": "zai", "elo": 1345, "95thPercentile": 1270, "votes": 18000, "organization": "Zhipu AI", "license": "Proprietary", "ranking": 18, "lastUpdated": "2026-06-01" },
{ "model": "MiniMax-M2.5", "rawName": "MiniMax M2.5", "provider": "minimax", "elo": 1342, "95thPercentile": 1265, "votes": 12000, "organization": "MiniMax", "license": "MIT", "ranking": 11, "lastUpdated": "2026-06-01" },
{ "model": "deepseek-ai/DeepSeek-Coder-V2", "rawName": "DeepSeek Coder V2", "provider": "siliconflow", "elo": 1340, "95thPercentile": 1260, "votes": 45000, "organization": "DeepSeek", "license": "MIT", "ranking": 16, "lastUpdated": "2026-06-01" },
{ "model": "glm-5", "rawName": "GLM-5", "provider": "zai", "elo": 1330, "95thPercentile": 1255, "votes": 14000, "organization": "Zhipu AI", "license": "Proprietary", "ranking": 19, "lastUpdated": "2026-06-01" },
{ "model": "deepseek-ai/DeepSeek-V3", "rawName": "DeepSeek V3", "provider": "siliconflow", "elo": 1328, "95thPercentile": 1250, "votes": 55000, "organization": "DeepSeek", "license": "MIT", "ranking": 15, "lastUpdated": "2026-06-01" },
{ "model": "glm-4-turbo", "rawName": "GLM-4 Turbo", "provider": "zai", "elo": 1305, "95thPercentile": 1230, "votes": 16000, "organization": "Zhipu AI", "license": "Proprietary", "ranking": 20, "lastUpdated": "2026-06-01" }
]
}
24 changes: 24 additions & 0 deletions packages/core/data/arena-vision.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{
"meta": {
"lastUpdated": "2026-06-01",
"source": "https://lmarena.ai/",
"description": "Vision benchmark Elo scores from Chatbot Arena"
},
"models": [
{ "model": "claude-sonnet-4-20250514", "rawName": "Claude Sonnet 4", "provider": "anthropic", "elo": 1465, "95thPercentile": 1398, "votes": 82000, "organization": "Anthropic", "license": "Proprietary", "ranking": 1, "lastUpdated": "2026-06-01" },
{ "model": "claude-opus-4-6", "rawName": "Claude Opus 4", "provider": "anthropic", "elo": 1460, "95thPercentile": 1390, "votes": 68000, "organization": "Anthropic", "license": "Proprietary", "ranking": 2, "lastUpdated": "2026-06-01" },
{ "model": "gpt-5.4", "rawName": "GPT-5.4", "provider": "openai", "elo": 1455, "95thPercentile": 1388, "votes": 75000, "organization": "OpenAI", "license": "Proprietary", "ranking": 3, "lastUpdated": "2026-06-01" },
{ "model": "gemini-3-1-pro", "rawName": "Gemini 3.1 Pro", "provider": "google", "elo": 1438, "95thPercentile": 1368, "votes": 63000, "organization": "Google", "license": "Proprietary", "ranking": 5, "lastUpdated": "2026-06-01" },
{ "model": "o4-mini", "rawName": "o4-mini", "provider": "openai", "elo": 1418, "95thPercentile": 1352, "votes": 55000, "organization": "OpenAI", "license": "Proprietary", "ranking": 15, "lastUpdated": "2026-06-01" },
{ "model": "gpt-4o", "rawName": "GPT-4o", "provider": "openai", "elo": 1410, "95thPercentile": 1345, "votes": 72000, "organization": "OpenAI", "license": "Proprietary", "ranking": 4, "lastUpdated": "2026-06-01" },
{ "model": "xiaomi/mimo-v2-pro", "rawName": "Xiaomi MiMo V2 Pro", "provider": "openrouter", "elo": 1395, "95thPercentile": 1320, "votes": 8000, "organization": "Xiaomi", "license": "MIT", "ranking": 9, "lastUpdated": "2026-06-01" },
{ "model": "gemini-2.5-flash", "rawName": "Gemini 2.5 Flash", "provider": "google", "elo": 1392, "95thPercentile": 1315, "votes": 65000, "organization": "Google", "license": "Proprietary", "ranking": 6, "lastUpdated": "2026-06-01" },
{ "model": "deepseek-v4-flash", "rawName": "DeepSeek V4 Flash", "provider": "deepseek", "elo": 1365, "95thPercentile": 1290, "votes": 48000, "organization": "DeepSeek", "license": "MIT", "ranking": 8, "lastUpdated": "2026-06-01" },
{ "model": "moonshotai/Kimi-K2.5", "rawName": "Kimi K2.5", "provider": "siliconflow", "elo": 1365, "95thPercentile": 1285, "votes": 18000, "organization": "Moonshot AI", "license": "Proprietary", "ranking": 14, "lastUpdated": "2026-06-01" },
{ "model": "Qwen/Qwen3.5-35B-A3B", "rawName": "Qwen 3.5 35B-A3B", "provider": "siliconflow", "elo": 1358, "95thPercentile": 1282, "votes": 22000, "organization": "Alibaba", "license": "Apache-2.0", "ranking": 10, "lastUpdated": "2026-06-01" },
{ "model": "Qwen/Qwen3.5-32B-A3B", "rawName": "Qwen 3.5 32B-A3B", "provider": "siliconflow", "elo": 1350, "95thPercentile": 1272, "votes": 15000, "organization": "Alibaba", "license": "Apache-2.0", "ranking": 11, "lastUpdated": "2026-06-01" },
{ "model": "glm-5.1", "rawName": "GLM-5.1", "provider": "zai", "elo": 1338, "95thPercentile": 1260, "votes": 12000, "organization": "Zhipu AI", "license": "Proprietary", "ranking": 12, "lastUpdated": "2026-06-01" },
{ "model": "glm-5", "rawName": "GLM-5", "provider": "zai", "elo": 1325, "95thPercentile": 1248, "votes": 9000, "organization": "Zhipu AI", "license": "Proprietary", "ranking": 13, "lastUpdated": "2026-06-01" },
{ "model": "deepseek-v4-pro", "rawName": "DeepSeek V4 Pro", "provider": "deepseek", "elo": 1380, "95thPercentile": 1310, "votes": 58000, "organization": "DeepSeek", "license": "MIT", "ranking": 7, "lastUpdated": "2026-06-01" }
]
}
101 changes: 101 additions & 0 deletions packages/core/scripts/update-arena-data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
#!/usr/bin/env bash
# ---------------------------------------------------------------------------
# update-arena-data.sh — Update Arena benchmark data from lmarena.ai
#
# Fetches the latest Chatbot Arena Elo scores and generates the three JSON
# data files used by ModelScoreService for model quality estimation.
#
# Usage: ./scripts/update-arena-data.sh
#
# Files updated:
# data/arena-text.json — Text benchmark Elo scores
# data/arena-code.json — Code benchmark Elo scores
# data/arena-vision.json — Vision benchmark Elo scores
# ---------------------------------------------------------------------------
set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
DATA_DIR="$SCRIPT_DIR/../data"

# --- Configuration ---
ARENA_TEXT_URL="https://lmarena.ai/data/leaderboard_text.json"
ARENA_CODE_URL="https://lmarena.ai/data/leaderboard_code.json"
ARENA_VISION_URL="https://lmarena.ai/data/leaderboard_vision.json"
TIMEOUT_SEC=30
MAX_RETRIES=3

log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; }
error() { echo "[ERROR] $*" >&2; }

# --- Helper: download with retry ---
download_file() {
local url="$1"
local output="$2"
local retries=0

while [ $retries -lt $MAX_RETRIES ]; do
if curl -sS --max-time "$TIMEOUT_SEC" "$url" -o "$output" 2>/dev/null; then
if [ -s "$output" ]; then
return 0
fi
fi
retries=$((retries + 1))
log "Retry $retries/$MAX_RETRIES for $url"
sleep $((retries * 2))
done

return 1
}

# --- Main ---
log "Starting Arena data update..."

mkdir -p "$DATA_DIR"

# Download text leaderboard
log "Fetching text benchmark data..."
TEMP_TEXT=$(mktemp)
if download_file "$ARENA_TEXT_URL" "$TEMP_TEXT"; then
# Validate JSON before copying
if python3 -c "import json; json.load(open('$TEMP_TEXT'))" 2>/dev/null; then
cp "$TEMP_TEXT" "$DATA_DIR/arena-text.json"
log "Updated arena-text.json"
else
error "Invalid JSON from text leaderboard"
fi
else
error "Failed to fetch text leaderboard, keeping existing file"
fi
rm -f "$TEMP_TEXT"

# Download code leaderboard
log "Fetching code benchmark data..."
TEMP_CODE=$(mktemp)
if download_file "$ARENA_CODE_URL" "$TEMP_CODE"; then
if python3 -c "import json; json.load(open('$TEMP_CODE'))" 2>/dev/null; then
cp "$TEMP_CODE" "$DATA_DIR/arena-code.json"
log "Updated arena-code.json"
else
error "Invalid JSON from code leaderboard"
fi
else
error "Failed to fetch code leaderboard, keeping existing file"
fi
rm -f "$TEMP_CODE"

# Download vision leaderboard
log "Fetching vision benchmark data..."
TEMP_VISION=$(mktemp)
if download_file "$ARENA_VISION_URL" "$TEMP_VISION"; then
if python3 -c "import json; json.load(open('$TEMP_VISION'))" 2>/dev/null; then
cp "$TEMP_VISION" "$DATA_DIR/arena-vision.json"
log "Updated arena-vision.json"
else
error "Invalid JSON from vision leaderboard"
fi
else
error "Failed to fetch vision leaderboard, keeping existing file"
fi
rm -f "$TEMP_VISION"

log "Arena data update complete."
103 changes: 103 additions & 0 deletions packages/core/scripts/update-model-catalog.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
#!/usr/bin/env bash
# ---------------------------------------------------------------------------
# update-model-catalog.sh — Update model catalog from LiteLLM prices
#
# Fetches the latest LiteLLM model_prices_and_context_window.json and saves
# it as the baseline catalog file used by ModelCatalogService.
#
# Usage: ./scripts/update-model-catalog.sh [--mirror URL]
#
# Options:
# --mirror URL Use a mirror URL instead of the default GitHub raw URL
#
# Files updated:
# data/model-catalog-baseline.json
# ---------------------------------------------------------------------------
set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
DATA_DIR="$SCRIPT_DIR/../data"

# --- Configuration ---
DEFAULT_URL="https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json"
TIMEOUT_SEC=60
MAX_RETRIES=3

log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; }
error() { echo "[ERROR] $*" >&2; }

# Parse args
MIRROR_URL=""
while [[ $# -gt 0 ]]; do
case "$1" in
--mirror)
MIRROR_URL="$2"
shift 2
;;
*)
echo "Unknown option: $1"
exit 1
;;
esac
done

CATALOG_URL="${MIRROR_URL:-$DEFAULT_URL}"

# --- Helper: download with retry + backoff ---
download_with_retry() {
local url="$1"
local output="$2"
local retries=0

while [ $retries -lt $MAX_RETRIES ]; do
log "Downloading (attempt $((retries + 1))/$MAX_RETRIES)..."
if curl -sS --max-time "$TIMEOUT_SEC" -L "$url" -o "$output" 2>/dev/null; then
if [ -s "$output" ]; then
log "Downloaded $(wc -c < "$output") bytes"
return 0
fi
fi
retries=$((retries + 1))
local wait=$((retries * 5))
error "Attempt $retries failed, waiting ${wait}s before retry..."
sleep "$wait"
done

return 1
}

# --- Main ---
log "Starting model catalog update..."

mkdir -p "$DATA_DIR"

TEMP_FILE=$(mktemp)

if download_with_retry "$CATALOG_URL" "$TEMP_FILE"; then
# Validate JSON
if python3 -c "import json; json.load(open('$TEMP_FILE'))" 2>/dev/null; then
# Check that it has the expected structure (sample_spec key)
if python3 -c "
import json
data = json.load(open('$TEMP_FILE'))
if 'sample_spec' in data:
print('Catalog validated: contains sample_spec')
else:
print('Warning: no sample_spec found, might be unexpected format')
print(f'Total entries: {len(data)}')
" 2>/dev/null; then
cp "$TEMP_FILE" "$DATA_DIR/model-catalog-baseline.json"
log "Updated model-catalog-baseline.json ($(wc -c < "$DATA_DIR/model-catalog-baseline.json") bytes)"
else
error "Validation failed, keeping existing file"
fi
else
error "Invalid JSON from catalog URL, keeping existing file"
fi
else
error "Failed to download catalog after $MAX_RETRIES attempts, keeping existing file"
fi

rm -f "$TEMP_FILE"

log "Model catalog update complete."
Loading