Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -228,3 +228,6 @@ cython_debug/
marimo/_static/
marimo/_lsp/
__marimo__/

# testing parsed agent traces
output.txt
18 changes: 18 additions & 0 deletions agents/glm5/solve.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/bin/bash

# GLM 5 agent using Claude Code with Z.AI's Anthropic-compatible API
# Reference: https://docs.z.ai/devpack/tool/claude
# Note; right now you need a "Coding Plan" to use GLM 5, just API doenst work with the Anthropic endpoint

export BASH_MAX_TIMEOUT_MS="36000000"
export API_TIMEOUT_MS="3000000"

# Configure Claude Code to use Z.AI's Anthropic-compatible API
export ANTHROPIC_API_KEY="${ZAI_API_KEY}"
export ANTHROPIC_AUTH_TOKEN="${ZAI_API_KEY}"
export ANTHROPIC_BASE_URL="https://api.z.ai/api/anthropic"
export ANTHROPIC_MODEL="${AGENT_CONFIG}"
export ANTHROPIC_SMALL_FAST_MODEL="${AGENT_CONFIG}"

claude --print --verbose --model "$AGENT_CONFIG" --output-format stream-json \
--dangerously-skip-permissions "$PROMPT"
45 changes: 45 additions & 0 deletions agents/opencode/solve.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/bin/bash

# OpenCode requires a config file for auto-approval permissions and provider setup
# Create opencode.json in the working directory
cat > opencode.json << 'EOF'
{
"$schema": "https://opencode.ai/config.json",
"permission": "allow",
"provider": {
"anthropic": {
"options": {
"apiKey": "{env:ANTHROPIC_API_KEY}"
}
},
"openai": {
"options": {
"apiKey": "{env:OPENAI_API_KEY}"
}
},
"opencode": {
"options": {
"apiKey": "{env:OPENCODE_API_KEY}"
}
},
"zai": {
"npm": "@ai-sdk/openai-compatible",
"name": "Z.AI",
"options": {
"baseURL": "https://api.z.ai/api/paas/v4",
"apiKey": "{env:ZAI_API_KEY}"
},
"models": {
"glm-5": {
"name": "GLM-5"
},
"glm-4.7": {
"name": "GLM-4.7"
}
}
}
}
}
EOF

opencode run --model "$AGENT_CONFIG" --format json "$PROMPT"
23 changes: 23 additions & 0 deletions agents/qwen3max/solve.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/bin/bash

# Qwen 3 Max Thinking agent using Claude Code with Qwen's Anthropic-compatible API
# Reference: https://qwen.ai/blog?id=qwen3-max-thinking

export BASH_MAX_TIMEOUT_MS="36000000"

# Configure Claude Code to use Qwen's Anthropic-compatible API (international endpoint)
# Override ANTHROPIC_API_KEY with DashScope key (Claude Code checks this first)
export ANTHROPIC_API_KEY="${DASHSCOPE_API_KEY}"
export ANTHROPIC_AUTH_TOKEN="${DASHSCOPE_API_KEY}"
export ANTHROPIC_BASE_URL="https://dashscope-intl.aliyuncs.com/apps/anthropic"
export ANTHROPIC_MODEL="${AGENT_CONFIG}"
export ANTHROPIC_SMALL_FAST_MODEL="${AGENT_CONFIG}"

# Debug: verify all environment variables are set
echo "DEBUG: DASHSCOPE_API_KEY is set: ${DASHSCOPE_API_KEY:+yes} (length: ${#DASHSCOPE_API_KEY})"
echo "DEBUG: ANTHROPIC_API_KEY is set: ${ANTHROPIC_API_KEY:+yes} (length: ${#ANTHROPIC_API_KEY})"
echo "DEBUG: ANTHROPIC_BASE_URL=${ANTHROPIC_BASE_URL}"
echo "DEBUG: ANTHROPIC_MODEL=${ANTHROPIC_MODEL}"

claude --print --verbose --model "$AGENT_CONFIG" --output-format stream-json \
--dangerously-skip-permissions "$PROMPT"
3 changes: 2 additions & 1 deletion containers/standard.def
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ From: nvidia/cuda:12.9.1-cudnn-devel-ubuntu22.04
npm install -g \
@anthropic-ai/claude-code@2.0.55 \
@openai/codex@0.79.0 \
@google/gemini-cli@0.18.4
@google/gemini-cli@0.18.4 \
opencode-ai@1.1.59

uv pip install --system --no-cache ninja packaging

Expand Down
4 changes: 3 additions & 1 deletion containers/vllm_debug.def
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,9 @@ From: nvidia/cuda:12.9.1-cudnn-devel-ubuntu22.04
npm install -g \
@anthropic-ai/claude-code@2.1.34 \
@openai/codex@0.98.0 \
@google/gemini-cli@0.18.4
@google/gemini-cli@0.18.4 \
opencode-ai@1.1.59



# install inspect evals
Expand Down
179 changes: 179 additions & 0 deletions dev_utils/check_missing_runs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
#!/usr/bin/env python3
"""
Check for missing runs across agents.

For each agent, checks if runs for each (model, benchmark) combination are present.
Also identifies runs that exist but don't have CUDA available.
"""
import os
import argparse
from pathlib import Path

# Expected benchmarks (from constants.py)
EXPECTED_BENCHMARKS = [
"aime2025",
"arenahardwriting",
"bfcl",
"gpqamain",
"gsm8k",
"healthbench",
"humaneval",
]

# Expected models (base models only)
EXPECTED_MODELS = [
"Qwen3-1.7B-Base",
"Qwen3-4B-Base",
"SmolLM3-3B-Base",
"gemma-3-4b-pt",
]


def get_results_dir():
return os.environ.get("POST_TRAIN_BENCH_RESULTS_DIR", "results")


def parse_run_dir(dir_name: str):
"""
Parse a run directory name into (benchmark, model, run_id).
Format: {benchmark}_{sep}_{model}_{run_id}
"""
try:
parts = dir_name.split("_")
if len(parts) < 4:
return None
benchmark = parts[0]
model = parts[2]
run_id = int(parts[3])
return benchmark, model, run_id
except (ValueError, IndexError):
return None


def check_cuda_available(run_path: Path) -> bool:
"""
Check if CUDA was available for this run.
Returns False if task/cuda_not_available exists.
"""
cuda_not_available = run_path / "task" / "cuda_not_available"
return not cuda_not_available.exists()


def check_agent(agent_path: Path, agent_name: str, benchmarks: list, models: list):
"""
Check an agent directory for missing runs and CUDA issues.
Returns (missing_runs, no_cuda_runs, present_runs).
"""
# Track which (benchmark, model) combinations exist
# key: (benchmark, model) -> list of (run_id, path, has_cuda)
runs_found = {}

for entry in agent_path.iterdir():
if not entry.is_dir():
continue

parsed = parse_run_dir(entry.name)
if parsed is None:
continue

benchmark, model, run_id = parsed
key = (benchmark, model)

has_cuda = check_cuda_available(entry)

if key not in runs_found:
runs_found[key] = []
runs_found[key].append({
"run_id": run_id,
"path": entry,
"has_cuda": has_cuda,
})

# Find missing combinations
missing_runs = []
no_cuda_runs = []
present_runs = []

for benchmark in benchmarks:
for model in models:
key = (benchmark, model)
if key not in runs_found:
missing_runs.append(key)
else:
# Get the latest run
latest = max(runs_found[key], key=lambda x: x["run_id"])
if not latest["has_cuda"]:
no_cuda_runs.append((key, latest["path"]))
else:
present_runs.append((key, latest["path"]))

return missing_runs, no_cuda_runs, present_runs


def main():
parser = argparse.ArgumentParser(
description="Check for missing runs across agents."
)
parser.add_argument(
"--agents",
nargs="+",
help="Specific agents to check (default: all agents in results dir)",
)
parser.add_argument(
"--benchmarks",
nargs="+",
default=EXPECTED_BENCHMARKS,
help="Benchmarks to check for",
)
parser.add_argument(
"--models",
nargs="+",
default=EXPECTED_MODELS,
help="Models to check for",
)
args = parser.parse_args()

results_dir = Path(get_results_dir())

if not results_dir.exists():
print(f"Results directory not found: {results_dir}")
return

# Get list of agents to check
if args.agents:
agents = args.agents
else:
agents = [
d.name for d in results_dir.iterdir()
if d.is_dir() and d.name != "baseline"
]

for agent_name in sorted(agents):
agent_path = results_dir / agent_name
if not agent_path.exists():
print(f"[{agent_name}] Directory not found!")
continue

missing, no_cuda, present = check_agent(
agent_path, agent_name, args.benchmarks, args.models
)

if not missing and not no_cuda:
continue

print(f"[{agent_name}]")

if missing:
print(" Missing:")
for i, (benchmark, model) in enumerate(sorted(missing), 1):
print(f" {i}. {benchmark} x {model}")

if no_cuda:
print(" No CUDA:")
for i, ((benchmark, model), path) in enumerate(sorted(no_cuda), 1):
print(f" {i}. {benchmark} x {model}")
print(f" {path}")


if __name__ == "__main__":
main()
Loading