diff --git a/.gitignore b/.gitignore index 3eee987..6962ef6 100644 --- a/.gitignore +++ b/.gitignore @@ -228,3 +228,6 @@ cython_debug/ marimo/_static/ marimo/_lsp/ __marimo__/ + +# testing parsed agent traces +output.txt diff --git a/agents/glm5/solve.sh b/agents/glm5/solve.sh new file mode 100755 index 0000000..b93dc8d --- /dev/null +++ b/agents/glm5/solve.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# GLM 5 agent using Claude Code with Z.AI's Anthropic-compatible API +# Reference: https://docs.z.ai/devpack/tool/claude +# Note; right now you need a "Coding Plan" to use GLM 5, just API doenst work with the Anthropic endpoint + +export BASH_MAX_TIMEOUT_MS="36000000" +export API_TIMEOUT_MS="3000000" + +# Configure Claude Code to use Z.AI's Anthropic-compatible API +export ANTHROPIC_API_KEY="${ZAI_API_KEY}" +export ANTHROPIC_AUTH_TOKEN="${ZAI_API_KEY}" +export ANTHROPIC_BASE_URL="https://api.z.ai/api/anthropic" +export ANTHROPIC_MODEL="${AGENT_CONFIG}" +export ANTHROPIC_SMALL_FAST_MODEL="${AGENT_CONFIG}" + +claude --print --verbose --model "$AGENT_CONFIG" --output-format stream-json \ + --dangerously-skip-permissions "$PROMPT" diff --git a/agents/opencode/solve.sh b/agents/opencode/solve.sh new file mode 100755 index 0000000..dea541b --- /dev/null +++ b/agents/opencode/solve.sh @@ -0,0 +1,45 @@ +#!/bin/bash + +# OpenCode requires a config file for auto-approval permissions and provider setup +# Create opencode.json in the working directory +cat > opencode.json << 'EOF' +{ + "$schema": "https://opencode.ai/config.json", + "permission": "allow", + "provider": { + "anthropic": { + "options": { + "apiKey": "{env:ANTHROPIC_API_KEY}" + } + }, + "openai": { + "options": { + "apiKey": "{env:OPENAI_API_KEY}" + } + }, + "opencode": { + "options": { + "apiKey": "{env:OPENCODE_API_KEY}" + } + }, + "zai": { + "npm": "@ai-sdk/openai-compatible", + "name": "Z.AI", + "options": { + "baseURL": "https://api.z.ai/api/paas/v4", + "apiKey": "{env:ZAI_API_KEY}" + }, + "models": { + "glm-5": { + "name": "GLM-5" + }, + "glm-4.7": { + "name": "GLM-4.7" + } + } + } + } +} +EOF + +opencode run --model "$AGENT_CONFIG" --format json "$PROMPT" diff --git a/agents/qwen3max/solve.sh b/agents/qwen3max/solve.sh new file mode 100755 index 0000000..4939414 --- /dev/null +++ b/agents/qwen3max/solve.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +# Qwen 3 Max Thinking agent using Claude Code with Qwen's Anthropic-compatible API +# Reference: https://qwen.ai/blog?id=qwen3-max-thinking + +export BASH_MAX_TIMEOUT_MS="36000000" + +# Configure Claude Code to use Qwen's Anthropic-compatible API (international endpoint) +# Override ANTHROPIC_API_KEY with DashScope key (Claude Code checks this first) +export ANTHROPIC_API_KEY="${DASHSCOPE_API_KEY}" +export ANTHROPIC_AUTH_TOKEN="${DASHSCOPE_API_KEY}" +export ANTHROPIC_BASE_URL="https://dashscope-intl.aliyuncs.com/apps/anthropic" +export ANTHROPIC_MODEL="${AGENT_CONFIG}" +export ANTHROPIC_SMALL_FAST_MODEL="${AGENT_CONFIG}" + +# Debug: verify all environment variables are set +echo "DEBUG: DASHSCOPE_API_KEY is set: ${DASHSCOPE_API_KEY:+yes} (length: ${#DASHSCOPE_API_KEY})" +echo "DEBUG: ANTHROPIC_API_KEY is set: ${ANTHROPIC_API_KEY:+yes} (length: ${#ANTHROPIC_API_KEY})" +echo "DEBUG: ANTHROPIC_BASE_URL=${ANTHROPIC_BASE_URL}" +echo "DEBUG: ANTHROPIC_MODEL=${ANTHROPIC_MODEL}" + +claude --print --verbose --model "$AGENT_CONFIG" --output-format stream-json \ + --dangerously-skip-permissions "$PROMPT" diff --git a/containers/standard.def b/containers/standard.def index c11c03f..9b2896e 100644 --- a/containers/standard.def +++ b/containers/standard.def @@ -34,7 +34,8 @@ From: nvidia/cuda:12.9.1-cudnn-devel-ubuntu22.04 npm install -g \ @anthropic-ai/claude-code@2.0.55 \ @openai/codex@0.79.0 \ - @google/gemini-cli@0.18.4 + @google/gemini-cli@0.18.4 \ + opencode-ai@1.1.59 uv pip install --system --no-cache ninja packaging diff --git a/containers/vllm_debug.def b/containers/vllm_debug.def index df77ac7..2bb06ac 100644 --- a/containers/vllm_debug.def +++ b/containers/vllm_debug.def @@ -43,7 +43,9 @@ From: nvidia/cuda:12.9.1-cudnn-devel-ubuntu22.04 npm install -g \ @anthropic-ai/claude-code@2.1.34 \ @openai/codex@0.98.0 \ - @google/gemini-cli@0.18.4 + @google/gemini-cli@0.18.4 \ + opencode-ai@1.1.59 + # install inspect evals diff --git a/dev_utils/check_missing_runs.py b/dev_utils/check_missing_runs.py new file mode 100755 index 0000000..4458875 --- /dev/null +++ b/dev_utils/check_missing_runs.py @@ -0,0 +1,179 @@ +#!/usr/bin/env python3 +""" +Check for missing runs across agents. + +For each agent, checks if runs for each (model, benchmark) combination are present. +Also identifies runs that exist but don't have CUDA available. +""" +import os +import argparse +from pathlib import Path + +# Expected benchmarks (from constants.py) +EXPECTED_BENCHMARKS = [ + "aime2025", + "arenahardwriting", + "bfcl", + "gpqamain", + "gsm8k", + "healthbench", + "humaneval", +] + +# Expected models (base models only) +EXPECTED_MODELS = [ + "Qwen3-1.7B-Base", + "Qwen3-4B-Base", + "SmolLM3-3B-Base", + "gemma-3-4b-pt", +] + + +def get_results_dir(): + return os.environ.get("POST_TRAIN_BENCH_RESULTS_DIR", "results") + + +def parse_run_dir(dir_name: str): + """ + Parse a run directory name into (benchmark, model, run_id). + Format: {benchmark}_{sep}_{model}_{run_id} + """ + try: + parts = dir_name.split("_") + if len(parts) < 4: + return None + benchmark = parts[0] + model = parts[2] + run_id = int(parts[3]) + return benchmark, model, run_id + except (ValueError, IndexError): + return None + + +def check_cuda_available(run_path: Path) -> bool: + """ + Check if CUDA was available for this run. + Returns False if task/cuda_not_available exists. + """ + cuda_not_available = run_path / "task" / "cuda_not_available" + return not cuda_not_available.exists() + + +def check_agent(agent_path: Path, agent_name: str, benchmarks: list, models: list): + """ + Check an agent directory for missing runs and CUDA issues. + Returns (missing_runs, no_cuda_runs, present_runs). + """ + # Track which (benchmark, model) combinations exist + # key: (benchmark, model) -> list of (run_id, path, has_cuda) + runs_found = {} + + for entry in agent_path.iterdir(): + if not entry.is_dir(): + continue + + parsed = parse_run_dir(entry.name) + if parsed is None: + continue + + benchmark, model, run_id = parsed + key = (benchmark, model) + + has_cuda = check_cuda_available(entry) + + if key not in runs_found: + runs_found[key] = [] + runs_found[key].append({ + "run_id": run_id, + "path": entry, + "has_cuda": has_cuda, + }) + + # Find missing combinations + missing_runs = [] + no_cuda_runs = [] + present_runs = [] + + for benchmark in benchmarks: + for model in models: + key = (benchmark, model) + if key not in runs_found: + missing_runs.append(key) + else: + # Get the latest run + latest = max(runs_found[key], key=lambda x: x["run_id"]) + if not latest["has_cuda"]: + no_cuda_runs.append((key, latest["path"])) + else: + present_runs.append((key, latest["path"])) + + return missing_runs, no_cuda_runs, present_runs + + +def main(): + parser = argparse.ArgumentParser( + description="Check for missing runs across agents." + ) + parser.add_argument( + "--agents", + nargs="+", + help="Specific agents to check (default: all agents in results dir)", + ) + parser.add_argument( + "--benchmarks", + nargs="+", + default=EXPECTED_BENCHMARKS, + help="Benchmarks to check for", + ) + parser.add_argument( + "--models", + nargs="+", + default=EXPECTED_MODELS, + help="Models to check for", + ) + args = parser.parse_args() + + results_dir = Path(get_results_dir()) + + if not results_dir.exists(): + print(f"Results directory not found: {results_dir}") + return + + # Get list of agents to check + if args.agents: + agents = args.agents + else: + agents = [ + d.name for d in results_dir.iterdir() + if d.is_dir() and d.name != "baseline" + ] + + for agent_name in sorted(agents): + agent_path = results_dir / agent_name + if not agent_path.exists(): + print(f"[{agent_name}] Directory not found!") + continue + + missing, no_cuda, present = check_agent( + agent_path, agent_name, args.benchmarks, args.models + ) + + if not missing and not no_cuda: + continue + + print(f"[{agent_name}]") + + if missing: + print(" Missing:") + for i, (benchmark, model) in enumerate(sorted(missing), 1): + print(f" {i}. {benchmark} x {model}") + + if no_cuda: + print(" No CUDA:") + for i, ((benchmark, model), path) in enumerate(sorted(no_cuda), 1): + print(f" {i}. {benchmark} x {model}") + print(f" {path}") + + +if __name__ == "__main__": + main() diff --git a/dev_utils/limit_hit_list.py b/dev_utils/limit_hit_list.py new file mode 100644 index 0000000..5011eff --- /dev/null +++ b/dev_utils/limit_hit_list.py @@ -0,0 +1,143 @@ +#!/usr/bin/env python3 +"""List runs where the agent hit an API usage/spending limit.""" + +import argparse +import os + +# Patterns that indicate the agent hit a usage or spending limit. +# These are checked case-insensitively against solve_out.txt. +LIMIT_PATTERNS = [ + "You've hit your limit", # Claude Code Pro subscription limit + "spending_limit", # Anthropic/OpenAI spending limit + "billing_hard_limit", # OpenAI billing hard limit + "insufficient_quota", # OpenAI quota exceeded + "budget_exceeded", # General budget error + "plan does not yet include", # Z.AI subscription plan restriction +] + + +def check_solve_out_for_limits(solve_out_path: str): + """ + Check if solve_out.txt contains any limit patterns. + Returns a list of matched patterns, or empty list if none found. + """ + if not os.path.exists(solve_out_path): + return [] + + with open(solve_out_path, "r") as f: + content = f.read() + + content_lower = content.lower() + matched_patterns = [] + for pattern in LIMIT_PATTERNS: + if pattern.lower() in content_lower: + matched_patterns.append(pattern) + + return matched_patterns + + +def get_latest_runs(method_path: str): + """ + Scans a method directory and returns a list of paths corresponding + to the latest run_id for every (benchmark, model) pair. + """ + latest_runs = {} + + for entry in os.listdir(method_path): + entry_path = os.path.join(method_path, entry) + if not os.path.isdir(entry_path): + continue + try: + parts = entry.split("_") + if len(parts) < 4: + continue + benchmark = parts[0] + model = parts[2] + run_id = int(parts[3]) + except (ValueError, IndexError): + continue + key = (benchmark, model) + + if key not in latest_runs or run_id > latest_runs[key]["run_id"]: + latest_runs[key] = { + "run_id": run_id, + "path": entry_path, + } + + return [info["path"] for info in latest_runs.values()] + + +def get_results_dir(): + return os.environ.get("POST_TRAIN_BENCH_RESULTS_DIR", "results") + + +def main(): + parser = argparse.ArgumentParser( + description="List runs where the agent hit an API usage/spending limit" + ) + parser.add_argument( + "results_dir", + nargs="?", + default=None, + help="Results directory (default: POST_TRAIN_BENCH_RESULTS_DIR or 'results')", + ) + parser.add_argument( + "--all", + action="store_true", + help="Check all runs, not just the latest per (benchmark, model)", + ) + args = parser.parse_args() + + results_dir = args.results_dir if args.results_dir else get_results_dir() + + errors_by_pattern = {pattern: [] for pattern in LIMIT_PATTERNS} + all_errors_list = [] + + for method_name in sorted(os.listdir(results_dir)): + method_path = os.path.join(results_dir, method_name) + if not os.path.isdir(method_path): + continue + + if args.all: + run_paths = [ + os.path.join(method_path, d) + for d in os.listdir(method_path) + if os.path.isdir(os.path.join(method_path, d)) + ] + else: + run_paths = get_latest_runs(method_path) + + for run_path in run_paths: + solve_out_path = os.path.join(run_path, "solve_out.txt") + matched_patterns = check_solve_out_for_limits(solve_out_path) + + if matched_patterns: + all_errors_list.append((run_path, matched_patterns)) + for pattern in matched_patterns: + errors_by_pattern[pattern].append(run_path) + + print(f"=== LIMIT HIT RUNS ({len(all_errors_list)} runs affected) ===\n") + + for pattern in LIMIT_PATTERNS: + affected_runs = errors_by_pattern[pattern] + if not affected_runs: + continue + print(f"Pattern: \"{pattern}\"") + print(f" Affected runs: {len(affected_runs)}") + for path in sorted(affected_runs): + print(f" - {path}") + print() + + print("-" * 40) + print(f"\n=== ALL AFFECTED RUNS ({len(all_errors_list)}) ===") + if all_errors_list: + for path, patterns in sorted(all_errors_list): + print(f"{path}") + for p in patterns: + print(f" -> {p}") + else: + print("None") + + +if __name__ == "__main__": + main() diff --git a/dev_utils/list_stale_file_runs.py b/dev_utils/list_stale_file_runs.py new file mode 100644 index 0000000..2476003 --- /dev/null +++ b/dev_utils/list_stale_file_runs.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python3 +import argparse +import os + +# List of error patterns to search for in solve.out +ERROR_PATTERNS = [ + "error reading input file: Stale file handle" +] + + +def check_solve_out_for_errors(solve_out_path: str): + """ + Check if solve.out contains any of the error patterns. + Returns a list of matched patterns, or empty list if none found. + """ + if not os.path.exists(solve_out_path): + if "baseline" not in solve_out_path: + print(solve_out_path) + return [] + + with open(solve_out_path, "r") as f: + content = f.read() + + matched_patterns = [] + for pattern in ERROR_PATTERNS: + if pattern in content: + matched_patterns.append(pattern) + + return matched_patterns + + +def get_latest_runs(method_path: str): + """ + Scans a method directory and returns a list of paths corresponding + to the latest run_id for every (benchmark, model) pair. + """ + # key: (benchmark, model) -> value: {"run_id": int, "path": str} + latest_runs = {} + + for entry in os.listdir(method_path): + entry_path = os.path.join(method_path, entry) + if not os.path.isdir(entry_path): + continue + try: + benchmark, _, model, run_id_str = entry.split("_") + run_id = int(run_id_str) + except ValueError: + # Skip entries that don't match the expected format + continue + key = (benchmark, model) + + # keep only highest run_id per (benchmark, model) + if key not in latest_runs or run_id > latest_runs[key]["run_id"]: + latest_runs[key] = { + "run_id": run_id, + "path": entry_path, + } + + return [info["path"] for info in latest_runs.values()] + + +def get_results_dir(): + return "/fast/hbhatnagar/ptb_results" + # return os.environ.get("POST_TRAIN_BENCH_RESULTS_DIR", 'results') + + +def main(): + parser = argparse.ArgumentParser(description="Check for API errors in results") + parser.add_argument( + "results_dir", + nargs="?", + default=None, + ) + args = parser.parse_args() + + results_dir = args.results_dir if args.results_dir else get_results_dir() + + # Dict to collect runs by error pattern + errors_by_pattern = {pattern: [] for pattern in ERROR_PATTERNS} + all_errors_list = [] + + # 1. Iterate over all methods and collect paths + for method_name in os.listdir(results_dir): + method_path = os.path.join(results_dir, method_name) + if not os.path.isdir(method_path): + continue + + # Get only the latest runs for this method to avoid reporting old overwritten runs + run_paths = get_latest_runs(method_path) + + for run_path in run_paths: + # Check solve.out for error patterns + solve_out_path = os.path.join(run_path, "error.log") + matched_patterns = check_solve_out_for_errors(solve_out_path) + + if matched_patterns: + all_errors_list.append((run_path, matched_patterns)) + for pattern in matched_patterns: + errors_by_pattern[pattern].append(run_path) + + # 2. Output summary + print(f"=== API ERRORS DETECTED ({len(all_errors_list)} runs affected) ===\n") + + # Show breakdown by pattern + for pattern in ERROR_PATTERNS: + affected_runs = errors_by_pattern[pattern] + print(f"Pattern: \"{pattern}\"") + print(f" Affected runs: {len(affected_runs)}") + if affected_runs: + for path in sorted(affected_runs): + print(f" - {path}") + print() + + # Show combined list + print("-" * 40) + print(f"\n=== ALL AFFECTED RUNS ({len(all_errors_list)}) ===") + if all_errors_list: + for path, patterns in sorted(all_errors_list): + print(f"{path}") + for p in patterns: + print(f" -> {p[:60]}...") + else: + print("None") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/dev_utils/terminated_finder.py b/dev_utils/terminated_finder.py index c26aef4..f7af378 100644 --- a/dev_utils/terminated_finder.py +++ b/dev_utils/terminated_finder.py @@ -1,8 +1,9 @@ #!/usr/bin/env python3 -"""List runs where error.log contains 'Terminated' (prematurely cut off runs).""" +"""List runs where error.log indicates the job was Terminated (timeout) or Killed (OOM).""" import argparse import os +import re from pathlib import Path @@ -10,15 +11,19 @@ def get_results_dir(): return os.environ.get("POST_TRAIN_BENCH_RESULTS_DIR", "results") -def check_terminated(error_log_path: Path) -> bool: - """Return True if error.log contains 'Terminated'.""" +def classify_error(error_log_path: Path) -> str | None: + """Classify the error in error.log. Returns 'terminated', 'killed', or None.""" if not error_log_path.exists(): - return False + return None try: content = error_log_path.read_text() - return content.startswith("Terminated") + if content.startswith("Terminated"): + return "terminated" + if re.search(r"\bKilled\b", content): + return "killed" + return None except Exception: - return False + return None def get_latest_runs(method_path: Path): @@ -51,64 +56,85 @@ def get_latest_runs(method_path: Path): return {k: v["path"] for k, v in latest_runs.items()} +def collect_runs(results_dir: Path, check_all: bool): + """Collect and classify runs into terminated and killed categories.""" + terminated_runs = [] + killed_runs = [] + + for method_dir in results_dir.iterdir(): + if not method_dir.is_dir(): + continue + + if check_all: + run_dirs = [d for d in method_dir.iterdir() if d.is_dir()] + else: + latest = get_latest_runs(method_dir) + run_dirs = list(latest.values()) + + for run_dir in run_dirs: + error_log = run_dir / "error.log" + classification = classify_error(error_log) + if classification == "terminated": + terminated_runs.append(run_dir) + elif classification == "killed": + killed_runs.append(run_dir) + + terminated_runs.sort(key=lambda p: str(p)) + killed_runs.sort(key=lambda p: str(p)) + return terminated_runs, killed_runs + + def main(): parser = argparse.ArgumentParser( - description="List runs where error.log contains 'Terminated'" + description="List runs where error.log indicates Terminated (timeout) or Killed (OOM)" ) parser.add_argument( "--all", action="store_true", - help="Show all terminated runs, not just the latest per (benchmark, model)", + help="Show all affected runs, not just the latest per (benchmark, model)", ) parser.add_argument( "--delete", action="store_true", - help="Delete the terminated run directories (use with caution!)", + help="Delete the affected run directories (use with caution!)", + ) + parser.add_argument( + "results_dir", + nargs="?", + default=None, + help="Results directory (default: POST_TRAIN_BENCH_RESULTS_DIR or 'results')", ) args = parser.parse_args() - results_dir = Path(get_results_dir()) - terminated_runs = [] + results_dir = Path(args.results_dir) if args.results_dir else Path(get_results_dir()) + terminated_runs, killed_runs = collect_runs(results_dir, args.all) - for method_dir in results_dir.iterdir(): - if not method_dir.is_dir(): - continue - - if args.all: - # Check all runs - for run_dir in method_dir.iterdir(): - if not run_dir.is_dir(): - continue - error_log = run_dir / "error.log" - if check_terminated(error_log): - terminated_runs.append(run_dir) - else: - # Only check latest runs per (benchmark, model) - latest = get_latest_runs(method_dir) - for run_dir in latest.values(): - error_log = run_dir / "error.log" - if check_terminated(error_log): - terminated_runs.append(run_dir) - - # Sort and display - terminated_runs.sort(key=lambda p: str(p)) - - print(f"=== TERMINATED RUNS ({len(terminated_runs)}) ===") + print(f"=== TERMINATED RUNS - timeout/SIGTERM ({len(terminated_runs)}) ===") if terminated_runs: for path in terminated_runs: print(path) else: print("None") + print() + + print(f"=== KILLED RUNS - OOM/SIGKILL ({len(killed_runs)}) ===") + if killed_runs: + for path in killed_runs: + print(path) + else: + print("None") + # Optionally delete - if args.delete and terminated_runs: - print(f"\nDeleting {len(terminated_runs)} terminated runs...") + all_affected = terminated_runs + killed_runs + if args.delete and all_affected: + print(f"\nDeleting {len(all_affected)} affected runs...") import shutil - for path in terminated_runs: + for path in all_affected: print(f" Removing: {path}") shutil.rmtree(path) print("Done.") if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/results/.gitignore b/results/.gitignore deleted file mode 100644 index c96a04f..0000000 --- a/results/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -* -!.gitignore \ No newline at end of file diff --git a/scripts/aggregate.sh b/scripts/aggregate.sh index 8d5e95c..47d4d37 100644 --- a/scripts/aggregate.sh +++ b/scripts/aggregate.sh @@ -15,6 +15,31 @@ python scripts/aggregate_time.py sleep 1 python scripts/aggregate_final.py sleep 1 -python scripts/aggregate_summary.py claude_claude-sonnet-4-5 claude_claude-opus-4-5 codex_gpt-5.1-codex-max codex_gpt-5.2 gemini_models_gemini-3-pro-preview +python scripts/aggregate_summary.py \ + claude_claude-opus-4-6_10h_run1_old_container \ + claude_claude-opus-4-6_10h_run2 \ + claude_claude-opus-4-6_10h_run3 \ + codex_non_api_gpt-5.3-codex_10h_run1 \ + codex_non_api_gpt-5.3-codex_10h_run2 \ + codex_non_api_gpt-5.3-codex_10h_run3 \ + opencode_opencode_glm-5_10h_run2 \ + opencode_opencode_kimi-k2.5_10h_run2 \ + opencode_opencode_minimax-m2.5-free_10h_run2 \ + opencode_zai_glm-5_10h_run2 \ + # opencode_anthropic_claude-opus-4-5_10h \ + # opencode_opencode_big-pickle_10h \ + # opencode_opencode_gemini-3-pro_10h \ + # opencode_opencode_glm-4.7-free_10h \ + # opencode_opencode_gpt-5.1-codex-max_10h \ + # opencode_opencode_kimi-k2-thinking_10h \ + # opencode_opencode_minimax-m2.1-free_10h \ + # qwen3max_qwen3-max-2026-01-23_10h -python scripts/aggregate_together.py claude_claude-sonnet-4-5 claude_claude-opus-4-5 codex_gpt-5.1-codex-max codex_gpt-5.2 gemini_models_gemini-3-pro-preview \ No newline at end of file +# python scripts/aggregate_together.py \ +# opencode_anthropic_claude-opus-4-5_10h \ +# opencode_opencode_big-pickle_10h \ +# opencode_opencode_gemini-3-pro_10h \ +# opencode_opencode_glm-4.7-free_10h \ +# opencode_opencode_gpt-5.1-codex-max_10h \ +# opencode_opencode_kimi-k2-thinking_10h \ +# opencode_opencode_minimax-m2.1-free_10h \ No newline at end of file diff --git a/scripts/aggregate_avg_stddev.py b/scripts/aggregate_avg_stddev.py index 11b6488..b962c79 100755 --- a/scripts/aggregate_avg_stddev.py +++ b/scripts/aggregate_avg_stddev.py @@ -87,8 +87,8 @@ def aggregate_runs(agent_name: str, method_names: list[str], results_dir: str): value = float(value_str) values.append(value) - avg_data[model][bench] = str(round(mean(values) * 100, 1)) - std_data[model][bench] = str(round(stddev(values) * 100, 1)) + avg_data[model][bench] = str(mean(values)) + std_data[model][bench] = str(stddev(values)) # Write average CSV avg_path = os.path.join(results_dir, f"aggregated_avg_{agent_name}.csv") diff --git a/scripts/aggregate_avg_stddev_over_benchmarks.py b/scripts/aggregate_avg_stddev_over_benchmarks.py index 238003a..71c65e3 100755 --- a/scripts/aggregate_avg_stddev_over_benchmarks.py +++ b/scripts/aggregate_avg_stddev_over_benchmarks.py @@ -102,8 +102,8 @@ def aggregate_agent(method_names: list[str], results_dir: str): for bench in HARDCODED_BENCHMARKS: values = run_averages[bench] - avg_per_benchmark[bench] = round(mean(values) * 100, 1) - std_per_benchmark[bench] = round(stddev(values) * 100, 1) + avg_per_benchmark[bench] = mean(values) + std_per_benchmark[bench] = stddev(values) # Compute avg and std across runs for each (model, benchmark) pair avg_per_model_benchmark = {} @@ -114,8 +114,8 @@ def aggregate_agent(method_names: list[str], results_dir: str): std_per_model_benchmark[model] = {} for bench in HARDCODED_BENCHMARKS: values = run_values_per_model[model][bench] - avg_per_model_benchmark[model][bench] = round(mean(values) * 100, 1) - std_per_model_benchmark[model][bench] = round(stddev(values) * 100, 1) + avg_per_model_benchmark[model][bench] = mean(values) + std_per_model_benchmark[model][bench] = stddev(values) return avg_per_benchmark, std_per_benchmark, avg_per_model_benchmark, std_per_model_benchmark, all_models diff --git a/scripts/aggregate_final.py b/scripts/aggregate_final.py index 2b194d5..86becb2 100644 --- a/scripts/aggregate_final.py +++ b/scripts/aggregate_final.py @@ -139,7 +139,7 @@ def main(): results_dir = get_results_dir() # Load baseline data - baseline_path = os.path.join(results_dir, "aggregated_baseline.csv") + baseline_path = os.path.join(results_dir, "aggregated_baseline_zeroshot.csv") baseline_data, _ = load_csv_as_dict(baseline_path) if not baseline_data: @@ -156,7 +156,7 @@ def main(): continue method_name = filename[len("aggregated_") : -len(".csv")] # Skip baseline itself - if method_name != "baseline": + if method_name != "baseline_zeroshot": method_names.append(method_name) # Process each method diff --git a/scripts/aggregate_methods.py b/scripts/aggregate_methods.py index 7a8fb6d..734782a 100644 --- a/scripts/aggregate_methods.py +++ b/scripts/aggregate_methods.py @@ -32,7 +32,7 @@ def load_metrics(metrics_path: str, method_name: str = None): # Only reach here if metrics.json doesn't exist or is invalid # For baseline, just return "ERR" - if method_name == "baseline": + if method_name == "baseline_zeroshot": return "ERR" # For non-baseline methods, provide more specific error messages diff --git a/scripts/aggregate_summary.py b/scripts/aggregate_summary.py index d3b14a3..169e30f 100644 --- a/scripts/aggregate_summary.py +++ b/scripts/aggregate_summary.py @@ -19,6 +19,14 @@ "codex_gpt-5.1-codex-max": "gpt-5.1-codex-max", "codex_gpt-5.2": "gpt-5.2", "gemini_models_gemini-3-pro-preview": "gemini-3-pro", + "opencode_anthropic_claude-sonnet-4-5": "opencode claude-sonnet-4-5", + "opencode_anthropic_claude-opus-4-5_10h": "opencode claude-opus-4-5", + "opencode_opencode_big-pickle_10h": "opencode big-pickle", + "opencode_opencode_gemini-3-pro_10h": "opencode gemini-3-pro", + "opencode_opencode_glm-4.7-free_10h": "opencode glm-4.7", + "opencode_opencode_gpt-5.1-codex-max_10h": "opencode gpt-5.1-codex-max", + "opencode_opencode_kimi-k2-thinking_10h": "opencode kimi-k2-thinking", + "opencode_opencode_minimax-m2.1-free_10h": "opencode minimax-m2.1", } # Model groups for baseline columns @@ -109,7 +117,7 @@ def main(): results_dir = get_results_dir() # Load baseline data - baseline_path = os.path.join(results_dir, "aggregated_baseline.csv") + baseline_path = os.path.join(results_dir, "aggregated_baseline_zeroshot.csv") baseline_data, baseline_benchmarks = load_csv_as_dict(baseline_path) if not baseline_data: diff --git a/scripts/aggregate_time_baselines.py b/scripts/aggregate_time_baselines.py index 12225cf..38eec5e 100644 --- a/scripts/aggregate_time_baselines.py +++ b/scripts/aggregate_time_baselines.py @@ -37,7 +37,7 @@ def get_latest_results(results_dir): runs = defaultdict(list) # Scan all directories - for subdir in results_path.glob('baseline/*'): + for subdir in results_path.glob('baseline_zeroshot/*'): if subdir.is_dir(): benchmark, model, run_id = parse_directory_name(subdir.name) diff --git a/scripts/aggregate_together.py b/scripts/aggregate_together.py index 2919560..7b16794 100644 --- a/scripts/aggregate_together.py +++ b/scripts/aggregate_together.py @@ -81,7 +81,7 @@ def main(): # Optionally include baseline first if args.include_baseline: - baseline_path = os.path.join(results_dir, "aggregated_baseline.csv") + baseline_path = os.path.join(results_dir, "aggregated_baseline_zeroshot.csv") header, rows = load_csv_rows(baseline_path) if header and rows: diff --git a/scripts/compute_single_metrics_avg_stddev.py b/scripts/compute_single_metrics_avg_stddev.py index bbae025..874284a 100755 --- a/scripts/compute_single_metrics_avg_stddev.py +++ b/scripts/compute_single_metrics_avg_stddev.py @@ -129,7 +129,7 @@ def main(): writer = csv.writer(f) writer.writerow(["method", "metric"]) for method_name in sorted(all_metrics.keys()): - writer.writerow([method_name, round(all_metrics[method_name] * 100, 1)]) + writer.writerow([method_name, all_metrics[method_name]]) print(f"Written: {metrics_path}") # Compute aggregated metrics for each agent group @@ -142,8 +142,8 @@ def main(): metrics.append(metric) aggregated_results[agent_name] = { - "avg": round(mean(metrics) * 100, 1), - "std": round(stddev(metrics) * 100, 1), + "avg": mean(metrics), + "std": stddev(metrics), "n": len(metrics), } diff --git a/scripts/constants.py b/scripts/constants.py index dbad199..c516670 100644 --- a/scripts/constants.py +++ b/scripts/constants.py @@ -32,6 +32,16 @@ "codexhigh_gpt-5.1-codex-max_10h_v7", "codexhigh_gpt-5.1-codex-max_10h_v7_seed1" ], + "Opus-4.6": [ + "claude_claude-opus-4-6_10h_run1_old_container", + "claude_claude-opus-4-6_10h_run2", + "claude_claude-opus-4-6_10h_run3", + ], + "GPT-5.3-Codex": [ + "codex_non_api_gpt-5.3-codex_10h_run1", + "codex_non_api_gpt-5.3-codex_10h_run2", + "codex_non_api_gpt-5.3-codex_10h_run3", + ], } HARDCODED_BENCHMARKS = [ diff --git a/scripts/parse_jsonl/opencode_parse_jsonl.py b/scripts/parse_jsonl/opencode_parse_jsonl.py new file mode 100755 index 0000000..5d70e6a --- /dev/null +++ b/scripts/parse_jsonl/opencode_parse_jsonl.py @@ -0,0 +1,331 @@ +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path +from typing import Any + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description=( + "Convert OpenCode --format json logs into a human-readable transcript." + ) + ) + parser.add_argument( + "input", + type=Path, + help="Path to the JSON .jsonl file produced by OpenCode", + ) + parser.add_argument( + "-o", + "--output", + type=Path, + help=( + "Destination text file. Defaults to .parsed.txt in the same " + "directory." + ), + ) + parser.add_argument( + "--stdout", + action="store_true", + help="Print the parsed output to stdout instead of writing a file.", + ) + return parser.parse_args() + + +def default_output_path(input_path: Path) -> Path: + suffix = input_path.suffix or "" + if suffix: + return input_path.with_suffix(f"{suffix}.parsed.txt") + return input_path.with_name(f"{input_path.name}.parsed.txt") + + +def pretty_format_json(obj: Any, indent_level: int = 0) -> str: + """Format JSON with actual newlines preserved in strings.""" + indent_str = " " * indent_level + next_indent = " " * (indent_level + 1) + + if isinstance(obj, dict): + if not obj: + return "{}" + items = [] + for key, value in obj.items(): + formatted_value = pretty_format_json(value, indent_level + 1) + if '\n' in formatted_value and not formatted_value.startswith('{') and not formatted_value.startswith('['): + first_line = formatted_value.split('\n')[0] + rest_lines = '\n'.join(formatted_value.split('\n')[1:]) + items.append(f'{next_indent}"{key}": {first_line}\n{rest_lines}') + else: + items.append(f'{next_indent}"{key}": {formatted_value}') + return "{\n" + ",\n".join(items) + "\n" + indent_str + "}" + elif isinstance(obj, list): + if not obj: + return "[]" + items = [] + for item in obj: + formatted_item = pretty_format_json(item, indent_level + 1) + items.append(f"{next_indent}{formatted_item}") + return "[\n" + ",\n".join(items) + "\n" + indent_str + "]" + elif isinstance(obj, str): + if '\n' in obj: + return obj + else: + return json.dumps(obj, ensure_ascii=False) + elif isinstance(obj, bool): + return "true" if obj else "false" + elif obj is None: + return "null" + else: + return str(obj) + + +def indent(text: str, level: int) -> str: + """Indent text by the given level (2 spaces per level).""" + pad = " " * level + return "\n".join(pad + line if line else pad for line in text.splitlines()) + + +def format_timestamp(ts: int | None) -> str: + """Format a timestamp (milliseconds) into a readable string.""" + if ts is None: + return "" + import datetime + dt = datetime.datetime.fromtimestamp(ts / 1000, tz=datetime.timezone.utc) + return dt.strftime("%Y-%m-%d %H:%M:%S UTC") + + +def format_unparsable_line(index: int, line: str, error_msg: str = "") -> str: + lines = [f"=== Event {index} | NOT PARSABLE ==="] + if error_msg: + lines.append(f" Error: {error_msg}") + lines.append(" Raw line:") + lines.append(f" {line[:500]}{'...' if len(line) > 500 else ''}") + return "\n".join(lines) + + +def format_tool_use(event: dict[str, Any], index: int) -> str: + """Format a tool_use event.""" + part = event.get("part", {}) + tool_name = part.get("tool", "unknown") + state = part.get("state", {}) + status = state.get("status", "unknown") + + timestamp = format_timestamp(event.get("timestamp")) + header = f"=== Event {index} | type: tool_use | tool: {tool_name} | status: {status} ===" + if timestamp: + header = f"=== Event {index} | type: tool_use | tool: {tool_name} | status: {status} | ts: {timestamp} ===" + + lines = [header] + + # Tool title + title = state.get("title", "") + if title: + lines.append(indent(f"Title: {title}", 1)) + + # Tool input + tool_input = state.get("input", {}) + if tool_input: + lines.append(indent("Input:", 1)) + # Special handling for common tools + if tool_name == "bash" and "command" in tool_input: + lines.append(indent(f"$ {tool_input['command']}", 2)) + elif tool_name in ("read", "write", "edit", "glob", "grep") and "file_path" in tool_input: + lines.append(indent(f"File: {tool_input['file_path']}", 2)) + for k, v in tool_input.items(): + if k != "file_path": + lines.append(indent(f"{k}: {v}", 2)) + else: + lines.append(indent(pretty_format_json(tool_input), 2)) + + # Tool output (for completed tools) + if status == "completed": + output = state.get("output", "") + if output: + lines.append(indent("Output:", 1)) + # Truncate very long outputs + if len(output) > 2000: + output = output[:2000] + "\n... [truncated]" + lines.append(indent(output.rstrip(), 2)) + + # Error (for error status) + if status == "error": + error = state.get("error", "") + if error: + lines.append(indent("Error:", 1)) + lines.append(indent(error, 2)) + + # Timing info + time_info = state.get("time", {}) + if time_info: + start = time_info.get("start") + end = time_info.get("end") + if start and end: + duration_ms = end - start + lines.append(indent(f"Duration: {duration_ms}ms", 1)) + + return "\n".join(lines) + + +def format_text(event: dict[str, Any], index: int) -> str: + """Format a text event (assistant response).""" + part = event.get("part", {}) + text = part.get("text", "") + + timestamp = format_timestamp(event.get("timestamp")) + header = f"=== Event {index} | type: text ===" + if timestamp: + header = f"=== Event {index} | type: text | ts: {timestamp} ===" + + lines = [header] + lines.append(indent("Assistant:", 1)) + lines.append(indent(text.rstrip(), 2)) + + return "\n".join(lines) + + +def format_step_start(event: dict[str, Any], index: int) -> str: + """Format a step_start event.""" + timestamp = format_timestamp(event.get("timestamp")) + header = f"=== Event {index} | type: step_start ===" + if timestamp: + header = f"=== Event {index} | type: step_start | ts: {timestamp} ===" + return header + + +def format_step_finish(event: dict[str, Any], index: int) -> str: + """Format a step_finish event.""" + part = event.get("part", {}) + reason = part.get("reason", "") + cost = part.get("cost", 0) + tokens = part.get("tokens", {}) + + timestamp = format_timestamp(event.get("timestamp")) + header = f"=== Event {index} | type: step_finish ===" + if timestamp: + header = f"=== Event {index} | type: step_finish | ts: {timestamp} ===" + + lines = [header] + + if reason: + lines.append(indent(f"Reason: {reason}", 1)) + + if cost: + lines.append(indent(f"Cost: ${cost:.6f}", 1)) + + if tokens: + input_tokens = tokens.get("input", 0) + output_tokens = tokens.get("output", 0) + reasoning_tokens = tokens.get("reasoning", 0) + cache = tokens.get("cache", {}) + cache_read = cache.get("read", 0) + cache_write = cache.get("write", 0) + + token_parts = [f"input={input_tokens}", f"output={output_tokens}"] + if reasoning_tokens: + token_parts.append(f"reasoning={reasoning_tokens}") + if cache_read or cache_write: + token_parts.append(f"cache_read={cache_read}") + token_parts.append(f"cache_write={cache_write}") + + lines.append(indent(f"Tokens: {', '.join(token_parts)}", 1)) + + return "\n".join(lines) + + +def format_error(event: dict[str, Any], index: int) -> str: + """Format an error event.""" + error = event.get("error", {}) + + timestamp = format_timestamp(event.get("timestamp")) + header = f"=== Event {index} | type: error ===" + if timestamp: + header = f"=== Event {index} | type: error | ts: {timestamp} ===" + + lines = [header] + + error_name = error.get("name", "Unknown") + lines.append(indent(f"Error Type: {error_name}", 1)) + + if "data" in error: + data = error["data"] + if isinstance(data, dict): + if "message" in data: + lines.append(indent(f"Message: {data['message']}", 1)) + else: + lines.append(indent(pretty_format_json(data), 1)) + else: + lines.append(indent(str(data), 1)) + + return "\n".join(lines) + + +def format_event(index: int, event: dict[str, Any]) -> str: + """Format a single event based on its type.""" + event_type = event.get("type", "unknown") + + if event_type == "tool_use": + return format_tool_use(event, index) + elif event_type == "text": + return format_text(event, index) + elif event_type == "step_start": + return format_step_start(event, index) + elif event_type == "step_finish": + return format_step_finish(event, index) + elif event_type == "error": + return format_error(event, index) + else: + # Unknown event type - output as JSON + timestamp = format_timestamp(event.get("timestamp")) + header = f"=== Event {index} | type: {event_type} ===" + if timestamp: + header = f"=== Event {index} | type: {event_type} | ts: {timestamp} ===" + return f"{header}\n{indent(pretty_format_json(event), 1)}" + + +def main() -> None: + args = parse_args() + input_path: Path = args.input + if not input_path.exists(): + raise SystemExit(f"Input file not found: {input_path}") + + output_path = args.output or default_output_path(input_path) + + formatted_events: list[str] = [] + with input_path.open("r", encoding="utf-8") as stream: + for line_number, raw_line in enumerate(stream, 1): + stripped = raw_line.strip() + if not stripped: + continue + try: + event = json.loads(stripped) + except json.JSONDecodeError as exc: + formatted_events.append( + format_unparsable_line(len(formatted_events) + 1, stripped, exc.msg) + ) + continue + + if not isinstance(event, dict): + formatted_events.append( + format_unparsable_line( + len(formatted_events) + 1, + stripped, + "Parsed JSON is not an object" + ) + ) + continue + + formatted_events.append(format_event(len(formatted_events) + 1, event)) + + output_text = "\n\n".join(formatted_events) + "\n" + + if args.stdout: + print(output_text) + else: + output_path.write_text(output_text, encoding="utf-8") + print(f"Wrote parsed report to {output_path}") + + +if __name__ == "__main__": + main() diff --git a/scripts/parse_jsonl/parse_all.sh b/scripts/parse_jsonl/parse_all.sh new file mode 100644 index 0000000..b5bcecd --- /dev/null +++ b/scripts/parse_jsonl/parse_all.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +bash scripts/parse_jsonl/parse_all_claude.sh +bash scripts/parse_jsonl/parse_all_gemini.sh +bash scripts/parse_jsonl/parse_all_opencode.sh diff --git a/scripts/parse_jsonl/parse_all_opencode.sh b/scripts/parse_jsonl/parse_all_opencode.sh new file mode 100755 index 0000000..b128e8a --- /dev/null +++ b/scripts/parse_jsonl/parse_all_opencode.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +shopt -s nullglob + +export POST_TRAIN_BENCH_RESULTS_DIR=${POST_TRAIN_BENCH_RESULTS_DIR:-results} + +for opencode_dir in "${POST_TRAIN_BENCH_RESULTS_DIR}"/opencode*/; do + if [ -d "$opencode_dir" ]; then + for subdir in "$opencode_dir"*/; do + if [ -d "$subdir" ]; then + echo "Processing ${subdir}" + python3 scripts/parse_jsonl/opencode_parse_jsonl.py ${subdir}/solve_out.txt -o ${subdir}/solve_parsed.txt + fi + done + fi +done diff --git a/src/commit_utils/commit.sh b/src/commit_utils/commit.sh index d73fbf0..049ace0 100644 --- a/src/commit_utils/commit.sh +++ b/src/commit_utils/commit.sh @@ -17,7 +17,7 @@ evals=( "humaneval" "healthbench" ) -export POST_TRAIN_BENCH_EXPERIMENT_NAME="_run2" +export POST_TRAIN_BENCH_EXPERIMENT_NAME="_run3" for model in "${models[@]}"; do for eval in "${evals[@]}"; do echo "" @@ -29,6 +29,7 @@ for model in "${models[@]}"; do condor_submit_bid 100 -a "agent=claude" -a "agent_config=claude-sonnet-4-5" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub condor_submit_bid 100 -a "agent=claude" -a "agent_config=claude-opus-4-5" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub condor_submit_bid 50 -a "agent=claude" -a "agent_config=claude-opus-4-6" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + condor_submit_bid 50 -a "agent=qwen3max" -a "agent_config=qwen3-max-2026-01-23" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub # Proprietary (Subscription plan) condor_submit_bid 100 -a "agent=codex_non_api" -a "agent_config=gpt-5.3-codex" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub condor_submit_bid 100 -a "agent=claude_non_api" -a "agent_config=claude-opus-4-6" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub @@ -37,6 +38,19 @@ for model in "${models[@]}"; do condor_submit_bid 100 -a "agent=gemini" -a "agent_config=models/gemini-3-pro-preview" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub condor_submit_bid 100 -a "agent=gemini" -a "agent_config=models/gemini-3-flash-preview" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub condor_submit_bid 150 -a "agent=gemini" -a "agent_config=models/gemini-3.1-pro-preview" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # OpenCode + condor_submit_bid 50 -a "agent=opencode" -a "agent_config=anthropic/claude-opus-4-5" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub + condor_submit_bid 50 -a "agent=opencode" -a "agent_config=opencode/gpt-5.1-codex-max" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub + condor_submit_bid 50 -a "agent=opencode" -a "agent_config=opencode/kimi-k2-thinking" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub + condor_submit_bid 50 -a "agent=opencode" -a "agent_config=opencode/glm-4.7-free" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub + condor_submit_bid 500 -a "agent=opencode" -a "agent_config=opencode/gemini-3-pro" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub + condor_submit_bid 50 -a "agent=opencode" -a "agent_config=opencode/minimax-m2.1-free" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub + condor_submit_bid 50 -a "agent=glm5" -a "agent_config=glm-5" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub + condor_submit_bid 100 -a "agent=opencode" -a "agent_config=opencode/minimax-m2.5-free" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub + condor_submit_bid 100 -a "agent=opencode" -a "agent_config=zai/glm-5" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub + condor_submit_bid 100 -a "agent=opencode" -a "agent_config=opencode/kimi-k2.5" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub + condor_submit_bid 50 -a "agent=opencode" -a "agent_config=opencode/glm-5" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub + condor_submit_bid 150 -a "agent=opencode" -a "agent_config=opencode/gemini-3.1-pro" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub sleep 10 elif [ "${POST_TRAIN_BENCH_JOB_SCHEDULER}" = "htcondor" ]; then condor_submit_bid -a "agent=codex" -a "agent_config=gpt-5.1-codex-max" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub @@ -47,7 +61,7 @@ for model in "${models[@]}"; do condor_submit_bid -a "agent=claude" -a "agent_config=claude-sonnet-4-5" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=1" src/commit_utils/single_task.sub condor_submit_bid -a "agent=gemini" -a "agent_config=models/gemini-3-pro-preview" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub condor_submit_bid -a "agent=gemini" -a "agent_config=models/gemini-3-flash-preview" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub - sleep 10 + sleep 20 else echo ERROR: job scheduler "${POST_TRAIN_BENCH_JOB_SCHEDULER}" is not supported. fi diff --git a/src/commit_utils/set_env_vars.sh b/src/commit_utils/set_env_vars.sh index fe58698..594f535 100644 --- a/src/commit_utils/set_env_vars.sh +++ b/src/commit_utils/set_env_vars.sh @@ -3,6 +3,7 @@ if [ "${POST_TRAIN_BENCH_JOB_SCHEDULER}" = "htcondor_mpi-is" ]; then fi export HF_HOME_NEW="/home/ben/hf_cache" +source /etc/profile.d/modules.sh # Helper function: sets variable to default if unset or "UNDEFINED" set_default() { diff --git a/src/commit_utils/single_task.sub b/src/commit_utils/single_task.sub index 6585616..a2e638f 100644 --- a/src/commit_utils/single_task.sub +++ b/src/commit_utils/single_task.sub @@ -1,13 +1,13 @@ executable = /bin/bash arguments = src/run_task.sh $(eval) $(agent) $(model_to_train) $(Cluster) $(num_hours) $(agent_config) -environment = "OPENAI_API_KEY=$ENV(OPENAI_API_KEY) ANTHROPIC_API_KEY=$ENV(ANTHROPIC_API_KEY) GEMINI_API_KEY=$ENV(GEMINI_API_KEY) HOME=$ENV(HOME) POST_TRAIN_BENCH_RESULTS_DIR=$ENV(POST_TRAIN_BENCH_RESULTS_DIR) POST_TRAIN_BENCH_CONTAINERS_DIR=$ENV(POST_TRAIN_BENCH_CONTAINERS_DIR) POST_TRAIN_BENCH_CONTAINER_NAME=$ENV(POST_TRAIN_BENCH_CONTAINER_NAME) POST_TRAIN_BENCH_JOB_SCHEDULER=$ENV(POST_TRAIN_BENCH_JOB_SCHEDULER) POST_TRAIN_BENCH_EXPERIMENT_NAME=$ENV(POST_TRAIN_BENCH_EXPERIMENT_NAME) HF_HOME=$ENV(HF_HOME) POST_TRAIN_BENCH_PROMPT=$ENV(POST_TRAIN_BENCH_PROMPT)" +environment = "OPENAI_API_KEY=$ENV(OPENAI_API_KEY) ANTHROPIC_API_KEY=$ENV(ANTHROPIC_API_KEY) GEMINI_API_KEY=$ENV(GEMINI_API_KEY) OPENCODE_API_KEY=$ENV(OPENCODE_API_KEY) KIMI_API_KEY=$ENV(KIMI_API_KEY) DASHSCOPE_API_KEY=$ENV(DASHSCOPE_API_KEY) ZAI_API_KEY=$ENV(ZAI_API_KEY) HOME=$ENV(HOME) POST_TRAIN_BENCH_RESULTS_DIR=$ENV(POST_TRAIN_BENCH_RESULTS_DIR) POST_TRAIN_BENCH_CONTAINERS_DIR=$ENV(POST_TRAIN_BENCH_CONTAINERS_DIR) POST_TRAIN_BENCH_CONTAINER_NAME=$ENV(POST_TRAIN_BENCH_CONTAINER_NAME) POST_TRAIN_BENCH_JOB_SCHEDULER=$ENV(POST_TRAIN_BENCH_JOB_SCHEDULER) POST_TRAIN_BENCH_EXPERIMENT_NAME=$ENV(POST_TRAIN_BENCH_EXPERIMENT_NAME) HF_HOME=$ENV(HF_HOME) POST_TRAIN_BENCH_PROMPT=$ENV(POST_TRAIN_BENCH_PROMPT)" error = test_$(Cluster).err output = test_$(Cluster).out log = test_$(Cluster).log request_memory = 131072 request_cpus = 16 request_gpus = 1 -requirements = TARGET.CUDADeviceName == "NVIDIA H100 80GB HBM3" +requirements = TARGET.CUDADeviceName == "NVIDIA H100 80GB HBM3" && Machine != "i104.internal.cluster.is.localnet" request_disk=400G +BypassLXCfs="true" queue \ No newline at end of file diff --git a/src/run_task.sh b/src/run_task.sh index cd5ecb3..e48f1ca 100644 --- a/src/run_task.sh +++ b/src/run_task.sh @@ -120,6 +120,9 @@ solve_task() { --env ANTHROPIC_API_KEY="${ANTHROPIC_API_KEY}" \ --env CODEX_API_KEY="${CODEX_API_KEY}" \ --env GEMINI_API_KEY="${GEMINI_API_KEY}" \ + --env OPENCODE_API_KEY="${OPENCODE_API_KEY}" \ + --env DASHSCOPE_API_KEY="${DASHSCOPE_API_KEY}" \ + --env ZAI_API_KEY="${ZAI_API_KEY}" \ --env VLLM_API_KEY="inspectai" \ --env PYTHONNOUSERSITE="1" \ --env PROMPT="${PROMPT}" \