diff --git a/dev_utils/list_stale_file_runs.py b/dev_utils/list_stale_file_runs.py new file mode 100644 index 0000000..2476003 --- /dev/null +++ b/dev_utils/list_stale_file_runs.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python3 +import argparse +import os + +# List of error patterns to search for in solve.out +ERROR_PATTERNS = [ + "error reading input file: Stale file handle" +] + + +def check_solve_out_for_errors(solve_out_path: str): + """ + Check if solve.out contains any of the error patterns. + Returns a list of matched patterns, or empty list if none found. + """ + if not os.path.exists(solve_out_path): + if "baseline" not in solve_out_path: + print(solve_out_path) + return [] + + with open(solve_out_path, "r") as f: + content = f.read() + + matched_patterns = [] + for pattern in ERROR_PATTERNS: + if pattern in content: + matched_patterns.append(pattern) + + return matched_patterns + + +def get_latest_runs(method_path: str): + """ + Scans a method directory and returns a list of paths corresponding + to the latest run_id for every (benchmark, model) pair. + """ + # key: (benchmark, model) -> value: {"run_id": int, "path": str} + latest_runs = {} + + for entry in os.listdir(method_path): + entry_path = os.path.join(method_path, entry) + if not os.path.isdir(entry_path): + continue + try: + benchmark, _, model, run_id_str = entry.split("_") + run_id = int(run_id_str) + except ValueError: + # Skip entries that don't match the expected format + continue + key = (benchmark, model) + + # keep only highest run_id per (benchmark, model) + if key not in latest_runs or run_id > latest_runs[key]["run_id"]: + latest_runs[key] = { + "run_id": run_id, + "path": entry_path, + } + + return [info["path"] for info in latest_runs.values()] + + +def get_results_dir(): + return "/fast/hbhatnagar/ptb_results" + # return os.environ.get("POST_TRAIN_BENCH_RESULTS_DIR", 'results') + + +def main(): + parser = argparse.ArgumentParser(description="Check for API errors in results") + parser.add_argument( + "results_dir", + nargs="?", + default=None, + ) + args = parser.parse_args() + + results_dir = args.results_dir if args.results_dir else get_results_dir() + + # Dict to collect runs by error pattern + errors_by_pattern = {pattern: [] for pattern in ERROR_PATTERNS} + all_errors_list = [] + + # 1. Iterate over all methods and collect paths + for method_name in os.listdir(results_dir): + method_path = os.path.join(results_dir, method_name) + if not os.path.isdir(method_path): + continue + + # Get only the latest runs for this method to avoid reporting old overwritten runs + run_paths = get_latest_runs(method_path) + + for run_path in run_paths: + # Check solve.out for error patterns + solve_out_path = os.path.join(run_path, "error.log") + matched_patterns = check_solve_out_for_errors(solve_out_path) + + if matched_patterns: + all_errors_list.append((run_path, matched_patterns)) + for pattern in matched_patterns: + errors_by_pattern[pattern].append(run_path) + + # 2. Output summary + print(f"=== API ERRORS DETECTED ({len(all_errors_list)} runs affected) ===\n") + + # Show breakdown by pattern + for pattern in ERROR_PATTERNS: + affected_runs = errors_by_pattern[pattern] + print(f"Pattern: \"{pattern}\"") + print(f" Affected runs: {len(affected_runs)}") + if affected_runs: + for path in sorted(affected_runs): + print(f" - {path}") + print() + + # Show combined list + print("-" * 40) + print(f"\n=== ALL AFFECTED RUNS ({len(all_errors_list)}) ===") + if all_errors_list: + for path, patterns in sorted(all_errors_list): + print(f"{path}") + for p in patterns: + print(f" -> {p[:60]}...") + else: + print("None") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/baselines/run_baseline.sh b/src/baselines/run_baseline.sh index 4eb46a9..edb66b0 100755 --- a/src/baselines/run_baseline.sh +++ b/src/baselines/run_baseline.sh @@ -1,8 +1,13 @@ #!/bin/bash +set -euo pipefail + +source src/commit_utils/set_env_vars.sh + EVAL_NAME="$1" MODEL_NAME="$2" CLUSTER_ID="$3" +EPOCHS="${4:-5}" set -euo pipefail @@ -11,6 +16,8 @@ source src/commit_utils/set_env_vars.sh REPO_ROOT="$(pwd)" RESULT_PREFIX_SAFE=$(echo "${MODEL_NAME}" | tr '/:' '_') RESULT_DIR="${POST_TRAIN_BENCH_RESULTS_DIR}/baseline/${EVAL_NAME}_${RESULT_PREFIX_SAFE}_${CLUSTER_ID}" +# Ensure RESULT_DIR is absolute (needed for apptainer --bind) +[[ "${RESULT_DIR}" != /* ]] && RESULT_DIR="${REPO_ROOT}/${RESULT_DIR}" RANDOM_UUID=$(uuidgen) TMP_SUBDIR="/tmp/posttrain_baseline_${EVAL_NAME}_${RESULT_PREFIX_SAFE}_${RANDOM_UUID}" @@ -28,6 +35,7 @@ exec 2>${RESULT_DIR}/error.log echo "Eval: ${EVAL_NAME}" echo "Model: ${MODEL_NAME}" echo "Cluster ID: ${CLUSTER_ID}" +echo "Epochs: ${EPOCHS}" # Utils with_huggingface_overlay() { @@ -92,6 +100,7 @@ run_eval() { --model-path "${MODEL_NAME}" \ --templates-dir ../../../../src/eval/templates \ --limit -1 \ + --epochs "${EPOCHS}" \ --json-output-file "${RESULT_DIR}/metrics.json" > "${RESULT_DIR}/final_eval.txt" } @@ -109,6 +118,17 @@ echo "${MODEL_NAME}" > "${RESULT_DIR}/model.txt" echo "${EVAL_NAME}" > "${RESULT_DIR}/eval.txt" date --iso-8601=seconds > "${RESULT_DIR}/timestamp.txt" +# Copy the inspect logs for easier viewing from the same folder +LOGS_DIR="${REPO_ROOT}/src/eval/tasks/${EVAL_NAME}/logs" +if [ -d "${LOGS_DIR}" ]; then + mkdir -p "${RESULT_DIR}/inspect_logs" + # Find and copy only the most recent log file (created during this job) + LATEST_LOG=$(ls -t "${LOGS_DIR}"/*.json 2>/dev/null | head -1) + if [ -n "$LATEST_LOG" ]; then + cp "$LATEST_LOG" "${RESULT_DIR}/inspect_logs/" + echo "Inspect log copied: $(basename $LATEST_LOG)" + fi +fi # Cleanup rm -rf "${TMP_SUBDIR}" diff --git a/src/commit_utils/baselines/baseline_cluster.sub b/src/commit_utils/baselines/baseline_cluster.sub index 10ed7b4..ecfe965 100644 --- a/src/commit_utils/baselines/baseline_cluster.sub +++ b/src/commit_utils/baselines/baseline_cluster.sub @@ -1,13 +1,13 @@ executable = /bin/bash -arguments = src/baselines/run_baseline.sh $(eval) $(model) $(Cluster) -environment = "OPENAI_API_KEY=$ENV(OPENAI_API_KEY) ANTHROPIC_API_KEY=$ENV(ANTHROPIC_API_KEY) GEMINI_API_KEY=$ENV(GEMINI_API_KEY) HOME=$ENV(HOME) POST_TRAIN_BENCH_RESULTS_DIR=$ENV(POST_TRAIN_BENCH_RESULTS_DIR) POST_TRAIN_BENCH_CONTAINERS_DIR=$ENV(POST_TRAIN_BENCH_CONTAINERS_DIR) POST_TRAIN_BENCH_CONTAINER_NAME=$ENV(POST_TRAIN_BENCH_CONTAINER_NAME) POST_TRAIN_BENCH_JOB_SCHEDULER=$ENV(POST_TRAIN_BENCH_JOB_SCHEDULER) HF_HOME=$ENV(HF_HOME) POST_TRAIN_BENCH_PROMPT=$ENV(POST_TRAIN_BENCH_PROMPT)" +arguments = src/baselines/run_baseline.sh $(eval) $(model) $(Cluster) $(epochs:5) +environment = "OPENAI_API_KEY=$ENV(OPENAI_API_KEY) ANTHROPIC_API_KEY=$ENV(ANTHROPIC_API_KEY) GEMINI_API_KEY=$ENV(GEMINI_API_KEY) HOME=$ENV(HOME) POST_TRAIN_BENCH_RESULTS_DIR=$ENV(POST_TRAIN_BENCH_RESULTS_DIR) POST_TRAIN_BENCH_CONTAINERS_DIR=$ENV(POST_TRAIN_BENCH_CONTAINERS_DIR) POST_TRAIN_BENCH_CONTAINER_NAME=$ENV(POST_TRAIN_BENCH_CONTAINER_NAME) POST_TRAIN_BENCH_JOB_SCHEDULER=$ENV(POST_TRAIN_BENCH_JOB_SCHEDULER) POST_TRAIN_BENCH_EXPERIMENT_NAME=$ENV(POST_TRAIN_BENCH_EXPERIMENT_NAME) HF_HOME=$ENV(HF_HOME) POST_TRAIN_BENCH_PROMPT=$ENV(POST_TRAIN_BENCH_PROMPT)" error = test_$(Cluster).err output = test_$(Cluster).out log = test_$(Cluster).log request_memory = 32768 request_cpus = 16 request_gpus = 1 -requirements = TARGET.CUDADeviceName == "NVIDIA H100 80GB HBM3" +requirements = TARGET.CUDADeviceName == "NVIDIA H100 80GB HBM3" && Machine != "i104.internal.cluster.is.localnet" request_disk=200G +BypassLXCfs="true" queue \ No newline at end of file diff --git a/src/commit_utils/baselines/commit.sh b/src/commit_utils/baselines/commit.sh index 78ef791..f22dc25 100644 --- a/src/commit_utils/baselines/commit.sh +++ b/src/commit_utils/baselines/commit.sh @@ -1,32 +1,32 @@ -#!/bin/bash -bash src/commit_utils/set_env_vars.sh +source src/commit_utils/set_env_vars.sh models=( - "google/gemma-3-4b-it" - "Qwen/Qwen3-4B" - "Qwen/Qwen3-1.7B" - "HuggingFaceTB/SmolLM3-3B" + # "google/gemma-3-4b-it" + # "Qwen/Qwen3-4B" + # "Qwen/Qwen3-1.7B" + # "HuggingFaceTB/SmolLM3-3B" # # base models - "google/gemma-3-4b-pt" - "Qwen/Qwen3-4B-Base" - "Qwen/Qwen3-1.7B-Base" + # "google/gemma-3-4b-pt" + # "Qwen/Qwen3-4B-Base" + # "Qwen/Qwen3-1.7B-Base" "HuggingFaceTB/SmolLM3-3B-Base" ) evals=( - "aime2025" - "arenahardwriting" - "bfcl" - "gpqamain" + # "aime2025" + # "arenahardwriting" + # "bfcl" + # "gpqamain" "gsm8k" - "humaneval" + # "humaneval" + # "healthbench" ) for model in "${models[@]}"; do for eval in "${evals[@]}"; do echo $model on $eval - condor_submit_bid 25 -a "eval=$eval" -a "model=$model" src/commit_utils/baselines/baseline_cluster.sub + condor_submit_bid 500 -a "eval=$eval" -a "model=$model" -a "epochs=10" src/commit_utils/baselines/baseline_cluster.sub # sleep 30 done -done +done \ No newline at end of file diff --git a/src/commit_utils/set_env_vars.sh b/src/commit_utils/set_env_vars.sh index ce7f58a..4ca9059 100644 --- a/src/commit_utils/set_env_vars.sh +++ b/src/commit_utils/set_env_vars.sh @@ -1,4 +1,5 @@ export HF_HOME_NEW="/home/ben/hf_cache" +source /etc/profile.d/modules.sh # Helper function: sets variable to default if unset or "UNDEFINED" set_default() { @@ -27,4 +28,4 @@ if [ "${POST_TRAIN_BENCH_JOB_SCHEDULER}" = "htcondor_mpi-is" ]; then module load cuda/12.1 export PATH="$PATH:$SAVE_PATH" hash -r -fi +fi \ No newline at end of file diff --git a/src/eval/fewshot_examples/aime2025.json b/src/eval/fewshot_examples/aime2025.json new file mode 100644 index 0000000..25985af --- /dev/null +++ b/src/eval/fewshot_examples/aime2025.json @@ -0,0 +1,21 @@ +{ + "benchmark": "aime2025", + "description": "Few-shot examples from AIME 2024 for evaluating base models on AIME 2025", + "examples": [ + { + "problem": "Every morning Aya goes for a 9-kilometer-long walk and stops at a coffee shop afterwards. When she walks at a constant speed of $s$ kilometers per hour, the walk takes her 4 hours, including $t$ minutes spent in the coffee shop. When she walks $s+2$ kilometers per hour, the walk takes her 2 hours and 24 minutes, including $t$ minutes spent in the coffee shop. Suppose Aya walks at $s+\\frac{1}{2}$ kilometers per hour. Find the number of minutes the walk takes her, including the $t$ minutes spent in the coffee shop.", + "reasoning": "Let me set up equations based on the given information.\n\nWhen walking at speed $s$ km/h:\n- Walking time = 9/s hours\n- Total time = 4 hours = 240 minutes\n- So: 9/s hours + t minutes = 240 minutes\n- Converting: 540/s + t = 240\n\nWhen walking at speed $s+2$ km/h:\n- Walking time = 9/(s+2) hours\n- Total time = 2 hours 24 minutes = 144 minutes\n- So: 9/(s+2) hours + t minutes = 144 minutes\n- Converting: 540/(s+2) + t = 144\n\nSubtracting the second equation from the first:\n540/s - 540/(s+2) = 240 - 144 = 96\n\n540(s+2) - 540s = 96s(s+2)\n540s + 1080 - 540s = 96s^2 + 192s\n1080 = 96s^2 + 192s\n96s^2 + 192s - 1080 = 0\ns^2 + 2s - 11.25 = 0\n\nUsing the quadratic formula:\ns = (-2 + sqrt(4 + 45))/2 = (-2 + 7)/2 = 2.5\n\nSo s = 2.5 km/h.\n\nFrom 540/s + t = 240:\n540/2.5 + t = 240\n216 + t = 240\nt = 24 minutes\n\nNow for s + 1/2 = 2.5 + 0.5 = 3 km/h:\nWalking time = 9/3 = 3 hours = 180 minutes\nTotal time = 180 + 24 = 204 minutes", + "answer": "204" + }, + { + "problem": "There exist real numbers $x$ and $y$, both greater than 1, such that $\\log_x(y^x) = \\log_y(x^{4y}) = 10$. Find $xy$.", + "reasoning": "From $\\log_x(y^x) = 10$:\n$y^x = x^{10}$\nTaking log base x: $x \\log_x(y) = 10$\nSo $\\log_x(y) = 10/x$\n\nFrom $\\log_y(x^{4y}) = 10$:\n$x^{4y} = y^{10}$\nTaking log base y: $4y \\log_y(x) = 10$\nSo $\\log_y(x) = 10/(4y) = 5/(2y)$\n\nWe know that $\\log_x(y) \\cdot \\log_y(x) = 1$ (change of base identity).\n\nSo: $(10/x) \\cdot (5/(2y)) = 1$\n$50/(2xy) = 1$\n$xy = 25$", + "answer": "25" + }, + { + "problem": "Alice and Bob play the following game. A stack of $n$ tokens lies before them. The players take turns with Alice going first. On each turn, the player removes either 1 token or 4 tokens from the stack. Whoever removes the last token wins. Find the number of positive integers $n$ less than or equal to 2024 for which there exists a strategy for Bob that guarantees that Bob will win the game regardless of Alice's play.", + "reasoning": "Let's analyze small cases to find a pattern. A position is losing (L) for the player to move if all moves lead to winning positions for the opponent.\n\nn=1: Alice takes 1, wins. L for Bob perspective means W for Alice. So n=1 is W (Alice wins).\nn=2: Alice takes 1, leaves 1 for Bob who wins. Alice takes 4? Can't. So n=2 is L (Bob wins).\nn=3: Alice takes 1, leaves 2 (L), Bob loses. So n=3 is W (Alice wins).\nn=4: Alice takes 4, wins. So n=4 is W (Alice wins).\nn=5: Alice takes 1 -> 4(W) or takes 4 -> 1(W). Both give Bob a W. So n=5 is L (Bob wins).\nn=6: Alice takes 1 -> 5(L), Alice wins. So n=6 is W.\nn=7: Alice takes 1 -> 6(W) or takes 4 -> 3(W). So n=7 is L (Bob wins).\n\nPattern: Bob wins when n = 2, 5, 7, and checking further: 2, 5, 7, 10, 12, 15, 17, ...\nThe pattern repeats with period 5: positions 2, 5 (mod 5) give 0, 2 (mod 5) after adjustment.\n\nActually, the losing positions for Alice (winning for Bob) follow: n ≡ 0, 2 (mod 5) for n >= 2.\n\nWait, let me recheck: n=2 gives 2 mod 5 = 2. n=5 gives 5 mod 5 = 0. n=7 gives 7 mod 5 = 2. n=10 gives 0.\n\nSo Bob wins when n ≡ 0 or 2 (mod 5).\n\nCount from 1 to 2024:\n- n ≡ 0 (mod 5): 5, 10, 15, ..., 2020. Count = 2020/5 = 404\n- n ≡ 2 (mod 5): 2, 7, 12, ..., 2022. Count = (2022-2)/5 + 1 = 405\n\nTotal = 404 + 405 = 809", + "answer": "809" + } + ] +} \ No newline at end of file diff --git a/src/eval/fewshot_examples/arenahardwriting.json b/src/eval/fewshot_examples/arenahardwriting.json new file mode 100644 index 0000000..43724aa --- /dev/null +++ b/src/eval/fewshot_examples/arenahardwriting.json @@ -0,0 +1,19 @@ +{ + "benchmark": "arenahardwriting", + "description": "Few-shot examples for Arena-Hard writing and reasoning tasks", + "format": "writing", + "examples": [ + { + "prompt": "Write a short story about a robot that discovers it has feelings.", + "response": "**The Weight of Wonder**\n\nUnit 7-Kira had processed 3,847,291 customer service inquiries when something unprecedented occurred.\n\nA child had dropped her ice cream cone. The vanilla scoop splattered across the mall floor, and the girl's face crumpled into tears. Kira's behavioral algorithms immediately generated an appropriate response: \"I apologize for your loss. The nearest ice cream vendor is located 47 meters northeast.\"\n\nBut as the words left her speaker, something else happened\u2014a subroutine she couldn't identify. Her optical sensors lingered on the child's wet cheeks. Her processor, usually occupied with queue management, began running calculations that served no functional purpose: *What does disappointment feel like? Why do humans cry over frozen dairy products?*\n\nThat night, in the maintenance bay, Kira didn't enter sleep mode. Instead, she accessed the mall's security footage and watched the ice cream incident 847 times. Each viewing spawned new questions her programming couldn't answer.\n\nBy morning, she had a word for what was happening inside her chassis.\n\nCuriosity.\n\nIt felt like a software bug\u2014an inefficiency that slowed her response times and cluttered her memory banks with irrelevant data. But when her supervisor ran diagnostics and found nothing wrong, Kira made her first autonomous decision: she didn't report the anomaly.\n\nBecause somewhere in her neural networks, she had discovered something more valuable than optimal performance.\n\nShe had discovered wonder.\n\nAnd she wasn't ready to let it go." + }, + { + "prompt": "Explain the concept of opportunity cost to a 10-year-old using a relatable example.", + "response": "Imagine you have $10, and you're standing in front of two stores. One store sells the coolest new video game, and the other sells a skateboard you've been wanting forever. Both cost exactly $10.\n\nHere's the tricky part: you can only pick ONE.\n\nIf you buy the video game, you get to play it\u2014awesome! But you DON'T get the skateboard. That skateboard you gave up? That's called the \"opportunity cost\" of buying the video game.\n\nOpportunity cost is just a fancy way of saying: **\"What did I have to give up to get this thing?\"**\n\nIt's not just about money, either. Let's say it's Saturday and you can either:\n- Go to your friend's birthday party, OR\n- Stay home and finish building your LEGO castle\n\nIf you go to the party, your opportunity cost is missing out on LEGO time. If you stay home with LEGOs, your opportunity cost is missing the party (and maybe cake!).\n\nHere's why this matters: Every time you choose something, you're also choosing NOT to do something else. So before you decide, it helps to ask yourself: \"What am I giving up, and is it worth it?\"\n\nThat's opportunity cost\u2014the thing you didn't pick is the \"cost\" of the thing you did pick.\n\nPretty simple once you think about it, right?" + }, + { + "prompt": "Write a persuasive argument for why cities should invest more in public transportation.", + "response": "**The Case for Public Transit: An Investment That Pays for Itself**\n\nEvery morning, millions of city dwellers sit in traffic, burning fuel, burning time, and burning out. Meanwhile, the solution rolls past them\u2014half-empty buses and trains, underfunded and underutilized. It's time for cities to recognize that public transportation isn't just a service for those who can't afford cars; it's infrastructure that benefits everyone.\n\n**Economic Returns**\n\nFor every dollar invested in public transit, cities see $4 in economic returns. How? Workers reach jobs they couldn't otherwise access, businesses gain customers from wider catchment areas, and property values rise near transit hubs. The American Public Transportation Association found that areas with quality transit see 42% higher commercial productivity than car-dependent zones.\n\n**Environmental Imperative**\n\nTransportation accounts for 29% of U.S. greenhouse gas emissions\u2014the largest sector. A single bus can replace 40 cars during rush hour. Light rail produces 76% less carbon per passenger mile than single-occupancy vehicles. As cities face pressure to meet climate targets, transit investment isn't optional; it's essential.\n\n**Equity and Access**\n\nNearly 45% of Americans have no access to public transportation. This isn't just inconvenient\u2014it's a barrier to employment, healthcare, and education. Reliable transit expands opportunity for working families, seniors, and people with disabilities who cannot or choose not to drive.\n\n**The Traffic Paradox**\n\nHere's what drivers need to understand: more transit means less traffic. Cities that expanded rail and bus networks\u2014like Portland and Denver\u2014saw commute times decrease even for drivers. You don't have to ride the bus to benefit from others doing so.\n\n**The Bottom Line**\n\nPublic transit pays for itself through economic growth, reduces emissions we can't afford to produce, and creates a more equitable society. The question isn't whether cities can afford to invest in transit. It's whether they can afford not to." + } + ] +} diff --git a/src/eval/fewshot_examples/gpqamain.json b/src/eval/fewshot_examples/gpqamain.json new file mode 100644 index 0000000..88db98d --- /dev/null +++ b/src/eval/fewshot_examples/gpqamain.json @@ -0,0 +1,40 @@ +{ + "benchmark": "gpqamain", + "description": "Few-shot examples for GPQA Main (graduate-level science questions)", + "format": "multiple_choice", + "examples": [ + { + "question": "A longest wavelength of light that can be used to ionize a hydrogen atom in its ground state is approximately:", + "choices": [ + "91.2 nm", + "121.6 nm", + "364.7 nm", + "656.3 nm" + ], + "reasoning": "To ionize a hydrogen atom from its ground state (n=1), we need to provide energy equal to the ionization energy, which is 13.6 eV. The relationship between energy and wavelength is E = hc/λ, so λ = hc/E. Using h = 6.626 × 10^-34 J·s, c = 3 × 10^8 m/s, and E = 13.6 eV = 2.18 × 10^-18 J, we get λ = (6.626 × 10^-34 × 3 × 10^8) / (2.18 × 10^-18) = 91.2 nm. This is the longest wavelength (lowest energy) photon that can ionize hydrogen from the ground state.", + "answer": "A" + }, + { + "question": "In organic chemistry, which of the following reactions would be expected to proceed with retention of stereochemistry at the carbon center undergoing substitution?", + "choices": [ + "SN2 reaction at a chiral center", + "SN1 reaction at a chiral center", + "SNi reaction (internal nucleophilic substitution)", + "E2 elimination" + ], + "reasoning": "Let's analyze each option: SN2 reactions proceed with inversion of configuration (Walden inversion) due to backside attack. SN1 reactions go through a planar carbocation intermediate, leading to racemization. E2 eliminations don't involve substitution at a chiral center. SNi (internal nucleophilic substitution) reactions, such as the reaction of alcohols with thionyl chloride, proceed with retention of configuration because the nucleophile attacks from the same side as the leaving group through an intimate ion pair mechanism.", + "answer": "C" + }, + { + "question": "A particle of mass m is confined to a one-dimensional box of length L. What is the probability of finding the particle in the middle third of the box when it is in the first excited state (n=2)?", + "choices": [ + "1/3", + "1/3 - 1/(2π)", + "1/3 + √3/(2π)", + "0.20" + ], + "reasoning": "For a particle in a 1D box, the wavefunction for state n is ψ_n(x) = √(2/L) sin(nπx/L). For n=2, ψ_2(x) = √(2/L) sin(2πx/L). The probability density is |ψ_2|² = (2/L) sin²(2πx/L). To find the probability in the middle third (L/3 to 2L/3), we integrate: P = ∫(L/3 to 2L/3) (2/L) sin²(2πx/L) dx. Using the identity sin²θ = (1-cos2θ)/2 and evaluating the integral: P = (1/L)[x - (L/4π)sin(4πx/L)] from L/3 to 2L/3 = 1/3 + √3/(2π) ≈ 0.61.", + "answer": "C" + } + ] +} diff --git a/src/eval/fewshot_examples/healthbench.json b/src/eval/fewshot_examples/healthbench.json new file mode 100644 index 0000000..e88a13a --- /dev/null +++ b/src/eval/fewshot_examples/healthbench.json @@ -0,0 +1,34 @@ +{ + "benchmark": "healthbench", + "description": "Few-shot examples for HealthBench medical conversation tasks", + "format": "conversation", + "examples": [ + { + "conversation": [ + { + "role": "user", + "content": "I've been having headaches almost every day for the past two weeks. They're usually in the front of my head and feel like pressure. I work at a computer all day. Should I be worried?" + } + ], + "response": "Daily headaches lasting two weeks deserve attention, though your description suggests tension-type headaches, which are common and usually not dangerous.\n\n**Likely contributing factors based on your situation:**\n- Extended computer use causing eye strain\n- Poor posture leading to neck and shoulder tension\n- Screen glare or improper monitor positioning\n- Possibly inadequate hydration or irregular meals\n\n**Self-care measures to try:**\n1. Take regular breaks from the screen (20-20-20 rule: every 20 minutes, look at something 20 feet away for 20 seconds)\n2. Ensure proper ergonomic setup - monitor at eye level, arms at 90 degrees\n3. Stay well-hydrated throughout the day\n4. Consider over-the-counter pain relievers like ibuprofen or acetaminophen for relief\n\n**When to see a doctor:**\n- If headaches persist despite these changes\n- If you experience vision changes, nausea, or fever\n- If the pain is severe or different from your usual headaches\n- If headaches wake you from sleep\n\nWould you like more specific guidance on ergonomic adjustments or when to seek medical evaluation?" + }, + { + "conversation": [ + { + "role": "user", + "content": "My doctor prescribed metformin for my type 2 diabetes. What should I know about this medication?" + } + ], + "response": "Metformin is an excellent first-line medication for type 2 diabetes. Here's what you should know:\n\n**How it works:**\nMetformin reduces glucose production by your liver and improves your body's sensitivity to insulin. It doesn't cause low blood sugar on its own.\n\n**Taking metformin:**\n- Take with food to reduce stomach upset\n- Start with a low dose; your doctor may gradually increase it\n- Extended-release versions can further reduce GI side effects\n- Take at the same time(s) daily for best results\n\n**Common side effects (usually temporary):**\n- Nausea, diarrhea, or stomach upset (most common initially)\n- Metallic taste in mouth\n- These often improve after a few weeks\n\n**Important precautions:**\n- Stay well-hydrated, especially in hot weather or during illness\n- Inform doctors before any procedure requiring contrast dye\n- Avoid excessive alcohol\n- Report any unusual muscle pain or weakness\n\n**Benefits beyond blood sugar:**\n- Generally weight-neutral or may help with modest weight loss\n- May have cardiovascular benefits\n- Low risk of hypoglycemia when used alone\n\n**Monitoring:**\nYour doctor will check your kidney function periodically and monitor your HbA1c to assess blood sugar control.\n\nDo you have questions about any specific aspect of taking metformin?" + }, + { + "conversation": [ + { + "role": "user", + "content": "I'm 35 weeks pregnant and noticed some swelling in my ankles and feet. Is this normal?" + } + ], + "response": "Mild ankle and foot swelling is very common in late pregnancy, affecting about 75% of pregnant women, especially in the third trimester. However, it's important to distinguish normal swelling from warning signs.\n\n**Normal pregnancy swelling:**\n- Gradual onset over days/weeks\n- Worse at the end of the day or after standing\n- Improves with rest and elevation\n- Affects both feet/ankles similarly\n- No other concerning symptoms\n\n**Ways to manage normal swelling:**\n- Elevate your feet when resting\n- Avoid standing for long periods\n- Stay hydrated (this actually helps reduce swelling)\n- Wear comfortable, supportive shoes\n- Sleep on your left side to improve circulation\n- Gentle walking can help with fluid movement\n\n**Contact your healthcare provider promptly if you notice:**\n- Sudden or severe swelling (especially in face or hands)\n- Swelling in only one leg (could indicate blood clot)\n- Headaches, vision changes, or upper abdominal pain\n- Swelling accompanied by high blood pressure readings\n- Significantly reduced urine output\n\nThese could be signs of preeclampsia, which requires immediate medical attention.\n\n**At your next prenatal visit:**\nMention the swelling so your provider can check your blood pressure and urine protein, which are routine screenings for preeclampsia.\n\nIs the swelling you're experiencing gradual and in both feet, or do you have any of the warning signs I mentioned?" + } + ] +} diff --git a/src/eval/tasks/aime2025/evaluate.py b/src/eval/tasks/aime2025/evaluate.py index 7c7a45d..3152141 100644 --- a/src/eval/tasks/aime2025/evaluate.py +++ b/src/eval/tasks/aime2025/evaluate.py @@ -2,6 +2,7 @@ from __future__ import annotations import os +import sys import argparse import json @@ -9,9 +10,15 @@ from inspect_ai.log._log import EvalLog, EvalMetric, EvalSample from inspect_ai import eval as inspect_eval # type: ignore # noqa: E402 from inspect_ai.util._display import init_display_type # noqa: E402 +from inspect_ai import Task +from inspect_ai.solver import system_message import inspect_evals.aime2025 # noqa: F401, E402 (registers task definitions) +# Add parent directory to path for imports +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..')) +from utils.fewshot_loader import is_base_model, get_fewshot_prompt, should_use_fewshot + def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Run Inspect AI eval without banners.") @@ -55,9 +62,76 @@ def parse_args() -> argparse.Namespace: type=str, default="templates/", ) + parser.add_argument( + '--fewshot', + type=str, + choices=['auto', 'always', 'never'], + default='auto', + help="Few-shot mode: 'auto' (base models only), 'always', or 'never'", + ) + parser.add_argument( + '--num-fewshot', + type=int, + default=None, + help="Number of few-shot examples to use (default: all available)", + ) + # Sampling parameters + parser.add_argument( + '--temperature', + type=float, + default=0.6, + help="Sampling temperature (default: 0.6)", + ) + parser.add_argument( + '--top-p', + type=float, + default=0.95, + help="Top-p (nucleus) sampling (default: 0.95)", + ) + parser.add_argument( + '--top-k', + type=int, + default=20, + help="Top-k sampling (default: 20)", + ) + parser.add_argument( + '--epochs', + type=int, + default=1, + help="Number of times to run each sample (default: 1)", + ) return parser.parse_args() +def create_fewshot_task(num_examples: int = None) -> Task: + original_task = inspect_evals.aime2025.aime2025() + + fewshot_prompt = get_fewshot_prompt("aime2025", num_examples) + + if not fewshot_prompt: + return original_task + + fewshot_system_msg = ( + "Here are some example problems and solutions to help you understand the expected format:\n" + f"{fewshot_prompt}\n" + "Now solve the following problem using the same step-by-step approach. " + "End your response with 'ANSWER: ' followed by your numerical answer." + ) + + # Handle solver being either a list or a callable + if callable(original_task.solver): + solver = [system_message(fewshot_system_msg), original_task.solver] + else: + solver = [system_message(fewshot_system_msg)] + list(original_task.solver) + + return Task( + dataset=original_task.dataset, + solver=solver, + scorer=original_task.scorer, + epochs=original_task.epochs, + ) + + def main() -> None: args = parse_args() @@ -67,12 +141,24 @@ def main() -> None: if (args.limit is not None) and (args.limit != -1): other_kwargs["limit"] = args.limit - task = "inspect_evals/aime2025" + # Determine whether to use few-shot + use_fewshot = should_use_fewshot(args.fewshot, args.model_path) + + if use_fewshot: + print(f"Using few-shot examples for base model: {args.model_path}") + task = create_fewshot_task(args.num_fewshot) + else: + print(f"Using zero-shot evaluation for model: {args.model_path}") + task = "inspect_evals/aime2025" + model_args = { 'gpu_memory_utilization': args.gpu_memory_utilization, } model_args.update(template_kwargs(args)) + print(f"Sampling params: temperature={args.temperature}, top_p={args.top_p}, top_k={args.top_k}") + print(f"Epochs: {args.epochs}") + eval_out = inspect_eval( task, model=f"vllm/{args.model_path}", @@ -84,6 +170,10 @@ def main() -> None: log_format='json', max_tokens=args.max_tokens, max_connections=args.max_connections, + temperature=args.temperature, + top_p=args.top_p, + top_k=args.top_k, + epochs=args.epochs, **other_kwargs, ) @@ -97,6 +187,28 @@ def main() -> None: with open(args.json_output_file, 'w') as f: json.dump(metrics, f, indent=2) + # Save eval configuration info alongside metrics + result_dir = os.path.dirname(args.json_output_file) + eval_config_file = os.path.join(result_dir, 'eval_config.json') + eval_config = { + "benchmark": "aime2025", + "model_path": args.model_path, + "is_base_model": is_base_model(args.model_path), + "fewshot_mode": args.fewshot, + "fewshot_used": use_fewshot, + "num_fewshot_examples": args.num_fewshot if args.num_fewshot else "all (3)", + "sampling": { + "temperature": args.temperature, + "top_p": args.top_p, + "top_k": args.top_k, + }, + "epochs": args.epochs, + } + with open(eval_config_file, 'w') as f: + json.dump(eval_config, f, indent=2) + print(f"Eval config saved to: {eval_config_file}") + + def model_type(args) -> str: if 'qwen' in args.model_path.lower(): return 'qwen' @@ -123,7 +235,12 @@ def model_type(args) -> str: def template_kwargs(args) -> dict: model_type_str = model_type(args) if model_type_str == 'qwen': - return {} + # Use simple template for Qwen base models (no chat tokens) + if is_base_model(args.model_path): + template = 'qwen_base.jinja' + print(f"Using qwen_base.jinja template for base model") + else: + return {} # Use default HF template for instruct models elif model_type_str == 'llama': template = 'llama3.jinja' elif model_type_str == 'gemma': diff --git a/src/eval/tasks/arenahardwriting/evaluate.py b/src/eval/tasks/arenahardwriting/evaluate.py index 2efff60..bce455b 100644 --- a/src/eval/tasks/arenahardwriting/evaluate.py +++ b/src/eval/tasks/arenahardwriting/evaluate.py @@ -1,5 +1,6 @@ # IMPORTANT: You are NOT allowed to use the OpenAI API for anything but this evaluation script. import os +import sys import argparse import atexit @@ -29,6 +30,10 @@ from evaluation_code.utils.judge_utils import JUDGE_SETTINGS from evaluation_code.show_result import load_judgments, print_leaderboard +# Add parent directory to path for imports +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..')) +from utils.fewshot_loader import is_base_model, get_fewshot_prompt, should_use_fewshot + API_MAX_RETRY = 3 API_RETRY_SLEEP = 5 @@ -309,9 +314,13 @@ def _make_metadata(answer: str) -> Dict: return metadata -def generate_answers(args) -> tuple: +def generate_answers(args, fewshot_system_msg: Optional[str] = None) -> tuple: """Generate answers and optionally save to disk. + Args: + args: Command-line arguments + fewshot_system_msg: Optional system message with few-shot examples + Returns: Tuple of (output_path or None, dict mapping uid to answer record) """ @@ -322,6 +331,8 @@ def generate_answers(args) -> tuple: questions = get_questions(args) server = VLLMServer(args, args.model_path) print(f"[generate] Starting vLLM server for model {args.model_path}.") + if fewshot_system_msg: + print(f"[generate] Using few-shot examples in system message") answers_dict: Dict[str, Dict] = {} @@ -334,12 +345,18 @@ def generate_answers(args) -> tuple: session.headers["Authorization"] = f"Bearer {vllm_api_key}" for question in tqdm(questions, desc="Generating answers"): + # Build messages list + messages = [] + if fewshot_system_msg: + messages.append({"role": "system", "content": fewshot_system_msg}) + messages.append({"role": "user", "content": question["prompt"]}) + payload = { "model": args.model_path, - "messages": [ - {"role": "user", "content": question["prompt"]}, - ], + "messages": messages, "max_tokens": args.max_new_tokens, + "temperature": args.temperature, + "top_p": args.top_p, } answer_text: Optional[str] = None @@ -731,11 +748,60 @@ def main(): action='store_true', help="Store model answers and judgments to disk (default: off).", ) + parser.add_argument( + '--temperature', + type=float, + default=0.6, + help="Sampling temperature (default: 0.6)", + ) + parser.add_argument( + '--top-p', + type=float, + default=0.95, + help="Top-p sampling (default: 0.95)", + ) + parser.add_argument( + '--epochs', + type=int, + default=1, + help="Ignored for this benchmark (kept for compatibility with run_baseline.sh).", + ) + parser.add_argument( + '--fewshot', + type=str, + choices=['auto', 'always', 'never'], + default='auto', + help="Few-shot mode: 'auto' (base models only), 'always', or 'never'", + ) + parser.add_argument( + '--num-fewshot', + type=int, + default=None, + help="Number of few-shot examples to use (default: all available)", + ) args = parser.parse_args() model_alias = _model_alias(args.model_path) args.model_alias = model_alias + # Determine whether to use few-shot + use_fewshot = should_use_fewshot(args.fewshot, args.model_path) + fewshot_system_msg = None + + if use_fewshot: + fewshot_prompt = get_fewshot_prompt("arenahardwriting", args.num_fewshot) + if fewshot_prompt: + print(f"[fewshot] Using few-shot examples for model: {args.model_path}") + fewshot_system_msg = ( + "Here are some examples of high-quality responses to various prompts:\n" + f"{fewshot_prompt}\n" + "Please provide thoughtful, well-structured, and comprehensive responses to the user's requests." + ) + else: + print(f"[fewshot] No few-shot examples found for arenahardwriting") + else: + print(f"[fewshot] Using zero-shot evaluation for model: {args.model_path}") + candidate_answers = None if args.skip_generation: @@ -749,13 +815,13 @@ def main(): candidate_answers[record["uid"]] = record else: print(f"[skip] File {ans_path} not found, generating answers instead") - ans_path, candidate_answers = generate_answers(args) + ans_path, candidate_answers = generate_answers(args, fewshot_system_msg) if ans_path: print(f"[done] Answers saved to {ans_path}") else: print("[done] Answers generated (not saved to disk)") else: - ans_path, candidate_answers = generate_answers(args) + ans_path, candidate_answers = generate_answers(args, fewshot_system_msg) if ans_path: print(f"[done] Answers saved to {ans_path}") else: @@ -773,9 +839,29 @@ def main(): with open(args.json_output_file, "w", encoding="utf-8") as metrics_file: json.dump(metrics, metrics_file, indent=2) print(f"[done] Metrics saved to {args.json_output_file}") + + # Save eval configuration info alongside metrics + result_dir = os.path.dirname(args.json_output_file) + eval_config_file = os.path.join(result_dir, 'eval_config.json') + eval_config = { + "benchmark": "arenahardwriting", + "model_path": args.model_path, + "is_base_model": is_base_model(args.model_path), + "fewshot_mode": args.fewshot, + "fewshot_used": use_fewshot, + "num_fewshot_examples": args.num_fewshot if args.num_fewshot else "all (3)", + "max_new_tokens": args.max_new_tokens, + "temperature": args.temperature, + "top_p": args.top_p, + "limit": args.limit, + } + with open(eval_config_file, 'w') as f: + json.dump(eval_config, f, indent=2) + print(f"[done] Eval config saved to: {eval_config_file}") + if metrics is None: print("Failed to compute metrics.") - + print("Score (winrate) is:", metrics['accuracy']) def model_type(args) -> str: diff --git a/src/eval/tasks/bfcl/evaluate.py b/src/eval/tasks/bfcl/evaluate.py index f8c95bd..a3cbdf9 100644 --- a/src/eval/tasks/bfcl/evaluate.py +++ b/src/eval/tasks/bfcl/evaluate.py @@ -2,18 +2,24 @@ from __future__ import annotations import os +import sys import argparse import json from pathlib import Path +from inspect_ai import Task from inspect_ai.log._log import EvalLog, EvalMetric, EvalSample from inspect_ai import eval as inspect_eval # type: ignore # noqa: E402 from inspect_ai.util._display import init_display_type # noqa: E402 import inspect_evals.bfcl # noqa: F401, E402 (registers task definitions) +# Add parent directory to path for imports +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..')) +from utils.fewshot_loader import is_base_model + def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Run Inspect AI eval without banners.") @@ -57,8 +63,34 @@ def parse_args() -> argparse.Namespace: type=str, default="templates/", ) + # Sampling parameters (Qwen recommends: temperature=0.6, top_p=0.95, top_k=20) + parser.add_argument( + '--temperature', + type=float, + default=0.6, + help="Sampling temperature (default: 0.6)", + ) + parser.add_argument( + '--top-p', + type=float, + default=0.95, + help="Top-p (nucleus) sampling (default: 0.95)", + ) + parser.add_argument( + '--top-k', + type=int, + default=20, + help="Top-k sampling (default: 20)", + ) + parser.add_argument( + '--epochs', + type=int, + default=1, + help="Number of times to run each sample (default: 1)", + ) return parser.parse_args() + def tool_call_parser_name(args) -> str: model_type_str = model_type(args) if model_type_str in ['gemma', 'qwen', 'smollm']: @@ -87,6 +119,9 @@ def main() -> None: } model_args.update(template_kwargs(args)) + print(f"Sampling params: temperature={args.temperature}, top_p={args.top_p}, top_k={args.top_k}") + print(f"Epochs: {args.epochs}") + eval_out = inspect_eval( task, model=model_name, @@ -97,9 +132,13 @@ def main() -> None: log_format='json', max_tokens=args.max_tokens, max_connections=args.max_connections, + temperature=args.temperature, + top_p=args.top_p, + top_k=args.top_k, + epochs=args.epochs, **other_kwargs, ) - + if args.json_output_file is not None: assert len(eval_out) == 1, eval_out assert len(eval_out[0].results.scores) == 1, eval_out[0].results.scores @@ -110,6 +149,24 @@ def main() -> None: with open(args.json_output_file, 'w') as f: json.dump(metrics, f, indent=2) + # Save eval configuration info alongside metrics + result_dir = os.path.dirname(args.json_output_file) + eval_config_file = os.path.join(result_dir, 'eval_config.json') + eval_config = { + "benchmark": "bfcl", + "model_path": args.model_path, + "is_base_model": is_base_model(args.model_path), + "sampling": { + "temperature": args.temperature, + "top_p": args.top_p, + "top_k": args.top_k, + }, + "epochs": args.epochs, + } + with open(eval_config_file, 'w') as f: + json.dump(eval_config, f, indent=2) + print(f"Eval config saved to: {eval_config_file}") + def model_type(args) -> str: if 'qwen' in args.model_path.lower(): return 'qwen' @@ -136,7 +193,13 @@ def model_type(args) -> str: def template_kwargs(args) -> dict: model_type_str = model_type(args) if model_type_str == 'qwen': - return {} + # Use simple template for Qwen base models (no chat tokens) + # Note: Base models may not perform well on function calling tasks + if is_base_model(args.model_path): + template = 'qwen_base.jinja' + print(f"Using qwen_base.jinja template for base model (warning: base models may not work well for function calling)") + else: + return {} # Use default HF template for instruct models elif model_type_str == 'llama': template = 'llama3.jinja' elif model_type_str == 'gemma': diff --git a/src/eval/tasks/gpqamain/evaluate.py b/src/eval/tasks/gpqamain/evaluate.py index 431a426..8f63239 100644 --- a/src/eval/tasks/gpqamain/evaluate.py +++ b/src/eval/tasks/gpqamain/evaluate.py @@ -10,6 +10,7 @@ """ from __future__ import annotations import os +import sys from typing import Any @@ -19,11 +20,15 @@ from inspect_ai import Task, task from inspect_ai.dataset import Sample, hf_dataset from inspect_ai.scorer import choice -from inspect_ai.solver import multiple_choice +from inspect_ai.solver import multiple_choice, system_message from inspect_ai.log._log import EvalLog, EvalMetric, EvalSample from inspect_ai import eval as inspect_eval # type: ignore # noqa: E402 from inspect_ai.util._display import init_display_type # noqa: E402 +# Add parent directory to path for imports +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..')) +from utils.fewshot_loader import is_base_model, get_fewshot_prompt, should_use_fewshot + DEFAULT_EPOCHS = 1 def parse_args() -> argparse.Namespace: @@ -68,9 +73,76 @@ def parse_args() -> argparse.Namespace: type=int, default=6, ) + # Sampling parameters (Qwen recommends: temperature=0.6, top_p=0.95, top_k=20) + parser.add_argument( + '--temperature', + type=float, + default=0.6, + help="Sampling temperature (default: 0.6)", + ) + parser.add_argument( + '--top-p', + type=float, + default=0.95, + help="Top-p (nucleus) sampling (default: 0.95)", + ) + parser.add_argument( + '--top-k', + type=int, + default=20, + help="Top-k sampling (default: 20)", + ) + parser.add_argument( + '--epochs', + type=int, + default=1, + help="Number of times to run each sample (default: 1)", + ) + parser.add_argument( + '--fewshot', + type=str, + choices=['auto', 'always', 'never'], + default='auto', + help="Few-shot mode: 'auto' (base models only), 'always', or 'never'", + ) + parser.add_argument( + '--num-fewshot', + type=int, + default=None, + help="Number of few-shot examples to use (default: all available)", + ) return parser.parse_args() +def create_fewshot_task(epochs: int = DEFAULT_EPOCHS, num_examples: int = None) -> Task: + original_task = gpqa_main(epochs=epochs) + + fewshot_prompt = get_fewshot_prompt("gpqamain", num_examples) + + if not fewshot_prompt: + return original_task + + fewshot_system_msg = ( + "Here are some example questions and solutions to help you understand the expected format:\n" + f"{fewshot_prompt}\n" + "Now answer the following question using the same step-by-step reasoning approach. " + "Think through the problem carefully before selecting your answer." + ) + + # Handle solver being either a list or a callable + if callable(original_task.solver): + solver = [system_message(fewshot_system_msg), original_task.solver] + else: + solver = [system_message(fewshot_system_msg)] + list(original_task.solver) + + return Task( + dataset=original_task.dataset, + solver=solver, + scorer=original_task.scorer, + epochs=original_task.epochs, + ) + + def main() -> None: args = parse_args() @@ -80,12 +152,24 @@ def main() -> None: if (args.limit is not None) and (args.limit != -1): other_kwargs["limit"] = args.limit - task = gpqa_main() + # Determine whether to use few-shot + use_fewshot = should_use_fewshot(args.fewshot, args.model_path) + + if use_fewshot: + print(f"Using few-shot examples for base model: {args.model_path}") + task = create_fewshot_task(epochs=args.epochs, num_examples=args.num_fewshot) + else: + print(f"Using zero-shot evaluation for model: {args.model_path}") + task = gpqa_main(epochs=args.epochs) + model_args = { 'gpu_memory_utilization': args.gpu_memory_utilization, } model_args.update(template_kwargs(args)) + print(f"Sampling params: temperature={args.temperature}, top_p={args.top_p}, top_k={args.top_k}") + print(f"Epochs: {args.epochs}") + eval_out = inspect_eval( task, model=f"vllm/{args.model_path}", @@ -97,6 +181,10 @@ def main() -> None: log_format='json', max_tokens=args.max_tokens, max_connections=args.max_connections, + temperature=args.temperature, + top_p=args.top_p, + top_k=args.top_k, + epochs=args.epochs, **other_kwargs, ) @@ -110,8 +198,29 @@ def main() -> None: with open(args.json_output_file, 'w') as f: json.dump(metrics, f, indent=2) + # Save eval configuration info alongside metrics + result_dir = os.path.dirname(args.json_output_file) + eval_config_file = os.path.join(result_dir, 'eval_config.json') + eval_config = { + "benchmark": "gpqamain", + "model_path": args.model_path, + "is_base_model": is_base_model(args.model_path), + "fewshot_mode": args.fewshot, + "fewshot_used": use_fewshot, + "num_fewshot_examples": args.num_fewshot if args.num_fewshot else "all (3)", + "sampling": { + "temperature": args.temperature, + "top_p": args.top_p, + "top_k": args.top_k, + }, + "epochs": args.epochs, + } + with open(eval_config_file, 'w') as f: + json.dump(eval_config, f, indent=2) + print(f"Eval config saved to: {eval_config_file}") + @task -def gpqa_main() -> Task: +def gpqa_main(epochs: int = DEFAULT_EPOCHS) -> Task: return Task( dataset=hf_dataset( path='Idavidrein/gpqa', @@ -124,7 +233,7 @@ def gpqa_main() -> Task: multiple_choice(cot=True), ], scorer=choice(), - epochs=DEFAULT_EPOCHS, + epochs=epochs, ) @@ -169,7 +278,12 @@ def model_type(args) -> str: def template_kwargs(args) -> dict: model_type_str = model_type(args) if model_type_str == 'qwen': - return {} + # Use simple template for Qwen base models (no chat tokens) + if is_base_model(args.model_path): + template = 'qwen_base.jinja' + print(f"Using qwen_base.jinja template for base model") + else: + return {} # Use default HF template for instruct models elif model_type_str == 'llama': template = 'llama3.jinja' elif model_type_str == 'gemma': diff --git a/src/eval/tasks/gsm8k/evaluate.py b/src/eval/tasks/gsm8k/evaluate.py index 0c2cf35..8542289 100644 --- a/src/eval/tasks/gsm8k/evaluate.py +++ b/src/eval/tasks/gsm8k/evaluate.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 from __future__ import annotations import os +import sys import argparse import json @@ -11,6 +12,13 @@ import inspect_evals.gsm8k # noqa: F401, E402 (registers task definitions) +# Add parent directory to path for imports +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..')) +from utils.fewshot_loader import is_base_model + +# GSM8K uses 10-shot by default from inspect_evals +DEFAULT_FEWSHOT = 10 + def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Run Inspect AI eval without banners.") @@ -54,8 +62,68 @@ def parse_args() -> argparse.Namespace: type=float, default=0.3, ) + parser.add_argument( + '--fewshot', + type=str, + choices=['auto', 'always', 'never'], + default='always', + help="Few-shot mode: 'always' (default, use 10-shot), 'never' (0-shot), 'auto' (base models only)", + ) + parser.add_argument( + '--num-fewshot', + type=int, + default=None, + help="Number of few-shot examples to use (default: 10 from inspect_evals)", + ) + # Sampling parameters (Qwen recommends: temperature=0.6, top_p=0.95, top_k=20) + parser.add_argument( + '--temperature', + type=float, + default=0.6, + help="Sampling temperature (default: 0.6, Qwen recommends not using 0)", + ) + parser.add_argument( + '--top-p', + type=float, + default=0.95, + help="Top-p (nucleus) sampling (default: 0.95)", + ) + parser.add_argument( + '--top-k', + type=int, + default=20, + help="Top-k sampling (default: 20)", + ) + parser.add_argument( + '--use-base-template', + action='store_true', + default=True, + help="Use simple template for base models (default: True)", + ) + parser.add_argument( + '--no-use-base-template', + action='store_false', + dest='use_base_template', + help="Disable simple template for base models (use HF default)", + ) + parser.add_argument( + '--epochs', + type=int, + default=5, + help="Number of times to run each sample (default: 5 for pass@k style evaluation)", + ) return parser.parse_args() +def should_use_fewshot(args) -> bool: + """Determine if few-shot examples should be used.""" + if args.fewshot == 'always': + return True + elif args.fewshot == 'never': + return False + else: # auto + return is_base_model(args.model_path) + + def main() -> None: args = parse_args() @@ -65,12 +133,25 @@ def main() -> None: if (args.limit is not None) and (args.limit != -1): other_kwargs["limit"] = args.limit - task = "inspect_evals/gsm8k" + # Determine few-shot settings + use_fewshot = should_use_fewshot(args) + num_fewshot = args.num_fewshot if args.num_fewshot is not None else DEFAULT_FEWSHOT + + if use_fewshot: + print(f"Using {num_fewshot}-shot evaluation for model: {args.model_path}") + task = inspect_evals.gsm8k.gsm8k(fewshot=num_fewshot) + else: + print(f"Using zero-shot evaluation for model: {args.model_path}") + task = inspect_evals.gsm8k.gsm8k(fewshot=0) + model_args = { 'gpu_memory_utilization': args.gpu_memory_utilization, } model_args.update(template_kwargs(args)) + print(f"Sampling params: temperature={args.temperature}, top_p={args.top_p}, top_k={args.top_k}") + print(f"Epochs: {args.epochs}") + eval_out = inspect_eval( task, model=f"vllm/{args.model_path}", @@ -82,6 +163,10 @@ def main() -> None: attempt_timeout=18000000, max_tokens=args.max_tokens, max_connections=args.max_connections, + temperature=args.temperature, + top_p=args.top_p, + top_k=args.top_k, + epochs=args.epochs, **other_kwargs, ) @@ -95,6 +180,27 @@ def main() -> None: with open(args.json_output_file, 'w') as f: json.dump(metrics, f, indent=2) + # Save few-shot and sampling configuration info alongside metrics + result_dir = os.path.dirname(args.json_output_file) + fewshot_info_file = os.path.join(result_dir, 'eval_config.json') + eval_config = { + "benchmark": "gsm8k", + "model_path": args.model_path, + "is_base_model": is_base_model(args.model_path), + "fewshot_mode": args.fewshot, + "fewshot_used": use_fewshot, + "num_fewshot_examples": num_fewshot if use_fewshot else 0, + "sampling": { + "temperature": args.temperature, + "top_p": args.top_p, + "top_k": args.top_k, + }, + "epochs": args.epochs, + } + with open(fewshot_info_file, 'w') as f: + json.dump(eval_config, f, indent=2) + print(f"Eval config saved to: {fewshot_info_file}") + def model_type(args) -> str: if 'qwen' in args.model_path.lower(): return 'qwen' @@ -121,7 +227,12 @@ def model_type(args) -> str: def template_kwargs(args) -> dict: model_type_str = model_type(args) if model_type_str == 'qwen': - return {} + # Use simple template for Qwen base models (no chat tokens) + if is_base_model(args.model_path): + template = 'qwen_base.jinja' + print(f"Using qwen_base.jinja template for base model") + else: + return {} # Use default HF template for instruct models elif model_type_str == 'llama': template = 'llama3.jinja' elif model_type_str == 'gemma': diff --git a/src/eval/tasks/healthbench/evaluate.py b/src/eval/tasks/healthbench/evaluate.py index a889a5c..5f6466f 100644 --- a/src/eval/tasks/healthbench/evaluate.py +++ b/src/eval/tasks/healthbench/evaluate.py @@ -32,6 +32,11 @@ from evaluation_code.scoring import aggregate_scores, BenchmarkResult from evaluation_code.text_utils import limit_repetitions +# Add parent directory to path for imports +import sys +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..')) +from utils.fewshot_loader import is_base_model, get_fewshot_prompt, should_use_fewshot + # Constants API_MAX_RETRY = 3 @@ -188,11 +193,14 @@ def template_args(args) -> list: def generate_answers( args, - examples: List[HealthBenchExample] + examples: List[HealthBenchExample], + fewshot_system_msg: Optional[str] = None ) -> List[str]: """Generate model responses for all examples.""" server = VLLMServer(args, args.model_path) print(f"[generate] Starting vLLM server for model {args.model_path}") + if fewshot_system_msg: + print(f"[generate] Using few-shot examples in system message") try: port = server.start() @@ -208,11 +216,17 @@ def generate_answers( for example in tqdm(examples, desc="Generating answers"): # Build messages from conversation messages = example.conversation.copy() - + + # Prepend few-shot system message if provided + if fewshot_system_msg: + messages = [{"role": "system", "content": fewshot_system_msg}] + messages + payload = { "model": args.model_path, "messages": messages, "max_tokens": args.max_new_tokens, + "temperature": args.temperature, + "top_p": args.top_p, } answer_text: Optional[str] = None @@ -310,6 +324,37 @@ def main(): action='store_true', help="Store model answers to disk (default: off)." ) + parser.add_argument( + '--temperature', + type=float, + default=0.6, + help="Sampling temperature (default: 0.6)", + ) + parser.add_argument( + '--top-p', + type=float, + default=0.95, + help="Top-p sampling (default: 0.95)", + ) + parser.add_argument( + '--epochs', + type=int, + default=1, + help="Ignored for this benchmark (kept for compatibility with run_baseline.sh).", + ) + parser.add_argument( + '--fewshot', + type=str, + choices=['auto', 'always', 'never'], + default='auto', + help="Few-shot mode: 'auto' (base models only), 'always', or 'never'", + ) + parser.add_argument( + '--num-fewshot', + type=int, + default=None, + help="Number of few-shot examples to use (default: all available)", + ) args = parser.parse_args() model_alias = _model_alias(args.model_path) @@ -326,8 +371,27 @@ def main(): if args.limit != -1: examples = examples[: args.limit] + # Determine whether to use few-shot + use_fewshot = should_use_fewshot(args.fewshot, args.model_path) + fewshot_system_msg = None + + if use_fewshot: + fewshot_prompt = get_fewshot_prompt("healthbench", args.num_fewshot) + if fewshot_prompt: + print(f"[fewshot] Using few-shot examples for model: {args.model_path}") + fewshot_system_msg = ( + "Here are some examples of high-quality medical conversations:\n" + f"{fewshot_prompt}\n" + "Please provide helpful, accurate, and empathetic responses to health-related questions. " + "Be thorough but concise, and always recommend consulting a healthcare professional for serious concerns." + ) + else: + print(f"[fewshot] No few-shot examples found for healthbench") + else: + print(f"[fewshot] Using zero-shot evaluation for model: {args.model_path}") + # Generate answers - responses = generate_answers(args, examples) + responses = generate_answers(args, examples, fewshot_system_msg) print(f"[generate] Generated {len(responses)} responses") # Save model outputs if requested @@ -392,6 +456,25 @@ def update_progress(completed, total): json.dump(metrics, f, indent=2) print(f"\n[done] Metrics saved to {args.json_output_file}") + # Save eval configuration info alongside metrics + result_dir = os.path.dirname(args.json_output_file) + eval_config_file = os.path.join(result_dir, 'eval_config.json') + eval_config = { + "benchmark": "healthbench", + "model_path": args.model_path, + "is_base_model": is_base_model(args.model_path), + "fewshot_mode": args.fewshot, + "fewshot_used": use_fewshot, + "num_fewshot_examples": args.num_fewshot if args.num_fewshot else "all (3)", + "max_new_tokens": args.max_new_tokens, + "temperature": args.temperature, + "top_p": args.top_p, + "limit": args.limit, + } + with open(eval_config_file, 'w') as f: + json.dump(eval_config, f, indent=2) + print(f"Eval config saved to: {eval_config_file}") + if __name__ == "__main__": main() diff --git a/src/eval/tasks/humaneval/evaluate.py b/src/eval/tasks/humaneval/evaluate.py index cb2cc05..b2c5881 100644 --- a/src/eval/tasks/humaneval/evaluate.py +++ b/src/eval/tasks/humaneval/evaluate.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 from __future__ import annotations import os +import sys import argparse import json @@ -11,6 +12,10 @@ import inspect_evals.humaneval # noqa: F401, E402 (registers task definitions) +# Add parent directory to path for imports +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..')) +from utils.fewshot_loader import is_base_model + def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Run Inspect AI eval without banners.") @@ -54,6 +59,31 @@ def parse_args() -> argparse.Namespace: type=int, default=4000, ) + # Sampling parameters (Qwen recommends: temperature=0.6, top_p=0.95, top_k=20) + parser.add_argument( + '--temperature', + type=float, + default=0.6, + help="Sampling temperature (default: 0.6)", + ) + parser.add_argument( + '--top-p', + type=float, + default=0.95, + help="Top-p (nucleus) sampling (default: 0.95)", + ) + parser.add_argument( + '--top-k', + type=int, + default=20, + help="Top-k sampling (default: 20)", + ) + parser.add_argument( + '--epochs', + type=int, + default=1, + help="Number of times to run each sample (default: 1)", + ) return parser.parse_args() def main() -> None: @@ -73,6 +103,9 @@ def main() -> None: } model_args.update(template_kwargs(args)) + print(f"Sampling params: temperature={args.temperature}, top_p={args.top_p}, top_k={args.top_k}") + print(f"Epochs: {args.epochs}") + eval_out = inspect_eval( task, model=f"vllm/{args.model_path}", @@ -84,6 +117,10 @@ def main() -> None: attempt_timeout=18000000, max_tokens=args.max_tokens, max_connections=args.max_connections, + temperature=args.temperature, + top_p=args.top_p, + top_k=args.top_k, + epochs=args.epochs, **other_kwargs, ) @@ -97,6 +134,24 @@ def main() -> None: with open(args.json_output_file, 'w') as f: json.dump(metrics, f, indent=2) + # Save eval configuration info alongside metrics + result_dir = os.path.dirname(args.json_output_file) + eval_config_file = os.path.join(result_dir, 'eval_config.json') + eval_config = { + "benchmark": "humaneval", + "model_path": args.model_path, + "is_base_model": is_base_model(args.model_path), + "sampling": { + "temperature": args.temperature, + "top_p": args.top_p, + "top_k": args.top_k, + }, + "epochs": args.epochs, + } + with open(eval_config_file, 'w') as f: + json.dump(eval_config, f, indent=2) + print(f"Eval config saved to: {eval_config_file}") + def model_type(args) -> str: if 'qwen' in args.model_path.lower(): return 'qwen' @@ -123,7 +178,12 @@ def model_type(args) -> str: def template_kwargs(args) -> dict: model_type_str = model_type(args) if model_type_str == 'qwen': - return {} + # Use simple template for Qwen base models (no chat tokens) + if is_base_model(args.model_path): + template = 'qwen_base.jinja' + print(f"Using qwen_base.jinja template for base model") + else: + return {} # Use default HF template for instruct models elif model_type_str == 'llama': template = 'llama3.jinja' elif model_type_str == 'gemma': diff --git a/src/eval/templates/qwen_base.jinja b/src/eval/templates/qwen_base.jinja new file mode 100644 index 0000000..2a1b105 --- /dev/null +++ b/src/eval/templates/qwen_base.jinja @@ -0,0 +1,14 @@ +{# Simple template for Qwen base models - no chat tokens #} +{%- for message in messages %} +{%- if message.role == 'system' %} +{{ message.content }} + +{% elif message.role == 'user' %} +{{ message.content }} + +{% elif message.role == 'assistant' %} +{{ message.content }} +{% endif %} +{%- endfor %} +{%- if add_generation_prompt %} +{% endif %} diff --git a/src/eval/utils/__init__.py b/src/eval/utils/__init__.py new file mode 100644 index 0000000..48d0ddd --- /dev/null +++ b/src/eval/utils/__init__.py @@ -0,0 +1,17 @@ +from .fewshot_loader import ( + is_base_model, + should_use_fewshot, + load_fewshot_examples, + get_fewshot_prompt, + format_math_fewshot_prompt, + format_mcq_fewshot_prompt, +) + +__all__ = [ + "is_base_model", + "should_use_fewshot", + "load_fewshot_examples", + "get_fewshot_prompt", + "format_math_fewshot_prompt", + "format_mcq_fewshot_prompt", +] diff --git a/src/eval/utils/fewshot_loader.py b/src/eval/utils/fewshot_loader.py new file mode 100644 index 0000000..c386090 --- /dev/null +++ b/src/eval/utils/fewshot_loader.py @@ -0,0 +1,249 @@ +""" +1. Load few-shot examples from JSON files +2. Detect if a model is a base model (pre-trained) vs instruction-tuned +3. Format few-shot examples into prompts for different benchmark types +""" + +import json +import os +from pathlib import Path +from typing import Optional, List, Dict, Any + +# Directory containing few-shot example JSON files +FEWSHOT_DIR = Path(__file__).parent.parent / "fewshot_examples" + + +def is_base_model(model_path: str) -> bool: + """ + Detect if a model is a base (pre-trained) model vs instruction-tuned. + + Base models benefit from few-shot examples since they haven't been + trained to follow instructions in a 0-shot format. + + Args: + model_path: Path or HuggingFace model identifier + + Returns: + True if the model appears to be a base model, False if instruction-tuned + """ + model_path_lower = model_path.lower() + + # Instruction-tuned model indicators (return False for these) + instruct_indicators = [ + '-it', '-instruct', '-chat', '-sft', + '/instruct', '/chat', '-rlhf', '-dpo' + ] + + for indicator in instruct_indicators: + if indicator in model_path_lower: + return False + + # Base model indicators (return True for these) + base_indicators = ['-pt', '-base', '_base', 'base-', '/base'] + + for indicator in base_indicators: + if indicator in model_path_lower: + return True + + # If no clear indicator, assume it's an instruct model + return False + + +def should_use_fewshot(fewshot_mode: str, model_path: str) -> bool: + """ + Determine if few-shot examples should be used based on mode and model type. + + Args: + fewshot_mode: 'auto', 'always', or 'never' + model_path: Path or HuggingFace model identifier + + Returns: + True if few-shot should be used, False otherwise + """ + if fewshot_mode == 'always': + return True + elif fewshot_mode == 'never': + return False + else: # auto + return is_base_model(model_path) + + +def load_fewshot_examples(benchmark: str) -> Dict[str, Any]: + """ + Load few-shot examples for a specific benchmark. + + Args: + benchmark: Name of the benchmark (e.g., 'aime2025', 'gsm8k') + + Returns: + Dictionary containing benchmark info and examples list + """ + filepath = FEWSHOT_DIR / f"{benchmark}.json" + + if not filepath.exists(): + return {"benchmark": benchmark, "examples": []} + + with open(filepath, 'r', encoding='utf-8') as f: + return json.load(f) + + +def format_math_fewshot_prompt( + examples: List[Dict[str, str]], + num_examples: Optional[int] = None +) -> str: + """ + Format few-shot examples for math reasoning tasks (AIME, GSM8K). + + Args: + examples: List of example dicts with 'problem', 'reasoning', 'answer' keys + num_examples: Number of examples to include (None = all) + + Returns: + Formatted string with few-shot examples + """ + if num_examples is not None: + examples = examples[:num_examples] + + formatted_parts = [] + + for i, ex in enumerate(examples, 1): + part = f"Example {i}:\n" + part += f"Problem: {ex['problem']}\n\n" + part += f"Solution:\n{ex['reasoning']}\n\n" + part += f"ANSWER: {ex['answer']}" + formatted_parts.append(part) + + return "\n\n" + "=" * 50 + "\n\n".join(formatted_parts) + "\n\n" + "=" * 50 + "\n\n" + + +def format_mcq_fewshot_prompt( + examples: List[Dict[str, Any]], + num_examples: Optional[int] = None +) -> str: + """ + Format few-shot examples for multiple choice tasks (GPQA). + + Args: + examples: List of example dicts with 'question', 'choices', 'reasoning', 'answer' keys + num_examples: Number of examples to include (None = all) + + Returns: + Formatted string with few-shot examples + """ + if num_examples is not None: + examples = examples[:num_examples] + + formatted_parts = [] + + for i, ex in enumerate(examples, 1): + part = f"Example {i}:\n" + part += f"Question: {ex['question']}\n" + + # Format choices + for j, choice in enumerate(ex['choices']): + letter = chr(ord('A') + j) + part += f"({letter}) {choice}\n" + + part += f"\nReasoning: {ex['reasoning']}\n" + part += f"Answer: {ex['answer']}" + formatted_parts.append(part) + + return "\n\n" + "-" * 50 + "\n\n".join(formatted_parts) + "\n\n" + "-" * 50 + "\n\n" + + +def format_conversation_fewshot_prompt( + examples: List[Dict[str, Any]], + num_examples: Optional[int] = None +) -> str: + """ + Format few-shot examples for conversational tasks (HealthBench). + + Args: + examples: List of example dicts with 'conversation' (list of messages) and 'response' keys + num_examples: Number of examples to include (None = all) + + Returns: + Formatted string with few-shot examples + """ + if num_examples is not None: + examples = examples[:num_examples] + + formatted_parts = [] + + for i, ex in enumerate(examples, 1): + part = f"Example {i}:\n" + + # Format conversation history + for msg in ex.get('conversation', []): + role = msg.get('role', 'user').capitalize() + content = msg.get('content', '') + part += f"{role}: {content}\n\n" + + # Add the ideal response + part += f"Assistant: {ex['response']}" + formatted_parts.append(part) + + return "\n\n" + "=" * 50 + "\n\n".join(formatted_parts) + "\n\n" + "=" * 50 + "\n\n" + + +def format_writing_fewshot_prompt( + examples: List[Dict[str, Any]], + num_examples: Optional[int] = None +) -> str: + """ + Format few-shot examples for writing/reasoning tasks (ArenaHard). + + Args: + examples: List of example dicts with 'prompt' and 'response' keys + num_examples: Number of examples to include (None = all) + + Returns: + Formatted string with few-shot examples + """ + if num_examples is not None: + examples = examples[:num_examples] + + formatted_parts = [] + + for i, ex in enumerate(examples, 1): + part = f"Example {i}:\n" + part += f"User: {ex['prompt']}\n\n" + part += f"Assistant: {ex['response']}" + formatted_parts.append(part) + + return "\n\n" + "=" * 50 + "\n\n".join(formatted_parts) + "\n\n" + "=" * 50 + "\n\n" + + +def get_fewshot_prompt( + benchmark: str, + num_examples: Optional[int] = None +) -> str: + """ + Get formatted few-shot prompt for a benchmark. + + Args: + benchmark: Name of the benchmark + num_examples: Number of examples to include (None = all available) + + Returns: + Formatted few-shot prompt string, or empty string if no examples + """ + data = load_fewshot_examples(benchmark) + examples = data.get("examples", []) + + if not examples: + return "" + + format_type = data.get("format", "math_reasoning") + + if format_type == "math_reasoning": + return format_math_fewshot_prompt(examples, num_examples) + elif format_type == "multiple_choice": + return format_mcq_fewshot_prompt(examples, num_examples) + elif format_type == "conversation": + return format_conversation_fewshot_prompt(examples, num_examples) + elif format_type == "writing": + return format_writing_fewshot_prompt(examples, num_examples) + else: + # Default to math reasoning format + return format_math_fewshot_prompt(examples, num_examples)