From ef3f2d22eef470e0610a41d796528db7f2b22a01 Mon Sep 17 00:00:00 2001 From: Ashvin Verma Date: Thu, 19 Feb 2026 18:01:22 -0800 Subject: [PATCH 1/8] [update] Fix benchmark measurement noise + comprehensive README - run_benchmark() now uses median-of-5 runs instead of single run, fixing unreliable measurements for fast benchmarks like sqlite3 (2ms) - Rewrite evolve README with end-to-end flow documentation: setup, experiment pipeline, per-benchmark execution, LLVM hooks, scoring - Add compile_testsuite.sh for building CTMark .bc files Co-Authored-By: Claude Opus 4.6 --- src/mlirAgent/evolve/README.md | 382 +++++++++++++----- src/mlirAgent/evolve/tasks/llvm_bench.py | 56 +-- .../benchmarks/compile_testsuite.sh | 244 +++++++++++ 3 files changed, 561 insertions(+), 121 deletions(-) create mode 100644 src/mlirAgent/evolve/tasks/llvm_inlining/benchmarks/compile_testsuite.sh diff --git a/src/mlirAgent/evolve/README.md b/src/mlirAgent/evolve/README.md index 99c2b7f..d76a558 100644 --- a/src/mlirAgent/evolve/README.md +++ b/src/mlirAgent/evolve/README.md @@ -3,109 +3,269 @@ Automated framework for evolving LLVM compiler heuristics using [OpenEvolve](../../third_party/openevolve/) with LLM-guided search. -## System Overview +## End-to-End Flow -``` - OpenEvolve controller - | - v - ManualLLM (file-based prompt/response) - | - v - Orchestrator (manual_run.py) <-- --auto / --wait / --resume - | - v - Task evaluator (evaluate.py) <-- patches LLVM, builds, benchmarks - | - v - Score -> OpenEvolve population -``` +### One-Time Setup + +**1. Build LLVM with evolved hooks** + +```bash +# Shallow clone +git clone --depth 1 https://github.com/llvm/llvm-project.git /scratch/ashvin/llvm-project + +# Add evolved heuristic files to the LLVM tree: +# llvm/include/llvm/Analysis/EvolvedInlineCost.h +# llvm/lib/Analysis/EvolvedInlineCost.cpp (inlining hook) +# llvm/include/llvm/CodeGen/EvolvedRegAllocPriority.h +# llvm/lib/CodeGen/EvolvedRegAllocPriority.cpp (regalloc hook) +# Register them in the corresponding CMakeLists.txt files. +# Hook into InlineCost.cpp and RegAllocGreedy.cpp with cl::opt flags. -OpenEvolve manages a population of evolved C++ heuristic programs. Each -iteration, it produces prompts asking an LLM to improve the code. The -ManualLLM bridge decouples the LLM from OpenEvolve's process model, -enabling Claude Code (or any external agent) to respond. +# Configure: Release, X86-only, GCC 13 + gold linker +cmake -G Ninja -B /scratch/ashvin/llvm-build \ + -DCMAKE_BUILD_TYPE=Release \ + -DLLVM_TARGETS_TO_BUILD=X86 \ + -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ \ + /scratch/ashvin/llvm-project/llvm -## ManualLLM +# Build (produces bin/opt and bin/llc, ~657MB) +ninja -C /scratch/ashvin/llvm-build bin/opt bin/llc +``` -**File:** `third_party/openevolve/openevolve/llm/manual.py` +**2. Prepare CTMark benchmarks as .bc files** -File-based polling interface between OpenEvolve and external responders: +The benchmarks come from [llvm-test-suite](https://github.com/llvm/llvm-test-suite) +CTMark. They are compiled to LLVM bitcode (.bc) with frontend optimizations +only, so our evolved passes have full control over LLVM-level optimization: -1. OpenEvolve writes `prompt_NNN.md` to a shared directory -2. ManualLLM polls for a corresponding `prompt_NNN.response.md` -3. When found, the response is returned to OpenEvolve as the LLM output +```bash +# compile_testsuite.sh does this for each benchmark: +clang-18 -O1 -Xclang -disable-llvm-optzns -emit-llvm -c source.c -o source.bc +llvm-link *.bc -o benchmark.bc # multi-file benchmarks +``` -The prompts directory is passed via `MANUAL_LLM_PROMPTS_DIR` env var, -which crosses the process-pool boundary (OpenEvolve uses multiprocessing). -`create_manual_llm` is a module-level factory (not a lambda) to support -pickling across worker processes. +`-O1 -Xclang -disable-llvm-optzns` enables Clang frontend opts (type lowering, +etc.) but skips all LLVM IR passes. The resulting .bc files contain unoptimized +IR ready for our `opt -O2` pipeline. -## Orchestrator +The 8 benchmarks used (2 excluded: clamav=segfault, 7zip=link error): -**File:** `manual_run.py` +| Benchmark | Language | Source | Description | +|-----------|----------|--------|-------------| +| bullet | C++ | MultiSource/Benchmarks/Bullet | Physics engine simulation | +| consumer-typeset | C | MultiSource/Applications/lout | Document typesetting (Lout) | +| kimwitu | C++ | MultiSource/Applications/kimwitu++ | Tree pattern matching | +| lencod | C | MultiSource/Applications/JM/lencod | H.264 video encoder | +| mafft | C | MultiSource/Applications/mafft | Multiple sequence alignment | +| spass | C | MultiSource/Applications/SPASS | First-order theorem prover | +| sqlite3 | C | MultiSource/Applications/sqlite3 | SQL database engine | +| tramp3d-v4 | C++ | MultiSource/Benchmarks/tramp3d-v4 | Template metaprogramming | +The .bc files and runtime data live in `tasks/llvm_inlining/benchmarks/testsuite/`: ``` -python -m mlirAgent.evolve.manual_run --example llvm_inlining -n 10 --auto +testsuite/ + bullet.bc, consumer-typeset.bc, kimwitu.bc, ... + data/ + bullet/ # landscape.mdl, Taru.mdl + consumer-typeset/ # large.lout, data/, font/, maps/, hyph/, include/ + kimwitu/ # inputs/f1.k, f2.k, f3.k + lencod/ # encoder_small.cfg, foreman_part_qcif_444.yuv, ... + mafft/ # pyruvate_decarboxylase.fasta + spass/ # problem.dfg + sqlite3/ # commands, sqlite3rc, test1.sql-test15.sql ``` -Modes: -- `--auto` Built-in heuristic strategies (simulated annealing, gradient - estimate, etc.) auto-respond to prompts. Fast but limited. -- `--wait` External agent (Claude Code, human) writes response files. -- `--resume ` Continue from a saved checkpoint. +### Running an Experiment + +```bash +# Set environment +export LLVM_SRC_PATH=/scratch/ashvin/llvm-project +export EVOLVE_BUILD_DIR=/scratch/ashvin/llvm-build +export EVOLVE_OPTUNA_TRIALS=5 # 0 to disable Optuna -Logs scores to `experiments/run_TIMESTAMP/scores.jsonl` and saves -OpenEvolve checkpoints every iteration. +# Launch (--wait mode: you respond to prompts manually or via Claude Code) +python -m mlirAgent.evolve.manual_run --example llvm_inlining -n 10 --wait + +# Or auto mode (built-in heuristic strategies respond automatically) +python -m mlirAgent.evolve.manual_run --example regalloc_priority -n 10 --auto +``` -## Evaluator Pipeline +This creates an experiment directory: +``` +experiments/run_20260219_132604/ + scores.jsonl # One JSON line per iteration with all metrics + prompts/ + prompt_001.md # OpenEvolve prompt (parent code + history) + prompt_001.response.md # LLM/agent response (new code) + prompt_002.md + ... + openevolve_output/ + checkpoints/checkpoint_N/ # Population state for --resume + best/best_program.cpp # Best evolved program + logs/openevolve_*.log # Detailed log +``` -Each task defines an `evaluate.py` that follows this pipeline: +### What Happens Each Iteration ``` -1. patch_source() Copy evolved .cpp into LLVM source tree -2. build_llvm() ninja -C $BUILD_DIR bin/opt bin/llc -3. load_baseline() Cache default-LLVM measurements (first run only) -4. [optuna_tune()] Optional inner-loop for [hyperparam] knobs -5. eval_benchmarks() For each CTMark .bc file: - opt -O2 [-use-evolved-*] bench.bc -> bench_opt.bc - llc -O2 [-use-evolved-*] bench_opt.bc -> bench.o - gcc bench.o -> bench - measure .text size, binary size, runtime -6. score_fn() Task-specific scoring -7. restore_source() Restore original .cpp from backup + ┌─────────────────────────────────┐ + │ OpenEvolve Controller │ + │ (population, MAP-Elites, etc.) │ + └────────────┬────────────────────┘ + │ 1. Sample parent program + │ from population + ▼ + ┌─────────────────────────────────┐ + │ ManualLLM Bridge │ + │ Write prompt_NNN.md to disk │ + │ Poll for prompt_NNN.response.md │ + └────────────┬────────────────────┘ + │ 2. External responder + │ writes response file + ▼ + ┌─────────────────────────────────┐ + │ Task Evaluator (evaluate.py)│ + └────────────┬────────────────────┘ + │ + ┌──────────────────────┼──────────────────────┐ + ▼ ▼ ▼ + 3. patch_source() 4. build_llvm() 5. load_baseline() + Copy evolved .cpp ninja -C BUILD_DIR Compile & run all + into LLVM tree bin/opt bin/llc benchmarks with + (backup original) (~3.5s incremental) default LLVM (once, + cached to .json) + │ │ │ + └──────────────────────┼──────────────────────┘ + │ + ▼ + ┌─────────────────────────────────┐ + │ 6. [Optuna inner-loop] │ + │ If [hyperparam] annotations: │ + │ Run N trials on 3-bench subset │ + │ (sqlite3, spass, tramp3d-v4) │ + │ Each trial = compile+run subset │ + │ Find best flag values │ + └────────────┬────────────────────┘ + │ + ▼ + ┌─────────────────────────────────┐ + │ 7. eval_benchmarks() │ + │ For EACH of 8 .bc benchmarks: │ + │ ┌─────────────────────────────┐ │ + │ │ a. opt -O2 │ │ + │ │ [-use-evolved-inline-cost]│ │ + │ │ bench.bc → bench_opt.bc │ │ + │ ├─────────────────────────────┤ │ + │ │ b. llc -O2 -filetype=obj │ │ + │ │ -relocation-model=pic │ │ + │ │ [-use-evolved-regalloc-*]│ │ + │ │ [-ae-flag=value ...] │ │ + │ │ bench_opt.bc → bench.o │ │ + │ ├─────────────────────────────┤ │ + │ │ c. gcc bench.o -o bench │ │ + │ │ -lm -lpthread -ldl │ │ + │ │ [-lstdc++ for C++ bench] │ │ + │ ├─────────────────────────────┤ │ + │ │ d. size bench.o → .text size│ │ + │ │ stat bench → binary sz │ │ + │ ├─────────────────────────────┤ │ + │ │ e. Run 5x, take median: │ │ + │ │ ./bench [args] [ BaseThreshold("ae-inline-base-threshold", cl::init(100), ...); ``` Format: `// [hyperparam]: flag-name, type, min, max` -When present and `optuna_trials > 0`, the evaluator runs an Optuna -inner-loop on a benchmark subset to find optimal values before the final -full-suite evaluation. Tuned values are passed as LLVM command-line flags -(e.g. `-ae-inline-base-threshold=173`). +When `EVOLVE_OPTUNA_TRIALS > 0`, the evaluator: +1. Parses `[hyperparam]` annotations from the evolved C++ code +2. Creates an Optuna study with one parameter per annotation +3. Runs N trials on a 3-benchmark subset (sqlite3, spass, tramp3d-v4) +4. Each trial: compile subset with trial params as LLVM flags, score +5. Best params are passed as flags in the final full-suite evaluation + +Example: Optuna suggests `-ae-inline-base-threshold=173`, which is passed +to `opt` (or `llc` for regalloc flags) during compilation. ## Configuration @@ -114,7 +274,7 @@ full-suite evaluation. Tuned values are passed as LLVM command-line flags ```python from mlirAgent.evolve.tasks.llvm_bench import EvalConfig -# From environment variables (backward compatible) +# From environment variables config = EvalConfig.from_env("llvm/lib/Analysis/EvolvedInlineCost.cpp") # Programmatic with overrides @@ -125,23 +285,37 @@ config = EvalConfig.from_env( ) ``` -Key env vars: `LLVM_SRC_PATH`, `EVOLVE_BUILD_DIR`, `EVOLVE_OPT_TIMEOUT`, -`EVOLVE_OPTUNA_TRIALS`. +| Env Var | Default | Description | +|---------|---------|-------------| +| `LLVM_SRC_PATH` | (required) | LLVM source tree root | +| `EVOLVE_BUILD_DIR` | (required) | LLVM ninja build directory | +| `EVOLVE_OPT_TIMEOUT` | 120 | Per-benchmark opt/llc timeout (seconds) | +| `EVOLVE_OPTUNA_TRIALS` | 20 | Optuna trials (0 = disable) | ## Task Structure ``` -tasks/ - llvm_bench.py # Shared: EvalConfig, compile, baseline, Optuna - llvm_inlining/ - evaluate.py # Thin wrapper: _score(), evaluate() - initial.cpp # Seed heuristic - task.py # OpenEvolve Task class - benchmarks/testsuite/ # CTMark .bc files + data/ - regalloc_priority/ - evaluate.py # Thin wrapper: _score(), evaluate() - initial.cpp # Seed priority function - baseline_regalloc.json # Separate baseline cache +src/mlirAgent/evolve/ + manual_run.py # Orchestrator: --auto/--wait/--resume + tasks/ + llvm_bench.py # Shared: EvalConfig, compile, baseline, Optuna + llvm_inlining/ + evaluate.py # _score(): bin_red% + speedup*10 + initial.cpp # Seed: sums heuristic features - threshold + task.py # OpenEvolve Task class + benchmarks/ + compile_testsuite.sh # Script to build .bc from llvm-test-suite + testsuite/ # .bc files (gitignored, built locally) + data/ # Runtime input data per benchmark + regalloc_priority/ + evaluate.py # _score(): 5*speedup% + bin_red% + initial.cpp # Seed: LLVM default bit-packed priority + baseline_regalloc.json # Separate baseline (uses -use-evolved-* on llc) + README.md # This file +configs/ + frameworks/manual.yaml # OpenEvolve config (pop=10, 1 island, seed=42) +experiments/ # Output (gitignored) + run_YYYYMMDD_HHMMSS/ ``` ### Adding a New Task @@ -150,31 +324,47 @@ tasks/ 2. In `evaluate.py`, define `_score(total_binary, baseline_binary, speedups)` 3. Call shared functions from `llvm_bench.py` with the right evolved flags 4. Add entry to `EXAMPLES` dict in `manual_run.py` +5. If the evolved code affects `llc` (not `opt`), use `flag_target="llc"` in + `optuna_tune()` and pass flags via `evolved_llc_flags` ## Scoring Formulas **Inlining:** `binary_reduction_pct + (avg_speedup - 1.0) * 10` -- Primary: linked binary size reduction vs baseline (Magellan-comparable) +- Primary signal: linked binary size reduction vs baseline - Secondary: small bonus for runtime improvement +- Comparable to Magellan (ICML 2025) binary reduction metric **RegAlloc:** `5.0 * speedup_pct + 1.0 * binary_reduction_pct` -- Primary: runtime improvement (regalloc most affects execution speed) +- Primary signal: runtime improvement (regalloc most affects execution speed) - Secondary: binary size reduction +- Warning: dominated by measurement noise for short-running benchmarks ## Experiment Results (CTMark, Feb 2026) ### LLVM Inlining -| Experiment | Optuna | Iters | Best Score | Binary Reduction | Time | -|-----------|--------|-------|------------|-----------------|------| -| Exp A | No | 10 | 8.65 | 8.78% | ~50 min | -| Exp C | 5 trials | 10 | 8.66 | 8.41% | ~90 min | - -Both match Magellan's reported range (4.27-8.79%) with only 10 iterations. -Optuna eliminates failures (100% positive scores vs 80%) but doesn't -improve peak performance significantly. Code structure matters more than -hyperparameter values for peak score. - -### Key Insight -Os-level inlining hurts tramp3d-v4 (C++ templates need inlining for -devirtualization). Best heuristics learn to selectively increase inlining -for template-heavy code. +| Experiment | Responder | Optuna | Iters | Best Score | Binary Reduction | +|-----------|-----------|--------|-------|------------|-----------------| +| Exp A | Claude | No | 10 | 8.65 | 8.78% | +| Exp C | Auto | 5 trials | 10 | 8.66 | 8.41% | +| Exp D | Claude | 5 trials | 11 | **8.78** | **9.24%** | + +All match Magellan's reported range (4.27-8.79%) with only 10 iterations. +Claude + Optuna combined is slightly better than either alone. + +### RegAlloc Priority +| Experiment | Measurement | Iters | Best Score | Notes | +|-----------|-------------|-------|------------|-------| +| Exp E | Single run | 8 | 63.39 | **INVALIDATED** (sqlite3 2ms noise) | +| Exp F | Median-of-5 | 11 | 8.82 | Pressure-proportional priority | + +Exp E results were entirely from sqlite3 measurement noise (1.89x "speedup" +was an artifact of 2ms runtime variance). After fixing `run_benchmark()` to +use median-of-5 runs, the only positive innovation was pressure-proportional +priority: boosting global ranges in constrained register classes. + +### Key Insights +- Os-level inlining hurts tramp3d-v4 (C++ templates need inlining) +- Code structure > hyperparameters for peak inlining score +- Optuna adds robustness (100% positive scores vs 80%) +- RegAlloc priority bit-packed encoding is fragile — structural changes hurt +- Benchmarks under 10ms are unreliable even with median-of-5 runs diff --git a/src/mlirAgent/evolve/tasks/llvm_bench.py b/src/mlirAgent/evolve/tasks/llvm_bench.py index 8b539f7..9371a23 100644 --- a/src/mlirAgent/evolve/tasks/llvm_bench.py +++ b/src/mlirAgent/evolve/tasks/llvm_bench.py @@ -239,8 +239,9 @@ def get_text_size(obj_path: str) -> int: # Benchmark execution # --------------------------------------------------------------------------- -def run_benchmark(name: str, binary_path: str, tmp_dir: str, data_dir: str): - """Run a benchmark with reference inputs; return wall-clock seconds or None.""" +def run_benchmark(name: str, binary_path: str, tmp_dir: str, data_dir: str, + num_runs: int = 5): + """Run a benchmark with reference inputs; return median wall-clock seconds or None.""" config = BENCH_RUN_CONFIGS.get(name) if not config: return None @@ -267,31 +268,36 @@ def run_benchmark(name: str, binary_path: str, tmp_dir: str, data_dir: str): if src.exists(): shutil.copy2(str(src), os.path.join(run_dir, f)) - # Prepare stdin - stdin_fh = None - if config.get("stdin_file") and bench_data.exists(): - stdin_src = bench_data / config["stdin_file"] - if stdin_src.exists(): - stdin_fh = open(str(stdin_src), "r") - cmd = [run_binary] + config.get("args", []) timeout = config.get("timeout", 30) - - try: - start = time.time() - proc = subprocess.run( - cmd, capture_output=True, timeout=timeout, - cwd=run_dir, stdin=stdin_fh, - ) - elapsed = time.time() - start - if proc.returncode == 0: - return elapsed - except subprocess.TimeoutExpired: - pass - finally: - if stdin_fh: - stdin_fh.close() - return None + stdin_file = None + if config.get("stdin_file") and bench_data.exists(): + stdin_file = bench_data / config["stdin_file"] + + timings = [] + for _ in range(num_runs): + stdin_fh = None + try: + if stdin_file and stdin_file.exists(): + stdin_fh = open(str(stdin_file), "r") + start = time.time() + proc = subprocess.run( + cmd, capture_output=True, timeout=timeout, + cwd=run_dir, stdin=stdin_fh, + ) + elapsed = time.time() - start + if proc.returncode == 0: + timings.append(elapsed) + except subprocess.TimeoutExpired: + pass + finally: + if stdin_fh: + stdin_fh.close() + + if not timings: + return None + timings.sort() + return timings[len(timings) // 2] def compile_benchmark(bc_path, opt_path, llc_path, tmp_dir, data_dir, diff --git a/src/mlirAgent/evolve/tasks/llvm_inlining/benchmarks/compile_testsuite.sh b/src/mlirAgent/evolve/tasks/llvm_inlining/benchmarks/compile_testsuite.sh new file mode 100644 index 0000000..324037a --- /dev/null +++ b/src/mlirAgent/evolve/tasks/llvm_inlining/benchmarks/compile_testsuite.sh @@ -0,0 +1,244 @@ +#!/bin/bash +# Compile LLVM test-suite benchmarks to .bc (bitcode) files +# Using: clang-18 -O1 -Xclang -disable-llvm-optzns -emit-llvm +# This produces unoptimized bitcode suitable for our custom opt pass +set -e + +CLANG="clang-18" +CLANGXX="clang++-18" +LLVM_LINK="llvm-link-18" +TESTSUITE="/scratch/ashvin/llvm-test-suite" +OUTDIR="/scratch/ashvin/merlin/mlirEvolve/src/mlirAgent/evolve/tasks/llvm_inlining/benchmarks/testsuite" +TMPDIR="/tmp/testsuite_build_$$" + +# Flags: -O1 enables optimizations but -disable-llvm-optzns prevents LLVM +# opts from running (only Clang frontend opts). This avoids noinline attrs. +COMMON_FLAGS="-O1 -Xclang -disable-llvm-optzns -emit-llvm" +C_FLAGS="$COMMON_FLAGS -std=c17" +CXX_FLAGS="$COMMON_FLAGS" + +mkdir -p "$OUTDIR" "$TMPDIR" + +compile_ok=0 +compile_fail=0 + +echo "=== Compiling LLVM test-suite benchmarks to .bc ===" +echo "" + +#-------------------------------------------------------------------- +# 1. SPASS - Theorem Prover (C) +#-------------------------------------------------------------------- +echo "--- [1/7] SPASS ---" +SPASS_DIR="$TESTSUITE/MultiSource/Applications/SPASS" +SPASS_TMP="$TMPDIR/spass" +mkdir -p "$SPASS_TMP" + +SPASS_SRCS=$(ls "$SPASS_DIR"/*.c 2>/dev/null) +SPASS_OK=1 +for src in $SPASS_SRCS; do + base=$(basename "$src" .c) + $CLANG $C_FLAGS -DCLOCK_NO_TIMING -fno-strict-aliasing \ + -I"$SPASS_DIR" \ + -c "$src" -o "$SPASS_TMP/${base}.bc" 2>/dev/null || { + echo " WARN: Failed to compile $base.c" + SPASS_OK=0 + } +done +if [ "$SPASS_OK" = "1" ]; then + $LLVM_LINK "$SPASS_TMP"/*.bc -o "$OUTDIR/spass.bc" 2>/dev/null && { + echo " OK: spass.bc ($(stat -c%s "$OUTDIR/spass.bc") bytes)" + compile_ok=$((compile_ok + 1)) + } || { + echo " FAIL: llvm-link failed for SPASS" + compile_fail=$((compile_fail + 1)) + } +else + # Try linking what we have + bc_count=$(ls "$SPASS_TMP"/*.bc 2>/dev/null | wc -l) + if [ "$bc_count" -gt 0 ]; then + $LLVM_LINK "$SPASS_TMP"/*.bc -o "$OUTDIR/spass.bc" 2>/dev/null && { + echo " OK (partial): spass.bc ($(stat -c%s "$OUTDIR/spass.bc") bytes)" + compile_ok=$((compile_ok + 1)) + } || { + echo " FAIL: llvm-link failed for SPASS" + compile_fail=$((compile_fail + 1)) + } + else + echo " FAIL: No .bc files produced for SPASS" + compile_fail=$((compile_fail + 1)) + fi +fi + +#-------------------------------------------------------------------- +# 2. tramp3d-v4 - C++ Template Metaprogramming Benchmark +#-------------------------------------------------------------------- +echo "--- [2/7] tramp3d-v4 ---" +TRAMP_DIR="$TESTSUITE/MultiSource/Benchmarks/tramp3d-v4" +$CLANGXX $CXX_FLAGS -std=c++14 -fno-exceptions \ + -c "$TRAMP_DIR/tramp3d-v4.cpp" -o "$OUTDIR/tramp3d.bc" 2>/dev/null && { + echo " OK: tramp3d.bc ($(stat -c%s "$OUTDIR/tramp3d.bc") bytes)" + compile_ok=$((compile_ok + 1)) +} || { + echo " FAIL: tramp3d-v4.cpp" + compile_fail=$((compile_fail + 1)) +} + +#-------------------------------------------------------------------- +# 3. Bullet - Physics Engine (C++) +#-------------------------------------------------------------------- +echo "--- [3/7] Bullet ---" +BULLET_DIR="$TESTSUITE/MultiSource/Benchmarks/Bullet" +BULLET_TMP="$TMPDIR/bullet" +mkdir -p "$BULLET_TMP" + +BULLET_SRCS=$(ls "$BULLET_DIR"/*.cpp 2>/dev/null) +BULLET_OK=1 +for src in $BULLET_SRCS; do + base=$(basename "$src" .cpp) + $CLANGXX $CXX_FLAGS -std=c++98 -DNO_TIME \ + -I"$BULLET_DIR/include" -I"$BULLET_DIR" \ + -c "$src" -o "$BULLET_TMP/${base}.bc" 2>/dev/null || { + echo " WARN: Failed to compile $base.cpp" + BULLET_OK=0 + } +done +bc_count=$(ls "$BULLET_TMP"/*.bc 2>/dev/null | wc -l) +if [ "$bc_count" -gt 0 ]; then + $LLVM_LINK "$BULLET_TMP"/*.bc -o "$OUTDIR/bullet.bc" 2>/dev/null && { + echo " OK: bullet.bc ($(stat -c%s "$OUTDIR/bullet.bc") bytes)" + compile_ok=$((compile_ok + 1)) + } || { + echo " FAIL: llvm-link failed for Bullet" + compile_fail=$((compile_fail + 1)) + } +else + echo " FAIL: No .bc files produced for Bullet" + compile_fail=$((compile_fail + 1)) +fi + +#-------------------------------------------------------------------- +# 4. ClamAV - Antivirus Engine (C) +#-------------------------------------------------------------------- +echo "--- [4/7] ClamAV ---" +CLAMAV_DIR="$TESTSUITE/MultiSource/Applications/ClamAV" +CLAMAV_TMP="$TMPDIR/clamav" +mkdir -p "$CLAMAV_TMP" + +# ClamAV needs specific defines for Linux +CLAMAV_DEFS="-DHAVE_CONFIG_H -DDONT_LOCK_DBDIRS -DC_LINUX -DWORDS_BIGENDIAN=0 -DFPU_WORDS_BIGENDIAN=0" +CLAMAV_INCLUDES="-I$CLAMAV_DIR -I$CLAMAV_DIR/zlib" + +CLAMAV_SRCS=$(ls "$CLAMAV_DIR"/*.c 2>/dev/null) +CLAMAV_FAIL_COUNT=0 +for src in $CLAMAV_SRCS; do + base=$(basename "$src" .c) + $CLANG $C_FLAGS $CLAMAV_DEFS $CLAMAV_INCLUDES \ + -Wno-incompatible-pointer-types \ + -c "$src" -o "$CLAMAV_TMP/${base}.bc" 2>/dev/null || { + CLAMAV_FAIL_COUNT=$((CLAMAV_FAIL_COUNT + 1)) + } +done +bc_count=$(ls "$CLAMAV_TMP"/*.bc 2>/dev/null | wc -l) +echo " Compiled $bc_count files ($CLAMAV_FAIL_COUNT failures)" +if [ "$bc_count" -gt 0 ]; then + $LLVM_LINK "$CLAMAV_TMP"/*.bc -o "$OUTDIR/clamav.bc" 2>/dev/null && { + echo " OK: clamav.bc ($(stat -c%s "$OUTDIR/clamav.bc") bytes)" + compile_ok=$((compile_ok + 1)) + } || { + echo " FAIL: llvm-link failed for ClamAV" + compile_fail=$((compile_fail + 1)) + } +else + echo " FAIL: No .bc files produced for ClamAV" + compile_fail=$((compile_fail + 1)) +fi + +#-------------------------------------------------------------------- +# 5. hexxagon - C++ Game AI +#-------------------------------------------------------------------- +echo "--- [5/7] hexxagon ---" +HEXX_DIR="$TESTSUITE/MultiSource/Applications/hexxagon" +HEXX_TMP="$TMPDIR/hexxagon" +mkdir -p "$HEXX_TMP" + +HEXX_SRCS=$(ls "$HEXX_DIR"/*.cpp 2>/dev/null) +for src in $HEXX_SRCS; do + base=$(basename "$src" .cpp) + $CLANGXX $CXX_FLAGS -std=c++14 \ + -I"$HEXX_DIR" \ + -c "$src" -o "$HEXX_TMP/${base}.bc" 2>/dev/null || { + echo " WARN: Failed to compile $base.cpp" + } +done +bc_count=$(ls "$HEXX_TMP"/*.bc 2>/dev/null | wc -l) +if [ "$bc_count" -gt 0 ]; then + $LLVM_LINK "$HEXX_TMP"/*.bc -o "$OUTDIR/hexxagon.bc" 2>/dev/null && { + echo " OK: hexxagon.bc ($(stat -c%s "$OUTDIR/hexxagon.bc") bytes)" + compile_ok=$((compile_ok + 1)) + } || { + echo " FAIL: llvm-link failed for hexxagon" + compile_fail=$((compile_fail + 1)) + } +else + echo " FAIL: No .bc files produced for hexxagon" + compile_fail=$((compile_fail + 1)) +fi + +#-------------------------------------------------------------------- +# 6. PAQ8p - Data Compression (single C++ file) +#-------------------------------------------------------------------- +echo "--- [6/7] PAQ8p ---" +PAQ_DIR="$TESTSUITE/MultiSource/Benchmarks/PAQ8p" +$CLANGXX $CXX_FLAGS -DNOASM -DLLVM \ + -c "$PAQ_DIR/paq8p.cpp" -o "$OUTDIR/paq8p.bc" 2>/dev/null && { + echo " OK: paq8p.bc ($(stat -c%s "$OUTDIR/paq8p.bc") bytes)" + compile_ok=$((compile_ok + 1)) +} || { + echo " FAIL: paq8p.cpp" + compile_fail=$((compile_fail + 1)) +} + +#-------------------------------------------------------------------- +# 7. Fhourstones - Game Tree Search (C) +#-------------------------------------------------------------------- +echo "--- [7/7] Fhourstones ---" +FHOUR_DIR="$TESTSUITE/MultiSource/Benchmarks/Fhourstones" +FHOUR_TMP="$TMPDIR/fhourstones" +mkdir -p "$FHOUR_TMP" + +for src in "$FHOUR_DIR"/c4.c "$FHOUR_DIR"/play.c "$FHOUR_DIR"/trans.c; do + if [ -f "$src" ]; then + base=$(basename "$src" .c) + $CLANG $C_FLAGS -I"$FHOUR_DIR" \ + -c "$src" -o "$FHOUR_TMP/${base}.bc" 2>/dev/null || { + echo " WARN: Failed to compile $(basename $src)" + } + fi +done +bc_count=$(ls "$FHOUR_TMP"/*.bc 2>/dev/null | wc -l) +if [ "$bc_count" -gt 0 ]; then + $LLVM_LINK "$FHOUR_TMP"/*.bc -o "$OUTDIR/fhourstones.bc" 2>/dev/null && { + echo " OK: fhourstones.bc ($(stat -c%s "$OUTDIR/fhourstones.bc") bytes)" + compile_ok=$((compile_ok + 1)) + } || { + echo " FAIL: llvm-link failed for Fhourstones" + compile_fail=$((compile_fail + 1)) + } +else + echo " FAIL: No .bc files produced for Fhourstones" + compile_fail=$((compile_fail + 1)) +fi + +#-------------------------------------------------------------------- +# Summary +#-------------------------------------------------------------------- +echo "" +echo "=== Summary ===" +echo "Compiled: $compile_ok / 7" +echo "Failed: $compile_fail / 7" +echo "" +echo "Output .bc files:" +ls -lh "$OUTDIR"/*.bc 2>/dev/null || echo " (none)" + +# Cleanup +rm -rf "$TMPDIR" From 52288fa3dc009da910aae2be6a27dd59de2243f8 Mon Sep 17 00:00:00 2001 From: Ashvin Verma Date: Thu, 19 Feb 2026 18:38:05 -0800 Subject: [PATCH 2/8] [update] Cookbook submodule: LLVM inlining recipe Co-Authored-By: Claude Opus 4.6 --- data/cookbook | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/cookbook b/data/cookbook index 94d365c..414c7d7 160000 --- a/data/cookbook +++ b/data/cookbook @@ -1 +1 @@ -Subproject commit 94d365c80639951e4ae92f056789c1475940b077 +Subproject commit 414c7d788c23af6a868295602cee213854fe8f93 From c33092b221de854c2e354ca196ba808c53b40631 Mon Sep 17 00:00:00 2001 From: Ashvin Verma Date: Thu, 19 Feb 2026 21:17:05 -0800 Subject: [PATCH 3/8] [update] Cookbook submodule: LLVM inlining recipe in mlirAgent_recipes Co-Authored-By: Claude Opus 4.6 --- data/cookbook | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/cookbook b/data/cookbook index 414c7d7..5b0a6d1 160000 --- a/data/cookbook +++ b/data/cookbook @@ -1 +1 @@ -Subproject commit 414c7d788c23af6a868295602cee213854fe8f93 +Subproject commit 5b0a6d1be77585b5fa709d2753d5550a543a0819 From a0a97277560938eae05b9009526b3f1934054d52 Mon Sep 17 00:00:00 2001 From: Ashvin Verma Date: Fri, 20 Feb 2026 17:47:32 -0800 Subject: [PATCH 4/8] [add] Loop unrolling evolution task Add loop_unrolling task for evolving LLVM's loop unroll heuristic via OpenEvolve. Includes evaluator (5x speedup + 1x binary reduction scoring), seed program with EVOLVE-BLOCK markers, and task metadata. Requires corresponding LLVM hook (EvolvedLoopUnroll.{h,cpp} + LoopUnrollPass.cpp changes) built separately. Exp G results: best score 58.06 at iter 4 (avg_speedup=1.116, ThresholdScale=76). Real signal ~1.3% speedup excluding sqlite3 noise. Co-Authored-By: Claude Opus 4.6 --- src/mlirAgent/evolve/manual_run.py | 6 + .../evolve/tasks/loop_unrolling/__init__.py | 0 .../evolve/tasks/loop_unrolling/evaluate.py | 180 ++++++++++++++++++ .../evolve/tasks/loop_unrolling/initial.cpp | 78 ++++++++ .../evolve/tasks/loop_unrolling/task.yaml | 9 + 5 files changed, 273 insertions(+) create mode 100644 src/mlirAgent/evolve/tasks/loop_unrolling/__init__.py create mode 100644 src/mlirAgent/evolve/tasks/loop_unrolling/evaluate.py create mode 100644 src/mlirAgent/evolve/tasks/loop_unrolling/initial.cpp create mode 100644 src/mlirAgent/evolve/tasks/loop_unrolling/task.yaml diff --git a/src/mlirAgent/evolve/manual_run.py b/src/mlirAgent/evolve/manual_run.py index 0bbb326..59cb285 100644 --- a/src/mlirAgent/evolve/manual_run.py +++ b/src/mlirAgent/evolve/manual_run.py @@ -51,6 +51,12 @@ "file_suffix": ".cpp", "language": "cpp", }, + "loop_unrolling": { + "initial_program": str(Path(__file__).parent / "tasks/loop_unrolling/initial.cpp"), + "evaluator": str(Path(__file__).parent / "tasks/loop_unrolling/evaluate.py"), + "file_suffix": ".cpp", + "language": "cpp", + }, } diff --git a/src/mlirAgent/evolve/tasks/loop_unrolling/__init__.py b/src/mlirAgent/evolve/tasks/loop_unrolling/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/mlirAgent/evolve/tasks/loop_unrolling/evaluate.py b/src/mlirAgent/evolve/tasks/loop_unrolling/evaluate.py new file mode 100644 index 0000000..d16c628 --- /dev/null +++ b/src/mlirAgent/evolve/tasks/loop_unrolling/evaluate.py @@ -0,0 +1,180 @@ +"""Evaluator for LLVM loop unrolling heuristic evolution. + +Called by OpenEvolve as: python evaluate.py + +Pipeline: +1. Patch evolved C++ heuristic into LLVM source tree +2. Rebuild opt incrementally (ninja) +3. For each CTMark benchmark .bc file: + a. opt -O2 -use-evolved-loop-unroll bench.bc -o bench_opt.bc + b. llc -O2 -filetype=obj -relocation-model=pic bench_opt.bc -o bench.o + c. gcc bench.o -o bench -lm -lpthread -ldl [-lstdc++ for C++] + d. Measure linked binary size + e. Run benchmark with reference inputs and measure wall-clock time +4. Score = 5.0 * speedup_pct + binary_reduction_pct + (loop unrolling is runtime-focused; binary growth expected, penalized at 1/5th) +""" + +import json +import os +import subprocess +import sys +import tempfile +import time +from pathlib import Path + +try: + from ..llvm_bench import ( + EvalConfig, build_llvm, eval_benchmarks, extract_hyperparams, + find_benchmarks, load_baseline, optuna_tune, patch_source, + restore_source, + ) +except ImportError: + # Standalone loading by OpenEvolve's importlib (no parent package) + sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) + from llvm_bench import ( + EvalConfig, build_llvm, eval_benchmarks, extract_hyperparams, + find_benchmarks, load_baseline, optuna_tune, patch_source, + restore_source, + ) + +_EVAL_DIR = Path(__file__).resolve().parent + + +def _score(total_binary, baseline_total_binary, speedups): + """Loop unroll score: 5x speedup + 1x binary reduction.""" + binary_pct = ( + 100.0 * (baseline_total_binary - total_binary) / baseline_total_binary + if baseline_total_binary > 0 else 0.0 + ) + avg_speedup = sum(speedups) / len(speedups) if speedups else 0.0 + speedup_pct = (avg_speedup - 1.0) * 100 if avg_speedup > 0 else 0.0 + return round(5.0 * speedup_pct + binary_pct, 4) + + +def evaluate(program_path: str, config: EvalConfig = None) -> dict: + """Evaluate an evolved LLVM loop unrolling heuristic. + + Score = 5x runtime speedup % + 1x binary size reduction % vs baseline. + Loop unrolling primarily affects runtime performance; binary size may grow. + """ + if config is None: + config = EvalConfig.from_env( + "llvm/lib/Transforms/Scalar/EvolvedLoopUnroll.cpp", + baseline_file=str(_EVAL_DIR / "baseline_unroll.json"), + ) + + if not config.llvm_src or not config.build_dir: + return { + "combined_score": 0.0, + "error": "LLVM_SRC_PATH and EVOLVE_BUILD_DIR must be set", + } + + result = { + "combined_score": 0.0, + "build_success": False, + "build_time": 0.0, + "total_binary_size": 0, + "binary_reduction_pct": 0.0, + "avg_speedup": 0.0, + "benchmark_details": {}, + "error": None, + } + + try: + dest, backup = patch_source(program_path, config) + except OSError as e: + result["error"] = f"Patch failed: {e}" + return result + + try: + ok, build_time, err = build_llvm(config) + result["build_time"] = build_time + result["build_success"] = ok + if not ok: + result["error"] = err + return result + + baseline = load_baseline(config) + opt_path = os.path.join(config.build_dir, "bin", "opt") + llc_path = os.path.join(config.build_dir, "bin", "llc") + benchmarks = find_benchmarks(Path(config.testsuite_dir)) + + if not benchmarks: + result["error"] = "No benchmark .bc files found in testsuite/" + return result + + # Extract hyperparams and optionally run Optuna + with open(program_path) as f: + hyperparams = extract_hyperparams(f.read()) + + evolved_opt_flags = ["-use-evolved-loop-unroll"] + + if hyperparams and config.optuna_trials > 0: + print(f" Optuna: tuning {len(hyperparams)} hyperparams " + f"({config.optuna_trials} trials)...") + tune_start = time.time() + best_sub, best_params, extra_flags = optuna_tune( + opt_path, llc_path, benchmarks, baseline, + n_trials=config.optuna_trials, hyperparams=hyperparams, + data_dir=config.data_dir, score_fn=_score, + opt_timeout=config.opt_timeout, + optuna_subset=config.optuna_subset, + base_opt_flags=evolved_opt_flags, flag_target="opt", + ) + result["optuna_trials"] = config.optuna_trials + result["optuna_subset_score"] = best_sub + result["tuned_params"] = best_params + result["tune_time"] = round(time.time() - tune_start, 2) + print(f" Optuna done in {result['tune_time']}s. " + f"Subset score={best_sub:.2f}, params={best_params}") + evolved_opt_flags.extend(extra_flags) + elif hyperparams: + result["optuna_trials"] = 0 + result["tuned_params"] = {} + + # Final evaluation on all benchmarks + with tempfile.TemporaryDirectory(prefix="unroll_eval_") as tmp_dir: + score, ev = eval_benchmarks( + benchmarks, opt_path, llc_path, baseline, tmp_dir, + config.data_dir, _score, + evolved_opt_flags=evolved_opt_flags, + opt_timeout=config.opt_timeout, + ) + + result["combined_score"] = score + result["benchmark_details"] = ev["details"] + result["total_binary_size"] = ev["total_binary"] + + if ev["baseline_total_binary"] > 0: + result["binary_reduction_pct"] = round( + 100.0 * (ev["baseline_total_binary"] - ev["total_binary"]) + / ev["baseline_total_binary"], 4 + ) + if ev["speedups"]: + result["avg_speedup"] = round( + sum(ev["speedups"]) / len(ev["speedups"]), 4 + ) + if ev["errors"]: + result["error"] = "; ".join(ev["errors"]) + + except subprocess.TimeoutExpired: + result["error"] = "Build timed out (600s)" + finally: + restore_source(dest, backup) + + return result + + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser(description="Evaluate LLVM loop unroll heuristic") + parser.add_argument("program_path", help="Path to evolved C++ source") + EvalConfig.add_arguments(parser) + args = parser.parse_args() + config = EvalConfig.from_args( + args, "llvm/lib/Transforms/Scalar/EvolvedLoopUnroll.cpp", + baseline_file=str(_EVAL_DIR / "baseline_unroll.json"), + ) + metrics = evaluate(args.program_path, config=config) + print(json.dumps(metrics, indent=2)) diff --git a/src/mlirAgent/evolve/tasks/loop_unrolling/initial.cpp b/src/mlirAgent/evolve/tasks/loop_unrolling/initial.cpp new file mode 100644 index 0000000..8e44ce3 --- /dev/null +++ b/src/mlirAgent/evolve/tasks/loop_unrolling/initial.cpp @@ -0,0 +1,78 @@ +//===- EvolvedLoopUnroll.cpp - Evolved loop unroll heuristic ------*- C++ -*-===// +// +// Evolved by OpenEvolve / ShinkaEvolve. +// +// This file is automatically patched by the evaluator during evolution. +// The EVOLVE-BLOCK markers delimit the region that the LLM modifies. +// +// Convention: return an unroll factor >= 1. +// 1 = don't unroll, >1 = unroll by that factor. +// +// Available LoopUnrollFeatures fields: +// LoopSize - instruction count of the rolled loop body +// TripCount - exact trip count (0 if unknown) +// MaxTripCount - upper bound on trip count (0 if unknown) +// TripMultiple - trip count is guaranteed a multiple of this +// Depth - loop nesting depth (1 = outermost) +// NumBlocks - number of basic blocks in the loop +// BEInsns - backend edge instructions (~2) +// Threshold - target unroll cost threshold +// PartialThreshold - partial unroll cost threshold +// MaxCount - maximum allowed unroll factor +// NumInlineCandidates - number of inline candidates in loop body +// IsInnermost - true if this is an innermost loop +// HasExactTripCount - true if TripCount > 0 +// MaxOrZero - true if loop runs max trip count or zero times +// AllowPartial - true if partial unrolling is allowed +// AllowRuntime - true if runtime unrolling is allowed +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar/EvolvedLoopUnroll.h" +#include "llvm/Support/CommandLine.h" + +using namespace llvm; + +// Tunable threshold scale exposed as cl::opt for Optuna inner-loop tuning +// [hyperparam]: ae-unroll-threshold-scale, int, 50, 200 +static cl::opt ThresholdScale("ae-unroll-threshold-scale", cl::init(100), cl::Hidden, + cl::desc("Scale factor for unroll threshold (percent, 100 = default)")); + +// EVOLVE-BLOCK-START loop_unroll_heuristic +unsigned llvm::computeEvolvedLoopUnrollCount(const LoopUnrollFeatures &F) { + unsigned EffThreshold = F.Threshold * ThresholdScale / 100; + + // 1. Full unroll: if exact trip count known and unrolled size fits threshold + if (F.HasExactTripCount && F.TripCount > 1) { + unsigned UnrolledSize = F.LoopSize * F.TripCount; + if (UnrolledSize <= EffThreshold) { + return F.TripCount; + } + } + + // 2. Partial unroll: if loop is small enough and we have trip info + if (F.AllowPartial && F.LoopSize < F.PartialThreshold) { + unsigned MaxUnroll = (F.PartialThreshold - F.BEInsns) / + (F.LoopSize - F.BEInsns); + if (MaxUnroll < 2) + return 1; + + // Clamp to power of 2 for clean remainder handling + unsigned Count = 1; + while (Count * 2 <= MaxUnroll) + Count *= 2; + + // If we know the trip count, align to it + if (F.HasExactTripCount) { + while (Count > 1 && F.TripCount % Count != 0) + Count >>= 1; + } + + if (Count > 1) + return Count; + } + + // 3. Don't unroll + return 1; +} +// EVOLVE-BLOCK-END loop_unroll_heuristic diff --git a/src/mlirAgent/evolve/tasks/loop_unrolling/task.yaml b/src/mlirAgent/evolve/tasks/loop_unrolling/task.yaml new file mode 100644 index 0000000..69c9805 --- /dev/null +++ b/src/mlirAgent/evolve/tasks/loop_unrolling/task.yaml @@ -0,0 +1,9 @@ +name: loop_unrolling +description: > + Evolve LLVM's loop unrolling heuristic (computeUnrollCount) to improve + runtime performance on CTMark benchmarks. The evolved function decides + whether and how much to unroll each loop based on 16 extracted features. +evolve_blocks: + - loop_unroll_heuristic +target_file: llvm/lib/Transforms/Scalar/EvolvedLoopUnroll.cpp +language: cpp From a4c852ca88673a14f5e75ed2617b409fade5080d Mon Sep 17 00:00:00 2001 From: Ashvin Verma Date: Sun, 22 Feb 2026 23:26:31 -0800 Subject: [PATCH 5/8] [add] ASI feedback (GEPA-style text gradients) + GEPA integration Add Actionable Side Information to evaluator output so the LLM receives structured diagnostic feedback alongside raw scores. Three always-on tiers: score decomposition with signal classification, compiler stats delta via -stats flag, and runtime variance from all timings. Two optional tiers gated behind config flags: perf stat hardware counters and optimization remarks. Also add GEPA adapter files (ManualLM, evaluator bridge, CLI runner) for comparison experiments. Co-Authored-By: Claude Opus 4.6 --- src/mlirAgent/evolve/gepa_adapter.py | 83 +++ src/mlirAgent/evolve/gepa_manual_lm.py | 60 +++ src/mlirAgent/evolve/gepa_run.py | 147 +++++ src/mlirAgent/evolve/tasks/llvm_bench.py | 508 +++++++++++++++++- .../evolve/tasks/llvm_inlining/evaluate.py | 35 +- .../evolve/tasks/loop_unrolling/evaluate.py | 35 +- .../tasks/regalloc_priority/evaluate.py | 35 +- 7 files changed, 867 insertions(+), 36 deletions(-) create mode 100644 src/mlirAgent/evolve/gepa_adapter.py create mode 100644 src/mlirAgent/evolve/gepa_manual_lm.py create mode 100644 src/mlirAgent/evolve/gepa_run.py diff --git a/src/mlirAgent/evolve/gepa_adapter.py b/src/mlirAgent/evolve/gepa_adapter.py new file mode 100644 index 0000000..b1ff02c --- /dev/null +++ b/src/mlirAgent/evolve/gepa_adapter.py @@ -0,0 +1,83 @@ +"""GEPA adapter for LLVM heuristic evolution. + +Bridges GEPA's ``optimize_anything`` API with our LLVM benchmark evaluator. +Handles EVOLVE-BLOCK extraction, code injection, and score retrieval. +""" + +import os +import re +import sys +import tempfile +from pathlib import Path + +# Ensure tasks package is importable +sys.path.insert(0, str(Path(__file__).resolve().parent)) + +from tasks.llvm_bench import EvalConfig + +_EVOLVE_BLOCK_RE = re.compile( + r"(// EVOLVE-BLOCK-START\n)(.*?)(// EVOLVE-BLOCK-END)", + re.DOTALL, +) + + +def extract_evolve_block(code): + """Extract the EVOLVE-BLOCK content from C++ source code.""" + m = _EVOLVE_BLOCK_RE.search(code) + if m: + return m.group(2) + return code + + +def inject_evolve_block(template, block): + """Replace EVOLVE-BLOCK in *template* with new *block* content.""" + return _EVOLVE_BLOCK_RE.sub( + lambda m: m.group(1) + block + m.group(3), + template, + ) + + +def make_evaluator(task_name, config=None): + """Create an evaluator function for GEPA. + + Returns a callable ``code_str -> float`` that compiles and benchmarks + the given C++ source code, returning the ``combined_score``. + """ + if task_name == "llvm_inlining": + from tasks.llvm_inlining.evaluate import evaluate + if config is None: + config = EvalConfig.from_env( + "llvm/lib/Analysis/EvolvedInlineCost.cpp" + ) + elif task_name == "loop_unrolling": + from tasks.loop_unrolling.evaluate import evaluate + if config is None: + config = EvalConfig.from_env( + "llvm/lib/Transforms/Scalar/EvolvedLoopUnroll.cpp" + ) + elif task_name == "regalloc_priority": + from tasks.regalloc_priority.evaluate import evaluate + if config is None: + config = EvalConfig.from_env( + "llvm/lib/CodeGen/EvolvedRegAllocPriority.cpp" + ) + else: + raise ValueError(f"Unknown task: {task_name}") + + def evaluator(code_str): + """Write code to temp file, evaluate, return score.""" + with tempfile.NamedTemporaryFile( + mode="w", suffix=".cpp", delete=False, prefix="gepa_" + ) as f: + f.write(code_str) + tmp_path = f.name + try: + result = evaluate(tmp_path, config=config) + if isinstance(result, dict): + return result.get("combined_score", 0.0) + # EvaluationResult + return result.metrics.get("combined_score", 0.0) + finally: + os.unlink(tmp_path) + + return evaluator diff --git a/src/mlirAgent/evolve/gepa_manual_lm.py b/src/mlirAgent/evolve/gepa_manual_lm.py new file mode 100644 index 0000000..03bd090 --- /dev/null +++ b/src/mlirAgent/evolve/gepa_manual_lm.py @@ -0,0 +1,60 @@ +"""File-based LLM for GEPA -- writes prompts to disk, polls for responses. + +GEPA's LM interface is a simple synchronous callable: +``__call__(prompt: str | list[dict]) -> str`` + +This class writes each prompt as a Markdown file and waits for the user +(or an agent) to create a corresponding ``.response.md`` file. + +Usage:: + + lm = ManualLM("gepa_prompts") + response = lm("Write improved code...") # blocks until response file exists +""" + +import os +import time + + +class ManualLM: + """File-based LLM for GEPA. + + Writes prompts as ``prompt_NNN.md`` and polls for ``prompt_NNN.response.md``. + """ + + def __init__(self, prompts_dir="gepa_prompts", poll_interval=2.0): + self.prompts_dir = prompts_dir + self.poll_interval = poll_interval + self._counter = 0 + os.makedirs(prompts_dir, exist_ok=True) + + def __call__(self, prompt): + """Send prompt and block until response file appears.""" + self._counter += 1 + prompt_path = os.path.join( + self.prompts_dir, f"prompt_{self._counter:03d}.md" + ) + response_path = os.path.join( + self.prompts_dir, f"prompt_{self._counter:03d}.response.md" + ) + + with open(prompt_path, "w") as f: + if isinstance(prompt, str): + f.write(f"# User\n\n{prompt}\n") + else: + # list[dict] format: [{"role": "system", "content": "..."}, ...] + for msg in prompt: + role = msg.get("role", "user").title() + content = msg.get("content", "") + f.write(f"# {role}\n\n{content}\n\n") + + print(f" [ManualLM] Prompt written to {prompt_path}") + print(f" [ManualLM] Waiting for response at {response_path}...") + + while not os.path.exists(response_path): + time.sleep(self.poll_interval) + + with open(response_path) as f: + response = f.read().strip() + print(f" [ManualLM] Got response ({len(response)} chars)") + return response diff --git a/src/mlirAgent/evolve/gepa_run.py b/src/mlirAgent/evolve/gepa_run.py new file mode 100644 index 0000000..cd38202 --- /dev/null +++ b/src/mlirAgent/evolve/gepa_run.py @@ -0,0 +1,147 @@ +"""CLI runner for GEPA on LLVM evolution tasks. + +Usage:: + + python gepa_run.py --task llvm_inlining [--prompts-dir gepa_prompts] + +Requires ``pip install gepa`` and environment variables: + - LLVM_SRC_PATH: path to LLVM source tree + - EVOLVE_BUILD_DIR: path to LLVM build directory +""" + +import argparse +import json +import os +import sys +from pathlib import Path + +# Ensure local packages are importable +sys.path.insert(0, str(Path(__file__).resolve().parent)) + +# Task → initial source file mapping +_TASK_INITIAL = { + "llvm_inlining": "tasks/llvm_inlining/initial.cpp", + "loop_unrolling": "tasks/loop_unrolling/initial.cpp", + "regalloc_priority": "tasks/regalloc_priority/initial.cpp", +} + + +def main(): + parser = argparse.ArgumentParser( + description="Run GEPA on LLVM heuristic evolution tasks" + ) + parser.add_argument( + "--task", required=True, + choices=list(_TASK_INITIAL.keys()), + help="Task to optimize", + ) + parser.add_argument( + "--initial", default=None, + help="Path to initial C++ source (overrides default)", + ) + parser.add_argument( + "--prompts-dir", default="gepa_prompts", + help="Directory for prompt/response files (default: gepa_prompts)", + ) + parser.add_argument( + "--poll-interval", type=float, default=2.0, + help="Poll interval for response files in seconds (default: 2.0)", + ) + parser.add_argument( + "--max-iterations", type=int, default=10, + help="Maximum GEPA iterations (default: 10)", + ) + parser.add_argument( + "--output", default=None, + help="Path to save best code (default: tasks//gepa_best.cpp)", + ) + args = parser.parse_args() + + # Import GEPA + try: + from gepa import optimize_anything + except ImportError: + print("Error: gepa not installed. Run: pip install gepa") + sys.exit(1) + + from gepa_manual_lm import ManualLM + from gepa_adapter import make_evaluator + + # Find initial program + base_dir = Path(__file__).resolve().parent + if args.initial: + initial_file = Path(args.initial) + else: + initial_file = base_dir / _TASK_INITIAL[args.task] + + if not initial_file.exists(): + print(f"Error: initial source not found at {initial_file}") + sys.exit(1) + + with open(initial_file) as f: + initial_code = f.read() + + # Create LM and evaluator + lm = ManualLM( + prompts_dir=args.prompts_dir, + poll_interval=args.poll_interval, + ) + evaluator = make_evaluator(args.task) + + print(f"{'=' * 60}") + print(f"GEPA Runner") + print(f" Task: {args.task}") + print(f" Initial code: {initial_file}") + print(f" Prompts dir: {args.prompts_dir}") + print(f" Max iterations: {args.max_iterations}") + print(f"{'=' * 60}") + print() + + # Evaluate initial program first + print("Evaluating initial program...") + initial_score = evaluator(initial_code) + print(f" Initial score: {initial_score}") + print() + + # Run GEPA + result = optimize_anything( + initial_code=initial_code, + evaluate_fn=evaluator, + lm=lm, + max_iterations=args.max_iterations, + ) + + print() + print(f"{'=' * 60}") + print(f"GEPA Results:") + print(f" Best score: {result.best_score}") + print(f" Initial score: {initial_score}") + print(f" Improvement: {result.best_score - initial_score:+.4f}") + print(f" Iterations: {result.iterations}") + print(f"{'=' * 60}") + + # Save best code + output_path = args.output or str( + base_dir / "tasks" / args.task / "gepa_best.cpp" + ) + os.makedirs(os.path.dirname(output_path), exist_ok=True) + with open(output_path, "w") as f: + f.write(result.best_code) + print(f"Best code saved to: {output_path}") + + # Save summary + summary = { + "task": args.task, + "initial_score": initial_score, + "best_score": result.best_score, + "iterations": result.iterations, + "output_path": output_path, + } + summary_path = os.path.join(args.prompts_dir, "summary.json") + with open(summary_path, "w") as f: + json.dump(summary, f, indent=2) + print(f"Summary saved to: {summary_path}") + + +if __name__ == "__main__": + main() diff --git a/src/mlirAgent/evolve/tasks/llvm_bench.py b/src/mlirAgent/evolve/tasks/llvm_bench.py index 9371a23..20957cb 100644 --- a/src/mlirAgent/evolve/tasks/llvm_bench.py +++ b/src/mlirAgent/evolve/tasks/llvm_bench.py @@ -6,6 +6,7 @@ """ import json +import math import os import re import shutil @@ -126,6 +127,9 @@ class EvalConfig: optuna_subset: list = field(default_factory=lambda: ["sqlite3", "spass", "tramp3d-v4"]) ninja: str = "" build_targets: str = "bin/opt bin/llc" + enable_stats: bool = True # Tier 2: -stats flag (zero overhead) + enable_perf_counters: bool = False # Tier 4: perf stat (needs permissions) + enable_remarks: bool = False # Tier 5: -pass-remarks-output (adds time) def __post_init__(self): if not self.testsuite_dir: @@ -153,6 +157,15 @@ def from_env(cls, target_file: str, **overrides) -> "EvalConfig": "target_file": os.environ.get("EVOLVE_TARGET_FILE", target_file), "opt_timeout": int(os.environ.get("EVOLVE_OPT_TIMEOUT", "120")), "optuna_trials": int(os.environ.get("EVOLVE_OPTUNA_TRIALS", "20")), + "enable_stats": os.environ.get( + "EVOLVE_ENABLE_STATS", "1" + ).lower() in ("1", "true"), + "enable_perf_counters": os.environ.get( + "EVOLVE_ENABLE_PERF", "0" + ).lower() in ("1", "true"), + "enable_remarks": os.environ.get( + "EVOLVE_ENABLE_REMARKS", "0" + ).lower() in ("1", "true"), } defaults.update(overrides) return cls(**defaults) @@ -235,16 +248,116 @@ def get_text_size(obj_path: str) -> int: return os.path.getsize(obj_path) if os.path.exists(obj_path) else 0 +# --------------------------------------------------------------------------- +# Stats / perf parsing +# --------------------------------------------------------------------------- + +# Matches LLVM -stats output: " 21479 inline - Number of functions inlined" +_STATS_RE = re.compile(r"^\s*(\d+)\s+([\w.-]+)\s+-\s+(.+)$", re.MULTILINE) + + +def parse_stats(stderr_text): + """Parse LLVM ``-stats`` output from stderr. + + Returns dict mapping ``"pass - description"`` to integer count. + """ + stats = {} + for m in _STATS_RE.finditer(stderr_text): + count = int(m.group(1)) + pass_name = m.group(2) + description = m.group(3).strip() + key = f"{pass_name} - {description}" + stats[key] = count + return stats + + +def parse_perf_output(perf_stderr): + """Parse ``perf stat -x,`` CSV output. + + Format per line: ``value,unit,event_name,...`` + """ + counters = {} + for line in perf_stderr.strip().split("\n"): + parts = line.split(",") + if len(parts) >= 3: + try: + value = int(parts[0].strip()) + event = parts[2].strip() + counters[event] = value + except (ValueError, IndexError): + continue + return counters + + +def run_perf_stat(name, binary_path, tmp_dir, data_dir, + counters=None): + """Run a single ``perf stat`` measurement. Returns dict of counter values.""" + if counters is None: + counters = ["instructions", "cycles", "cache-misses", "branch-misses"] + config = BENCH_RUN_CONFIGS.get(name) + if not config: + return {} + + run_dir = os.path.join(tmp_dir, f"{name}_perf") + os.makedirs(run_dir, exist_ok=True) + run_binary = os.path.join(run_dir, name) + shutil.copy2(binary_path, run_binary) + os.chmod(run_binary, 0o755) + + bench_data = Path(data_dir) / name + + # Copy data files (same logic as run_benchmark) + if config.get("data_subdir") and bench_data.exists(): + for item in bench_data.iterdir(): + dst = os.path.join(run_dir, item.name) + if item.is_dir(): + shutil.copytree(str(item), dst, dirs_exist_ok=True) + else: + shutil.copy2(str(item), dst) + elif config.get("data_files") and bench_data.exists(): + for f in config["data_files"]: + src = bench_data / f + if src.exists(): + shutil.copy2(str(src), os.path.join(run_dir, f)) + + cmd = ["perf", "stat", "-e", ",".join(counters), "-x", ",", + run_binary] + config.get("args", []) + timeout = config.get("timeout", 30) + stdin_file = None + if config.get("stdin_file") and bench_data.exists(): + stdin_file = bench_data / config["stdin_file"] + + stdin_fh = None + try: + if stdin_file and stdin_file.exists(): + stdin_fh = open(str(stdin_file), "r") + proc = subprocess.run( + cmd, capture_output=True, text=True, timeout=timeout, + cwd=run_dir, stdin=stdin_fh, + ) + return parse_perf_output(proc.stderr) + except (subprocess.TimeoutExpired, OSError): + return {} + finally: + if stdin_fh: + stdin_fh.close() + + # --------------------------------------------------------------------------- # Benchmark execution # --------------------------------------------------------------------------- def run_benchmark(name: str, binary_path: str, tmp_dir: str, data_dir: str, num_runs: int = 5): - """Run a benchmark with reference inputs; return median wall-clock seconds or None.""" + """Run a benchmark with reference inputs. + + Returns ``(median, all_timings)`` where *median* is the median + wall-clock seconds (or ``None`` on failure) and *all_timings* is the + sorted list of successful run durations. + """ config = BENCH_RUN_CONFIGS.get(name) if not config: - return None + return None, [] run_dir = os.path.join(tmp_dir, f"{name}_run") os.makedirs(run_dir, exist_ok=True) @@ -295,14 +408,15 @@ def run_benchmark(name: str, binary_path: str, tmp_dir: str, data_dir: str, stdin_fh.close() if not timings: - return None + return None, [] timings.sort() - return timings[len(timings) // 2] + return timings[len(timings) // 2], timings def compile_benchmark(bc_path, opt_path, llc_path, tmp_dir, data_dir, evolved_opt_flags=None, evolved_llc_flags=None, - opt_timeout=120): + opt_timeout=120, enable_stats=False, + enable_perf=False): """Compile a .bc file through ``opt -> llc -> gcc``. Callers pass evolved flags to *opt*, *llc*, or both: @@ -310,15 +424,23 @@ def compile_benchmark(bc_path, opt_path, llc_path, tmp_dir, data_dir, - Inlining: ``evolved_opt_flags=["-use-evolved-inline-cost", ...]`` - RegAlloc: ``evolved_llc_flags=["-use-evolved-regalloc-priority", ...]`` - Returns ``(text_size, binary_size, runtime, error)`` 4-tuple. + Returns a dict with keys: ``text_size``, ``binary_size``, ``runtime``, + ``timings``, ``opt_stats``, ``llc_stats``, ``perf_counters``, ``error``. """ name = bc_path.stem opt_bc = os.path.join(tmp_dir, f"{name}_opt.bc") obj_file = os.path.join(tmp_dir, f"{name}.o") binary = os.path.join(tmp_dir, name) + def _err(msg): + return {"text_size": None, "binary_size": None, "runtime": None, + "timings": [], "opt_stats": {}, "llc_stats": {}, + "perf_counters": {}, "error": msg} + # opt pass opt_cmd = [str(opt_path), "-O2"] + if enable_stats: + opt_cmd.append("-stats") if evolved_opt_flags: opt_cmd.extend(evolved_opt_flags) opt_cmd += [str(bc_path), "-o", opt_bc] @@ -328,12 +450,16 @@ def compile_benchmark(bc_path, opt_path, llc_path, tmp_dir, data_dir, opt_cmd, capture_output=True, text=True, timeout=opt_timeout, ) except subprocess.TimeoutExpired: - return None, None, None, f"opt timed out ({opt_timeout}s)" + return _err(f"opt timed out ({opt_timeout}s)") if proc.returncode != 0: - return None, None, None, proc.stderr[:500] + return _err(proc.stderr[:500]) + + opt_stats = parse_stats(proc.stderr) if enable_stats else {} # llc: bitcode -> object llc_cmd = [str(llc_path), "-O2", "-filetype=obj", "-relocation-model=pic"] + if enable_stats: + llc_cmd.append("-stats") if evolved_llc_flags: llc_cmd.extend(evolved_llc_flags) llc_cmd += [opt_bc, "-o", obj_file] @@ -343,9 +469,11 @@ def compile_benchmark(bc_path, opt_path, llc_path, tmp_dir, data_dir, llc_cmd, capture_output=True, text=True, timeout=opt_timeout, ) except subprocess.TimeoutExpired: - return None, None, None, f"llc timed out ({opt_timeout}s)" + return _err(f"llc timed out ({opt_timeout}s)") if proc.returncode != 0: - return None, None, None, proc.stderr[:500] + return _err(proc.stderr[:500]) + + llc_stats = parse_stats(proc.stderr) if enable_stats else {} text_size = get_text_size(obj_file) @@ -357,13 +485,33 @@ def compile_benchmark(bc_path, opt_path, llc_path, tmp_dir, data_dir, gcc_cmd, capture_output=True, text=True, timeout=60, ) except subprocess.TimeoutExpired: - return text_size, None, None, "link timed out" + return {"text_size": text_size, "binary_size": None, "runtime": None, + "timings": [], "opt_stats": opt_stats, "llc_stats": llc_stats, + "perf_counters": {}, "error": "link timed out"} if proc.returncode != 0: - return text_size, None, None, f"link failed: {proc.stderr[:200]}" + return {"text_size": text_size, "binary_size": None, "runtime": None, + "timings": [], "opt_stats": opt_stats, "llc_stats": llc_stats, + "perf_counters": {}, + "error": f"link failed: {proc.stderr[:200]}"} binary_size = os.path.getsize(binary) - runtime = run_benchmark(name, binary, tmp_dir, data_dir) - return text_size, binary_size, runtime, None + runtime, timings = run_benchmark(name, binary, tmp_dir, data_dir) + + # Optional perf stat (single run, deterministic counters) + perf_counters = {} + if enable_perf and runtime is not None: + perf_counters = run_perf_stat(name, binary, tmp_dir, data_dir) + + return { + "text_size": text_size, + "binary_size": binary_size, + "runtime": runtime, + "timings": timings, + "opt_stats": opt_stats, + "llc_stats": llc_stats, + "perf_counters": perf_counters, + "error": None, + } # --------------------------------------------------------------------------- @@ -429,10 +577,14 @@ def load_baseline(config: EvalConfig): with tempfile.TemporaryDirectory(prefix="evolve_baseline_") as tmp_dir: for bc in benchmarks: print(f" Baseline: {bc.stem}...", end=" ", flush=True) - text_size, binary_size, runtime, err = compile_benchmark( + r = compile_benchmark( bc, opt_path, llc_path, tmp_dir, config.data_dir, opt_timeout=config.opt_timeout, ) + text_size = r.get("text_size") + binary_size = r.get("binary_size") + runtime = r.get("runtime") + err = r.get("error") if err: print(f"ERROR: {err}") elif text_size is not None: @@ -461,14 +613,15 @@ def load_baseline(config: EvalConfig): def eval_benchmarks(benchmarks, opt_path, llc_path, baseline, tmp_dir, data_dir, score_fn, evolved_opt_flags=None, - evolved_llc_flags=None, opt_timeout=120): + evolved_llc_flags=None, opt_timeout=120, + enable_stats=False, enable_perf=False): """Compile and score benchmarks. *score_fn(total_binary, baseline_total_binary, speedups)* computes the task-specific score from aggregate measurements. Returns ``(score, result_dict)`` where *result_dict* contains per-benchmark - details plus aggregate totals. + details (including stats, timings, perf counters) plus aggregate totals. """ total_binary = 0 baseline_total_binary = 0 @@ -479,17 +632,28 @@ def eval_benchmarks(benchmarks, opt_path, llc_path, baseline, tmp_dir, errors = [] for bc in benchmarks: - text_size, binary_size, runtime, err = compile_benchmark( + r = compile_benchmark( bc, opt_path, llc_path, tmp_dir, data_dir, evolved_opt_flags=evolved_opt_flags, evolved_llc_flags=evolved_llc_flags, opt_timeout=opt_timeout, + enable_stats=enable_stats, + enable_perf=enable_perf, ) bl = baseline.get(bc.name, {}) + text_size = r.get("text_size") + binary_size = r.get("binary_size") + runtime = r.get("runtime") + err = r.get("error") + info = { "text_size": text_size, "binary_size": binary_size, "runtime": runtime, + "timings": r.get("timings", []), + "opt_stats": r.get("opt_stats", {}), + "llc_stats": r.get("llc_stats", {}), + "perf_counters": r.get("perf_counters", {}), } if err: @@ -593,3 +757,311 @@ def objective(trial): best_params = study.best_params best_flags = [f"-{k}={v}" for k, v in best_params.items()] return study.best_value, best_params, best_flags + + +# --------------------------------------------------------------------------- +# ASI — Actionable Side Information (GEPA-style text gradients) +# --------------------------------------------------------------------------- + +@dataclass +class ScoreFormula: + """Describes how a task's score is computed from metrics. + + Used by ``generate_asi()`` to decompose the score into its components so + the LLM can understand what drives the fitness function. + """ + speedup_weight: float = 5.0 # multiplier on speedup_pct + binary_weight: float = 1.0 # multiplier on binary_reduction_pct + description: str = "5 * speedup% + binary_reduction%" + + +def _classify_signal(info, bl): + """Classify a benchmark result's signal reliability. + + Returns a human-readable label: + - ``UNRELIABLE (<10ms)`` — baseline runtime too short for stable measurement + - ``HIGH_VARIANCE (<100ms)`` — baseline runtime marginal + - ``REAL (code changed)`` — text section changed AND meaningful speedup + - ``NOISE (same code)`` — speedup without code change (measurement noise) + - ``MARGINAL`` — small or no change + """ + bl_rt = bl.get("runtime") + speedup = info.get("speedup", 1.0) + text_pct = abs(info.get("text_reduction_pct", 0)) + speedup_delta = abs(speedup - 1.0) * 100 if speedup else 0 + + if bl_rt is not None and bl_rt < 0.01: + return "UNRELIABLE (<10ms)" + if bl_rt is not None and bl_rt < 0.1: + return "HIGH_VARIANCE (<100ms)" + if text_pct > 0.01 and speedup_delta > 1: + return "REAL (code changed)" + if text_pct <= 0.01 and speedup_delta > 1: + return "NOISE (same code)" + return "MARGINAL" + + +def _fmt_runtime(seconds): + """Format a runtime value for display.""" + if seconds is None: + return "N/A" + if seconds < 1.0: + return f"{seconds * 1000:.1f}ms" + return f"{seconds:.1f}s" + + +def generate_asi(score, result_dict, baseline, baseline_stats=None, + formula=None): + """Generate Actionable Side Information markdown narrative. + + Produces structured diagnostic feedback (GEPA-style "text gradients") + with up to four tiers of analysis: + + - **Tier 1** — Score decomposition + per-benchmark signal classification + - **Tier 2** — Compiler statistics delta vs baseline (requires *baseline_stats*) + - **Tier 3** — Runtime variance from individual timings + - **Tier 4** — Hardware perf counters (if collected) + """ + if formula is None: + formula = ScoreFormula() + details = result_dict.get("details", {}) + lines = [] + + # ---- Tier 1: Score Decomposition ---- + speedups = result_dict.get("speedups", []) + avg_speedup = sum(speedups) / len(speedups) if speedups else 0.0 + speedup_pct = (avg_speedup - 1.0) * 100 if avg_speedup > 0 else 0.0 + + total_binary = result_dict.get("total_binary", 0) + bl_total_binary = result_dict.get("baseline_total_binary", 0) + binary_pct = ( + 100.0 * (bl_total_binary - total_binary) / bl_total_binary + if bl_total_binary > 0 else 0.0 + ) + + lines.append(f"## Performance Analysis (Score: {score})") + lines.append("") + lines.append("### Score Decomposition") + lines.append(f"Formula: {formula.description}") + lines.append( + f"- Avg speedup: {avg_speedup:.4f}x ({speedup_pct:+.2f}%) " + f"x {formula.speedup_weight} = {formula.speedup_weight * speedup_pct:.2f}" + ) + lines.append( + f"- Binary reduction: {binary_pct:.2f}% " + f"x {formula.binary_weight} = {formula.binary_weight * binary_pct:.2f}" + ) + lines.append("") + + # Per-benchmark table + lines.append("### Per-Benchmark Results") + lines.append( + "| Benchmark | Speedup | Text D | Binary D | Baseline RT | Signal |" + ) + lines.append( + "|-----------|---------|--------|----------|-------------|--------|" + ) + + score_contributions = {} + for bname in sorted(details.keys()): + info = details[bname] + bl = baseline.get(bname, {}) + + speedup = info.get("speedup") + text_delta = info.get("text_reduction_pct", 0) + binary_delta = info.get("binary_reduction_pct", 0) + bl_rt = bl.get("runtime") + signal = _classify_signal(info, bl) + + sp_str = f"{(speedup - 1) * 100:+.1f}%" if speedup else "N/A" + text_str = f"{text_delta:+.2f}%" + binary_str = f"{binary_delta:+.2f}%" + rt_str = _fmt_runtime(bl_rt) + short_name = bname.replace(".bc", "") + + lines.append( + f"| {short_name} | {sp_str} | {text_str} | {binary_str} " + f"| {rt_str} | {signal} |" + ) + + # Track score contribution per benchmark + if speedup and len(details) > 0: + contrib = ( + (speedup - 1.0) * 100 + * formula.speedup_weight + / len(details) + ) + score_contributions[bname] = contrib + + lines.append("") + + # Key observations + if score_contributions: + total_sp_score = sum(score_contributions.values()) + if total_sp_score != 0: + top = max(score_contributions, key=lambda k: abs(score_contributions[k])) + top_contrib = score_contributions[top] + top_pct = abs(top_contrib / total_sp_score * 100) + top_signal = _classify_signal(details[top], baseline.get(top, {})) + lines.append("### Key Observations") + short = top.replace(".bc", "") + lines.append( + f"- {short} contributes {top_pct:.0f}% of speedup score" + f" -- {top_signal}" + ) + + # Summarize real improvements + real_gains = [ + (n, details[n].get("speedup", 1.0)) + for n in details + if _classify_signal(details[n], baseline.get(n, {})).startswith("REAL") + and details[n].get("speedup", 1.0) > 1.0 + ] + if real_gains: + real_avg = ( + sum(s - 1.0 for _, s in real_gains) / len(real_gains) * 100 + ) + lines.append( + f"- Real avg speedup (code-changed benchmarks): {real_avg:+.1f}%" + ) + lines.append("") + + # ---- Tier 2: Compiler Statistics Delta ---- + if baseline_stats: + has_any_stats = any( + details[b].get("opt_stats") or details[b].get("llc_stats") + for b in details + ) + if has_any_stats: + lines.append("### Compiler Statistics Delta") + for bname in sorted(details.keys()): + info = details[bname] + bl_stats = baseline_stats.get(bname, {}) + + evolved_opt = info.get("opt_stats", {}) + evolved_llc = info.get("llc_stats", {}) + bl_opt = bl_stats.get("opt_stats", {}) + bl_llc = bl_stats.get("llc_stats", {}) + + # Combine and find interesting deltas + deltas = [] + for key in set(list(evolved_opt.keys()) + list(bl_opt.keys())): + ev = evolved_opt.get(key, 0) + bl_v = bl_opt.get(key, 0) + if bl_v != 0 and ev != bl_v: + pct = (ev - bl_v) / bl_v * 100 + deltas.append((key, bl_v, ev, ev - bl_v, pct)) + for key in set(list(evolved_llc.keys()) + list(bl_llc.keys())): + ev = evolved_llc.get(key, 0) + bl_v = bl_llc.get(key, 0) + if bl_v != 0 and ev != bl_v: + pct = (ev - bl_v) / bl_v * 100 + deltas.append((key, bl_v, ev, ev - bl_v, pct)) + + if deltas: + deltas.sort(key=lambda x: abs(x[4]), reverse=True) + short = bname.replace(".bc", "") + lines.append(f"\n**{short}** (top changes):") + lines.append("| Metric | Baseline | Evolved | Delta |") + lines.append("|--------|----------|---------|-------|") + for key, bl_v, ev, delta, pct in deltas[:8]: + lines.append( + f"| {key} | {bl_v:,} | {ev:,} " + f"| {delta:+,} ({pct:+.1f}%) |" + ) + lines.append("") + + # ---- Tier 3: Runtime Variance ---- + has_timings = any( + len(details[b].get("timings", [])) > 1 for b in details + ) + if has_timings: + lines.append("### Runtime Variance") + lines.append("| Benchmark | Timings | CoV | Signal |") + lines.append("|-----------|---------|-----|--------|") + for bname in sorted(details.keys()): + timings = details[bname].get("timings", []) + if len(timings) < 2: + continue + mean = sum(timings) / len(timings) + variance = sum((t - mean) ** 2 for t in timings) / (len(timings) - 1) + stdev = math.sqrt(variance) + cov = (stdev / mean * 100) if mean > 0 else 0 + signal = ( + "STABLE" if cov < 5 else ("MODERATE" if cov < 15 else "NOISY") + ) + timing_strs = ", ".join(f"{t:.4f}" for t in timings[:5]) + short = bname.replace(".bc", "") + lines.append(f"| {short} | {timing_strs} | {cov:.1f}% | {signal} |") + lines.append("") + + # ---- Tier 4: Hardware Counters ---- + has_perf = any(details[b].get("perf_counters") for b in details) + if has_perf: + lines.append("### Hardware Counters") + for bname in sorted(details.keys()): + perf = details[bname].get("perf_counters", {}) + if not perf: + continue + short = bname.replace(".bc", "") + lines.append(f"\n**{short}**:") + lines.append("| Counter | Value |") + lines.append("|---------|-------|") + for counter, value in sorted(perf.items()): + lines.append(f"| {counter} | {value:,} |") + lines.append("") + + return "\n".join(lines) + + +# --------------------------------------------------------------------------- +# Baseline stats caching +# --------------------------------------------------------------------------- + +def load_baseline_stats(config): + """Load or compute baseline compiler stats (``opt``/``llc -stats`` output). + + Stats are cached in ``baseline_stats.json`` alongside the baseline file. + Re-generates when the file is missing. + """ + stats_path = Path(config.baseline_file).parent / "baseline_stats.json" + if stats_path.exists(): + with open(stats_path) as f: + return json.load(f) + + # Compile each benchmark with -stats (no evolved flags) to collect baseline + opt_path = os.path.join(config.build_dir, "bin", "opt") + llc_path = os.path.join(config.build_dir, "bin", "llc") + benchmarks = find_benchmarks(Path(config.testsuite_dir)) + + if not benchmarks: + return {} + + baseline_stats = {} + with tempfile.TemporaryDirectory(prefix="evolve_blstats_") as tmp_dir: + for bc in benchmarks: + print(f" Baseline stats: {bc.stem}...", end=" ", flush=True) + r = compile_benchmark( + bc, opt_path, llc_path, tmp_dir, config.data_dir, + opt_timeout=config.opt_timeout, enable_stats=True, + ) + if r.get("error"): + print(f"ERROR: {r['error']}") + else: + baseline_stats[bc.name] = { + "opt_stats": r.get("opt_stats", {}), + "llc_stats": r.get("llc_stats", {}), + } + opt_n = len(r.get("opt_stats", {})) + llc_n = len(r.get("llc_stats", {})) + print(f"opt: {opt_n} stats, llc: {llc_n} stats") + + try: + os.makedirs(stats_path.parent, exist_ok=True) + with open(stats_path, "w") as f: + json.dump(baseline_stats, f, indent=2) + print(f" Baseline stats saved to {stats_path}") + except OSError: + pass + + return baseline_stats diff --git a/src/mlirAgent/evolve/tasks/llvm_inlining/evaluate.py b/src/mlirAgent/evolve/tasks/llvm_inlining/evaluate.py index 58b4518..dea6b08 100644 --- a/src/mlirAgent/evolve/tasks/llvm_inlining/evaluate.py +++ b/src/mlirAgent/evolve/tasks/llvm_inlining/evaluate.py @@ -24,19 +24,24 @@ try: from ..llvm_bench import ( - EvalConfig, build_llvm, eval_benchmarks, extract_hyperparams, - find_benchmarks, load_baseline, optuna_tune, patch_source, - restore_source, + EvalConfig, ScoreFormula, build_llvm, eval_benchmarks, + extract_hyperparams, find_benchmarks, generate_asi, load_baseline, + load_baseline_stats, optuna_tune, patch_source, restore_source, ) except ImportError: # Standalone loading by OpenEvolve's importlib (no parent package) sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) from llvm_bench import ( - EvalConfig, build_llvm, eval_benchmarks, extract_hyperparams, - find_benchmarks, load_baseline, optuna_tune, patch_source, - restore_source, + EvalConfig, ScoreFormula, build_llvm, eval_benchmarks, + extract_hyperparams, find_benchmarks, generate_asi, load_baseline, + load_baseline_stats, optuna_tune, patch_source, restore_source, ) +try: + from openevolve.evaluation_result import EvaluationResult +except ImportError: + EvaluationResult = None + def _score(total_binary, baseline_total_binary, speedups): """Inlining score: binary reduction % + speedup bonus.""" @@ -138,6 +143,8 @@ def evaluate(program_path: str, config: EvalConfig = None) -> dict: config.data_dir, _score, evolved_opt_flags=evolved_opt_flags, opt_timeout=config.opt_timeout, + enable_stats=config.enable_stats, + enable_perf=config.enable_perf_counters, ) result["combined_score"] = score @@ -162,6 +169,22 @@ def evaluate(program_path: str, config: EvalConfig = None) -> dict: if ev["errors"]: result["error"] = "; ".join(ev["errors"]) + # Generate ASI (Actionable Side Information) + baseline_stats = None + if config.enable_stats: + baseline_stats = load_baseline_stats(config) + asi = generate_asi( + score, ev, baseline, baseline_stats=baseline_stats, + formula=ScoreFormula( + speedup_weight=0.1, + binary_weight=1.0, + description="binary_reduction% + (avg_speedup - 1) x 10", + ), + ) + + if EvaluationResult is not None: + return EvaluationResult(metrics=result, artifacts={"asi": asi}) + except subprocess.TimeoutExpired: result["error"] = "Build timed out (600s)" finally: diff --git a/src/mlirAgent/evolve/tasks/loop_unrolling/evaluate.py b/src/mlirAgent/evolve/tasks/loop_unrolling/evaluate.py index d16c628..85f91da 100644 --- a/src/mlirAgent/evolve/tasks/loop_unrolling/evaluate.py +++ b/src/mlirAgent/evolve/tasks/loop_unrolling/evaluate.py @@ -25,19 +25,24 @@ try: from ..llvm_bench import ( - EvalConfig, build_llvm, eval_benchmarks, extract_hyperparams, - find_benchmarks, load_baseline, optuna_tune, patch_source, - restore_source, + EvalConfig, ScoreFormula, build_llvm, eval_benchmarks, + extract_hyperparams, find_benchmarks, generate_asi, load_baseline, + load_baseline_stats, optuna_tune, patch_source, restore_source, ) except ImportError: # Standalone loading by OpenEvolve's importlib (no parent package) sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) from llvm_bench import ( - EvalConfig, build_llvm, eval_benchmarks, extract_hyperparams, - find_benchmarks, load_baseline, optuna_tune, patch_source, - restore_source, + EvalConfig, ScoreFormula, build_llvm, eval_benchmarks, + extract_hyperparams, find_benchmarks, generate_asi, load_baseline, + load_baseline_stats, optuna_tune, patch_source, restore_source, ) +try: + from openevolve.evaluation_result import EvaluationResult +except ImportError: + EvaluationResult = None + _EVAL_DIR = Path(__file__).resolve().parent @@ -140,6 +145,8 @@ def evaluate(program_path: str, config: EvalConfig = None) -> dict: config.data_dir, _score, evolved_opt_flags=evolved_opt_flags, opt_timeout=config.opt_timeout, + enable_stats=config.enable_stats, + enable_perf=config.enable_perf_counters, ) result["combined_score"] = score @@ -158,6 +165,22 @@ def evaluate(program_path: str, config: EvalConfig = None) -> dict: if ev["errors"]: result["error"] = "; ".join(ev["errors"]) + # Generate ASI (Actionable Side Information) + baseline_stats = None + if config.enable_stats: + baseline_stats = load_baseline_stats(config) + asi = generate_asi( + score, ev, baseline, baseline_stats=baseline_stats, + formula=ScoreFormula( + speedup_weight=5.0, + binary_weight=1.0, + description="5 x speedup% + binary_reduction%", + ), + ) + + if EvaluationResult is not None: + return EvaluationResult(metrics=result, artifacts={"asi": asi}) + except subprocess.TimeoutExpired: result["error"] = "Build timed out (600s)" finally: diff --git a/src/mlirAgent/evolve/tasks/regalloc_priority/evaluate.py b/src/mlirAgent/evolve/tasks/regalloc_priority/evaluate.py index 7edd176..b5bf452 100644 --- a/src/mlirAgent/evolve/tasks/regalloc_priority/evaluate.py +++ b/src/mlirAgent/evolve/tasks/regalloc_priority/evaluate.py @@ -24,19 +24,24 @@ try: from ..llvm_bench import ( - EvalConfig, build_llvm, eval_benchmarks, extract_hyperparams, - find_benchmarks, load_baseline, optuna_tune, patch_source, - restore_source, + EvalConfig, ScoreFormula, build_llvm, eval_benchmarks, + extract_hyperparams, find_benchmarks, generate_asi, load_baseline, + load_baseline_stats, optuna_tune, patch_source, restore_source, ) except ImportError: # Standalone loading by OpenEvolve's importlib (no parent package) sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) from llvm_bench import ( - EvalConfig, build_llvm, eval_benchmarks, extract_hyperparams, - find_benchmarks, load_baseline, optuna_tune, patch_source, - restore_source, + EvalConfig, ScoreFormula, build_llvm, eval_benchmarks, + extract_hyperparams, find_benchmarks, generate_asi, load_baseline, + load_baseline_stats, optuna_tune, patch_source, restore_source, ) +try: + from openevolve.evaluation_result import EvaluationResult +except ImportError: + EvaluationResult = None + _EVAL_DIR = Path(__file__).resolve().parent @@ -138,6 +143,8 @@ def evaluate(program_path: str, config: EvalConfig = None) -> dict: config.data_dir, _score, evolved_llc_flags=evolved_llc_flags, opt_timeout=config.opt_timeout, + enable_stats=config.enable_stats, + enable_perf=config.enable_perf_counters, ) result["combined_score"] = score @@ -156,6 +163,22 @@ def evaluate(program_path: str, config: EvalConfig = None) -> dict: if ev["errors"]: result["error"] = "; ".join(ev["errors"]) + # Generate ASI (Actionable Side Information) + baseline_stats = None + if config.enable_stats: + baseline_stats = load_baseline_stats(config) + asi = generate_asi( + score, ev, baseline, baseline_stats=baseline_stats, + formula=ScoreFormula( + speedup_weight=5.0, + binary_weight=1.0, + description="5 x speedup% + binary_reduction%", + ), + ) + + if EvaluationResult is not None: + return EvaluationResult(metrics=result, artifacts={"asi": asi}) + except subprocess.TimeoutExpired: result["error"] = "Build timed out (600s)" finally: From b00304138a31e24abf3455ab1de84c4a038225af Mon Sep 17 00:00:00 2001 From: Ashvin Verma Date: Mon, 23 Feb 2026 00:10:18 -0800 Subject: [PATCH 6/8] Add Tier 5 optimization remarks + fix GEPA adapter for real API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tier 5: per-decision optimization remarks via -pass-remarks-output. Line-by-line state-machine YAML parser (no PyYAML dependency) extracts inline/loop-unroll !Passed/!Missed documents, compares evolved vs baseline to identify flipped decisions with cost/threshold values. Wired through compile_benchmark → eval_benchmarks → generate_asi. Enabled via EVOLVE_ENABLE_REMARKS=1 (~20% overhead). GEPA: rewrite gepa_adapter.py evaluator to return (score, side_info) tuple per GEPA protocol, passing ASI as native Feedback channel. Rewrite gepa_run.py to use real optimize_anything API with GEPAConfig, EngineConfig, ReflectionConfig. Add --auto-respond flag for smoke testing (background thread auto-creates response files). README: add ASI tiers explainer and GEPA integration guide. Co-Authored-By: Claude Opus 4.6 --- src/mlirAgent/evolve/README.md | 142 +++++++- src/mlirAgent/evolve/gepa_adapter.py | 20 +- src/mlirAgent/evolve/gepa_run.py | 143 ++++++-- src/mlirAgent/evolve/tasks/llvm_bench.py | 323 +++++++++++++++++- .../evolve/tasks/llvm_inlining/evaluate.py | 11 +- .../evolve/tasks/loop_unrolling/evaluate.py | 11 +- .../tasks/regalloc_priority/evaluate.py | 11 +- 7 files changed, 618 insertions(+), 43 deletions(-) diff --git a/src/mlirAgent/evolve/README.md b/src/mlirAgent/evolve/README.md index d76a558..25ee3ae 100644 --- a/src/mlirAgent/evolve/README.md +++ b/src/mlirAgent/evolve/README.md @@ -210,6 +210,135 @@ Each benchmark is run **5 times** and the **median** wall-clock time is used scheduling and process startup, though very short benchmarks (sqlite3 at 2ms) remain unreliable. +## ASI — Actionable Side Information + +ASI is a structured diagnostic feedback mechanism inspired by GEPA's "text +gradients". Instead of returning only a scalar score to the LLM, the evaluator +generates a multi-tier markdown narrative explaining *why* the code scored as it +did and *what to change*. + +### Tiers + +| Tier | Content | Overhead | Config | +|------|---------|----------|--------| +| **1** | Score decomposition + per-benchmark signal classification | Zero | Always on | +| **2** | Compiler statistics delta (`-stats` output vs baseline) | Zero | `EVOLVE_ENABLE_STATS=1` (default) | +| **3** | Runtime variance (CoV from 5 runs, STABLE/MODERATE/NOISY) | Zero | Always on | +| **4** | Hardware perf counters (instructions, cycles, cache/branch misses) | ~1s | `EVOLVE_ENABLE_PERF=1` | +| **5** | Optimization decision changes (`-pass-remarks-output` YAML diff) | ~20% | `EVOLVE_ENABLE_REMARKS=1` | + +### Tier 1: Score Decomposition + +Breaks the score into its components (speedup vs binary reduction) and +classifies each benchmark's signal reliability: + +- **UNRELIABLE (<10ms)** — baseline runtime too short (e.g., sqlite3 at 2ms) +- **HIGH_VARIANCE (<100ms)** — borderline runtime stability +- **REAL (code changed)** — text section changed AND meaningful speedup +- **NOISE (same code)** — speedup without code change (measurement artifact) +- **MARGINAL** — small or no change + +### Tier 2: Compiler Statistics Delta + +Compares LLVM `-stats` output between evolved and baseline compilations. Shows +which optimization passes changed behavior (e.g., "inline - Number of functions +inlined: 1234 → 1567, +27%"). + +### Tier 5: Optimization Decision Changes + +Compares per-decision optimization remarks (YAML) between evolved and baseline. +Identifies "flipped" decisions — functions that changed from inlined→rejected +or rejected→inlined — with their cost/threshold values. This gives the LLM +precise targets: "function X was rejected because cost=500 exceeds threshold=225; +lower the cost or raise the threshold." + +The remarks parser uses a line-by-line state machine (not PyYAML) for +performance on 62MB files. Only `inline` and `loop-unroll` pass remarks are +extracted. + +### Example ASI Output + +```markdown +## Performance Analysis (Score: 8.78) + +### Score Decomposition +Formula: binary_reduction% + (avg_speedup - 1) x 10 +- Avg speedup: 1.0023x (+0.23%) x 0.1 = 0.02 +- Binary reduction: 9.24% x 1.0 = 9.24 + +### Per-Benchmark Results +| Benchmark | Speedup | Text D | Binary D | Baseline RT | Signal | +|-----------|---------|--------|----------|-------------|--------| +| spass | +0.3% | +12.31%| +10.42% | 8.1s | REAL | +| tramp3d-v4| -1.2% | -3.45% | -2.11% | 0.11s | HIGH_VARIANCE | + +### Optimization Decisions +**spass** (412 decisions changed vs baseline): +- 287 newly passed (were rejected) +- 125 newly rejected (were passed) + +| Function | Callee | Direction | BL Cost/Thresh | Ev Cost/Thresh | +|----------|--------|-----------|----------------|----------------| +| memory_Free | allocBlock | now passed | 500/225 | -15025/225 | +``` + +## GEPA Integration + +[GEPA](https://github.com/google-deepmind/gepa) (Generalist Evolutionary +Prompt Architect) is an optimization framework that uses LLM reflections to +evolve arbitrary text parameters. We integrate GEPA as an alternative to +OpenEvolve for driving LLVM heuristic evolution. + +### Architecture + +``` +GEPA optimize_anything() + │ + ├─ evaluator(code_str) → (score, {"Feedback": ASI_markdown}) + │ └─ Our make_evaluator(): patch LLVM, build, benchmark, generate ASI + │ + └─ reflection_lm(prompt) → str + └─ ManualLM: write prompt to disk, poll for response file +``` + +Key insight: GEPA's evaluator protocol accepts `(score, side_info_dict)` tuples. +We pass our ASI as `{"Feedback": asi_text}`, which GEPA includes in its +reflection prompt alongside the candidate code. This gives the LLM rich +diagnostic context for proposing improvements. + +### Usage + +```bash +# Manual mode: prompts appear as prompt_NNN.md, you write prompt_NNN.response.md +python gepa_run.py --task llvm_inlining --max-evals 10 + +# Auto mode for smoke testing (auto-responds with trivially modified code) +python gepa_run.py --task llvm_inlining --max-evals 2 --auto-respond +``` + +### Configuration + +| Flag | Default | Description | +|------|---------|-------------| +| `--task` | (required) | `llvm_inlining`, `loop_unrolling`, or `regalloc_priority` | +| `--max-evals` | 10 | Maximum evaluator calls (seed + proposals) | +| `--prompts-dir` | `gepa_prompts` | Directory for prompt/response files | +| `--output-dir` | `/run` | GEPA state directory (for resume) | +| `--auto-respond` | off | Spawn background thread that auto-creates responses | + +### GEPA vs OpenEvolve + +| Feature | OpenEvolve | GEPA | +|---------|-----------|------| +| Population | MAP-Elites (50 candidates) | Pareto frontier | +| Feedback | Scalar score only → ASI via artifacts | Native side-info channel | +| LLM interface | ManualLLM (file-based) | ManualLM (file-based) | +| Hyperparameter tuning | Optuna inner-loop | Not integrated (future) | +| Resume | Checkpoint directory | `run_dir` state | + +Both frameworks use our same evaluation pipeline (`llvm_bench.py`), so scores +are directly comparable. + ## LLVM Hooks ### Inlining (`-use-evolved-inline-cost`) @@ -291,14 +420,20 @@ config = EvalConfig.from_env( | `EVOLVE_BUILD_DIR` | (required) | LLVM ninja build directory | | `EVOLVE_OPT_TIMEOUT` | 120 | Per-benchmark opt/llc timeout (seconds) | | `EVOLVE_OPTUNA_TRIALS` | 20 | Optuna trials (0 = disable) | +| `EVOLVE_ENABLE_STATS` | 1 | Tier 2: collect `-stats` output | +| `EVOLVE_ENABLE_PERF` | 0 | Tier 4: collect perf counters | +| `EVOLVE_ENABLE_REMARKS` | 0 | Tier 5: collect optimization remarks (~20% overhead) | ## Task Structure ``` src/mlirAgent/evolve/ - manual_run.py # Orchestrator: --auto/--wait/--resume + manual_run.py # OpenEvolve orchestrator: --auto/--wait/--resume + gepa_run.py # GEPA orchestrator: --auto-respond + gepa_adapter.py # GEPA evaluator bridge (score, side_info) + gepa_manual_lm.py # File-based LLM for GEPA tasks/ - llvm_bench.py # Shared: EvalConfig, compile, baseline, Optuna + llvm_bench.py # Shared: EvalConfig, compile, baseline, Optuna, ASI llvm_inlining/ evaluate.py # _score(): bin_red% + speedup*10 initial.cpp # Seed: sums heuristic features - threshold @@ -307,6 +442,9 @@ src/mlirAgent/evolve/ compile_testsuite.sh # Script to build .bc from llvm-test-suite testsuite/ # .bc files (gitignored, built locally) data/ # Runtime input data per benchmark + loop_unrolling/ + evaluate.py # _score(): 5*speedup% + bin_red% + initial.cpp # Seed: LLVM default unroll heuristic regalloc_priority/ evaluate.py # _score(): 5*speedup% + bin_red% initial.cpp # Seed: LLVM default bit-packed priority diff --git a/src/mlirAgent/evolve/gepa_adapter.py b/src/mlirAgent/evolve/gepa_adapter.py index b1ff02c..2e9082b 100644 --- a/src/mlirAgent/evolve/gepa_adapter.py +++ b/src/mlirAgent/evolve/gepa_adapter.py @@ -40,8 +40,9 @@ def inject_evolve_block(template, block): def make_evaluator(task_name, config=None): """Create an evaluator function for GEPA. - Returns a callable ``code_str -> float`` that compiles and benchmarks - the given C++ source code, returning the ``combined_score``. + Returns a callable ``code_str -> (score, side_info)`` matching GEPA's + evaluator protocol. *side_info* is a dict that may contain a + ``"Feedback"`` key with ASI markdown text. """ if task_name == "llvm_inlining": from tasks.llvm_inlining.evaluate import evaluate @@ -65,7 +66,7 @@ def make_evaluator(task_name, config=None): raise ValueError(f"Unknown task: {task_name}") def evaluator(code_str): - """Write code to temp file, evaluate, return score.""" + """Write code to temp file, evaluate, return (score, side_info).""" with tempfile.NamedTemporaryFile( mode="w", suffix=".cpp", delete=False, prefix="gepa_" ) as f: @@ -74,9 +75,16 @@ def evaluator(code_str): try: result = evaluate(tmp_path, config=config) if isinstance(result, dict): - return result.get("combined_score", 0.0) - # EvaluationResult - return result.metrics.get("combined_score", 0.0) + score = result.get("combined_score", 0.0) + side_info = {} + else: + # EvaluationResult from OpenEvolve + score = result.metrics.get("combined_score", 0.0) + if hasattr(result, "artifacts") and "asi" in result.artifacts: + side_info = {"Feedback": result.artifacts["asi"]} + else: + side_info = {} + return score, side_info finally: os.unlink(tmp_path) diff --git a/src/mlirAgent/evolve/gepa_run.py b/src/mlirAgent/evolve/gepa_run.py index cd38202..8e7dfb5 100644 --- a/src/mlirAgent/evolve/gepa_run.py +++ b/src/mlirAgent/evolve/gepa_run.py @@ -3,6 +3,7 @@ Usage:: python gepa_run.py --task llvm_inlining [--prompts-dir gepa_prompts] + python gepa_run.py --task llvm_inlining --max-evals 2 --auto-respond Requires ``pip install gepa`` and environment variables: - LLVM_SRC_PATH: path to LLVM source tree @@ -12,7 +13,10 @@ import argparse import json import os +import re import sys +import threading +import time from pathlib import Path # Ensure local packages are importable @@ -25,6 +29,70 @@ "regalloc_priority": "tasks/regalloc_priority/initial.cpp", } +# Task → objective string for GEPA +_TASK_OBJECTIVE = { + "llvm_inlining": ( + "Maximize binary size reduction across CTMark benchmarks " + "by modifying the inlining cost heuristic." + ), + "loop_unrolling": ( + "Maximize runtime speedup across CTMark benchmarks " + "by modifying the loop unrolling heuristic." + ), + "regalloc_priority": ( + "Maximize runtime speedup across CTMark benchmarks " + "by modifying the register allocation priority function." + ), +} + +_TASK_BACKGROUND = ( + "You are modifying a C++ heuristic function in LLVM's optimization pipeline. " + "The function is compiled into the opt/llc tools and evaluated against CTMark " + "benchmarks (real-world C/C++ programs). The evaluator returns a score based on " + "binary size reduction and/or runtime speedup vs the default LLVM heuristic. " + "Higher scores are better. The source code uses LLVM APIs (cl::opt for flags, " + "InlineCost, LoopUnrollResult, etc.). Expose tunable constants as " + "// [hyperparam]: flag-name, type, min, max comments for the autotuner." +) + + +def _auto_respond_thread(prompts_dir, initial_code, poll_interval=1.0): + """Background thread that auto-creates response files for smoke testing. + + Watches for new prompt_NNN.md files and creates prompt_NNN.response.md + with a trivially modified version of the initial code. + """ + seen = set() + prompt_re = re.compile(r"^prompt_(\d+)\.md$") + + while True: + try: + for fname in os.listdir(prompts_dir): + m = prompt_re.match(fname) + if not m: + continue + num = m.group(1) + response_name = f"prompt_{num}.response.md" + if response_name in seen: + continue + response_path = os.path.join(prompts_dir, response_name) + if os.path.exists(response_path): + seen.add(response_name) + continue + + # Create a trivially modified version of the code + modified = initial_code.replace( + "// EVOLVE-BLOCK-START", + f"// EVOLVE-BLOCK-START\n// Auto-response iteration {num}", + ) + with open(response_path, "w") as f: + f.write(f"```cpp\n{modified}\n```\n") + seen.add(response_name) + print(f" [auto-respond] Created {response_name}") + except OSError: + pass + time.sleep(poll_interval) + def main(): parser = argparse.ArgumentParser( @@ -48,18 +116,28 @@ def main(): help="Poll interval for response files in seconds (default: 2.0)", ) parser.add_argument( - "--max-iterations", type=int, default=10, - help="Maximum GEPA iterations (default: 10)", + "--max-evals", type=int, default=10, + help="Maximum evaluator calls (default: 10)", + ) + parser.add_argument( + "--output-dir", default=None, + help="GEPA run directory for state/resume (default: /run)", ) parser.add_argument( "--output", default=None, help="Path to save best code (default: tasks//gepa_best.cpp)", ) + parser.add_argument( + "--auto-respond", action="store_true", + help="Auto-create response files for smoke testing", + ) args = parser.parse_args() # Import GEPA try: - from gepa import optimize_anything + from gepa.optimize_anything import ( + optimize_anything, GEPAConfig, EngineConfig, ReflectionConfig, + ) except ImportError: print("Error: gepa not installed. Run: pip install gepa") sys.exit(1) @@ -88,36 +166,57 @@ def main(): ) evaluator = make_evaluator(args.task) + run_dir = args.output_dir or os.path.join(args.prompts_dir, "run") + os.makedirs(run_dir, exist_ok=True) + print(f"{'=' * 60}") print(f"GEPA Runner") print(f" Task: {args.task}") print(f" Initial code: {initial_file}") print(f" Prompts dir: {args.prompts_dir}") - print(f" Max iterations: {args.max_iterations}") + print(f" Max evals: {args.max_evals}") + print(f" Run dir: {run_dir}") + print(f" Auto-respond: {args.auto_respond}") print(f"{'=' * 60}") print() - # Evaluate initial program first - print("Evaluating initial program...") - initial_score = evaluator(initial_code) - print(f" Initial score: {initial_score}") - print() + # Start auto-responder thread if requested + if args.auto_respond: + t = threading.Thread( + target=_auto_respond_thread, + args=(args.prompts_dir, initial_code, args.poll_interval), + daemon=True, + ) + t.start() + print(" [auto-respond] Background responder started") + + # Run GEPA with real API + objective = _TASK_OBJECTIVE[args.task] + config = GEPAConfig( + engine=EngineConfig( + max_metric_calls=args.max_evals, + parallel=False, + run_dir=run_dir, + ), + reflection=ReflectionConfig( + reflection_lm=lm, + ), + ) - # Run GEPA result = optimize_anything( - initial_code=initial_code, - evaluate_fn=evaluator, - lm=lm, - max_iterations=args.max_iterations, + seed_candidate=initial_code, + evaluator=evaluator, + objective=objective, + background=_TASK_BACKGROUND, + config=config, ) print() print(f"{'=' * 60}") print(f"GEPA Results:") - print(f" Best score: {result.best_score}") - print(f" Initial score: {initial_score}") - print(f" Improvement: {result.best_score - initial_score:+.4f}") - print(f" Iterations: {result.iterations}") + print(f" Best candidate: {len(result.best_candidate)} chars") + print(f" Num candidates: {result.num_candidates}") + print(f" Total evals: {result.total_metric_calls}") print(f"{'=' * 60}") # Save best code @@ -126,16 +225,16 @@ def main(): ) os.makedirs(os.path.dirname(output_path), exist_ok=True) with open(output_path, "w") as f: - f.write(result.best_code) + f.write(result.best_candidate) print(f"Best code saved to: {output_path}") # Save summary summary = { "task": args.task, - "initial_score": initial_score, - "best_score": result.best_score, - "iterations": result.iterations, + "num_candidates": result.num_candidates, + "total_metric_calls": result.total_metric_calls, "output_path": output_path, + "run_dir": run_dir, } summary_path = os.path.join(args.prompts_dir, "summary.json") with open(summary_path, "w") as f: diff --git a/src/mlirAgent/evolve/tasks/llvm_bench.py b/src/mlirAgent/evolve/tasks/llvm_bench.py index 20957cb..1df35c7 100644 --- a/src/mlirAgent/evolve/tasks/llvm_bench.py +++ b/src/mlirAgent/evolve/tasks/llvm_bench.py @@ -289,6 +289,195 @@ def parse_perf_output(perf_stderr): return counters +# --------------------------------------------------------------------------- +# Optimization remarks parsing (Tier 5) +# --------------------------------------------------------------------------- + +# Pass names we care about for remarks +_REMARK_PASSES = {"inline", "loop-unroll"} + + +def parse_remarks(remarks_file): + """Parse LLVM optimization remarks YAML using line-by-line state machine. + + Avoids PyYAML for performance (62MB files). Extracts only inline and + loop-unroll related ``!Passed`` / ``!Missed`` documents. + + Returns ``{"passed": [...], "missed": [...]}`` where each entry is + ``{"pass": str, "name": str, "function": str, "args": dict}``. + """ + passed = [] + missed = [] + + doc_type = None # "passed" or "missed" + cur = None # current document dict + in_args = False + last_arg_key = None + + try: + fh = open(remarks_file, "r", errors="replace") + except OSError: + return {"passed": [], "missed": []} + + try: + for line in fh: + stripped = line.rstrip() + + # New document separator + if stripped.startswith("--- !"): + # Flush previous document + if cur and cur.get("pass") in _REMARK_PASSES: + if doc_type == "passed": + passed.append(cur) + elif doc_type == "missed": + missed.append(cur) + + tag = stripped[5:].strip() + if tag == "Passed": + doc_type = "passed" + cur = {"pass": "", "name": "", "function": "", "args": {}} + in_args = False + elif tag == "Missed": + doc_type = "missed" + cur = {"pass": "", "name": "", "function": "", "args": {}} + in_args = False + else: + doc_type = None + cur = None + in_args = False + continue + + if cur is None: + continue + + # End of document + if stripped == "...": + if cur.get("pass") in _REMARK_PASSES: + if doc_type == "passed": + passed.append(cur) + elif doc_type == "missed": + missed.append(cur) + cur = None + doc_type = None + in_args = False + continue + + # Top-level fields + if not in_args: + if stripped.startswith("Pass:"): + cur["pass"] = stripped.split(":", 1)[1].strip().strip("'\"") + elif stripped.startswith("Name:"): + cur["name"] = stripped.split(":", 1)[1].strip().strip("'\"") + elif stripped.startswith("Function:"): + cur["function"] = stripped.split(":", 1)[1].strip().strip("'\"") + elif stripped.startswith("Args:"): + in_args = True + last_arg_key = None + else: + # Inside Args list — look for key-value pairs + s = stripped.lstrip() + if s.startswith("- "): + # New arg entry: "- Callee: foo" + kv = s[2:] + colon = kv.find(":") + if colon > 0: + key = kv[:colon].strip() + val = kv[colon + 1:].strip().strip("'\"") + cur["args"][key] = val + last_arg_key = key + elif ":" in s and not s.startswith("#"): + # Continuation key on same arg: " Cost: '15'" + colon = s.find(":") + key = s[:colon].strip() + val = s[colon + 1:].strip().strip("'\"") + if key: + cur["args"][key] = val + finally: + fh.close() + + # Flush last document + if cur and cur.get("pass") in _REMARK_PASSES: + if doc_type == "passed": + passed.append(cur) + elif doc_type == "missed": + missed.append(cur) + + return {"passed": passed, "missed": missed} + + +def summarize_remarks(evolved_remarks, baseline_remarks): + """Compare evolved vs baseline remarks to find flipped decisions. + + Returns a compact summary dict with counts and top flipped decisions. + """ + summary = { + "evolved_passed": len(evolved_remarks.get("passed", [])), + "evolved_missed": len(evolved_remarks.get("missed", [])), + "baseline_passed": len(baseline_remarks.get("passed", [])), + "baseline_missed": len(baseline_remarks.get("missed", [])), + "flipped": [], + } + + # Build lookup: (function, callee) -> doc for baseline + def _key(doc): + callee = doc["args"].get("Callee", "") + return (doc["function"], callee) + + bl_passed = {} + for doc in baseline_remarks.get("passed", []): + k = _key(doc) + bl_passed[k] = doc + + bl_missed = {} + for doc in baseline_remarks.get("missed", []): + k = _key(doc) + bl_missed[k] = doc + + # Find newly passed (were missed in baseline) + for doc in evolved_remarks.get("passed", []): + k = _key(doc) + if k in bl_missed: + bl_doc = bl_missed[k] + flip = { + "function": doc["function"], + "callee": doc["args"].get("Callee", ""), + "pass": doc["pass"], + "direction": "newly_passed", + "evolved_cost": doc["args"].get("Cost", ""), + "evolved_threshold": doc["args"].get("Threshold", ""), + "baseline_cost": bl_doc["args"].get("Cost", ""), + "baseline_threshold": bl_doc["args"].get("Threshold", ""), + } + summary["flipped"].append(flip) + + # Find newly missed (were passed in baseline) + for doc in evolved_remarks.get("missed", []): + k = _key(doc) + if k in bl_passed: + bl_doc = bl_passed[k] + flip = { + "function": doc["function"], + "callee": doc["args"].get("Callee", ""), + "pass": doc["pass"], + "direction": "newly_missed", + "evolved_cost": doc["args"].get("Cost", ""), + "evolved_threshold": doc["args"].get("Threshold", ""), + "baseline_cost": bl_doc["args"].get("Cost", ""), + "baseline_threshold": bl_doc["args"].get("Threshold", ""), + } + summary["flipped"].append(flip) + + # Sort flipped by absolute cost difference (most impactful first) + def _sort_key(f): + try: + return abs(int(f["evolved_cost"]) - int(f["baseline_cost"])) + except (ValueError, TypeError): + return 0 + summary["flipped"].sort(key=_sort_key, reverse=True) + + return summary + + def run_perf_stat(name, binary_path, tmp_dir, data_dir, counters=None): """Run a single ``perf stat`` measurement. Returns dict of counter values.""" @@ -416,7 +605,7 @@ def run_benchmark(name: str, binary_path: str, tmp_dir: str, data_dir: str, def compile_benchmark(bc_path, opt_path, llc_path, tmp_dir, data_dir, evolved_opt_flags=None, evolved_llc_flags=None, opt_timeout=120, enable_stats=False, - enable_perf=False): + enable_perf=False, enable_remarks=False): """Compile a .bc file through ``opt -> llc -> gcc``. Callers pass evolved flags to *opt*, *llc*, or both: @@ -435,12 +624,16 @@ def compile_benchmark(bc_path, opt_path, llc_path, tmp_dir, data_dir, def _err(msg): return {"text_size": None, "binary_size": None, "runtime": None, "timings": [], "opt_stats": {}, "llc_stats": {}, - "perf_counters": {}, "error": msg} + "perf_counters": {}, "opt_remarks": {}, "error": msg} # opt pass opt_cmd = [str(opt_path), "-O2"] if enable_stats: opt_cmd.append("-stats") + remarks_file = None + if enable_remarks: + remarks_file = os.path.join(tmp_dir, f"{name}_remarks.yaml") + opt_cmd.append(f"-pass-remarks-output={remarks_file}") if evolved_opt_flags: opt_cmd.extend(evolved_opt_flags) opt_cmd += [str(bc_path), "-o", opt_bc] @@ -455,6 +648,7 @@ def _err(msg): return _err(proc.stderr[:500]) opt_stats = parse_stats(proc.stderr) if enable_stats else {} + opt_remarks = parse_remarks(remarks_file) if remarks_file else {} # llc: bitcode -> object llc_cmd = [str(llc_path), "-O2", "-filetype=obj", "-relocation-model=pic"] @@ -487,11 +681,12 @@ def _err(msg): except subprocess.TimeoutExpired: return {"text_size": text_size, "binary_size": None, "runtime": None, "timings": [], "opt_stats": opt_stats, "llc_stats": llc_stats, - "perf_counters": {}, "error": "link timed out"} + "perf_counters": {}, "opt_remarks": opt_remarks, + "error": "link timed out"} if proc.returncode != 0: return {"text_size": text_size, "binary_size": None, "runtime": None, "timings": [], "opt_stats": opt_stats, "llc_stats": llc_stats, - "perf_counters": {}, + "perf_counters": {}, "opt_remarks": opt_remarks, "error": f"link failed: {proc.stderr[:200]}"} binary_size = os.path.getsize(binary) @@ -510,6 +705,7 @@ def _err(msg): "opt_stats": opt_stats, "llc_stats": llc_stats, "perf_counters": perf_counters, + "opt_remarks": opt_remarks, "error": None, } @@ -614,7 +810,8 @@ def load_baseline(config: EvalConfig): def eval_benchmarks(benchmarks, opt_path, llc_path, baseline, tmp_dir, data_dir, score_fn, evolved_opt_flags=None, evolved_llc_flags=None, opt_timeout=120, - enable_stats=False, enable_perf=False): + enable_stats=False, enable_perf=False, + enable_remarks=False): """Compile and score benchmarks. *score_fn(total_binary, baseline_total_binary, speedups)* computes the @@ -639,6 +836,7 @@ def eval_benchmarks(benchmarks, opt_path, llc_path, baseline, tmp_dir, opt_timeout=opt_timeout, enable_stats=enable_stats, enable_perf=enable_perf, + enable_remarks=enable_remarks, ) bl = baseline.get(bc.name, {}) text_size = r.get("text_size") @@ -654,6 +852,7 @@ def eval_benchmarks(benchmarks, opt_path, llc_path, baseline, tmp_dir, "opt_stats": r.get("opt_stats", {}), "llc_stats": r.get("llc_stats", {}), "perf_counters": r.get("perf_counters", {}), + "opt_remarks": r.get("opt_remarks", {}), } if err: @@ -811,16 +1010,17 @@ def _fmt_runtime(seconds): def generate_asi(score, result_dict, baseline, baseline_stats=None, - formula=None): + formula=None, baseline_remarks=None): """Generate Actionable Side Information markdown narrative. Produces structured diagnostic feedback (GEPA-style "text gradients") - with up to four tiers of analysis: + with up to five tiers of analysis: - **Tier 1** — Score decomposition + per-benchmark signal classification - **Tier 2** — Compiler statistics delta vs baseline (requires *baseline_stats*) - **Tier 3** — Runtime variance from individual timings - **Tier 4** — Hardware perf counters (if collected) + - **Tier 5** — Optimization decision changes (requires *baseline_remarks*) """ if formula is None: formula = ScoreFormula() @@ -1011,6 +1211,71 @@ def generate_asi(score, result_dict, baseline, baseline_stats=None, lines.append(f"| {counter} | {value:,} |") lines.append("") + # ---- Tier 5: Optimization Decision Changes ---- + if baseline_remarks: + has_remarks = any(details[b].get("opt_remarks") for b in details) + if has_remarks: + lines.append("### Optimization Decisions") + for bname in sorted(details.keys()): + evolved_rm = details[bname].get("opt_remarks", {}) + bl_rm = baseline_remarks.get(bname, {}) + if not evolved_rm and not bl_rm: + continue + + summary = summarize_remarks(evolved_rm, bl_rm) + flipped = summary.get("flipped", []) + if not flipped and summary["evolved_passed"] == summary["baseline_passed"]: + continue + + short = bname.replace(".bc", "") + n_flipped = len(flipped) + newly_passed = sum( + 1 for f in flipped if f["direction"] == "newly_passed" + ) + newly_missed = sum( + 1 for f in flipped if f["direction"] == "newly_missed" + ) + lines.append( + f"\n**{short}** ({n_flipped} decisions changed vs baseline):" + ) + if newly_passed: + lines.append(f"- {newly_passed} newly passed (were rejected)") + if newly_missed: + lines.append(f"- {newly_missed} newly rejected (were passed)") + + # Show top flipped decisions with cost/threshold info + top_flips = flipped[:5] + if top_flips: + lines.append("") + lines.append( + "| Function | Callee | Direction | " + "BL Cost/Thresh | Ev Cost/Thresh |" + ) + lines.append( + "|----------|--------|-----------|" + "----------------|----------------|" + ) + for f in top_flips: + direction = ( + "now passed" if f["direction"] == "newly_passed" + else "now rejected" + ) + bl_ct = ( + f"{f['baseline_cost']}/{f['baseline_threshold']}" + if f["baseline_cost"] else "N/A" + ) + ev_ct = ( + f"{f['evolved_cost']}/{f['evolved_threshold']}" + if f["evolved_cost"] else "N/A" + ) + func = f["function"][:30] + callee = f["callee"][:20] + lines.append( + f"| {func} | {callee} | {direction} " + f"| {bl_ct} | {ev_ct} |" + ) + lines.append("") + return "\n".join(lines) @@ -1065,3 +1330,47 @@ def load_baseline_stats(config): pass return baseline_stats + + +def load_baseline_remarks(config): + """Load or compute baseline optimization remarks. + + Remarks are cached in ``baseline_remarks.json`` alongside the baseline + file. Re-generates when the file is missing. Only called when + ``config.enable_remarks`` is True. + """ + remarks_path = Path(config.baseline_file).parent / "baseline_remarks.json" + if remarks_path.exists(): + with open(remarks_path) as f: + return json.load(f) + + opt_path = os.path.join(config.build_dir, "bin", "opt") + llc_path = os.path.join(config.build_dir, "bin", "llc") + benchmarks = find_benchmarks(Path(config.testsuite_dir)) + + if not benchmarks: + return {} + + baseline_remarks = {} + with tempfile.TemporaryDirectory(prefix="evolve_blremarks_") as tmp_dir: + for bc in benchmarks: + print(f" Baseline remarks: {bc.stem}...", end=" ", flush=True) + r = compile_benchmark( + bc, opt_path, llc_path, tmp_dir, config.data_dir, + opt_timeout=config.opt_timeout, enable_remarks=True, + ) + remarks = r.get("opt_remarks", {}) + n_passed = len(remarks.get("passed", [])) + n_missed = len(remarks.get("missed", [])) + baseline_remarks[bc.name] = remarks + print(f"passed={n_passed}, missed={n_missed}") + + try: + os.makedirs(remarks_path.parent, exist_ok=True) + with open(remarks_path, "w") as f: + json.dump(baseline_remarks, f) + print(f" Baseline remarks saved to {remarks_path}") + except OSError: + pass + + return baseline_remarks diff --git a/src/mlirAgent/evolve/tasks/llvm_inlining/evaluate.py b/src/mlirAgent/evolve/tasks/llvm_inlining/evaluate.py index dea6b08..25162f3 100644 --- a/src/mlirAgent/evolve/tasks/llvm_inlining/evaluate.py +++ b/src/mlirAgent/evolve/tasks/llvm_inlining/evaluate.py @@ -26,7 +26,8 @@ from ..llvm_bench import ( EvalConfig, ScoreFormula, build_llvm, eval_benchmarks, extract_hyperparams, find_benchmarks, generate_asi, load_baseline, - load_baseline_stats, optuna_tune, patch_source, restore_source, + load_baseline_remarks, load_baseline_stats, optuna_tune, + patch_source, restore_source, ) except ImportError: # Standalone loading by OpenEvolve's importlib (no parent package) @@ -34,7 +35,8 @@ from llvm_bench import ( EvalConfig, ScoreFormula, build_llvm, eval_benchmarks, extract_hyperparams, find_benchmarks, generate_asi, load_baseline, - load_baseline_stats, optuna_tune, patch_source, restore_source, + load_baseline_remarks, load_baseline_stats, optuna_tune, + patch_source, restore_source, ) try: @@ -145,6 +147,7 @@ def evaluate(program_path: str, config: EvalConfig = None) -> dict: opt_timeout=config.opt_timeout, enable_stats=config.enable_stats, enable_perf=config.enable_perf_counters, + enable_remarks=config.enable_remarks, ) result["combined_score"] = score @@ -173,6 +176,9 @@ def evaluate(program_path: str, config: EvalConfig = None) -> dict: baseline_stats = None if config.enable_stats: baseline_stats = load_baseline_stats(config) + bl_remarks = None + if config.enable_remarks: + bl_remarks = load_baseline_remarks(config) asi = generate_asi( score, ev, baseline, baseline_stats=baseline_stats, formula=ScoreFormula( @@ -180,6 +186,7 @@ def evaluate(program_path: str, config: EvalConfig = None) -> dict: binary_weight=1.0, description="binary_reduction% + (avg_speedup - 1) x 10", ), + baseline_remarks=bl_remarks, ) if EvaluationResult is not None: diff --git a/src/mlirAgent/evolve/tasks/loop_unrolling/evaluate.py b/src/mlirAgent/evolve/tasks/loop_unrolling/evaluate.py index 85f91da..e6563a5 100644 --- a/src/mlirAgent/evolve/tasks/loop_unrolling/evaluate.py +++ b/src/mlirAgent/evolve/tasks/loop_unrolling/evaluate.py @@ -27,7 +27,8 @@ from ..llvm_bench import ( EvalConfig, ScoreFormula, build_llvm, eval_benchmarks, extract_hyperparams, find_benchmarks, generate_asi, load_baseline, - load_baseline_stats, optuna_tune, patch_source, restore_source, + load_baseline_remarks, load_baseline_stats, optuna_tune, + patch_source, restore_source, ) except ImportError: # Standalone loading by OpenEvolve's importlib (no parent package) @@ -35,7 +36,8 @@ from llvm_bench import ( EvalConfig, ScoreFormula, build_llvm, eval_benchmarks, extract_hyperparams, find_benchmarks, generate_asi, load_baseline, - load_baseline_stats, optuna_tune, patch_source, restore_source, + load_baseline_remarks, load_baseline_stats, optuna_tune, + patch_source, restore_source, ) try: @@ -147,6 +149,7 @@ def evaluate(program_path: str, config: EvalConfig = None) -> dict: opt_timeout=config.opt_timeout, enable_stats=config.enable_stats, enable_perf=config.enable_perf_counters, + enable_remarks=config.enable_remarks, ) result["combined_score"] = score @@ -169,6 +172,9 @@ def evaluate(program_path: str, config: EvalConfig = None) -> dict: baseline_stats = None if config.enable_stats: baseline_stats = load_baseline_stats(config) + bl_remarks = None + if config.enable_remarks: + bl_remarks = load_baseline_remarks(config) asi = generate_asi( score, ev, baseline, baseline_stats=baseline_stats, formula=ScoreFormula( @@ -176,6 +182,7 @@ def evaluate(program_path: str, config: EvalConfig = None) -> dict: binary_weight=1.0, description="5 x speedup% + binary_reduction%", ), + baseline_remarks=bl_remarks, ) if EvaluationResult is not None: diff --git a/src/mlirAgent/evolve/tasks/regalloc_priority/evaluate.py b/src/mlirAgent/evolve/tasks/regalloc_priority/evaluate.py index b5bf452..a09acf2 100644 --- a/src/mlirAgent/evolve/tasks/regalloc_priority/evaluate.py +++ b/src/mlirAgent/evolve/tasks/regalloc_priority/evaluate.py @@ -26,7 +26,8 @@ from ..llvm_bench import ( EvalConfig, ScoreFormula, build_llvm, eval_benchmarks, extract_hyperparams, find_benchmarks, generate_asi, load_baseline, - load_baseline_stats, optuna_tune, patch_source, restore_source, + load_baseline_remarks, load_baseline_stats, optuna_tune, + patch_source, restore_source, ) except ImportError: # Standalone loading by OpenEvolve's importlib (no parent package) @@ -34,7 +35,8 @@ from llvm_bench import ( EvalConfig, ScoreFormula, build_llvm, eval_benchmarks, extract_hyperparams, find_benchmarks, generate_asi, load_baseline, - load_baseline_stats, optuna_tune, patch_source, restore_source, + load_baseline_remarks, load_baseline_stats, optuna_tune, + patch_source, restore_source, ) try: @@ -145,6 +147,7 @@ def evaluate(program_path: str, config: EvalConfig = None) -> dict: opt_timeout=config.opt_timeout, enable_stats=config.enable_stats, enable_perf=config.enable_perf_counters, + enable_remarks=config.enable_remarks, ) result["combined_score"] = score @@ -167,6 +170,9 @@ def evaluate(program_path: str, config: EvalConfig = None) -> dict: baseline_stats = None if config.enable_stats: baseline_stats = load_baseline_stats(config) + bl_remarks = None + if config.enable_remarks: + bl_remarks = load_baseline_remarks(config) asi = generate_asi( score, ev, baseline, baseline_stats=baseline_stats, formula=ScoreFormula( @@ -174,6 +180,7 @@ def evaluate(program_path: str, config: EvalConfig = None) -> dict: binary_weight=1.0, description="5 x speedup% + binary_reduction%", ), + baseline_remarks=bl_remarks, ) if EvaluationResult is not None: From c0cf65b47ba708e47704cfcdcd49d163fdd6ea8d Mon Sep 17 00:00:00 2001 From: Ashvin Verma Date: Mon, 23 Feb 2026 09:07:50 -0800 Subject: [PATCH 7/8] Fix GEPA runner: disable cloudpickle, tolerate eval exceptions Co-Authored-By: Claude Opus 4.6 --- src/mlirAgent/evolve/gepa_run.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/mlirAgent/evolve/gepa_run.py b/src/mlirAgent/evolve/gepa_run.py index 8e7dfb5..e2f10f9 100644 --- a/src/mlirAgent/evolve/gepa_run.py +++ b/src/mlirAgent/evolve/gepa_run.py @@ -197,6 +197,8 @@ def main(): max_metric_calls=args.max_evals, parallel=False, run_dir=run_dir, + use_cloudpickle=False, + raise_on_exception=False, ), reflection=ReflectionConfig( reflection_lm=lm, From f5d18b58a193e59e7044a99dcc8cba0e46d52aa0 Mon Sep 17 00:00:00 2001 From: Ashvin Verma Date: Mon, 23 Feb 2026 09:40:59 -0800 Subject: [PATCH 8/8] Unify GEPA + OpenEvolve into single run.py entry point Consolidate manual_run.py, gepa_run.py, gepa_adapter.py, and providers.py into three clean modules: run.py (CLI), adapters.py (framework adapters), evaluator.py (shared eval bridge). Both frameworks share the same evaluator pipeline and prompt/response file-based LLM interface. Co-Authored-By: Claude Opus 4.6 --- src/mlirAgent/evolve/README.md | 109 ++++--- src/mlirAgent/evolve/adapters.py | 424 ++++++++++++++++++------- src/mlirAgent/evolve/evaluator.py | 189 +++++++----- src/mlirAgent/evolve/gepa_adapter.py | 91 ------ src/mlirAgent/evolve/gepa_run.py | 248 --------------- src/mlirAgent/evolve/manual_run.py | 444 --------------------------- src/mlirAgent/evolve/providers.py | 46 --- src/mlirAgent/evolve/run.py | 232 ++++++++------ 8 files changed, 633 insertions(+), 1150 deletions(-) delete mode 100644 src/mlirAgent/evolve/gepa_adapter.py delete mode 100644 src/mlirAgent/evolve/gepa_run.py delete mode 100644 src/mlirAgent/evolve/manual_run.py delete mode 100644 src/mlirAgent/evolve/providers.py diff --git a/src/mlirAgent/evolve/README.md b/src/mlirAgent/evolve/README.md index 25ee3ae..3b1d0ac 100644 --- a/src/mlirAgent/evolve/README.md +++ b/src/mlirAgent/evolve/README.md @@ -83,40 +83,58 @@ export LLVM_SRC_PATH=/scratch/ashvin/llvm-project export EVOLVE_BUILD_DIR=/scratch/ashvin/llvm-build export EVOLVE_OPTUNA_TRIALS=5 # 0 to disable Optuna -# Launch (--wait mode: you respond to prompts manually or via Claude Code) -python -m mlirAgent.evolve.manual_run --example llvm_inlining -n 10 --wait +# GEPA (default) — manual mode: write prompt_NNN.response.md when prompted +python run.py --task llvm_inlining --max-evals 10 -# Or auto mode (built-in heuristic strategies respond automatically) -python -m mlirAgent.evolve.manual_run --example regalloc_priority -n 10 --auto +# GEPA — auto mode for smoke testing +python run.py --task llvm_inlining --max-evals 2 --auto + +# OpenEvolve — manual mode +python run.py --framework openevolve --task llvm_inlining --max-evals 10 + +# OpenEvolve — auto mode +python run.py --framework openevolve --task regalloc_priority -n 10 --auto + +# Override Optuna trials +python run.py --task llvm_inlining --max-evals 10 --optuna-trials 5 ``` This creates an experiment directory: ``` experiments/run_20260219_132604/ - scores.jsonl # One JSON line per iteration with all metrics + summary.json # Framework, task, output paths + best.cpp # Best evolved code prompts/ - prompt_001.md # OpenEvolve prompt (parent code + history) + prompt_001.md # LLM prompt (GEPA reflection or OpenEvolve parent code) prompt_001.response.md # LLM/agent response (new code) prompt_002.md ... - openevolve_output/ - checkpoints/checkpoint_N/ # Population state for --resume - best/best_program.cpp # Best evolved program - logs/openevolve_*.log # Detailed log + gepa_state/ # GEPA only: optimizer state (for resume) + openevolve_output/ # OpenEvolve only: + checkpoints/checkpoint_N/ # Population state for --resume + best/best_program.cpp # Best evolved program + logs/openevolve_*.log # Detailed log + scores.jsonl # OpenEvolve only: per-iteration metrics ``` ### What Happens Each Iteration ``` + run.py --framework {gepa, openevolve} + │ + ┌───────────┴───────────┐ + ▼ ▼ + ┌───────────────┐ ┌──────────────────────┐ + │ GEPAAdapter │ │ OpenEvolveAdapter │ + │ (Pareto │ │ (MAP-Elites │ + │ frontier) │ │ population) │ + └───────┬───────┘ └──────────┬───────────┘ + │ │ + └───────────┬────────────┘ + │ 1. Select/reflect on parent + ▼ ┌─────────────────────────────────┐ - │ OpenEvolve Controller │ - │ (population, MAP-Elites, etc.) │ - └────────────┬────────────────────┘ - │ 1. Sample parent program - │ from population - ▼ - ┌─────────────────────────────────┐ - │ ManualLLM Bridge │ + │ ManualLM / ManualLLM │ │ Write prompt_NNN.md to disk │ │ Poll for prompt_NNN.response.md │ └────────────┬────────────────────┘ @@ -124,7 +142,7 @@ experiments/run_20260219_132604/ │ writes response file ▼ ┌─────────────────────────────────┐ - │ Task Evaluator (evaluate.py)│ + │ evaluator.py → evaluate.py │ └────────────┬────────────────────┘ │ ┌──────────────────────┼──────────────────────┐ @@ -308,36 +326,31 @@ diagnostic context for proposing improvements. ### Usage -```bash -# Manual mode: prompts appear as prompt_NNN.md, you write prompt_NNN.response.md -python gepa_run.py --task llvm_inlining --max-evals 10 +Both frameworks are accessed through the unified `run.py`: -# Auto mode for smoke testing (auto-responds with trivially modified code) -python gepa_run.py --task llvm_inlining --max-evals 2 --auto-respond -``` +```bash +# GEPA — manual mode +python run.py --task llvm_inlining --max-evals 10 -### Configuration +# GEPA — auto mode for smoke testing +python run.py --task llvm_inlining --max-evals 2 --auto -| Flag | Default | Description | -|------|---------|-------------| -| `--task` | (required) | `llvm_inlining`, `loop_unrolling`, or `regalloc_priority` | -| `--max-evals` | 10 | Maximum evaluator calls (seed + proposals) | -| `--prompts-dir` | `gepa_prompts` | Directory for prompt/response files | -| `--output-dir` | `/run` | GEPA state directory (for resume) | -| `--auto-respond` | off | Spawn background thread that auto-creates responses | +# OpenEvolve +python run.py --framework openevolve --task llvm_inlining --max-evals 10 +``` ### GEPA vs OpenEvolve -| Feature | OpenEvolve | GEPA | -|---------|-----------|------| -| Population | MAP-Elites (50 candidates) | Pareto frontier | -| Feedback | Scalar score only → ASI via artifacts | Native side-info channel | -| LLM interface | ManualLLM (file-based) | ManualLM (file-based) | -| Hyperparameter tuning | Optuna inner-loop | Not integrated (future) | -| Resume | Checkpoint directory | `run_dir` state | +| Feature | GEPA (default) | OpenEvolve | +|---------|----------------|-----------| +| Population | Pareto frontier | MAP-Elites (10 candidates) | +| Feedback | Native side-info `(score, {"Feedback": ASI})` | ASI via `artifacts["asi"]` | +| LLM interface | ManualLM (`gepa_manual_lm.py`) | ManualLLM (`third_party/openevolve/`) | +| Hyperparameter tuning | Optuna inner-loop | Optuna inner-loop | +| Resume | `gepa_state/` directory | Checkpoint directory | -Both frameworks use our same evaluation pipeline (`llvm_bench.py`), so scores -are directly comparable. +Both frameworks share the same evaluation pipeline (`evaluator.py` → `llvm_bench.py`), +so scores are directly comparable. ## LLVM Hooks @@ -428,9 +441,9 @@ config = EvalConfig.from_env( ``` src/mlirAgent/evolve/ - manual_run.py # OpenEvolve orchestrator: --auto/--wait/--resume - gepa_run.py # GEPA orchestrator: --auto-respond - gepa_adapter.py # GEPA evaluator bridge (score, side_info) + run.py # Unified CLI: --framework {gepa,openevolve} + adapters.py # GEPAAdapter + OpenEvolveAdapter + evaluator.py # Framework-agnostic evaluator bridge gepa_manual_lm.py # File-based LLM for GEPA tasks/ llvm_bench.py # Shared: EvalConfig, compile, baseline, Optuna, ASI @@ -461,8 +474,10 @@ experiments/ # Output (gitignored) 1. Create `tasks/my_task/` with `initial.cpp` and `evaluate.py` 2. In `evaluate.py`, define `_score(total_binary, baseline_binary, speedups)` 3. Call shared functions from `llvm_bench.py` with the right evolved flags -4. Add entry to `EXAMPLES` dict in `manual_run.py` -5. If the evolved code affects `llc` (not `opt`), use `flag_target="llc"` in +4. Add task name to `_TASKS` and `_TASK_INITIAL` in `run.py` +5. Add task config to `_TASK_CONFIG` in `evaluator.py` +6. Add objective string to `_TASK_OBJECTIVE` in `adapters.py` +7. If the evolved code affects `llc` (not `opt`), use `flag_target="llc"` in `optuna_tune()` and pass flags via `evolved_llc_flags` ## Scoring Formulas diff --git a/src/mlirAgent/evolve/adapters.py b/src/mlirAgent/evolve/adapters.py index 0b17df0..666797b 100644 --- a/src/mlirAgent/evolve/adapters.py +++ b/src/mlirAgent/evolve/adapters.py @@ -1,144 +1,362 @@ -"""Framework adapters for evolutionary optimization. +"""Framework adapters for LLVM heuristic evolution. -Translates our unified config (task + agent + framework YAML) into the -specific format each evolution framework expects, then launches it. +Each adapter wraps a specific evolution framework (GEPA, OpenEvolve) with a +common interface so ``run.py`` can dispatch to either one. """ +import asyncio +import json import os +import re import sys -import tempfile +import threading +import time from abc import ABC, abstractmethod +from datetime import datetime from pathlib import Path -from typing import Dict, Any, Optional -import yaml +_BASE_DIR = Path(__file__).resolve().parent -from ..config import Config -from .providers import load_agent_config +# Ensure local packages are importable +if str(_BASE_DIR) not in sys.path: + sys.path.insert(0, str(_BASE_DIR)) +# Task → initial source file (relative to _BASE_DIR) +_TASK_INITIAL = { + "llvm_inlining": "tasks/llvm_inlining/initial.cpp", + "loop_unrolling": "tasks/loop_unrolling/initial.cpp", + "regalloc_priority": "tasks/regalloc_priority/initial.cpp", +} -class FrameworkAdapter(ABC): - """Abstract adapter that bridges our config to a specific evo framework.""" +# Task → GEPA objective string +_TASK_OBJECTIVE = { + "llvm_inlining": ( + "Maximize binary size reduction across CTMark benchmarks " + "by modifying the inlining cost heuristic." + ), + "loop_unrolling": ( + "Maximize runtime speedup across CTMark benchmarks " + "by modifying the loop unrolling heuristic." + ), + "regalloc_priority": ( + "Maximize runtime speedup across CTMark benchmarks " + "by modifying the register allocation priority function." + ), +} - def __init__(self): - self.task = None - self.agent_config = None - self.framework_config = None +_GEPA_BACKGROUND = ( + "You are modifying a C++ heuristic function in LLVM's optimization pipeline. " + "The function is compiled into the opt/llc tools and evaluated against CTMark " + "benchmarks (real-world C/C++ programs). The evaluator returns a score based on " + "binary size reduction and/or runtime speedup vs the default LLVM heuristic. " + "Higher scores are better. The source code uses LLVM APIs (cl::opt for flags, " + "InlineCost, LoopUnrollResult, etc.). Expose tunable constants as " + "// [hyperparam]: flag-name, type, min, max comments for the autotuner." +) - def configure(self, task, agent_config: Dict[str, Any], framework_config: Dict[str, Any]): - """Store task, agent, and framework configs.""" - self.task = task - self.agent_config = agent_config - self.framework_config = framework_config - @abstractmethod - def launch(self, dry_run: bool = False, max_iterations: Optional[int] = None) -> Dict[str, Any]: - """Start the evolution run. Returns result dict.""" - ... +class FrameworkAdapter(ABC): + """Common interface for evolution framework adapters.""" @abstractmethod - def get_results(self) -> Dict[str, Any]: - """Return results from the most recent run.""" + def run(self, *, task, initial_file, prompts_dir, max_evals, + auto_respond, poll_interval, output, exp_dir, **kwargs): + """Run the evolution loop. Returns a result dict.""" ... +# --------------------------------------------------------------------------- +# GEPA +# --------------------------------------------------------------------------- + +def _auto_respond_thread(prompts_dir, initial_code, poll_interval=1.0): + """Background thread that auto-creates response files for smoke testing.""" + seen = set() + prompt_re = re.compile(r"^prompt_(\d+)\.md$") + + while True: + try: + for fname in os.listdir(prompts_dir): + m = prompt_re.match(fname) + if not m: + continue + num = m.group(1) + response_name = f"prompt_{num}.response.md" + if response_name in seen: + continue + response_path = os.path.join(prompts_dir, response_name) + if os.path.exists(response_path): + seen.add(response_name) + continue + + modified = initial_code.replace( + "// EVOLVE-BLOCK-START", + f"// EVOLVE-BLOCK-START\n// Auto-response iteration {num}", + ) + with open(response_path, "w") as f: + f.write(f"```cpp\n{modified}\n```\n") + seen.add(response_name) + print(f" [auto-respond] Created {response_name}") + except OSError: + pass + time.sleep(poll_interval) + + +class GEPAAdapter(FrameworkAdapter): + """Runs GEPA ``optimize_anything()`` with ManualLM and ASI feedback. + + GEPA's evaluator receives ``(score, {"Feedback": ASI_text})`` so it + can embed rich diagnostic feedback into its reflection prompts. + Optuna hyperparameter tuning runs inside each evaluation automatically + when ``[hyperparam]`` annotations are present in the C++ code. + """ + + def run(self, *, task, initial_file, prompts_dir, max_evals, + auto_respond, poll_interval=2.0, output=None, exp_dir=None, + **kwargs): + from gepa.optimize_anything import ( + optimize_anything, GEPAConfig, EngineConfig, ReflectionConfig, + ) + from gepa_manual_lm import ManualLM + from evaluator import make_evaluator + + with open(initial_file) as f: + initial_code = f.read() + + lm = ManualLM(prompts_dir=prompts_dir, poll_interval=poll_interval) + evaluator = make_evaluator(task) + + run_dir = os.path.join(exp_dir, "gepa_state") if exp_dir else None + if run_dir: + os.makedirs(run_dir, exist_ok=True) + + if auto_respond: + t = threading.Thread( + target=_auto_respond_thread, + args=(prompts_dir, initial_code, poll_interval), + daemon=True, + ) + t.start() + print(" [auto-respond] Background responder started") + + config = GEPAConfig( + engine=EngineConfig( + max_metric_calls=max_evals, + parallel=False, + run_dir=run_dir, + use_cloudpickle=False, + raise_on_exception=False, + ), + reflection=ReflectionConfig( + reflection_lm=lm, + ), + ) + + result = optimize_anything( + seed_candidate=initial_code, + evaluator=evaluator, + objective=_TASK_OBJECTIVE[task], + background=_GEPA_BACKGROUND, + config=config, + ) + + # Save best code + output_path = output or os.path.join(exp_dir or ".", "best.cpp") + os.makedirs(os.path.dirname(output_path), exist_ok=True) + with open(output_path, "w") as f: + f.write(result.best_candidate) + print(f"Best code saved to: {output_path}") + + summary = { + "framework": "gepa", + "task": task, + "num_candidates": result.num_candidates, + "total_metric_calls": result.total_metric_calls, + "output_path": output_path, + } + summary_path = os.path.join(exp_dir or prompts_dir, "summary.json") + with open(summary_path, "w") as f: + json.dump(summary, f, indent=2) + print(f"Summary saved to: {summary_path}") + + return summary + + +# --------------------------------------------------------------------------- +# OpenEvolve +# --------------------------------------------------------------------------- + class OpenEvolveAdapter(FrameworkAdapter): - """Adapter for the OpenEvolve framework (third_party/openevolve).""" + """Runs OpenEvolve with ManualLLM (file-based prompt/response). + + OpenEvolve uses MAP-Elites population management. ASI feedback is + passed through ``EvaluationResult.artifacts["asi"]``. Optuna runs + inside each evaluation automatically. + """ - def __init__(self): - super().__init__() - self._result = None + def run(self, *, task, initial_file, prompts_dir, max_evals, + auto_respond, poll_interval=2.0, output=None, exp_dir=None, + resume=None, **kwargs): + return asyncio.run(self._run_async( + task=task, initial_file=initial_file, prompts_dir=prompts_dir, + max_evals=max_evals, auto_respond=auto_respond, + output=output, exp_dir=exp_dir, resume=resume, + )) - def launch(self, dry_run: bool = False, max_iterations: Optional[int] = None) -> Dict[str, Any]: + async def _run_async(self, *, task, initial_file, prompts_dir, + max_evals, auto_respond, output, exp_dir, resume): # Ensure openevolve is importable - oe_path = Config.OPENEVOLVE_PATH + mlirevolve_root = _BASE_DIR.parent.parent.parent + oe_path = str(mlirevolve_root / "third_party" / "openevolve") if oe_path not in sys.path: sys.path.insert(0, oe_path) from openevolve.config import Config as OEConfig, LLMModelConfig + from openevolve.controller import OpenEvolve + from openevolve.llm.manual import create_manual_llm - # Build OpenEvolve config from our YAML configs - oe_cfg = OEConfig() - - # Framework settings - fw = self.framework_config or {} - oe_cfg.max_iterations = max_iterations or fw.get("max_iterations", 100) - oe_cfg.database.num_islands = fw.get("islands", 4) - oe_cfg.database.population_size = fw.get("population_size", 50) - oe_cfg.database.migration_interval = fw.get("migration_interval", 10) - if fw.get("random_seed") is not None: - oe_cfg.random_seed = fw["random_seed"] - - # File suffix for C++ evolution - oe_cfg.language = "cpp" - oe_cfg.file_suffix = ".cpp" - - # LLM settings from agent config - agent = self.agent_config or {} - model = LLMModelConfig( - name=agent.get("model", "claude-opus-4-6"), - api_base=agent.get("api_base", "https://api.anthropic.com/v1"), - api_key=agent.get("api_key", ""), - temperature=agent.get("temperature", 0.7), - max_tokens=agent.get("max_tokens", 4096), + os.environ["MANUAL_LLM_PROMPTS_DIR"] = prompts_dir + + # Build config + configs_dir = str(mlirevolve_root / "configs") + fw_yaml = os.path.join(configs_dir, "frameworks", "manual.yaml") + cfg = OEConfig.from_yaml(fw_yaml) if os.path.exists(fw_yaml) else OEConfig() + + cfg.max_iterations = max_evals + cfg.file_suffix = ".cpp" + cfg.language = "cpp" + + manual_model = LLMModelConfig( + name="manual", + init_client=create_manual_llm, + weight=1.0, ) - oe_cfg.llm.models = [model] - oe_cfg.llm.evaluator_models = [model] - - # Paths - initial_program = str(self.task.get_initial_program()) - evaluator = str(self.task.get_evaluator()) - - if dry_run: - return { - "dry_run": True, - "initial_program": initial_program, - "evaluator": evaluator, - "config": { - "max_iterations": oe_cfg.max_iterations, - "population_size": oe_cfg.database.population_size, - "islands": oe_cfg.database.num_islands, - "model": agent.get("model"), - "language": oe_cfg.language, - }, - } + cfg.llm.models = [manual_model] + cfg.llm.evaluator_models = [manual_model] - # Launch via OpenEvolve API - from openevolve.api import run_evolution + cfg.database.population_size = 10 + cfg.database.archive_size = 10 + cfg.database.num_islands = 1 + cfg.database.migration_interval = 999 + cfg.checkpoint_interval = 1 + cfg.diff_based_evolution = False - result = run_evolution( - initial_program=initial_program, - evaluator=evaluator, - config=oe_cfg, - iterations=oe_cfg.max_iterations, - cleanup=False, + # Evaluator path (the task's evaluate.py) + evaluator_path = str(_BASE_DIR / "tasks" / task / "evaluate.py") + + oe_output_dir = os.path.join(exp_dir, "openevolve_output") + scores_path = os.path.join(exp_dir, "scores.jsonl") + os.makedirs(oe_output_dir, exist_ok=True) + + openevolve = OpenEvolve( + initial_program_path=initial_file, + evaluation_file=evaluator_path, + config=cfg, + output_dir=oe_output_dir, ) - self._result = { - "best_score": result.best_score, - "best_code": result.best_code, - "metrics": result.metrics, - "output_dir": result.output_dir, + + if resume: + if os.path.exists(resume): + print(f"Resuming from checkpoint: {resume}") + openevolve.database.load(resume) + else: + print(f"Warning: Checkpoint not found: {resume}") + + # Auto-respond thread + stop_event = asyncio.Event() + responder_task = None + if auto_respond: + loop = asyncio.get_event_loop() + responder_task = loop.run_in_executor( + None, _oe_auto_respond, prompts_dir, stop_event, + ) + + # Score logging hook + _original_add = openevolve.database.add + + def _logging_add(program, *a, **kw): + result = _original_add(program, *a, **kw) + entry = { + "timestamp": time.time(), + "iteration": program.iteration_found, + "program_id": program.id, + "metrics": program.metrics, + } + best = openevolve.database.get_best_program() + if best: + entry["best_score"] = best.metrics.get("combined_score", 0) + with open(scores_path, "a") as f: + f.write(json.dumps(entry, default=str) + "\n") + return result + + openevolve.database.add = _logging_add + + try: + print(f"Starting OpenEvolve ({cfg.max_iterations} iterations)...") + best = await openevolve.run( + iterations=cfg.max_iterations, + checkpoint_path=resume, + ) + if best: + print(f"\nBest metrics:") + for k, v in best.metrics.items(): + if isinstance(v, float): + print(f" {k}: {v:.4f}") + else: + print(f" {k}: {v}") + + # Save best code + if output and hasattr(best, "code"): + os.makedirs(os.path.dirname(output), exist_ok=True) + with open(output, "w") as f: + f.write(best.code) + print(f"Best code saved to: {output}") + finally: + stop_event.set() + if responder_task: + await asyncio.sleep(2) + + summary = { + "framework": "openevolve", + "task": task, + "output_dir": oe_output_dir, } - return self._result + summary_path = os.path.join(exp_dir, "summary.json") + with open(summary_path, "w") as f: + json.dump(summary, f, indent=2) + + return summary - def get_results(self) -> Dict[str, Any]: - return self._result or {} +def _oe_auto_respond(prompts_dir, stop_event): + """Auto-responder for OpenEvolve (trivial code modification).""" + import glob -class ShinkaAdapter(FrameworkAdapter): - """Adapter for ShinkaEvolve (stub — not yet implemented).""" + responded = set() + pattern = re.compile(r"prompt_\d+\.md$") - def launch(self, dry_run: bool = False, max_iterations: Optional[int] = None) -> Dict[str, Any]: - raise NotImplementedError( - "ShinkaEvolve adapter is not yet implemented. " - "Use --framework openevolve for now." + while not stop_event.is_set(): + prompt_files = sorted( + f for f in glob.glob(os.path.join(prompts_dir, "prompt_*.md")) + if pattern.search(f) ) + for pf in prompt_files: + if pf in responded: + continue + resp_path = pf.replace(".md", ".response.md") + if os.path.exists(resp_path): + responded.add(pf) + continue - def get_results(self) -> Dict[str, Any]: - raise NotImplementedError("ShinkaEvolve adapter is not yet implemented.") + with open(pf) as f: + prompt_text = f.read() + # Trivial modification: add a comment to the code + num = Path(pf).stem.split("_")[-1] + response = f"// Auto-response iteration {num}\n{prompt_text[:500]}" + with open(resp_path, "w") as f: + f.write(response) + responded.add(pf) + print(f" [auto] Responded to {os.path.basename(pf)}") -ADAPTERS = { - "openevolve": OpenEvolveAdapter, - "shinkaevolve": ShinkaAdapter, -} + time.sleep(1) diff --git a/src/mlirAgent/evolve/evaluator.py b/src/mlirAgent/evolve/evaluator.py index f990917..a40e2bd 100644 --- a/src/mlirAgent/evolve/evaluator.py +++ b/src/mlirAgent/evolve/evaluator.py @@ -1,72 +1,121 @@ +"""Evaluator bridges for LLVM heuristic evolution. + +Creates framework-agnostic evaluator callables that: +1. Write candidate C++ to a temp file +2. Call the task-specific evaluate() (patch LLVM, build, benchmark) +3. Return (score, side_info) for GEPA or EvaluationResult for OpenEvolve + +The actual compilation/benchmark logic lives in ``tasks/llvm_bench.py`` +and per-task ``evaluate.py`` files. +""" + import os import re -import subprocess -import optuna -from typing import Dict, Any -from openevolve.evaluation_result import EvaluationResult - -class MagellanEvaluator: - def __init__(self, llvm_build_dir, benchmark_script): - self.build_dir = llvm_build_dir - self.benchmark_script = benchmark_script - - def evaluate(self, code: str) -> EvaluationResult: - # 1. Inject Code into LLVM Source - self._inject_code(code) - - # 2. Compile LLVM (Incremental) - # We only rebuild the relevant library to save time - build_cmd = ["ninja", "-C", self.build_dir, "lib/Analysis/AEInlineAdvisor.o"] - if subprocess.run(build_cmd).returncode != 0: - return EvaluationResult(score=float('-inf'), error="Compilation Failed") - - # Link the final tool (e.g., opt or clang) - subprocess.run(["ninja", "-C", self.build_dir, "bin/opt"]) - - # 3. Inner Loop: Hyperparameter Tuning (The Magellan "Secret Sauce") - # Extract params defined in the C++ comments - params = self._extract_hyperparams(code) - - if not params: - # No params to tune, just run once - score = self._run_benchmark({}) - return EvaluationResult(score=score) - - # Use Optuna to tune the exposed flags - study = optuna.create_study(direction="maximize") - study.optimize(lambda trial: self._objective(trial, params), n_trials=20) - - best_score = study.best_value - best_params = study.best_params - - return EvaluationResult( - score=best_score, - metadata={"tuned_params": best_params} - ) - - def _objective(self, trial, params_schema): - # Map trial suggestions to LLVM flags - # e.g., -ae-inline-base-threshold=255 - flags = [] - for name, type_, min_v, max_v in params_schema: - val = trial.suggest_int(name, int(min_v), int(max_v)) - flags.append(f"-{name}={val}") - - return self._run_benchmark(flags) - - def _run_benchmark(self, flags): - # Execute the benchmark script with the tuned flags - cmd = [self.benchmark_script] + flags - result = subprocess.run(cmd, capture_output=True, text=True) - # Parse output for binary size reduction or execution speed - return self._parse_score(result.stdout) - - def _extract_hyperparams(self, code): - # Regex to find lines like: // [hyperparam]: name, type, min, max - pattern = r"//\s*\[hyperparam\]:\s*([\w-]+),\s*(\w+),\s*(\d+),\s*(\d+)" - return re.findall(pattern, code) - - def _inject_code(self, code): - target_path = "llvm-project/llvm/lib/Analysis/AEInlineAdvisor.cpp" - with open(target_path, "w") as f: - f.write(code) \ No newline at end of file +import sys +import tempfile +from pathlib import Path + +# Ensure tasks package is importable when run standalone +sys.path.insert(0, str(Path(__file__).resolve().parent)) + +from tasks.llvm_bench import EvalConfig + +_EVOLVE_BLOCK_RE = re.compile( + r"(// EVOLVE-BLOCK-START\n)(.*?)(// EVOLVE-BLOCK-END)", + re.DOTALL, +) + + +def extract_evolve_block(code): + """Extract the EVOLVE-BLOCK content from C++ source code.""" + m = _EVOLVE_BLOCK_RE.search(code) + if m: + return m.group(2) + return code + + +def inject_evolve_block(template, block): + """Replace EVOLVE-BLOCK in *template* with new *block* content.""" + return _EVOLVE_BLOCK_RE.sub( + lambda m: m.group(1) + block + m.group(3), + template, + ) + + +# Task → (target_file, default baseline overrides) +_TASK_CONFIG = { + "llvm_inlining": { + "target_file": "llvm/lib/Analysis/EvolvedInlineCost.cpp", + }, + "loop_unrolling": { + "target_file": "llvm/lib/Transforms/Scalar/EvolvedLoopUnroll.cpp", + "baseline_file": str( + Path(__file__).resolve().parent + / "tasks" / "loop_unrolling" / "baseline_unroll.json" + ), + }, + "regalloc_priority": { + "target_file": "llvm/lib/CodeGen/EvolvedRegAllocPriority.cpp", + "baseline_file": str( + Path(__file__).resolve().parent + / "tasks" / "regalloc_priority" / "baseline_regalloc.json" + ), + }, +} + + +def _import_evaluate(task_name): + """Import the task-specific evaluate function.""" + if task_name == "llvm_inlining": + from tasks.llvm_inlining.evaluate import evaluate + elif task_name == "loop_unrolling": + from tasks.loop_unrolling.evaluate import evaluate + elif task_name == "regalloc_priority": + from tasks.regalloc_priority.evaluate import evaluate + else: + raise ValueError(f"Unknown task: {task_name}") + return evaluate + + +def make_evaluator(task_name, config=None): + """Create an evaluator function for a given task. + + Returns a callable ``code_str -> (score, side_info)`` matching GEPA's + evaluator protocol. *side_info* is a dict that may contain a + ``"Feedback"`` key with ASI markdown text. + + The same evaluator works with OpenEvolve — the score is extracted from + the returned tuple's first element. + """ + evaluate = _import_evaluate(task_name) + + if config is None: + tc = _TASK_CONFIG[task_name] + config = EvalConfig.from_env(tc["target_file"], **{ + k: v for k, v in tc.items() if k != "target_file" + }) + + def evaluator(code_str): + """Write code to temp file, evaluate, return (score, side_info).""" + with tempfile.NamedTemporaryFile( + mode="w", suffix=".cpp", delete=False, prefix="evolve_" + ) as f: + f.write(code_str) + tmp_path = f.name + try: + result = evaluate(tmp_path, config=config) + if isinstance(result, dict): + score = result.get("combined_score", 0.0) + side_info = {} + else: + # EvaluationResult from OpenEvolve + score = result.metrics.get("combined_score", 0.0) + if hasattr(result, "artifacts") and "asi" in result.artifacts: + side_info = {"Feedback": result.artifacts["asi"]} + else: + side_info = {} + return score, side_info + finally: + os.unlink(tmp_path) + + return evaluator diff --git a/src/mlirAgent/evolve/gepa_adapter.py b/src/mlirAgent/evolve/gepa_adapter.py deleted file mode 100644 index 2e9082b..0000000 --- a/src/mlirAgent/evolve/gepa_adapter.py +++ /dev/null @@ -1,91 +0,0 @@ -"""GEPA adapter for LLVM heuristic evolution. - -Bridges GEPA's ``optimize_anything`` API with our LLVM benchmark evaluator. -Handles EVOLVE-BLOCK extraction, code injection, and score retrieval. -""" - -import os -import re -import sys -import tempfile -from pathlib import Path - -# Ensure tasks package is importable -sys.path.insert(0, str(Path(__file__).resolve().parent)) - -from tasks.llvm_bench import EvalConfig - -_EVOLVE_BLOCK_RE = re.compile( - r"(// EVOLVE-BLOCK-START\n)(.*?)(// EVOLVE-BLOCK-END)", - re.DOTALL, -) - - -def extract_evolve_block(code): - """Extract the EVOLVE-BLOCK content from C++ source code.""" - m = _EVOLVE_BLOCK_RE.search(code) - if m: - return m.group(2) - return code - - -def inject_evolve_block(template, block): - """Replace EVOLVE-BLOCK in *template* with new *block* content.""" - return _EVOLVE_BLOCK_RE.sub( - lambda m: m.group(1) + block + m.group(3), - template, - ) - - -def make_evaluator(task_name, config=None): - """Create an evaluator function for GEPA. - - Returns a callable ``code_str -> (score, side_info)`` matching GEPA's - evaluator protocol. *side_info* is a dict that may contain a - ``"Feedback"`` key with ASI markdown text. - """ - if task_name == "llvm_inlining": - from tasks.llvm_inlining.evaluate import evaluate - if config is None: - config = EvalConfig.from_env( - "llvm/lib/Analysis/EvolvedInlineCost.cpp" - ) - elif task_name == "loop_unrolling": - from tasks.loop_unrolling.evaluate import evaluate - if config is None: - config = EvalConfig.from_env( - "llvm/lib/Transforms/Scalar/EvolvedLoopUnroll.cpp" - ) - elif task_name == "regalloc_priority": - from tasks.regalloc_priority.evaluate import evaluate - if config is None: - config = EvalConfig.from_env( - "llvm/lib/CodeGen/EvolvedRegAllocPriority.cpp" - ) - else: - raise ValueError(f"Unknown task: {task_name}") - - def evaluator(code_str): - """Write code to temp file, evaluate, return (score, side_info).""" - with tempfile.NamedTemporaryFile( - mode="w", suffix=".cpp", delete=False, prefix="gepa_" - ) as f: - f.write(code_str) - tmp_path = f.name - try: - result = evaluate(tmp_path, config=config) - if isinstance(result, dict): - score = result.get("combined_score", 0.0) - side_info = {} - else: - # EvaluationResult from OpenEvolve - score = result.metrics.get("combined_score", 0.0) - if hasattr(result, "artifacts") and "asi" in result.artifacts: - side_info = {"Feedback": result.artifacts["asi"]} - else: - side_info = {} - return score, side_info - finally: - os.unlink(tmp_path) - - return evaluator diff --git a/src/mlirAgent/evolve/gepa_run.py b/src/mlirAgent/evolve/gepa_run.py deleted file mode 100644 index e2f10f9..0000000 --- a/src/mlirAgent/evolve/gepa_run.py +++ /dev/null @@ -1,248 +0,0 @@ -"""CLI runner for GEPA on LLVM evolution tasks. - -Usage:: - - python gepa_run.py --task llvm_inlining [--prompts-dir gepa_prompts] - python gepa_run.py --task llvm_inlining --max-evals 2 --auto-respond - -Requires ``pip install gepa`` and environment variables: - - LLVM_SRC_PATH: path to LLVM source tree - - EVOLVE_BUILD_DIR: path to LLVM build directory -""" - -import argparse -import json -import os -import re -import sys -import threading -import time -from pathlib import Path - -# Ensure local packages are importable -sys.path.insert(0, str(Path(__file__).resolve().parent)) - -# Task → initial source file mapping -_TASK_INITIAL = { - "llvm_inlining": "tasks/llvm_inlining/initial.cpp", - "loop_unrolling": "tasks/loop_unrolling/initial.cpp", - "regalloc_priority": "tasks/regalloc_priority/initial.cpp", -} - -# Task → objective string for GEPA -_TASK_OBJECTIVE = { - "llvm_inlining": ( - "Maximize binary size reduction across CTMark benchmarks " - "by modifying the inlining cost heuristic." - ), - "loop_unrolling": ( - "Maximize runtime speedup across CTMark benchmarks " - "by modifying the loop unrolling heuristic." - ), - "regalloc_priority": ( - "Maximize runtime speedup across CTMark benchmarks " - "by modifying the register allocation priority function." - ), -} - -_TASK_BACKGROUND = ( - "You are modifying a C++ heuristic function in LLVM's optimization pipeline. " - "The function is compiled into the opt/llc tools and evaluated against CTMark " - "benchmarks (real-world C/C++ programs). The evaluator returns a score based on " - "binary size reduction and/or runtime speedup vs the default LLVM heuristic. " - "Higher scores are better. The source code uses LLVM APIs (cl::opt for flags, " - "InlineCost, LoopUnrollResult, etc.). Expose tunable constants as " - "// [hyperparam]: flag-name, type, min, max comments for the autotuner." -) - - -def _auto_respond_thread(prompts_dir, initial_code, poll_interval=1.0): - """Background thread that auto-creates response files for smoke testing. - - Watches for new prompt_NNN.md files and creates prompt_NNN.response.md - with a trivially modified version of the initial code. - """ - seen = set() - prompt_re = re.compile(r"^prompt_(\d+)\.md$") - - while True: - try: - for fname in os.listdir(prompts_dir): - m = prompt_re.match(fname) - if not m: - continue - num = m.group(1) - response_name = f"prompt_{num}.response.md" - if response_name in seen: - continue - response_path = os.path.join(prompts_dir, response_name) - if os.path.exists(response_path): - seen.add(response_name) - continue - - # Create a trivially modified version of the code - modified = initial_code.replace( - "// EVOLVE-BLOCK-START", - f"// EVOLVE-BLOCK-START\n// Auto-response iteration {num}", - ) - with open(response_path, "w") as f: - f.write(f"```cpp\n{modified}\n```\n") - seen.add(response_name) - print(f" [auto-respond] Created {response_name}") - except OSError: - pass - time.sleep(poll_interval) - - -def main(): - parser = argparse.ArgumentParser( - description="Run GEPA on LLVM heuristic evolution tasks" - ) - parser.add_argument( - "--task", required=True, - choices=list(_TASK_INITIAL.keys()), - help="Task to optimize", - ) - parser.add_argument( - "--initial", default=None, - help="Path to initial C++ source (overrides default)", - ) - parser.add_argument( - "--prompts-dir", default="gepa_prompts", - help="Directory for prompt/response files (default: gepa_prompts)", - ) - parser.add_argument( - "--poll-interval", type=float, default=2.0, - help="Poll interval for response files in seconds (default: 2.0)", - ) - parser.add_argument( - "--max-evals", type=int, default=10, - help="Maximum evaluator calls (default: 10)", - ) - parser.add_argument( - "--output-dir", default=None, - help="GEPA run directory for state/resume (default: /run)", - ) - parser.add_argument( - "--output", default=None, - help="Path to save best code (default: tasks//gepa_best.cpp)", - ) - parser.add_argument( - "--auto-respond", action="store_true", - help="Auto-create response files for smoke testing", - ) - args = parser.parse_args() - - # Import GEPA - try: - from gepa.optimize_anything import ( - optimize_anything, GEPAConfig, EngineConfig, ReflectionConfig, - ) - except ImportError: - print("Error: gepa not installed. Run: pip install gepa") - sys.exit(1) - - from gepa_manual_lm import ManualLM - from gepa_adapter import make_evaluator - - # Find initial program - base_dir = Path(__file__).resolve().parent - if args.initial: - initial_file = Path(args.initial) - else: - initial_file = base_dir / _TASK_INITIAL[args.task] - - if not initial_file.exists(): - print(f"Error: initial source not found at {initial_file}") - sys.exit(1) - - with open(initial_file) as f: - initial_code = f.read() - - # Create LM and evaluator - lm = ManualLM( - prompts_dir=args.prompts_dir, - poll_interval=args.poll_interval, - ) - evaluator = make_evaluator(args.task) - - run_dir = args.output_dir or os.path.join(args.prompts_dir, "run") - os.makedirs(run_dir, exist_ok=True) - - print(f"{'=' * 60}") - print(f"GEPA Runner") - print(f" Task: {args.task}") - print(f" Initial code: {initial_file}") - print(f" Prompts dir: {args.prompts_dir}") - print(f" Max evals: {args.max_evals}") - print(f" Run dir: {run_dir}") - print(f" Auto-respond: {args.auto_respond}") - print(f"{'=' * 60}") - print() - - # Start auto-responder thread if requested - if args.auto_respond: - t = threading.Thread( - target=_auto_respond_thread, - args=(args.prompts_dir, initial_code, args.poll_interval), - daemon=True, - ) - t.start() - print(" [auto-respond] Background responder started") - - # Run GEPA with real API - objective = _TASK_OBJECTIVE[args.task] - config = GEPAConfig( - engine=EngineConfig( - max_metric_calls=args.max_evals, - parallel=False, - run_dir=run_dir, - use_cloudpickle=False, - raise_on_exception=False, - ), - reflection=ReflectionConfig( - reflection_lm=lm, - ), - ) - - result = optimize_anything( - seed_candidate=initial_code, - evaluator=evaluator, - objective=objective, - background=_TASK_BACKGROUND, - config=config, - ) - - print() - print(f"{'=' * 60}") - print(f"GEPA Results:") - print(f" Best candidate: {len(result.best_candidate)} chars") - print(f" Num candidates: {result.num_candidates}") - print(f" Total evals: {result.total_metric_calls}") - print(f"{'=' * 60}") - - # Save best code - output_path = args.output or str( - base_dir / "tasks" / args.task / "gepa_best.cpp" - ) - os.makedirs(os.path.dirname(output_path), exist_ok=True) - with open(output_path, "w") as f: - f.write(result.best_candidate) - print(f"Best code saved to: {output_path}") - - # Save summary - summary = { - "task": args.task, - "num_candidates": result.num_candidates, - "total_metric_calls": result.total_metric_calls, - "output_path": output_path, - "run_dir": run_dir, - } - summary_path = os.path.join(args.prompts_dir, "summary.json") - with open(summary_path, "w") as f: - json.dump(summary, f, indent=2) - print(f"Summary saved to: {summary_path}") - - -if __name__ == "__main__": - main() diff --git a/src/mlirAgent/evolve/manual_run.py b/src/mlirAgent/evolve/manual_run.py deleted file mode 100644 index 59cb285..0000000 --- a/src/mlirAgent/evolve/manual_run.py +++ /dev/null @@ -1,444 +0,0 @@ -"""Orchestrator for running OpenEvolve with ManualLLM + Claude Code as the responder. - -Usage: - # Auto mode: Claude Code sub-agent responds to each prompt - python -m mlirAgent.evolve.manual_run --example function_minimization --iterations 10 --auto - - # Wait mode: user manually creates response files - python -m mlirAgent.evolve.manual_run --example function_minimization --iterations 10 --wait - - # Resume from checkpoint - python -m mlirAgent.evolve.manual_run --example function_minimization --iterations 10 --auto \ - --resume experiments/run_YYYYMMDD_HHMMSS/openevolve_output/checkpoints/checkpoint_5 -""" - -import argparse -import asyncio -import json -import os -import sys -import time -from datetime import datetime -from pathlib import Path - -# Ensure openevolve is importable -_MLIREVOLVE_ROOT = Path(__file__).resolve().parent.parent.parent.parent -_OE_PATH = str(_MLIREVOLVE_ROOT / "third_party" / "openevolve") -if _OE_PATH not in sys.path: - sys.path.insert(0, _OE_PATH) - -from openevolve.config import Config as OEConfig, LLMModelConfig, load_config -from openevolve.controller import OpenEvolve -from openevolve.llm.manual import create_manual_llm - - -EXAMPLES = { - "function_minimization": { - "initial_program": _OE_PATH + "/examples/function_minimization/initial_program.py", - "evaluator": _OE_PATH + "/examples/function_minimization/evaluator.py", - "file_suffix": ".py", - "language": "python", - }, - "llvm_inlining": { - "initial_program": str(Path(__file__).parent / "tasks/llvm_inlining/initial.cpp"), - "evaluator": str(Path(__file__).parent / "tasks/llvm_inlining/evaluate.py"), - "file_suffix": ".cpp", - "language": "cpp", - }, - "regalloc_priority": { - "initial_program": str(Path(__file__).parent / "tasks/regalloc_priority/initial.cpp"), - "evaluator": str(Path(__file__).parent / "tasks/regalloc_priority/evaluate.py"), - "file_suffix": ".cpp", - "language": "cpp", - }, - "loop_unrolling": { - "initial_program": str(Path(__file__).parent / "tasks/loop_unrolling/initial.cpp"), - "evaluator": str(Path(__file__).parent / "tasks/loop_unrolling/evaluate.py"), - "file_suffix": ".cpp", - "language": "cpp", - }, -} - - -def _build_config(args, prompts_dir: str) -> OEConfig: - """Build OpenEvolve config with ManualLLM injected.""" - # Set env var so ManualLLM instances (including in worker processes) find the prompts dir - os.environ["MANUAL_LLM_PROMPTS_DIR"] = prompts_dir - - # Load framework YAML as base - configs_dir = str(_MLIREVOLVE_ROOT / "configs") - fw_yaml = os.path.join(configs_dir, "frameworks", "manual.yaml") - if os.path.exists(fw_yaml): - cfg = OEConfig.from_yaml(fw_yaml) - else: - cfg = OEConfig() - - # Override iterations - if args.iterations: - cfg.max_iterations = args.iterations - - # Set file suffix / language from example - if args.example and args.example in EXAMPLES: - ex = EXAMPLES[args.example] - cfg.file_suffix = ex["file_suffix"] - cfg.language = ex["language"] - - # Inject ManualLLM via init_client (module-level function, picklable) - manual_model = LLMModelConfig( - name="manual", - init_client=create_manual_llm, - weight=1.0, - ) - - cfg.llm.models = [manual_model] - cfg.llm.evaluator_models = [manual_model] - - # Small population for manual speed - cfg.database.population_size = 10 - cfg.database.archive_size = 10 - cfg.database.num_islands = 1 - cfg.database.migration_interval = 999 - cfg.checkpoint_interval = 1 - cfg.diff_based_evolution = False - - return cfg - - -def _auto_respond(prompts_dir: str, stop_event: asyncio.Event): - """Watch prompts_dir for new prompt files and auto-respond using a simple heuristic improver.""" - import glob - import re - - responded = set() - # Match only prompt_NNN.md (not .response.md files) - pattern = re.compile(r"prompt_\d+\.md$") - while not stop_event.is_set(): - prompt_files = sorted( - f for f in glob.glob(os.path.join(prompts_dir, "prompt_*.md")) - if pattern.search(f) - ) - for pf in prompt_files: - if pf in responded: - continue - resp_path = pf.replace(".md", ".response.md") - if os.path.exists(resp_path): - responded.add(pf) - continue - - # Read the prompt - with open(pf) as f: - prompt_text = f.read() - - # Generate a response: extract parent code and propose improvement - response = _generate_improvement(prompt_text) - with open(resp_path, "w") as f: - f.write(response) - responded.add(pf) - print(f" [auto] Responded to {os.path.basename(pf)}") - - time.sleep(1) - - -def _generate_improvement(prompt_text: str) -> str: - """Generate an improved version of the code from the prompt. - - Produces a diff-style response with a concrete improvement to the search algorithm. - """ - import random - - # Choose a random improvement strategy - strategies = [ - _strategy_simulated_annealing, - _strategy_adaptive_step, - _strategy_multi_restart, - _strategy_gradient_estimate, - ] - strategy = random.choice(strategies) - return strategy() - - -def _strategy_simulated_annealing() -> str: - return '''Here's an improved search algorithm using simulated annealing: - -<<<<<<< SEARCH - for _ in range(iterations): - # Simple random search - x = np.random.uniform(bounds[0], bounds[1]) - y = np.random.uniform(bounds[0], bounds[1]) - value = evaluate_function(x, y) - - if value < best_value: - best_value = value - best_x, best_y = x, y -======= - temperature = 2.0 - cooling_rate = 0.995 - step_size = 1.0 - for i in range(iterations): - # Simulated annealing with adaptive step size - x = best_x + np.random.normal(0, step_size) - y = best_y + np.random.normal(0, step_size) - x = np.clip(x, bounds[0], bounds[1]) - y = np.clip(y, bounds[0], bounds[1]) - value = evaluate_function(x, y) - - delta = value - best_value - if delta < 0 or np.random.random() < np.exp(-delta / max(temperature, 1e-10)): - best_value = value - best_x, best_y = x, y - - temperature *= cooling_rate - step_size = max(0.01, step_size * 0.999) ->>>>>>> REPLACE -''' - - -def _strategy_adaptive_step() -> str: - return '''Here's an improved search algorithm with adaptive step sizes and local refinement: - -<<<<<<< SEARCH - for _ in range(iterations): - # Simple random search - x = np.random.uniform(bounds[0], bounds[1]) - y = np.random.uniform(bounds[0], bounds[1]) - value = evaluate_function(x, y) - - if value < best_value: - best_value = value - best_x, best_y = x, y -======= - step = (bounds[1] - bounds[0]) / 4.0 - no_improve = 0 - for i in range(iterations): - if no_improve > 50: - # Random restart - best_x = np.random.uniform(bounds[0], bounds[1]) - best_y = np.random.uniform(bounds[0], bounds[1]) - best_value = evaluate_function(best_x, best_y) - step = (bounds[1] - bounds[0]) / 4.0 - no_improve = 0 - - x = best_x + np.random.uniform(-step, step) - y = best_y + np.random.uniform(-step, step) - x = np.clip(x, bounds[0], bounds[1]) - y = np.clip(y, bounds[0], bounds[1]) - value = evaluate_function(x, y) - - if value < best_value: - best_value = value - best_x, best_y = x, y - no_improve = 0 - else: - no_improve += 1 - if no_improve % 20 == 0: - step *= 0.8 ->>>>>>> REPLACE -''' - - -def _strategy_multi_restart() -> str: - return '''Here's an improved search algorithm with multiple restarts and basin hopping: - -<<<<<<< SEARCH - for _ in range(iterations): - # Simple random search - x = np.random.uniform(bounds[0], bounds[1]) - y = np.random.uniform(bounds[0], bounds[1]) - value = evaluate_function(x, y) - - if value < best_value: - best_value = value - best_x, best_y = x, y -======= - num_restarts = 5 - iters_per_restart = iterations // num_restarts - for restart in range(num_restarts): - # Random restart point - cx = np.random.uniform(bounds[0], bounds[1]) - cy = np.random.uniform(bounds[0], bounds[1]) - cv = evaluate_function(cx, cy) - step = 1.0 - - for i in range(iters_per_restart): - x = cx + np.random.normal(0, step) - y = cy + np.random.normal(0, step) - x = np.clip(x, bounds[0], bounds[1]) - y = np.clip(y, bounds[0], bounds[1]) - value = evaluate_function(x, y) - - if value < cv: - cv = value - cx, cy = x, y - step = max(0.01, step * 0.998) - - if cv < best_value: - best_value = cv - best_x, best_y = cx, cy ->>>>>>> REPLACE -''' - - -def _strategy_gradient_estimate() -> str: - return '''Here's an improved search algorithm using numerical gradient estimation: - -<<<<<<< SEARCH - for _ in range(iterations): - # Simple random search - x = np.random.uniform(bounds[0], bounds[1]) - y = np.random.uniform(bounds[0], bounds[1]) - value = evaluate_function(x, y) - - if value < best_value: - best_value = value - best_x, best_y = x, y -======= - lr = 0.1 - eps = 1e-4 - for i in range(iterations): - # Estimate gradient via finite differences - fx = evaluate_function(best_x + eps, best_y) - fy = evaluate_function(best_x, best_y + eps) - f0 = evaluate_function(best_x, best_y) - gx = (fx - f0) / eps - gy = (fy - f0) / eps - - # Gradient descent step with noise for exploration - noise_scale = max(0.01, 0.5 * (1 - i / iterations)) - nx = best_x - lr * gx + np.random.normal(0, noise_scale) - ny = best_y - lr * gy + np.random.normal(0, noise_scale) - nx = np.clip(nx, bounds[0], bounds[1]) - ny = np.clip(ny, bounds[0], bounds[1]) - nv = evaluate_function(nx, ny) - - if nv < best_value: - best_value = nv - best_x, best_y = nx, ny - - lr = max(0.001, lr * 0.999) ->>>>>>> REPLACE -''' - - -async def _run(args): - """Main async entry point.""" - # Set up experiment directory - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - exp_dir = os.path.join(_MLIREVOLVE_ROOT, "experiments", f"run_{timestamp}") - prompts_dir = os.path.join(exp_dir, "prompts") - oe_output_dir = os.path.join(exp_dir, "openevolve_output") - scores_path = os.path.join(exp_dir, "scores.jsonl") - os.makedirs(prompts_dir, exist_ok=True) - os.makedirs(oe_output_dir, exist_ok=True) - - print(f"Experiment directory: {exp_dir}") - print(f"Prompts directory: {prompts_dir}") - print(f"Scores log: {scores_path}") - - # Build config - cfg = _build_config(args, prompts_dir) - - # Resolve example paths - if args.example: - if args.example not in EXAMPLES: - print(f"Unknown example: {args.example}. Available: {list(EXAMPLES.keys())}") - return 1 - ex = EXAMPLES[args.example] - initial_program = ex["initial_program"] - evaluator = ex["evaluator"] - else: - print("Error: --example is required for now") - return 1 - - # Initialize OpenEvolve - openevolve = OpenEvolve( - initial_program_path=initial_program, - evaluation_file=evaluator, - config=cfg, - output_dir=oe_output_dir, - ) - - # Load checkpoint if resuming - if args.resume: - if not os.path.exists(args.resume): - print(f"Error: Checkpoint not found: {args.resume}") - return 1 - print(f"Resuming from checkpoint: {args.resume}") - openevolve.database.load(args.resume) - - # Start auto-responder if --auto - stop_event = asyncio.Event() - responder_task = None - if args.auto: - print("Auto mode: built-in heuristic strategies will respond to prompts") - loop = asyncio.get_event_loop() - responder_task = loop.run_in_executor(None, _auto_respond, prompts_dir, stop_event) - - # Hook into the database to log scores - _original_add = openevolve.database.add - - def _logging_add(program, *a, **kw): - result = _original_add(program, *a, **kw) - score_entry = { - "timestamp": time.time(), - "iteration": program.iteration_found, - "program_id": program.id, - "metrics": program.metrics, - "generation": program.generation, - } - best = openevolve.database.get_best_program() - if best: - score_entry["best_score"] = best.metrics.get("combined_score", 0) - score_entry["best_id"] = best.id - with open(scores_path, "a") as f: - f.write(json.dumps(score_entry, default=str) + "\n") - return result - - openevolve.database.add = _logging_add - - # Run evolution - try: - print(f"\nStarting OpenEvolve with ManualLLM ({cfg.max_iterations} iterations)...") - best = await openevolve.run( - iterations=cfg.max_iterations, - checkpoint_path=args.resume, - ) - if best: - print(f"\nEvolution complete! Best metrics:") - for k, v in best.metrics.items(): - if isinstance(v, float): - print(f" {k}: {v:.4f}") - else: - print(f" {k}: {v}") - else: - print("\nNo valid programs found.") - finally: - stop_event.set() - if responder_task: - # Give responder time to notice the stop event - await asyncio.sleep(2) - - return 0 - - -def main(): - parser = argparse.ArgumentParser(description="Run OpenEvolve with ManualLLM") - parser.add_argument("--example", "-e", choices=list(EXAMPLES.keys()), - help="Built-in example to run") - parser.add_argument("--iterations", "-n", type=int, default=10, - help="Number of iterations (default: 10)") - parser.add_argument("--auto", action="store_true", - help="Auto-respond with built-in heuristic strategies") - parser.add_argument("--wait", action="store_true", - help="Wait for manual response files (human or external tool)") - parser.add_argument("--resume", help="Path to checkpoint directory to resume from") - - args = parser.parse_args() - - if not args.auto and not args.wait: - args.auto = True # Default to auto mode - - return asyncio.run(_run(args)) - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/src/mlirAgent/evolve/providers.py b/src/mlirAgent/evolve/providers.py deleted file mode 100644 index 6e8b8de..0000000 --- a/src/mlirAgent/evolve/providers.py +++ /dev/null @@ -1,46 +0,0 @@ -"""LLM provider configuration loader. - -Reads YAML agent configs from configs/agents/ and returns dicts -consumable by framework adapters (OpenEvolve, ShinkaEvolve). -""" - -import os -from pathlib import Path -from typing import Dict, Any, Optional - -import yaml - -from ..config import Config - - -def load_agent_config(agent_name: str, configs_dir: Optional[str] = None) -> Dict[str, Any]: - """Load an agent YAML config by name. - - Args: - agent_name: Name without extension, e.g. "claude_opus". - configs_dir: Override for configs directory. - - Returns: - Dict with keys: api_base, model, api_key_env, temperature, max_tokens. - The api_key is resolved from the environment variable named in api_key_env. - """ - base = Path(configs_dir or Config.EVOLVE_CONFIGS_DIR) / "agents" - path = base / f"{agent_name}.yaml" - if not path.exists(): - raise FileNotFoundError(f"Agent config not found: {path}") - - with open(path) as f: - cfg = yaml.safe_load(f) - - # Resolve API key from environment - key_env = cfg.get("api_key_env", "") - cfg["api_key"] = os.environ.get(key_env, "") - return cfg - - -def list_agents(configs_dir: Optional[str] = None) -> list: - """List available agent config names.""" - base = Path(configs_dir or Config.EVOLVE_CONFIGS_DIR) / "agents" - if not base.exists(): - return [] - return sorted(p.stem for p in base.glob("*.yaml")) diff --git a/src/mlirAgent/evolve/run.py b/src/mlirAgent/evolve/run.py index 7e2cbbe..1826f41 100644 --- a/src/mlirAgent/evolve/run.py +++ b/src/mlirAgent/evolve/run.py @@ -1,124 +1,154 @@ -"""CLI entry point for the evolve harness. - -Usage: - python -m mlirAgent.evolve.run --task llvm_inlining --framework openevolve --agent claude_opus - python -m mlirAgent.evolve.run --list - python -m mlirAgent.evolve.run --task llvm_inlining --framework openevolve --agent claude_opus --dry-run -""" - -import argparse -import json -import sys -from pathlib import Path -from typing import Dict, Any, Optional - -import yaml - -from ..config import Config -from .adapters import ADAPTERS -from .providers import load_agent_config, list_agents - - -# Registry of available tasks -TASKS = { - "llvm_inlining": "mlirAgent.evolve.tasks.llvm_inlining.task.LLVMInliningTask", -} +"""Unified entry point for LLVM heuristic evolution. +Supports two evolution frameworks dispatched via ``--framework``: + - **gepa** (default): GEPA optimize_anything() with ASI as native side-info + - **openevolve**: MAP-Elites with ManualLLM -def _load_task(task_name: str, configs_dir: str) -> Any: - """Instantiate a task by name, loading its YAML config if present.""" - if task_name not in TASKS: - raise ValueError(f"Unknown task: {task_name}. Available: {list(TASKS.keys())}") +Both frameworks share the same evaluator pipeline (``tasks/llvm_bench.py``), +which includes Optuna hyperparameter tuning when ``[hyperparam]`` annotations +are present in the evolved C++ code. - # Import task class - module_path, class_name = TASKS[task_name].rsplit(".", 1) - import importlib - mod = importlib.import_module(module_path) - task_cls = getattr(mod, class_name) +Usage:: - # Load task YAML config - task_yaml = Path(configs_dir) / "tasks" / f"{task_name}.yaml" - task_config = {} - if task_yaml.exists(): - with open(task_yaml) as f: - task_config = yaml.safe_load(f) or {} + # GEPA with auto-respond (smoke test) + python run.py --task llvm_inlining --max-evals 2 --auto - return task_cls(task_config) + # GEPA manual mode (Claude Code or human writes response files) + python run.py --task llvm_inlining --max-evals 10 + # OpenEvolve + python run.py --framework openevolve --task llvm_inlining --max-evals 10 --auto -def _load_framework_config(framework_name: str, configs_dir: str) -> Dict[str, Any]: - """Load framework YAML config.""" - fw_yaml = Path(configs_dir) / "frameworks" / f"{framework_name}.yaml" - if not fw_yaml.exists(): - raise FileNotFoundError(f"Framework config not found: {fw_yaml}") - with open(fw_yaml) as f: - return yaml.safe_load(f) or {} + # Override Optuna trials + python run.py --task llvm_inlining --max-evals 10 --optuna-trials 5 +""" +import argparse +import os +import sys +from datetime import datetime +from pathlib import Path -def _list_available(configs_dir: str): - """Print available agents, frameworks, and tasks.""" - print("Available configurations:\n") +_BASE_DIR = Path(__file__).resolve().parent - print(" Agents:") - agents_dir = Path(configs_dir) / "agents" - if agents_dir.exists(): - for p in sorted(agents_dir.glob("*.yaml")): - print(f" - {p.stem}") - else: - print(" (none)") +# Ensure local packages are importable +if str(_BASE_DIR) not in sys.path: + sys.path.insert(0, str(_BASE_DIR)) - print("\n Frameworks:") - fw_dir = Path(configs_dir) / "frameworks" - if fw_dir.exists(): - for p in sorted(fw_dir.glob("*.yaml")): - print(f" - {p.stem}") - else: - print(" (none)") +_TASKS = ["llvm_inlining", "loop_unrolling", "regalloc_priority"] - print("\n Tasks:") - for name in sorted(TASKS.keys()): - print(f" - {name}") +_TASK_INITIAL = { + "llvm_inlining": _BASE_DIR / "tasks" / "llvm_inlining" / "initial.cpp", + "loop_unrolling": _BASE_DIR / "tasks" / "loop_unrolling" / "initial.cpp", + "regalloc_priority": _BASE_DIR / "tasks" / "regalloc_priority" / "initial.cpp", +} def main(): parser = argparse.ArgumentParser( - description="Evolve harness: evolutionary compiler optimization" + description="Evolve LLVM heuristics via LLM-guided search", + ) + parser.add_argument( + "--framework", "-f", default="gepa", + choices=["gepa", "openevolve"], + help="Evolution framework (default: gepa)", + ) + parser.add_argument( + "--task", "-t", required=True, + choices=_TASKS, + help="LLVM task to optimize", + ) + parser.add_argument( + "--initial", default=None, + help="Override initial C++ source path", + ) + parser.add_argument( + "--max-evals", "-n", type=int, default=10, + help="Max evaluator calls / iterations (default: 10)", + ) + parser.add_argument( + "--auto", action="store_true", + help="Auto-respond to prompts (for smoke testing)", + ) + parser.add_argument( + "--prompts-dir", default=None, + help="Prompt/response directory (default: auto)", + ) + parser.add_argument( + "--output", default=None, + help="Save best code to this path", + ) + parser.add_argument( + "--resume", default=None, + help="Resume from checkpoint (OpenEvolve only)", + ) + parser.add_argument( + "--poll-interval", type=float, default=2.0, + help="ManualLM poll interval in seconds (default: 2.0)", + ) + parser.add_argument( + "--optuna-trials", type=int, default=None, + help="Optuna inner-loop trials (overrides EVOLVE_OPTUNA_TRIALS env)", ) - parser.add_argument("--task", "-t", help="Task name (e.g. llvm_inlining)") - parser.add_argument("--framework", "-f", help="Framework name (e.g. openevolve)") - parser.add_argument("--agent", "-a", help="Agent config name (e.g. claude_opus)") - parser.add_argument("--list", action="store_true", help="List available configs") - parser.add_argument("--dry-run", action="store_true", help="Print config without running") - parser.add_argument("--max-iterations", type=int, help="Override max iterations") - parser.add_argument("--configs-dir", default=Config.EVOLVE_CONFIGS_DIR, - help="Path to configs directory") - args = parser.parse_args() - configs_dir = args.configs_dir - - if args.list: - _list_available(configs_dir) - return 0 - - if not all([args.task, args.framework, args.agent]): - parser.error("--task, --framework, and --agent are required (or use --list)") - # Load everything - task = _load_task(args.task, configs_dir) - agent_config = load_agent_config(args.agent, configs_dir) - framework_config = _load_framework_config(args.framework, configs_dir) - - # Get adapter - if args.framework not in ADAPTERS: - print(f"Error: Unknown framework '{args.framework}'. Available: {list(ADAPTERS.keys())}") - return 1 - - adapter = ADAPTERS[args.framework]() - adapter.configure(task, agent_config, framework_config) + # Set Optuna env var if specified + if args.optuna_trials is not None: + os.environ["EVOLVE_OPTUNA_TRIALS"] = str(args.optuna_trials) + + # Set up experiment directory + mlirevolve_root = _BASE_DIR.parent.parent.parent + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + exp_dir = str(mlirevolve_root / "experiments" / f"run_{timestamp}") + prompts_dir = args.prompts_dir or os.path.join(exp_dir, "prompts") + os.makedirs(prompts_dir, exist_ok=True) + os.makedirs(exp_dir, exist_ok=True) + + # Resolve initial program + initial_file = args.initial or str(_TASK_INITIAL[args.task]) + if not os.path.exists(initial_file): + print(f"Error: Initial source not found: {initial_file}") + sys.exit(1) + + # Print header + optuna_trials = args.optuna_trials or os.environ.get("EVOLVE_OPTUNA_TRIALS", "20") + print(f"{'=' * 60}") + print(f"Evolve LLVM Heuristics") + print(f" Framework: {args.framework}") + print(f" Task: {args.task}") + print(f" Initial: {initial_file}") + print(f" Max evals: {args.max_evals}") + print(f" Optuna trials: {optuna_trials}") + print(f" Auto-respond: {args.auto}") + print(f" Prompts: {prompts_dir}") + print(f" Experiment: {exp_dir}") + print(f"{'=' * 60}") + print() + + # Dispatch to framework adapter + if args.framework == "gepa": + from adapters import GEPAAdapter + adapter = GEPAAdapter() + else: + from adapters import OpenEvolveAdapter + adapter = OpenEvolveAdapter() + + result = adapter.run( + task=args.task, + initial_file=initial_file, + prompts_dir=prompts_dir, + max_evals=args.max_evals, + auto_respond=args.auto, + poll_interval=args.poll_interval, + output=args.output or os.path.join(exp_dir, "best.cpp"), + exp_dir=exp_dir, + resume=args.resume, + ) - # Run - result = adapter.launch(dry_run=args.dry_run, max_iterations=args.max_iterations) - print(json.dumps(result, indent=2, default=str)) + print() + print(f"{'=' * 60}") + print(f"Done. Results in: {exp_dir}") + print(f"{'=' * 60}") return 0