From ef3f2d22eef470e0610a41d796528db7f2b22a01 Mon Sep 17 00:00:00 2001
From: Ashvin Verma <ashvin.verma@Berkeley.EDU>
Date: Thu, 19 Feb 2026 18:01:22 -0800
Subject: [PATCH 1/8] [update] Fix benchmark measurement noise + comprehensive
 README

- run_benchmark() now uses median-of-5 runs instead of single run,
  fixing unreliable measurements for fast benchmarks like sqlite3 (2ms)
- Rewrite evolve README with end-to-end flow documentation: setup,
  experiment pipeline, per-benchmark execution, LLVM hooks, scoring
- Add compile_testsuite.sh for building CTMark .bc files

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/mlirAgent/evolve/README.md                | 382 +++++++++++++-----
 src/mlirAgent/evolve/tasks/llvm_bench.py      |  56 +--
 .../benchmarks/compile_testsuite.sh           | 244 +++++++++++
 3 files changed, 561 insertions(+), 121 deletions(-)
 create mode 100644 src/mlirAgent/evolve/tasks/llvm_inlining/benchmarks/compile_testsuite.sh

diff --git a/src/mlirAgent/evolve/README.md b/src/mlirAgent/evolve/README.md
index 99c2b7f..d76a558 100644
--- a/src/mlirAgent/evolve/README.md
+++ b/src/mlirAgent/evolve/README.md
@@ -3,109 +3,269 @@
 Automated framework for evolving LLVM compiler heuristics using
 [OpenEvolve](../../third_party/openevolve/) with LLM-guided search.
 
-## System Overview
+## End-to-End Flow
 
-```
-  OpenEvolve controller
-        |
-        v
-  ManualLLM (file-based prompt/response)
-        |
-        v
-  Orchestrator (manual_run.py)       <-- --auto / --wait / --resume
-        |
-        v
-  Task evaluator (evaluate.py)       <-- patches LLVM, builds, benchmarks
-        |
-        v
-  Score -> OpenEvolve population
-```
+### One-Time Setup
+
+**1. Build LLVM with evolved hooks**
+
+```bash
+# Shallow clone
+git clone --depth 1 https://github.com/llvm/llvm-project.git /scratch/ashvin/llvm-project
+
+# Add evolved heuristic files to the LLVM tree:
+#   llvm/include/llvm/Analysis/EvolvedInlineCost.h
+#   llvm/lib/Analysis/EvolvedInlineCost.cpp          (inlining hook)
+#   llvm/include/llvm/CodeGen/EvolvedRegAllocPriority.h
+#   llvm/lib/CodeGen/EvolvedRegAllocPriority.cpp      (regalloc hook)
+# Register them in the corresponding CMakeLists.txt files.
+# Hook into InlineCost.cpp and RegAllocGreedy.cpp with cl::opt flags.
 
-OpenEvolve manages a population of evolved C++ heuristic programs. Each
-iteration, it produces prompts asking an LLM to improve the code. The
-ManualLLM bridge decouples the LLM from OpenEvolve's process model,
-enabling Claude Code (or any external agent) to respond.
+# Configure: Release, X86-only, GCC 13 + gold linker
+cmake -G Ninja -B /scratch/ashvin/llvm-build \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DLLVM_TARGETS_TO_BUILD=X86 \
+  -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ \
+  /scratch/ashvin/llvm-project/llvm
 
-## ManualLLM
+# Build (produces bin/opt and bin/llc, ~657MB)
+ninja -C /scratch/ashvin/llvm-build bin/opt bin/llc
+```
 
-**File:** `third_party/openevolve/openevolve/llm/manual.py`
+**2. Prepare CTMark benchmarks as .bc files**
 
-File-based polling interface between OpenEvolve and external responders:
+The benchmarks come from [llvm-test-suite](https://github.com/llvm/llvm-test-suite)
+CTMark. They are compiled to LLVM bitcode (.bc) with frontend optimizations
+only, so our evolved passes have full control over LLVM-level optimization:
 
-1. OpenEvolve writes `prompt_NNN.md` to a shared directory
-2. ManualLLM polls for a corresponding `prompt_NNN.response.md`
-3. When found, the response is returned to OpenEvolve as the LLM output
+```bash
+# compile_testsuite.sh does this for each benchmark:
+clang-18 -O1 -Xclang -disable-llvm-optzns -emit-llvm -c source.c -o source.bc
+llvm-link *.bc -o benchmark.bc    # multi-file benchmarks
+```
 
-The prompts directory is passed via `MANUAL_LLM_PROMPTS_DIR` env var,
-which crosses the process-pool boundary (OpenEvolve uses multiprocessing).
-`create_manual_llm` is a module-level factory (not a lambda) to support
-pickling across worker processes.
+`-O1 -Xclang -disable-llvm-optzns` enables Clang frontend opts (type lowering,
+etc.) but skips all LLVM IR passes. The resulting .bc files contain unoptimized
+IR ready for our `opt -O2` pipeline.
 
-## Orchestrator
+The 8 benchmarks used (2 excluded: clamav=segfault, 7zip=link error):
 
-**File:** `manual_run.py`
+| Benchmark | Language | Source | Description |
+|-----------|----------|--------|-------------|
+| bullet | C++ | MultiSource/Benchmarks/Bullet | Physics engine simulation |
+| consumer-typeset | C | MultiSource/Applications/lout | Document typesetting (Lout) |
+| kimwitu | C++ | MultiSource/Applications/kimwitu++ | Tree pattern matching |
+| lencod | C | MultiSource/Applications/JM/lencod | H.264 video encoder |
+| mafft | C | MultiSource/Applications/mafft | Multiple sequence alignment |
+| spass | C | MultiSource/Applications/SPASS | First-order theorem prover |
+| sqlite3 | C | MultiSource/Applications/sqlite3 | SQL database engine |
+| tramp3d-v4 | C++ | MultiSource/Benchmarks/tramp3d-v4 | Template metaprogramming |
 
+The .bc files and runtime data live in `tasks/llvm_inlining/benchmarks/testsuite/`:
 ```
-python -m mlirAgent.evolve.manual_run --example llvm_inlining -n 10 --auto
+testsuite/
+  bullet.bc, consumer-typeset.bc, kimwitu.bc, ...
+  data/
+    bullet/           # landscape.mdl, Taru.mdl
+    consumer-typeset/  # large.lout, data/, font/, maps/, hyph/, include/
+    kimwitu/          # inputs/f1.k, f2.k, f3.k
+    lencod/           # encoder_small.cfg, foreman_part_qcif_444.yuv, ...
+    mafft/            # pyruvate_decarboxylase.fasta
+    spass/            # problem.dfg
+    sqlite3/          # commands, sqlite3rc, test1.sql-test15.sql
 ```
 
-Modes:
-- `--auto` Built-in heuristic strategies (simulated annealing, gradient
-  estimate, etc.) auto-respond to prompts. Fast but limited.
-- `--wait` External agent (Claude Code, human) writes response files.
-- `--resume <checkpoint>` Continue from a saved checkpoint.
+### Running an Experiment
+
+```bash
+# Set environment
+export LLVM_SRC_PATH=/scratch/ashvin/llvm-project
+export EVOLVE_BUILD_DIR=/scratch/ashvin/llvm-build
+export EVOLVE_OPTUNA_TRIALS=5   # 0 to disable Optuna
 
-Logs scores to `experiments/run_TIMESTAMP/scores.jsonl` and saves
-OpenEvolve checkpoints every iteration.
+# Launch (--wait mode: you respond to prompts manually or via Claude Code)
+python -m mlirAgent.evolve.manual_run --example llvm_inlining -n 10 --wait
+
+# Or auto mode (built-in heuristic strategies respond automatically)
+python -m mlirAgent.evolve.manual_run --example regalloc_priority -n 10 --auto
+```
 
-## Evaluator Pipeline
+This creates an experiment directory:
+```
+experiments/run_20260219_132604/
+  scores.jsonl                    # One JSON line per iteration with all metrics
+  prompts/
+    prompt_001.md                 # OpenEvolve prompt (parent code + history)
+    prompt_001.response.md        # LLM/agent response (new code)
+    prompt_002.md
+    ...
+  openevolve_output/
+    checkpoints/checkpoint_N/     # Population state for --resume
+    best/best_program.cpp         # Best evolved program
+    logs/openevolve_*.log         # Detailed log
+```
 
-Each task defines an `evaluate.py` that follows this pipeline:
+### What Happens Each Iteration
 
 ```
-1. patch_source()    Copy evolved .cpp into LLVM source tree
-2. build_llvm()      ninja -C $BUILD_DIR bin/opt bin/llc
-3. load_baseline()   Cache default-LLVM measurements (first run only)
-4. [optuna_tune()]   Optional inner-loop for [hyperparam] knobs
-5. eval_benchmarks() For each CTMark .bc file:
-     opt -O2 [-use-evolved-*] bench.bc -> bench_opt.bc
-     llc -O2 [-use-evolved-*] bench_opt.bc -> bench.o
-     gcc bench.o -> bench
-     measure .text size, binary size, runtime
-6. score_fn()        Task-specific scoring
-7. restore_source()  Restore original .cpp from backup
+                        ┌─────────────────────────────────┐
+                        │        OpenEvolve Controller     │
+                        │  (population, MAP-Elites, etc.)  │
+                        └────────────┬────────────────────┘
+                                     │ 1. Sample parent program
+                                     │    from population
+                                     ▼
+                        ┌─────────────────────────────────┐
+                        │          ManualLLM Bridge        │
+                        │  Write prompt_NNN.md to disk     │
+                        │  Poll for prompt_NNN.response.md │
+                        └────────────┬────────────────────┘
+                                     │ 2. External responder
+                                     │    writes response file
+                                     ▼
+                        ┌─────────────────────────────────┐
+                        │       Task Evaluator (evaluate.py)│
+                        └────────────┬────────────────────┘
+                                     │
+              ┌──────────────────────┼──────────────────────┐
+              ▼                      ▼                      ▼
+    3. patch_source()       4. build_llvm()         5. load_baseline()
+    Copy evolved .cpp       ninja -C BUILD_DIR      Compile & run all
+    into LLVM tree          bin/opt bin/llc          benchmarks with
+    (backup original)       (~3.5s incremental)     default LLVM (once,
+                                                    cached to .json)
+              │                      │                      │
+              └──────────────────────┼──────────────────────┘
+                                     │
+                                     ▼
+                        ┌─────────────────────────────────┐
+                        │  6. [Optuna inner-loop]          │
+                        │  If [hyperparam] annotations:    │
+                        │  Run N trials on 3-bench subset  │
+                        │  (sqlite3, spass, tramp3d-v4)    │
+                        │  Each trial = compile+run subset │
+                        │  Find best flag values           │
+                        └────────────┬────────────────────┘
+                                     │
+                                     ▼
+                        ┌─────────────────────────────────┐
+                        │  7. eval_benchmarks()            │
+                        │  For EACH of 8 .bc benchmarks:   │
+                        │  ┌─────────────────────────────┐ │
+                        │  │ a. opt -O2                  │ │
+                        │  │    [-use-evolved-inline-cost]│ │
+                        │  │    bench.bc → bench_opt.bc  │ │
+                        │  ├─────────────────────────────┤ │
+                        │  │ b. llc -O2 -filetype=obj    │ │
+                        │  │    -relocation-model=pic    │ │
+                        │  │    [-use-evolved-regalloc-*]│ │
+                        │  │    [-ae-flag=value ...]     │ │
+                        │  │    bench_opt.bc → bench.o   │ │
+                        │  ├─────────────────────────────┤ │
+                        │  │ c. gcc bench.o -o bench     │ │
+                        │  │    -lm -lpthread -ldl       │ │
+                        │  │    [-lstdc++ for C++ bench]  │ │
+                        │  ├─────────────────────────────┤ │
+                        │  │ d. size bench.o → .text size│ │
+                        │  │    stat bench   → binary sz │ │
+                        │  ├─────────────────────────────┤ │
+                        │  │ e. Run 5x, take median:     │ │
+                        │  │    ./bench [args] [<stdin]   │ │
+                        │  └─────────────────────────────┘ │
+                        └────────────┬────────────────────┘
+                                     │
+                                     ▼
+                        ┌─────────────────────────────────┐
+                        │  8. score_fn()                   │
+                        │  Compare vs baseline:            │
+                        │  Inlining: bin_red% + speedup*10 │
+                        │  RegAlloc: 5*speedup% + bin_red% │
+                        └────────────┬────────────────────┘
+                                     │
+                                     ▼
+                        ┌─────────────────────────────────┐
+                        │  9. restore_source()             │
+                        │  Put original .cpp back          │
+                        │  Return score to OpenEvolve      │
+                        └─────────────────────────────────┘
 ```
 
-Shared infrastructure lives in `tasks/llvm_bench.py`. Task-specific
-evaluators are thin wrappers (~100 lines) that define a scoring function
-and pass the right evolved flags.
+### Per-Benchmark Execution Commands
+
+Each benchmark is run with CTMark reference inputs:
+
+| Benchmark | Command | Runtime |
+|-----------|---------|---------|
+| bullet | `./bullet` (reads landscape.mdl, Taru.mdl) | ~3.4s |
+| consumer-typeset | `./consumer-typeset -x -I data/include -D data/data -F data/font -C data/maps -H data/hyph large.lout` | ~0.1s |
+| kimwitu | `./kimwitu -f test -o -v -s kcc inputs/f3.k inputs/f2.k inputs/f1.k` | ~0.06s |
+| lencod | `./lencod -d data/encoder_small.cfg -p InputFile=data/foreman_part_qcif_444.yuv ...` | no runtime (hangs) |
+| mafft | `./mafft -b 62 -g 0.100 -f 2.00 -h 0.100 -L < pyruvate_decarboxylase.fasta` | ~15s |
+| spass | `./spass problem.dfg` | ~8s |
+| sqlite3 | `./sqlite3 -init sqlite3rc :memory: < commands` (runs test1-15.sql) | ~0.002s |
+| tramp3d-v4 | `./tramp3d-v4 --cartvis 1.0 0.0 --rhomin 1e-8 -n 4 --domain 32 32 32` | ~0.11s |
+
+Each benchmark is run **5 times** and the **median** wall-clock time is used
+(via `time.time()` around `subprocess.run()`). This reduces noise from OS
+scheduling and process startup, though very short benchmarks (sqlite3 at 2ms)
+remain unreliable.
 
 ## LLVM Hooks
 
 ### Inlining (`-use-evolved-inline-cost`)
+- **Header:** `llvm/include/llvm/Analysis/EvolvedInlineCost.h`
 - **Source:** `llvm/lib/Analysis/EvolvedInlineCost.cpp`
-- **Flag on:** `opt` (inlining happens during middle-end optimization)
-- Evolves `getEvolvedInlineCost()` which returns a cost adjustment
+- **Hooked in:** `InlineCost.cpp` — `getInlineCost()` checks the `cl::opt` flag
+- **Flag on:** `opt` (inlining is a middle-end IR pass)
+- **Function:** `getEvolvedInlineCost(const InlineCostFeatures &F)` returns
+  negative=inline, positive=don't inline. Mapped to `InlineCost::get(cost, 0)`.
+- **Features:** callsite_cost, unsimplified_common_instructions, simplified_instructions,
+  dead_blocks, constant_args, num_loops, nested_inlines, etc.
 
 ### RegAlloc Priority (`-use-evolved-regalloc-priority`)
+- **Header:** `llvm/include/llvm/CodeGen/EvolvedRegAllocPriority.h`
 - **Source:** `llvm/lib/CodeGen/EvolvedRegAllocPriority.cpp`
-- **Flag on:** `llc` (register allocation happens during code generation)
-- Evolves `computeEvolvedRegAllocPriority()` which returns a priority value
+- **Hooked in:** `RegAllocGreedy.cpp` — `DefaultPriorityAdvisor::getPriority()`
+  checks the `cl::opt` flag, extracts features, calls evolved function
+- **Flag on:** `llc` (register allocation is a CodeGen pass)
+- **Function:** `computeEvolvedRegAllocPriority(const RegAllocPriorityFeatures &F)`
+  returns unsigned priority (higher = allocated first)
+- **Features:** Size, Stage, IsLocal, ForceGlobal, AllocationPriority,
+  HasPreference, NumAllocatable, BeginDist, EndDist, NumInstrs, IsCSR
+
+### Bit-packed priority encoding (regalloc)
+
+The default LLVM priority uses a bit-packed unsigned:
+```
+Bit 31:    1 = RS_Assign (above RS_Split)
+Bit 30:    1 = HasPreference (register hint)
+Bit 29:    GlobalBit (global ranges above local)
+Bits 24-28: AllocationPriority (register class, 5 bits)
+Bits 0-23:  Size or BeginDist (clamped to 24 bits)
+```
+This creates hard priority boundaries. Structural changes to this encoding
+consistently hurt performance in experiments.
 
 ## Hyperparameter Convention
 
-Evolved C++ code can declare tunable numeric knobs:
+Evolved C++ code can declare tunable numeric knobs via comments:
 
 ```cpp
-const int base_threshold = 100;  // [hyperparam]: ae-inline-base-threshold, int, 50, 300
+// [hyperparam]: ae-inline-base-threshold, int, 50, 300
+static cl::opt<int> BaseThreshold("ae-inline-base-threshold", cl::init(100), ...);
 ```
 
 Format: `// [hyperparam]: flag-name, type, min, max`
 
-When present and `optuna_trials > 0`, the evaluator runs an Optuna
-inner-loop on a benchmark subset to find optimal values before the final
-full-suite evaluation. Tuned values are passed as LLVM command-line flags
-(e.g. `-ae-inline-base-threshold=173`).
+When `EVOLVE_OPTUNA_TRIALS > 0`, the evaluator:
+1. Parses `[hyperparam]` annotations from the evolved C++ code
+2. Creates an Optuna study with one parameter per annotation
+3. Runs N trials on a 3-benchmark subset (sqlite3, spass, tramp3d-v4)
+4. Each trial: compile subset with trial params as LLVM flags, score
+5. Best params are passed as flags in the final full-suite evaluation
+
+Example: Optuna suggests `-ae-inline-base-threshold=173`, which is passed
+to `opt` (or `llc` for regalloc flags) during compilation.
 
 ## Configuration
 
@@ -114,7 +274,7 @@ full-suite evaluation. Tuned values are passed as LLVM command-line flags
 ```python
 from mlirAgent.evolve.tasks.llvm_bench import EvalConfig
 
-# From environment variables (backward compatible)
+# From environment variables
 config = EvalConfig.from_env("llvm/lib/Analysis/EvolvedInlineCost.cpp")
 
 # Programmatic with overrides
@@ -125,23 +285,37 @@ config = EvalConfig.from_env(
 )
 ```
 
-Key env vars: `LLVM_SRC_PATH`, `EVOLVE_BUILD_DIR`, `EVOLVE_OPT_TIMEOUT`,
-`EVOLVE_OPTUNA_TRIALS`.
+| Env Var | Default | Description |
+|---------|---------|-------------|
+| `LLVM_SRC_PATH` | (required) | LLVM source tree root |
+| `EVOLVE_BUILD_DIR` | (required) | LLVM ninja build directory |
+| `EVOLVE_OPT_TIMEOUT` | 120 | Per-benchmark opt/llc timeout (seconds) |
+| `EVOLVE_OPTUNA_TRIALS` | 20 | Optuna trials (0 = disable) |
 
 ## Task Structure
 
 ```
-tasks/
-  llvm_bench.py              # Shared: EvalConfig, compile, baseline, Optuna
-  llvm_inlining/
-    evaluate.py              # Thin wrapper: _score(), evaluate()
-    initial.cpp              # Seed heuristic
-    task.py                  # OpenEvolve Task class
-    benchmarks/testsuite/    # CTMark .bc files + data/
-  regalloc_priority/
-    evaluate.py              # Thin wrapper: _score(), evaluate()
-    initial.cpp              # Seed priority function
-    baseline_regalloc.json   # Separate baseline cache
+src/mlirAgent/evolve/
+  manual_run.py                  # Orchestrator: --auto/--wait/--resume
+  tasks/
+    llvm_bench.py                # Shared: EvalConfig, compile, baseline, Optuna
+    llvm_inlining/
+      evaluate.py                # _score(): bin_red% + speedup*10
+      initial.cpp                # Seed: sums heuristic features - threshold
+      task.py                    # OpenEvolve Task class
+      benchmarks/
+        compile_testsuite.sh     # Script to build .bc from llvm-test-suite
+        testsuite/               # .bc files (gitignored, built locally)
+          data/                  # Runtime input data per benchmark
+    regalloc_priority/
+      evaluate.py                # _score(): 5*speedup% + bin_red%
+      initial.cpp                # Seed: LLVM default bit-packed priority
+      baseline_regalloc.json     # Separate baseline (uses -use-evolved-* on llc)
+  README.md                      # This file
+configs/
+  frameworks/manual.yaml         # OpenEvolve config (pop=10, 1 island, seed=42)
+experiments/                     # Output (gitignored)
+  run_YYYYMMDD_HHMMSS/
 ```
 
 ### Adding a New Task
@@ -150,31 +324,47 @@ tasks/
 2. In `evaluate.py`, define `_score(total_binary, baseline_binary, speedups)`
 3. Call shared functions from `llvm_bench.py` with the right evolved flags
 4. Add entry to `EXAMPLES` dict in `manual_run.py`
+5. If the evolved code affects `llc` (not `opt`), use `flag_target="llc"` in
+   `optuna_tune()` and pass flags via `evolved_llc_flags`
 
 ## Scoring Formulas
 
 **Inlining:** `binary_reduction_pct + (avg_speedup - 1.0) * 10`
-- Primary: linked binary size reduction vs baseline (Magellan-comparable)
+- Primary signal: linked binary size reduction vs baseline
 - Secondary: small bonus for runtime improvement
+- Comparable to Magellan (ICML 2025) binary reduction metric
 
 **RegAlloc:** `5.0 * speedup_pct + 1.0 * binary_reduction_pct`
-- Primary: runtime improvement (regalloc most affects execution speed)
+- Primary signal: runtime improvement (regalloc most affects execution speed)
 - Secondary: binary size reduction
+- Warning: dominated by measurement noise for short-running benchmarks
 
 ## Experiment Results (CTMark, Feb 2026)
 
 ### LLVM Inlining
-| Experiment | Optuna | Iters | Best Score | Binary Reduction | Time |
-|-----------|--------|-------|------------|-----------------|------|
-| Exp A     | No     | 10    | 8.65       | 8.78%           | ~50 min |
-| Exp C     | 5 trials | 10  | 8.66       | 8.41%           | ~90 min |
-
-Both match Magellan's reported range (4.27-8.79%) with only 10 iterations.
-Optuna eliminates failures (100% positive scores vs 80%) but doesn't
-improve peak performance significantly. Code structure matters more than
-hyperparameter values for peak score.
-
-### Key Insight
-Os-level inlining hurts tramp3d-v4 (C++ templates need inlining for
-devirtualization). Best heuristics learn to selectively increase inlining
-for template-heavy code.
+| Experiment | Responder | Optuna | Iters | Best Score | Binary Reduction |
+|-----------|-----------|--------|-------|------------|-----------------|
+| Exp A | Claude | No | 10 | 8.65 | 8.78% |
+| Exp C | Auto | 5 trials | 10 | 8.66 | 8.41% |
+| Exp D | Claude | 5 trials | 11 | **8.78** | **9.24%** |
+
+All match Magellan's reported range (4.27-8.79%) with only 10 iterations.
+Claude + Optuna combined is slightly better than either alone.
+
+### RegAlloc Priority
+| Experiment | Measurement | Iters | Best Score | Notes |
+|-----------|-------------|-------|------------|-------|
+| Exp E | Single run | 8 | 63.39 | **INVALIDATED** (sqlite3 2ms noise) |
+| Exp F | Median-of-5 | 11 | 8.82 | Pressure-proportional priority |
+
+Exp E results were entirely from sqlite3 measurement noise (1.89x "speedup"
+was an artifact of 2ms runtime variance). After fixing `run_benchmark()` to
+use median-of-5 runs, the only positive innovation was pressure-proportional
+priority: boosting global ranges in constrained register classes.
+
+### Key Insights
+- Os-level inlining hurts tramp3d-v4 (C++ templates need inlining)
+- Code structure > hyperparameters for peak inlining score
+- Optuna adds robustness (100% positive scores vs 80%)
+- RegAlloc priority bit-packed encoding is fragile — structural changes hurt
+- Benchmarks under 10ms are unreliable even with median-of-5 runs
diff --git a/src/mlirAgent/evolve/tasks/llvm_bench.py b/src/mlirAgent/evolve/tasks/llvm_bench.py
index 8b539f7..9371a23 100644
--- a/src/mlirAgent/evolve/tasks/llvm_bench.py
+++ b/src/mlirAgent/evolve/tasks/llvm_bench.py
@@ -239,8 +239,9 @@ def get_text_size(obj_path: str) -> int:
 # Benchmark execution
 # ---------------------------------------------------------------------------
 
-def run_benchmark(name: str, binary_path: str, tmp_dir: str, data_dir: str):
-    """Run a benchmark with reference inputs; return wall-clock seconds or None."""
+def run_benchmark(name: str, binary_path: str, tmp_dir: str, data_dir: str,
+                  num_runs: int = 5):
+    """Run a benchmark with reference inputs; return median wall-clock seconds or None."""
     config = BENCH_RUN_CONFIGS.get(name)
     if not config:
         return None
@@ -267,31 +268,36 @@ def run_benchmark(name: str, binary_path: str, tmp_dir: str, data_dir: str):
             if src.exists():
                 shutil.copy2(str(src), os.path.join(run_dir, f))
 
-    # Prepare stdin
-    stdin_fh = None
-    if config.get("stdin_file") and bench_data.exists():
-        stdin_src = bench_data / config["stdin_file"]
-        if stdin_src.exists():
-            stdin_fh = open(str(stdin_src), "r")
-
     cmd = [run_binary] + config.get("args", [])
     timeout = config.get("timeout", 30)
-
-    try:
-        start = time.time()
-        proc = subprocess.run(
-            cmd, capture_output=True, timeout=timeout,
-            cwd=run_dir, stdin=stdin_fh,
-        )
-        elapsed = time.time() - start
-        if proc.returncode == 0:
-            return elapsed
-    except subprocess.TimeoutExpired:
-        pass
-    finally:
-        if stdin_fh:
-            stdin_fh.close()
-    return None
+    stdin_file = None
+    if config.get("stdin_file") and bench_data.exists():
+        stdin_file = bench_data / config["stdin_file"]
+
+    timings = []
+    for _ in range(num_runs):
+        stdin_fh = None
+        try:
+            if stdin_file and stdin_file.exists():
+                stdin_fh = open(str(stdin_file), "r")
+            start = time.time()
+            proc = subprocess.run(
+                cmd, capture_output=True, timeout=timeout,
+                cwd=run_dir, stdin=stdin_fh,
+            )
+            elapsed = time.time() - start
+            if proc.returncode == 0:
+                timings.append(elapsed)
+        except subprocess.TimeoutExpired:
+            pass
+        finally:
+            if stdin_fh:
+                stdin_fh.close()
+
+    if not timings:
+        return None
+    timings.sort()
+    return timings[len(timings) // 2]
 
 
 def compile_benchmark(bc_path, opt_path, llc_path, tmp_dir, data_dir,
diff --git a/src/mlirAgent/evolve/tasks/llvm_inlining/benchmarks/compile_testsuite.sh b/src/mlirAgent/evolve/tasks/llvm_inlining/benchmarks/compile_testsuite.sh
new file mode 100644
index 0000000..324037a
--- /dev/null
+++ b/src/mlirAgent/evolve/tasks/llvm_inlining/benchmarks/compile_testsuite.sh
@@ -0,0 +1,244 @@
+#!/bin/bash
+# Compile LLVM test-suite benchmarks to .bc (bitcode) files
+# Using: clang-18 -O1 -Xclang -disable-llvm-optzns -emit-llvm
+# This produces unoptimized bitcode suitable for our custom opt pass
+set -e
+
+CLANG="clang-18"
+CLANGXX="clang++-18"
+LLVM_LINK="llvm-link-18"
+TESTSUITE="/scratch/ashvin/llvm-test-suite"
+OUTDIR="/scratch/ashvin/merlin/mlirEvolve/src/mlirAgent/evolve/tasks/llvm_inlining/benchmarks/testsuite"
+TMPDIR="/tmp/testsuite_build_$$"
+
+# Flags: -O1 enables optimizations but -disable-llvm-optzns prevents LLVM
+# opts from running (only Clang frontend opts). This avoids noinline attrs.
+COMMON_FLAGS="-O1 -Xclang -disable-llvm-optzns -emit-llvm"
+C_FLAGS="$COMMON_FLAGS -std=c17"
+CXX_FLAGS="$COMMON_FLAGS"
+
+mkdir -p "$OUTDIR" "$TMPDIR"
+
+compile_ok=0
+compile_fail=0
+
+echo "=== Compiling LLVM test-suite benchmarks to .bc ==="
+echo ""
+
+#--------------------------------------------------------------------
+# 1. SPASS - Theorem Prover (C)
+#--------------------------------------------------------------------
+echo "--- [1/7] SPASS ---"
+SPASS_DIR="$TESTSUITE/MultiSource/Applications/SPASS"
+SPASS_TMP="$TMPDIR/spass"
+mkdir -p "$SPASS_TMP"
+
+SPASS_SRCS=$(ls "$SPASS_DIR"/*.c 2>/dev/null)
+SPASS_OK=1
+for src in $SPASS_SRCS; do
+    base=$(basename "$src" .c)
+    $CLANG $C_FLAGS -DCLOCK_NO_TIMING -fno-strict-aliasing \
+        -I"$SPASS_DIR" \
+        -c "$src" -o "$SPASS_TMP/${base}.bc" 2>/dev/null || {
+        echo "  WARN: Failed to compile $base.c"
+        SPASS_OK=0
+    }
+done
+if [ "$SPASS_OK" = "1" ]; then
+    $LLVM_LINK "$SPASS_TMP"/*.bc -o "$OUTDIR/spass.bc" 2>/dev/null && {
+        echo "  OK: spass.bc ($(stat -c%s "$OUTDIR/spass.bc") bytes)"
+        compile_ok=$((compile_ok + 1))
+    } || {
+        echo "  FAIL: llvm-link failed for SPASS"
+        compile_fail=$((compile_fail + 1))
+    }
+else
+    # Try linking what we have
+    bc_count=$(ls "$SPASS_TMP"/*.bc 2>/dev/null | wc -l)
+    if [ "$bc_count" -gt 0 ]; then
+        $LLVM_LINK "$SPASS_TMP"/*.bc -o "$OUTDIR/spass.bc" 2>/dev/null && {
+            echo "  OK (partial): spass.bc ($(stat -c%s "$OUTDIR/spass.bc") bytes)"
+            compile_ok=$((compile_ok + 1))
+        } || {
+            echo "  FAIL: llvm-link failed for SPASS"
+            compile_fail=$((compile_fail + 1))
+        }
+    else
+        echo "  FAIL: No .bc files produced for SPASS"
+        compile_fail=$((compile_fail + 1))
+    fi
+fi
+
+#--------------------------------------------------------------------
+# 2. tramp3d-v4 - C++ Template Metaprogramming Benchmark
+#--------------------------------------------------------------------
+echo "--- [2/7] tramp3d-v4 ---"
+TRAMP_DIR="$TESTSUITE/MultiSource/Benchmarks/tramp3d-v4"
+$CLANGXX $CXX_FLAGS -std=c++14 -fno-exceptions \
+    -c "$TRAMP_DIR/tramp3d-v4.cpp" -o "$OUTDIR/tramp3d.bc" 2>/dev/null && {
+    echo "  OK: tramp3d.bc ($(stat -c%s "$OUTDIR/tramp3d.bc") bytes)"
+    compile_ok=$((compile_ok + 1))
+} || {
+    echo "  FAIL: tramp3d-v4.cpp"
+    compile_fail=$((compile_fail + 1))
+}
+
+#--------------------------------------------------------------------
+# 3. Bullet - Physics Engine (C++)
+#--------------------------------------------------------------------
+echo "--- [3/7] Bullet ---"
+BULLET_DIR="$TESTSUITE/MultiSource/Benchmarks/Bullet"
+BULLET_TMP="$TMPDIR/bullet"
+mkdir -p "$BULLET_TMP"
+
+BULLET_SRCS=$(ls "$BULLET_DIR"/*.cpp 2>/dev/null)
+BULLET_OK=1
+for src in $BULLET_SRCS; do
+    base=$(basename "$src" .cpp)
+    $CLANGXX $CXX_FLAGS -std=c++98 -DNO_TIME \
+        -I"$BULLET_DIR/include" -I"$BULLET_DIR" \
+        -c "$src" -o "$BULLET_TMP/${base}.bc" 2>/dev/null || {
+        echo "  WARN: Failed to compile $base.cpp"
+        BULLET_OK=0
+    }
+done
+bc_count=$(ls "$BULLET_TMP"/*.bc 2>/dev/null | wc -l)
+if [ "$bc_count" -gt 0 ]; then
+    $LLVM_LINK "$BULLET_TMP"/*.bc -o "$OUTDIR/bullet.bc" 2>/dev/null && {
+        echo "  OK: bullet.bc ($(stat -c%s "$OUTDIR/bullet.bc") bytes)"
+        compile_ok=$((compile_ok + 1))
+    } || {
+        echo "  FAIL: llvm-link failed for Bullet"
+        compile_fail=$((compile_fail + 1))
+    }
+else
+    echo "  FAIL: No .bc files produced for Bullet"
+    compile_fail=$((compile_fail + 1))
+fi
+
+#--------------------------------------------------------------------
+# 4. ClamAV - Antivirus Engine (C)
+#--------------------------------------------------------------------
+echo "--- [4/7] ClamAV ---"
+CLAMAV_DIR="$TESTSUITE/MultiSource/Applications/ClamAV"
+CLAMAV_TMP="$TMPDIR/clamav"
+mkdir -p "$CLAMAV_TMP"
+
+# ClamAV needs specific defines for Linux
+CLAMAV_DEFS="-DHAVE_CONFIG_H -DDONT_LOCK_DBDIRS -DC_LINUX -DWORDS_BIGENDIAN=0 -DFPU_WORDS_BIGENDIAN=0"
+CLAMAV_INCLUDES="-I$CLAMAV_DIR -I$CLAMAV_DIR/zlib"
+
+CLAMAV_SRCS=$(ls "$CLAMAV_DIR"/*.c 2>/dev/null)
+CLAMAV_FAIL_COUNT=0
+for src in $CLAMAV_SRCS; do
+    base=$(basename "$src" .c)
+    $CLANG $C_FLAGS $CLAMAV_DEFS $CLAMAV_INCLUDES \
+        -Wno-incompatible-pointer-types \
+        -c "$src" -o "$CLAMAV_TMP/${base}.bc" 2>/dev/null || {
+        CLAMAV_FAIL_COUNT=$((CLAMAV_FAIL_COUNT + 1))
+    }
+done
+bc_count=$(ls "$CLAMAV_TMP"/*.bc 2>/dev/null | wc -l)
+echo "  Compiled $bc_count files ($CLAMAV_FAIL_COUNT failures)"
+if [ "$bc_count" -gt 0 ]; then
+    $LLVM_LINK "$CLAMAV_TMP"/*.bc -o "$OUTDIR/clamav.bc" 2>/dev/null && {
+        echo "  OK: clamav.bc ($(stat -c%s "$OUTDIR/clamav.bc") bytes)"
+        compile_ok=$((compile_ok + 1))
+    } || {
+        echo "  FAIL: llvm-link failed for ClamAV"
+        compile_fail=$((compile_fail + 1))
+    }
+else
+    echo "  FAIL: No .bc files produced for ClamAV"
+    compile_fail=$((compile_fail + 1))
+fi
+
+#--------------------------------------------------------------------
+# 5. hexxagon - C++ Game AI
+#--------------------------------------------------------------------
+echo "--- [5/7] hexxagon ---"
+HEXX_DIR="$TESTSUITE/MultiSource/Applications/hexxagon"
+HEXX_TMP="$TMPDIR/hexxagon"
+mkdir -p "$HEXX_TMP"
+
+HEXX_SRCS=$(ls "$HEXX_DIR"/*.cpp 2>/dev/null)
+for src in $HEXX_SRCS; do
+    base=$(basename "$src" .cpp)
+    $CLANGXX $CXX_FLAGS -std=c++14 \
+        -I"$HEXX_DIR" \
+        -c "$src" -o "$HEXX_TMP/${base}.bc" 2>/dev/null || {
+        echo "  WARN: Failed to compile $base.cpp"
+    }
+done
+bc_count=$(ls "$HEXX_TMP"/*.bc 2>/dev/null | wc -l)
+if [ "$bc_count" -gt 0 ]; then
+    $LLVM_LINK "$HEXX_TMP"/*.bc -o "$OUTDIR/hexxagon.bc" 2>/dev/null && {
+        echo "  OK: hexxagon.bc ($(stat -c%s "$OUTDIR/hexxagon.bc") bytes)"
+        compile_ok=$((compile_ok + 1))
+    } || {
+        echo "  FAIL: llvm-link failed for hexxagon"
+        compile_fail=$((compile_fail + 1))
+    }
+else
+    echo "  FAIL: No .bc files produced for hexxagon"
+    compile_fail=$((compile_fail + 1))
+fi
+
+#--------------------------------------------------------------------
+# 6. PAQ8p - Data Compression (single C++ file)
+#--------------------------------------------------------------------
+echo "--- [6/7] PAQ8p ---"
+PAQ_DIR="$TESTSUITE/MultiSource/Benchmarks/PAQ8p"
+$CLANGXX $CXX_FLAGS -DNOASM -DLLVM \
+    -c "$PAQ_DIR/paq8p.cpp" -o "$OUTDIR/paq8p.bc" 2>/dev/null && {
+    echo "  OK: paq8p.bc ($(stat -c%s "$OUTDIR/paq8p.bc") bytes)"
+    compile_ok=$((compile_ok + 1))
+} || {
+    echo "  FAIL: paq8p.cpp"
+    compile_fail=$((compile_fail + 1))
+}
+
+#--------------------------------------------------------------------
+# 7. Fhourstones - Game Tree Search (C)
+#--------------------------------------------------------------------
+echo "--- [7/7] Fhourstones ---"
+FHOUR_DIR="$TESTSUITE/MultiSource/Benchmarks/Fhourstones"
+FHOUR_TMP="$TMPDIR/fhourstones"
+mkdir -p "$FHOUR_TMP"
+
+for src in "$FHOUR_DIR"/c4.c "$FHOUR_DIR"/play.c "$FHOUR_DIR"/trans.c; do
+    if [ -f "$src" ]; then
+        base=$(basename "$src" .c)
+        $CLANG $C_FLAGS -I"$FHOUR_DIR" \
+            -c "$src" -o "$FHOUR_TMP/${base}.bc" 2>/dev/null || {
+            echo "  WARN: Failed to compile $(basename $src)"
+        }
+    fi
+done
+bc_count=$(ls "$FHOUR_TMP"/*.bc 2>/dev/null | wc -l)
+if [ "$bc_count" -gt 0 ]; then
+    $LLVM_LINK "$FHOUR_TMP"/*.bc -o "$OUTDIR/fhourstones.bc" 2>/dev/null && {
+        echo "  OK: fhourstones.bc ($(stat -c%s "$OUTDIR/fhourstones.bc") bytes)"
+        compile_ok=$((compile_ok + 1))
+    } || {
+        echo "  FAIL: llvm-link failed for Fhourstones"
+        compile_fail=$((compile_fail + 1))
+    }
+else
+    echo "  FAIL: No .bc files produced for Fhourstones"
+    compile_fail=$((compile_fail + 1))
+fi
+
+#--------------------------------------------------------------------
+# Summary
+#--------------------------------------------------------------------
+echo ""
+echo "=== Summary ==="
+echo "Compiled: $compile_ok / 7"
+echo "Failed:   $compile_fail / 7"
+echo ""
+echo "Output .bc files:"
+ls -lh "$OUTDIR"/*.bc 2>/dev/null || echo "  (none)"
+
+# Cleanup
+rm -rf "$TMPDIR"

From 52288fa3dc009da910aae2be6a27dd59de2243f8 Mon Sep 17 00:00:00 2001
From: Ashvin Verma <ashvin.verma@Berkeley.EDU>
Date: Thu, 19 Feb 2026 18:38:05 -0800
Subject: [PATCH 2/8] [update] Cookbook submodule: LLVM inlining recipe

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 data/cookbook | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/cookbook b/data/cookbook
index 94d365c..414c7d7 160000
--- a/data/cookbook
+++ b/data/cookbook
@@ -1 +1 @@
-Subproject commit 94d365c80639951e4ae92f056789c1475940b077
+Subproject commit 414c7d788c23af6a868295602cee213854fe8f93

From c33092b221de854c2e354ca196ba808c53b40631 Mon Sep 17 00:00:00 2001
From: Ashvin Verma <ashvin.verma@Berkeley.EDU>
Date: Thu, 19 Feb 2026 21:17:05 -0800
Subject: [PATCH 3/8] [update] Cookbook submodule: LLVM inlining recipe in
 mlirAgent_recipes

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 data/cookbook | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/cookbook b/data/cookbook
index 414c7d7..5b0a6d1 160000
--- a/data/cookbook
+++ b/data/cookbook
@@ -1 +1 @@
-Subproject commit 414c7d788c23af6a868295602cee213854fe8f93
+Subproject commit 5b0a6d1be77585b5fa709d2753d5550a543a0819

From a0a97277560938eae05b9009526b3f1934054d52 Mon Sep 17 00:00:00 2001
From: Ashvin Verma <ashvin.verma@Berkeley.EDU>
Date: Fri, 20 Feb 2026 17:47:32 -0800
Subject: [PATCH 4/8] [add] Loop unrolling evolution task

Add loop_unrolling task for evolving LLVM's loop unroll heuristic via
OpenEvolve. Includes evaluator (5x speedup + 1x binary reduction scoring),
seed program with EVOLVE-BLOCK markers, and task metadata. Requires
corresponding LLVM hook (EvolvedLoopUnroll.{h,cpp} + LoopUnrollPass.cpp
changes) built separately.

Exp G results: best score 58.06 at iter 4 (avg_speedup=1.116,
ThresholdScale=76). Real signal ~1.3% speedup excluding sqlite3 noise.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/mlirAgent/evolve/manual_run.py            |   6 +
 .../evolve/tasks/loop_unrolling/__init__.py   |   0
 .../evolve/tasks/loop_unrolling/evaluate.py   | 180 ++++++++++++++++++
 .../evolve/tasks/loop_unrolling/initial.cpp   |  78 ++++++++
 .../evolve/tasks/loop_unrolling/task.yaml     |   9 +
 5 files changed, 273 insertions(+)
 create mode 100644 src/mlirAgent/evolve/tasks/loop_unrolling/__init__.py
 create mode 100644 src/mlirAgent/evolve/tasks/loop_unrolling/evaluate.py
 create mode 100644 src/mlirAgent/evolve/tasks/loop_unrolling/initial.cpp
 create mode 100644 src/mlirAgent/evolve/tasks/loop_unrolling/task.yaml

diff --git a/src/mlirAgent/evolve/manual_run.py b/src/mlirAgent/evolve/manual_run.py
index 0bbb326..59cb285 100644
--- a/src/mlirAgent/evolve/manual_run.py
+++ b/src/mlirAgent/evolve/manual_run.py
@@ -51,6 +51,12 @@
         "file_suffix": ".cpp",
         "language": "cpp",
     },
+    "loop_unrolling": {
+        "initial_program": str(Path(__file__).parent / "tasks/loop_unrolling/initial.cpp"),
+        "evaluator": str(Path(__file__).parent / "tasks/loop_unrolling/evaluate.py"),
+        "file_suffix": ".cpp",
+        "language": "cpp",
+    },
 }
 
 
diff --git a/src/mlirAgent/evolve/tasks/loop_unrolling/__init__.py b/src/mlirAgent/evolve/tasks/loop_unrolling/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/mlirAgent/evolve/tasks/loop_unrolling/evaluate.py b/src/mlirAgent/evolve/tasks/loop_unrolling/evaluate.py
new file mode 100644
index 0000000..d16c628
--- /dev/null
+++ b/src/mlirAgent/evolve/tasks/loop_unrolling/evaluate.py
@@ -0,0 +1,180 @@
+"""Evaluator for LLVM loop unrolling heuristic evolution.
+
+Called by OpenEvolve as: python evaluate.py <program_path>
+
+Pipeline:
+1. Patch evolved C++ heuristic into LLVM source tree
+2. Rebuild opt incrementally (ninja)
+3. For each CTMark benchmark .bc file:
+   a. opt -O2 -use-evolved-loop-unroll bench.bc -o bench_opt.bc
+   b. llc -O2 -filetype=obj -relocation-model=pic bench_opt.bc -o bench.o
+   c. gcc bench.o -o bench -lm -lpthread -ldl [-lstdc++ for C++]
+   d. Measure linked binary size
+   e. Run benchmark with reference inputs and measure wall-clock time
+4. Score = 5.0 * speedup_pct + binary_reduction_pct
+   (loop unrolling is runtime-focused; binary growth expected, penalized at 1/5th)
+"""
+
+import json
+import os
+import subprocess
+import sys
+import tempfile
+import time
+from pathlib import Path
+
+try:
+    from ..llvm_bench import (
+        EvalConfig, build_llvm, eval_benchmarks, extract_hyperparams,
+        find_benchmarks, load_baseline, optuna_tune, patch_source,
+        restore_source,
+    )
+except ImportError:
+    # Standalone loading by OpenEvolve's importlib (no parent package)
+    sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+    from llvm_bench import (
+        EvalConfig, build_llvm, eval_benchmarks, extract_hyperparams,
+        find_benchmarks, load_baseline, optuna_tune, patch_source,
+        restore_source,
+    )
+
+_EVAL_DIR = Path(__file__).resolve().parent
+
+
+def _score(total_binary, baseline_total_binary, speedups):
+    """Loop unroll score: 5x speedup + 1x binary reduction."""
+    binary_pct = (
+        100.0 * (baseline_total_binary - total_binary) / baseline_total_binary
+        if baseline_total_binary > 0 else 0.0
+    )
+    avg_speedup = sum(speedups) / len(speedups) if speedups else 0.0
+    speedup_pct = (avg_speedup - 1.0) * 100 if avg_speedup > 0 else 0.0
+    return round(5.0 * speedup_pct + binary_pct, 4)
+
+
+def evaluate(program_path: str, config: EvalConfig = None) -> dict:
+    """Evaluate an evolved LLVM loop unrolling heuristic.
+
+    Score = 5x runtime speedup % + 1x binary size reduction % vs baseline.
+    Loop unrolling primarily affects runtime performance; binary size may grow.
+    """
+    if config is None:
+        config = EvalConfig.from_env(
+            "llvm/lib/Transforms/Scalar/EvolvedLoopUnroll.cpp",
+            baseline_file=str(_EVAL_DIR / "baseline_unroll.json"),
+        )
+
+    if not config.llvm_src or not config.build_dir:
+        return {
+            "combined_score": 0.0,
+            "error": "LLVM_SRC_PATH and EVOLVE_BUILD_DIR must be set",
+        }
+
+    result = {
+        "combined_score": 0.0,
+        "build_success": False,
+        "build_time": 0.0,
+        "total_binary_size": 0,
+        "binary_reduction_pct": 0.0,
+        "avg_speedup": 0.0,
+        "benchmark_details": {},
+        "error": None,
+    }
+
+    try:
+        dest, backup = patch_source(program_path, config)
+    except OSError as e:
+        result["error"] = f"Patch failed: {e}"
+        return result
+
+    try:
+        ok, build_time, err = build_llvm(config)
+        result["build_time"] = build_time
+        result["build_success"] = ok
+        if not ok:
+            result["error"] = err
+            return result
+
+        baseline = load_baseline(config)
+        opt_path = os.path.join(config.build_dir, "bin", "opt")
+        llc_path = os.path.join(config.build_dir, "bin", "llc")
+        benchmarks = find_benchmarks(Path(config.testsuite_dir))
+
+        if not benchmarks:
+            result["error"] = "No benchmark .bc files found in testsuite/"
+            return result
+
+        # Extract hyperparams and optionally run Optuna
+        with open(program_path) as f:
+            hyperparams = extract_hyperparams(f.read())
+
+        evolved_opt_flags = ["-use-evolved-loop-unroll"]
+
+        if hyperparams and config.optuna_trials > 0:
+            print(f"  Optuna: tuning {len(hyperparams)} hyperparams "
+                  f"({config.optuna_trials} trials)...")
+            tune_start = time.time()
+            best_sub, best_params, extra_flags = optuna_tune(
+                opt_path, llc_path, benchmarks, baseline,
+                n_trials=config.optuna_trials, hyperparams=hyperparams,
+                data_dir=config.data_dir, score_fn=_score,
+                opt_timeout=config.opt_timeout,
+                optuna_subset=config.optuna_subset,
+                base_opt_flags=evolved_opt_flags, flag_target="opt",
+            )
+            result["optuna_trials"] = config.optuna_trials
+            result["optuna_subset_score"] = best_sub
+            result["tuned_params"] = best_params
+            result["tune_time"] = round(time.time() - tune_start, 2)
+            print(f"  Optuna done in {result['tune_time']}s. "
+                  f"Subset score={best_sub:.2f}, params={best_params}")
+            evolved_opt_flags.extend(extra_flags)
+        elif hyperparams:
+            result["optuna_trials"] = 0
+            result["tuned_params"] = {}
+
+        # Final evaluation on all benchmarks
+        with tempfile.TemporaryDirectory(prefix="unroll_eval_") as tmp_dir:
+            score, ev = eval_benchmarks(
+                benchmarks, opt_path, llc_path, baseline, tmp_dir,
+                config.data_dir, _score,
+                evolved_opt_flags=evolved_opt_flags,
+                opt_timeout=config.opt_timeout,
+            )
+
+        result["combined_score"] = score
+        result["benchmark_details"] = ev["details"]
+        result["total_binary_size"] = ev["total_binary"]
+
+        if ev["baseline_total_binary"] > 0:
+            result["binary_reduction_pct"] = round(
+                100.0 * (ev["baseline_total_binary"] - ev["total_binary"])
+                / ev["baseline_total_binary"], 4
+            )
+        if ev["speedups"]:
+            result["avg_speedup"] = round(
+                sum(ev["speedups"]) / len(ev["speedups"]), 4
+            )
+        if ev["errors"]:
+            result["error"] = "; ".join(ev["errors"])
+
+    except subprocess.TimeoutExpired:
+        result["error"] = "Build timed out (600s)"
+    finally:
+        restore_source(dest, backup)
+
+    return result
+
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Evaluate LLVM loop unroll heuristic")
+    parser.add_argument("program_path", help="Path to evolved C++ source")
+    EvalConfig.add_arguments(parser)
+    args = parser.parse_args()
+    config = EvalConfig.from_args(
+        args, "llvm/lib/Transforms/Scalar/EvolvedLoopUnroll.cpp",
+        baseline_file=str(_EVAL_DIR / "baseline_unroll.json"),
+    )
+    metrics = evaluate(args.program_path, config=config)
+    print(json.dumps(metrics, indent=2))
diff --git a/src/mlirAgent/evolve/tasks/loop_unrolling/initial.cpp b/src/mlirAgent/evolve/tasks/loop_unrolling/initial.cpp
new file mode 100644
index 0000000..8e44ce3
--- /dev/null
+++ b/src/mlirAgent/evolve/tasks/loop_unrolling/initial.cpp
@@ -0,0 +1,78 @@
+//===- EvolvedLoopUnroll.cpp - Evolved loop unroll heuristic ------*- C++ -*-===//
+//
+// Evolved by OpenEvolve / ShinkaEvolve.
+//
+// This file is automatically patched by the evaluator during evolution.
+// The EVOLVE-BLOCK markers delimit the region that the LLM modifies.
+//
+// Convention: return an unroll factor >= 1.
+//   1 = don't unroll, >1 = unroll by that factor.
+//
+// Available LoopUnrollFeatures fields:
+//   LoopSize            - instruction count of the rolled loop body
+//   TripCount           - exact trip count (0 if unknown)
+//   MaxTripCount        - upper bound on trip count (0 if unknown)
+//   TripMultiple        - trip count is guaranteed a multiple of this
+//   Depth               - loop nesting depth (1 = outermost)
+//   NumBlocks           - number of basic blocks in the loop
+//   BEInsns             - backend edge instructions (~2)
+//   Threshold           - target unroll cost threshold
+//   PartialThreshold    - partial unroll cost threshold
+//   MaxCount            - maximum allowed unroll factor
+//   NumInlineCandidates - number of inline candidates in loop body
+//   IsInnermost         - true if this is an innermost loop
+//   HasExactTripCount   - true if TripCount > 0
+//   MaxOrZero           - true if loop runs max trip count or zero times
+//   AllowPartial        - true if partial unrolling is allowed
+//   AllowRuntime        - true if runtime unrolling is allowed
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/EvolvedLoopUnroll.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+// Tunable threshold scale exposed as cl::opt for Optuna inner-loop tuning
+// [hyperparam]: ae-unroll-threshold-scale, int, 50, 200
+static cl::opt<int> ThresholdScale("ae-unroll-threshold-scale", cl::init(100), cl::Hidden,
+    cl::desc("Scale factor for unroll threshold (percent, 100 = default)"));
+
+// EVOLVE-BLOCK-START loop_unroll_heuristic
+unsigned llvm::computeEvolvedLoopUnrollCount(const LoopUnrollFeatures &F) {
+    unsigned EffThreshold = F.Threshold * ThresholdScale / 100;
+
+    // 1. Full unroll: if exact trip count known and unrolled size fits threshold
+    if (F.HasExactTripCount && F.TripCount > 1) {
+        unsigned UnrolledSize = F.LoopSize * F.TripCount;
+        if (UnrolledSize <= EffThreshold) {
+            return F.TripCount;
+        }
+    }
+
+    // 2. Partial unroll: if loop is small enough and we have trip info
+    if (F.AllowPartial && F.LoopSize < F.PartialThreshold) {
+        unsigned MaxUnroll = (F.PartialThreshold - F.BEInsns) /
+                             (F.LoopSize - F.BEInsns);
+        if (MaxUnroll < 2)
+            return 1;
+
+        // Clamp to power of 2 for clean remainder handling
+        unsigned Count = 1;
+        while (Count * 2 <= MaxUnroll)
+            Count *= 2;
+
+        // If we know the trip count, align to it
+        if (F.HasExactTripCount) {
+            while (Count > 1 && F.TripCount % Count != 0)
+                Count >>= 1;
+        }
+
+        if (Count > 1)
+            return Count;
+    }
+
+    // 3. Don't unroll
+    return 1;
+}
+// EVOLVE-BLOCK-END loop_unroll_heuristic
diff --git a/src/mlirAgent/evolve/tasks/loop_unrolling/task.yaml b/src/mlirAgent/evolve/tasks/loop_unrolling/task.yaml
new file mode 100644
index 0000000..69c9805
--- /dev/null
+++ b/src/mlirAgent/evolve/tasks/loop_unrolling/task.yaml
@@ -0,0 +1,9 @@
+name: loop_unrolling
+description: >
+  Evolve LLVM's loop unrolling heuristic (computeUnrollCount) to improve
+  runtime performance on CTMark benchmarks. The evolved function decides
+  whether and how much to unroll each loop based on 16 extracted features.
+evolve_blocks:
+  - loop_unroll_heuristic
+target_file: llvm/lib/Transforms/Scalar/EvolvedLoopUnroll.cpp
+language: cpp

From a4c852ca88673a14f5e75ed2617b409fade5080d Mon Sep 17 00:00:00 2001
From: Ashvin Verma <ashvin.verma@Berkeley.EDU>
Date: Sun, 22 Feb 2026 23:26:31 -0800
Subject: [PATCH 5/8] [add] ASI feedback (GEPA-style text gradients) + GEPA
 integration

Add Actionable Side Information to evaluator output so the LLM receives
structured diagnostic feedback alongside raw scores. Three always-on
tiers: score decomposition with signal classification, compiler stats
delta via -stats flag, and runtime variance from all timings. Two
optional tiers gated behind config flags: perf stat hardware counters
and optimization remarks. Also add GEPA adapter files (ManualLM,
evaluator bridge, CLI runner) for comparison experiments.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/mlirAgent/evolve/gepa_adapter.py          |  83 +++
 src/mlirAgent/evolve/gepa_manual_lm.py        |  60 +++
 src/mlirAgent/evolve/gepa_run.py              | 147 +++++
 src/mlirAgent/evolve/tasks/llvm_bench.py      | 508 +++++++++++++++++-
 .../evolve/tasks/llvm_inlining/evaluate.py    |  35 +-
 .../evolve/tasks/loop_unrolling/evaluate.py   |  35 +-
 .../tasks/regalloc_priority/evaluate.py       |  35 +-
 7 files changed, 867 insertions(+), 36 deletions(-)
 create mode 100644 src/mlirAgent/evolve/gepa_adapter.py
 create mode 100644 src/mlirAgent/evolve/gepa_manual_lm.py
 create mode 100644 src/mlirAgent/evolve/gepa_run.py

diff --git a/src/mlirAgent/evolve/gepa_adapter.py b/src/mlirAgent/evolve/gepa_adapter.py
new file mode 100644
index 0000000..b1ff02c
--- /dev/null
+++ b/src/mlirAgent/evolve/gepa_adapter.py
@@ -0,0 +1,83 @@
+"""GEPA adapter for LLVM heuristic evolution.
+
+Bridges GEPA's ``optimize_anything`` API with our LLVM benchmark evaluator.
+Handles EVOLVE-BLOCK extraction, code injection, and score retrieval.
+"""
+
+import os
+import re
+import sys
+import tempfile
+from pathlib import Path
+
+# Ensure tasks package is importable
+sys.path.insert(0, str(Path(__file__).resolve().parent))
+
+from tasks.llvm_bench import EvalConfig
+
+_EVOLVE_BLOCK_RE = re.compile(
+    r"(// EVOLVE-BLOCK-START\n)(.*?)(// EVOLVE-BLOCK-END)",
+    re.DOTALL,
+)
+
+
+def extract_evolve_block(code):
+    """Extract the EVOLVE-BLOCK content from C++ source code."""
+    m = _EVOLVE_BLOCK_RE.search(code)
+    if m:
+        return m.group(2)
+    return code
+
+
+def inject_evolve_block(template, block):
+    """Replace EVOLVE-BLOCK in *template* with new *block* content."""
+    return _EVOLVE_BLOCK_RE.sub(
+        lambda m: m.group(1) + block + m.group(3),
+        template,
+    )
+
+
+def make_evaluator(task_name, config=None):
+    """Create an evaluator function for GEPA.
+
+    Returns a callable ``code_str -> float`` that compiles and benchmarks
+    the given C++ source code, returning the ``combined_score``.
+    """
+    if task_name == "llvm_inlining":
+        from tasks.llvm_inlining.evaluate import evaluate
+        if config is None:
+            config = EvalConfig.from_env(
+                "llvm/lib/Analysis/EvolvedInlineCost.cpp"
+            )
+    elif task_name == "loop_unrolling":
+        from tasks.loop_unrolling.evaluate import evaluate
+        if config is None:
+            config = EvalConfig.from_env(
+                "llvm/lib/Transforms/Scalar/EvolvedLoopUnroll.cpp"
+            )
+    elif task_name == "regalloc_priority":
+        from tasks.regalloc_priority.evaluate import evaluate
+        if config is None:
+            config = EvalConfig.from_env(
+                "llvm/lib/CodeGen/EvolvedRegAllocPriority.cpp"
+            )
+    else:
+        raise ValueError(f"Unknown task: {task_name}")
+
+    def evaluator(code_str):
+        """Write code to temp file, evaluate, return score."""
+        with tempfile.NamedTemporaryFile(
+            mode="w", suffix=".cpp", delete=False, prefix="gepa_"
+        ) as f:
+            f.write(code_str)
+            tmp_path = f.name
+        try:
+            result = evaluate(tmp_path, config=config)
+            if isinstance(result, dict):
+                return result.get("combined_score", 0.0)
+            # EvaluationResult
+            return result.metrics.get("combined_score", 0.0)
+        finally:
+            os.unlink(tmp_path)
+
+    return evaluator
diff --git a/src/mlirAgent/evolve/gepa_manual_lm.py b/src/mlirAgent/evolve/gepa_manual_lm.py
new file mode 100644
index 0000000..03bd090
--- /dev/null
+++ b/src/mlirAgent/evolve/gepa_manual_lm.py
@@ -0,0 +1,60 @@
+"""File-based LLM for GEPA -- writes prompts to disk, polls for responses.
+
+GEPA's LM interface is a simple synchronous callable:
+``__call__(prompt: str | list[dict]) -> str``
+
+This class writes each prompt as a Markdown file and waits for the user
+(or an agent) to create a corresponding ``.response.md`` file.
+
+Usage::
+
+    lm = ManualLM("gepa_prompts")
+    response = lm("Write improved code...")  # blocks until response file exists
+"""
+
+import os
+import time
+
+
+class ManualLM:
+    """File-based LLM for GEPA.
+
+    Writes prompts as ``prompt_NNN.md`` and polls for ``prompt_NNN.response.md``.
+    """
+
+    def __init__(self, prompts_dir="gepa_prompts", poll_interval=2.0):
+        self.prompts_dir = prompts_dir
+        self.poll_interval = poll_interval
+        self._counter = 0
+        os.makedirs(prompts_dir, exist_ok=True)
+
+    def __call__(self, prompt):
+        """Send prompt and block until response file appears."""
+        self._counter += 1
+        prompt_path = os.path.join(
+            self.prompts_dir, f"prompt_{self._counter:03d}.md"
+        )
+        response_path = os.path.join(
+            self.prompts_dir, f"prompt_{self._counter:03d}.response.md"
+        )
+
+        with open(prompt_path, "w") as f:
+            if isinstance(prompt, str):
+                f.write(f"# User\n\n{prompt}\n")
+            else:
+                # list[dict] format: [{"role": "system", "content": "..."}, ...]
+                for msg in prompt:
+                    role = msg.get("role", "user").title()
+                    content = msg.get("content", "")
+                    f.write(f"# {role}\n\n{content}\n\n")
+
+        print(f"  [ManualLM] Prompt written to {prompt_path}")
+        print(f"  [ManualLM] Waiting for response at {response_path}...")
+
+        while not os.path.exists(response_path):
+            time.sleep(self.poll_interval)
+
+        with open(response_path) as f:
+            response = f.read().strip()
+        print(f"  [ManualLM] Got response ({len(response)} chars)")
+        return response
diff --git a/src/mlirAgent/evolve/gepa_run.py b/src/mlirAgent/evolve/gepa_run.py
new file mode 100644
index 0000000..cd38202
--- /dev/null
+++ b/src/mlirAgent/evolve/gepa_run.py
@@ -0,0 +1,147 @@
+"""CLI runner for GEPA on LLVM evolution tasks.
+
+Usage::
+
+    python gepa_run.py --task llvm_inlining [--prompts-dir gepa_prompts]
+
+Requires ``pip install gepa`` and environment variables:
+  - LLVM_SRC_PATH: path to LLVM source tree
+  - EVOLVE_BUILD_DIR: path to LLVM build directory
+"""
+
+import argparse
+import json
+import os
+import sys
+from pathlib import Path
+
+# Ensure local packages are importable
+sys.path.insert(0, str(Path(__file__).resolve().parent))
+
+# Task → initial source file mapping
+_TASK_INITIAL = {
+    "llvm_inlining": "tasks/llvm_inlining/initial.cpp",
+    "loop_unrolling": "tasks/loop_unrolling/initial.cpp",
+    "regalloc_priority": "tasks/regalloc_priority/initial.cpp",
+}
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Run GEPA on LLVM heuristic evolution tasks"
+    )
+    parser.add_argument(
+        "--task", required=True,
+        choices=list(_TASK_INITIAL.keys()),
+        help="Task to optimize",
+    )
+    parser.add_argument(
+        "--initial", default=None,
+        help="Path to initial C++ source (overrides default)",
+    )
+    parser.add_argument(
+        "--prompts-dir", default="gepa_prompts",
+        help="Directory for prompt/response files (default: gepa_prompts)",
+    )
+    parser.add_argument(
+        "--poll-interval", type=float, default=2.0,
+        help="Poll interval for response files in seconds (default: 2.0)",
+    )
+    parser.add_argument(
+        "--max-iterations", type=int, default=10,
+        help="Maximum GEPA iterations (default: 10)",
+    )
+    parser.add_argument(
+        "--output", default=None,
+        help="Path to save best code (default: tasks/<task>/gepa_best.cpp)",
+    )
+    args = parser.parse_args()
+
+    # Import GEPA
+    try:
+        from gepa import optimize_anything
+    except ImportError:
+        print("Error: gepa not installed. Run: pip install gepa")
+        sys.exit(1)
+
+    from gepa_manual_lm import ManualLM
+    from gepa_adapter import make_evaluator
+
+    # Find initial program
+    base_dir = Path(__file__).resolve().parent
+    if args.initial:
+        initial_file = Path(args.initial)
+    else:
+        initial_file = base_dir / _TASK_INITIAL[args.task]
+
+    if not initial_file.exists():
+        print(f"Error: initial source not found at {initial_file}")
+        sys.exit(1)
+
+    with open(initial_file) as f:
+        initial_code = f.read()
+
+    # Create LM and evaluator
+    lm = ManualLM(
+        prompts_dir=args.prompts_dir,
+        poll_interval=args.poll_interval,
+    )
+    evaluator = make_evaluator(args.task)
+
+    print(f"{'=' * 60}")
+    print(f"GEPA Runner")
+    print(f"  Task:           {args.task}")
+    print(f"  Initial code:   {initial_file}")
+    print(f"  Prompts dir:    {args.prompts_dir}")
+    print(f"  Max iterations: {args.max_iterations}")
+    print(f"{'=' * 60}")
+    print()
+
+    # Evaluate initial program first
+    print("Evaluating initial program...")
+    initial_score = evaluator(initial_code)
+    print(f"  Initial score: {initial_score}")
+    print()
+
+    # Run GEPA
+    result = optimize_anything(
+        initial_code=initial_code,
+        evaluate_fn=evaluator,
+        lm=lm,
+        max_iterations=args.max_iterations,
+    )
+
+    print()
+    print(f"{'=' * 60}")
+    print(f"GEPA Results:")
+    print(f"  Best score:     {result.best_score}")
+    print(f"  Initial score:  {initial_score}")
+    print(f"  Improvement:    {result.best_score - initial_score:+.4f}")
+    print(f"  Iterations:     {result.iterations}")
+    print(f"{'=' * 60}")
+
+    # Save best code
+    output_path = args.output or str(
+        base_dir / "tasks" / args.task / "gepa_best.cpp"
+    )
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    with open(output_path, "w") as f:
+        f.write(result.best_code)
+    print(f"Best code saved to: {output_path}")
+
+    # Save summary
+    summary = {
+        "task": args.task,
+        "initial_score": initial_score,
+        "best_score": result.best_score,
+        "iterations": result.iterations,
+        "output_path": output_path,
+    }
+    summary_path = os.path.join(args.prompts_dir, "summary.json")
+    with open(summary_path, "w") as f:
+        json.dump(summary, f, indent=2)
+    print(f"Summary saved to: {summary_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/mlirAgent/evolve/tasks/llvm_bench.py b/src/mlirAgent/evolve/tasks/llvm_bench.py
index 9371a23..20957cb 100644
--- a/src/mlirAgent/evolve/tasks/llvm_bench.py
+++ b/src/mlirAgent/evolve/tasks/llvm_bench.py
@@ -6,6 +6,7 @@
 """
 
 import json
+import math
 import os
 import re
 import shutil
@@ -126,6 +127,9 @@ class EvalConfig:
     optuna_subset: list = field(default_factory=lambda: ["sqlite3", "spass", "tramp3d-v4"])
     ninja: str = ""
     build_targets: str = "bin/opt bin/llc"
+    enable_stats: bool = True         # Tier 2: -stats flag (zero overhead)
+    enable_perf_counters: bool = False  # Tier 4: perf stat (needs permissions)
+    enable_remarks: bool = False      # Tier 5: -pass-remarks-output (adds time)
 
     def __post_init__(self):
         if not self.testsuite_dir:
@@ -153,6 +157,15 @@ def from_env(cls, target_file: str, **overrides) -> "EvalConfig":
             "target_file": os.environ.get("EVOLVE_TARGET_FILE", target_file),
             "opt_timeout": int(os.environ.get("EVOLVE_OPT_TIMEOUT", "120")),
             "optuna_trials": int(os.environ.get("EVOLVE_OPTUNA_TRIALS", "20")),
+            "enable_stats": os.environ.get(
+                "EVOLVE_ENABLE_STATS", "1"
+            ).lower() in ("1", "true"),
+            "enable_perf_counters": os.environ.get(
+                "EVOLVE_ENABLE_PERF", "0"
+            ).lower() in ("1", "true"),
+            "enable_remarks": os.environ.get(
+                "EVOLVE_ENABLE_REMARKS", "0"
+            ).lower() in ("1", "true"),
         }
         defaults.update(overrides)
         return cls(**defaults)
@@ -235,16 +248,116 @@ def get_text_size(obj_path: str) -> int:
     return os.path.getsize(obj_path) if os.path.exists(obj_path) else 0
 
 
+# ---------------------------------------------------------------------------
+# Stats / perf parsing
+# ---------------------------------------------------------------------------
+
+# Matches LLVM -stats output: "  21479 inline  - Number of functions inlined"
+_STATS_RE = re.compile(r"^\s*(\d+)\s+([\w.-]+)\s+-\s+(.+)$", re.MULTILINE)
+
+
+def parse_stats(stderr_text):
+    """Parse LLVM ``-stats`` output from stderr.
+
+    Returns dict mapping ``"pass - description"`` to integer count.
+    """
+    stats = {}
+    for m in _STATS_RE.finditer(stderr_text):
+        count = int(m.group(1))
+        pass_name = m.group(2)
+        description = m.group(3).strip()
+        key = f"{pass_name} - {description}"
+        stats[key] = count
+    return stats
+
+
+def parse_perf_output(perf_stderr):
+    """Parse ``perf stat -x,`` CSV output.
+
+    Format per line: ``value,unit,event_name,...``
+    """
+    counters = {}
+    for line in perf_stderr.strip().split("\n"):
+        parts = line.split(",")
+        if len(parts) >= 3:
+            try:
+                value = int(parts[0].strip())
+                event = parts[2].strip()
+                counters[event] = value
+            except (ValueError, IndexError):
+                continue
+    return counters
+
+
+def run_perf_stat(name, binary_path, tmp_dir, data_dir,
+                  counters=None):
+    """Run a single ``perf stat`` measurement. Returns dict of counter values."""
+    if counters is None:
+        counters = ["instructions", "cycles", "cache-misses", "branch-misses"]
+    config = BENCH_RUN_CONFIGS.get(name)
+    if not config:
+        return {}
+
+    run_dir = os.path.join(tmp_dir, f"{name}_perf")
+    os.makedirs(run_dir, exist_ok=True)
+    run_binary = os.path.join(run_dir, name)
+    shutil.copy2(binary_path, run_binary)
+    os.chmod(run_binary, 0o755)
+
+    bench_data = Path(data_dir) / name
+
+    # Copy data files (same logic as run_benchmark)
+    if config.get("data_subdir") and bench_data.exists():
+        for item in bench_data.iterdir():
+            dst = os.path.join(run_dir, item.name)
+            if item.is_dir():
+                shutil.copytree(str(item), dst, dirs_exist_ok=True)
+            else:
+                shutil.copy2(str(item), dst)
+    elif config.get("data_files") and bench_data.exists():
+        for f in config["data_files"]:
+            src = bench_data / f
+            if src.exists():
+                shutil.copy2(str(src), os.path.join(run_dir, f))
+
+    cmd = ["perf", "stat", "-e", ",".join(counters), "-x", ",",
+           run_binary] + config.get("args", [])
+    timeout = config.get("timeout", 30)
+    stdin_file = None
+    if config.get("stdin_file") and bench_data.exists():
+        stdin_file = bench_data / config["stdin_file"]
+
+    stdin_fh = None
+    try:
+        if stdin_file and stdin_file.exists():
+            stdin_fh = open(str(stdin_file), "r")
+        proc = subprocess.run(
+            cmd, capture_output=True, text=True, timeout=timeout,
+            cwd=run_dir, stdin=stdin_fh,
+        )
+        return parse_perf_output(proc.stderr)
+    except (subprocess.TimeoutExpired, OSError):
+        return {}
+    finally:
+        if stdin_fh:
+            stdin_fh.close()
+
+
 # ---------------------------------------------------------------------------
 # Benchmark execution
 # ---------------------------------------------------------------------------
 
 def run_benchmark(name: str, binary_path: str, tmp_dir: str, data_dir: str,
                   num_runs: int = 5):
-    """Run a benchmark with reference inputs; return median wall-clock seconds or None."""
+    """Run a benchmark with reference inputs.
+
+    Returns ``(median, all_timings)`` where *median* is the median
+    wall-clock seconds (or ``None`` on failure) and *all_timings* is the
+    sorted list of successful run durations.
+    """
     config = BENCH_RUN_CONFIGS.get(name)
     if not config:
-        return None
+        return None, []
 
     run_dir = os.path.join(tmp_dir, f"{name}_run")
     os.makedirs(run_dir, exist_ok=True)
@@ -295,14 +408,15 @@ def run_benchmark(name: str, binary_path: str, tmp_dir: str, data_dir: str,
                 stdin_fh.close()
 
     if not timings:
-        return None
+        return None, []
     timings.sort()
-    return timings[len(timings) // 2]
+    return timings[len(timings) // 2], timings
 
 
 def compile_benchmark(bc_path, opt_path, llc_path, tmp_dir, data_dir,
                       evolved_opt_flags=None, evolved_llc_flags=None,
-                      opt_timeout=120):
+                      opt_timeout=120, enable_stats=False,
+                      enable_perf=False):
     """Compile a .bc file through ``opt -> llc -> gcc``.
 
     Callers pass evolved flags to *opt*, *llc*, or both:
@@ -310,15 +424,23 @@ def compile_benchmark(bc_path, opt_path, llc_path, tmp_dir, data_dir,
     - Inlining: ``evolved_opt_flags=["-use-evolved-inline-cost", ...]``
     - RegAlloc: ``evolved_llc_flags=["-use-evolved-regalloc-priority", ...]``
 
-    Returns ``(text_size, binary_size, runtime, error)`` 4-tuple.
+    Returns a dict with keys: ``text_size``, ``binary_size``, ``runtime``,
+    ``timings``, ``opt_stats``, ``llc_stats``, ``perf_counters``, ``error``.
     """
     name = bc_path.stem
     opt_bc = os.path.join(tmp_dir, f"{name}_opt.bc")
     obj_file = os.path.join(tmp_dir, f"{name}.o")
     binary = os.path.join(tmp_dir, name)
 
+    def _err(msg):
+        return {"text_size": None, "binary_size": None, "runtime": None,
+                "timings": [], "opt_stats": {}, "llc_stats": {},
+                "perf_counters": {}, "error": msg}
+
     # opt pass
     opt_cmd = [str(opt_path), "-O2"]
+    if enable_stats:
+        opt_cmd.append("-stats")
     if evolved_opt_flags:
         opt_cmd.extend(evolved_opt_flags)
     opt_cmd += [str(bc_path), "-o", opt_bc]
@@ -328,12 +450,16 @@ def compile_benchmark(bc_path, opt_path, llc_path, tmp_dir, data_dir,
             opt_cmd, capture_output=True, text=True, timeout=opt_timeout,
         )
     except subprocess.TimeoutExpired:
-        return None, None, None, f"opt timed out ({opt_timeout}s)"
+        return _err(f"opt timed out ({opt_timeout}s)")
     if proc.returncode != 0:
-        return None, None, None, proc.stderr[:500]
+        return _err(proc.stderr[:500])
+
+    opt_stats = parse_stats(proc.stderr) if enable_stats else {}
 
     # llc: bitcode -> object
     llc_cmd = [str(llc_path), "-O2", "-filetype=obj", "-relocation-model=pic"]
+    if enable_stats:
+        llc_cmd.append("-stats")
     if evolved_llc_flags:
         llc_cmd.extend(evolved_llc_flags)
     llc_cmd += [opt_bc, "-o", obj_file]
@@ -343,9 +469,11 @@ def compile_benchmark(bc_path, opt_path, llc_path, tmp_dir, data_dir,
             llc_cmd, capture_output=True, text=True, timeout=opt_timeout,
         )
     except subprocess.TimeoutExpired:
-        return None, None, None, f"llc timed out ({opt_timeout}s)"
+        return _err(f"llc timed out ({opt_timeout}s)")
     if proc.returncode != 0:
-        return None, None, None, proc.stderr[:500]
+        return _err(proc.stderr[:500])
+
+    llc_stats = parse_stats(proc.stderr) if enable_stats else {}
 
     text_size = get_text_size(obj_file)
 
@@ -357,13 +485,33 @@ def compile_benchmark(bc_path, opt_path, llc_path, tmp_dir, data_dir,
             gcc_cmd, capture_output=True, text=True, timeout=60,
         )
     except subprocess.TimeoutExpired:
-        return text_size, None, None, "link timed out"
+        return {"text_size": text_size, "binary_size": None, "runtime": None,
+                "timings": [], "opt_stats": opt_stats, "llc_stats": llc_stats,
+                "perf_counters": {}, "error": "link timed out"}
     if proc.returncode != 0:
-        return text_size, None, None, f"link failed: {proc.stderr[:200]}"
+        return {"text_size": text_size, "binary_size": None, "runtime": None,
+                "timings": [], "opt_stats": opt_stats, "llc_stats": llc_stats,
+                "perf_counters": {},
+                "error": f"link failed: {proc.stderr[:200]}"}
 
     binary_size = os.path.getsize(binary)
-    runtime = run_benchmark(name, binary, tmp_dir, data_dir)
-    return text_size, binary_size, runtime, None
+    runtime, timings = run_benchmark(name, binary, tmp_dir, data_dir)
+
+    # Optional perf stat (single run, deterministic counters)
+    perf_counters = {}
+    if enable_perf and runtime is not None:
+        perf_counters = run_perf_stat(name, binary, tmp_dir, data_dir)
+
+    return {
+        "text_size": text_size,
+        "binary_size": binary_size,
+        "runtime": runtime,
+        "timings": timings,
+        "opt_stats": opt_stats,
+        "llc_stats": llc_stats,
+        "perf_counters": perf_counters,
+        "error": None,
+    }
 
 
 # ---------------------------------------------------------------------------
@@ -429,10 +577,14 @@ def load_baseline(config: EvalConfig):
     with tempfile.TemporaryDirectory(prefix="evolve_baseline_") as tmp_dir:
         for bc in benchmarks:
             print(f"  Baseline: {bc.stem}...", end=" ", flush=True)
-            text_size, binary_size, runtime, err = compile_benchmark(
+            r = compile_benchmark(
                 bc, opt_path, llc_path, tmp_dir, config.data_dir,
                 opt_timeout=config.opt_timeout,
             )
+            text_size = r.get("text_size")
+            binary_size = r.get("binary_size")
+            runtime = r.get("runtime")
+            err = r.get("error")
             if err:
                 print(f"ERROR: {err}")
             elif text_size is not None:
@@ -461,14 +613,15 @@ def load_baseline(config: EvalConfig):
 
 def eval_benchmarks(benchmarks, opt_path, llc_path, baseline, tmp_dir,
                     data_dir, score_fn, evolved_opt_flags=None,
-                    evolved_llc_flags=None, opt_timeout=120):
+                    evolved_llc_flags=None, opt_timeout=120,
+                    enable_stats=False, enable_perf=False):
     """Compile and score benchmarks.
 
     *score_fn(total_binary, baseline_total_binary, speedups)* computes the
     task-specific score from aggregate measurements.
 
     Returns ``(score, result_dict)`` where *result_dict* contains per-benchmark
-    details plus aggregate totals.
+    details (including stats, timings, perf counters) plus aggregate totals.
     """
     total_binary = 0
     baseline_total_binary = 0
@@ -479,17 +632,28 @@ def eval_benchmarks(benchmarks, opt_path, llc_path, baseline, tmp_dir,
     errors = []
 
     for bc in benchmarks:
-        text_size, binary_size, runtime, err = compile_benchmark(
+        r = compile_benchmark(
             bc, opt_path, llc_path, tmp_dir, data_dir,
             evolved_opt_flags=evolved_opt_flags,
             evolved_llc_flags=evolved_llc_flags,
             opt_timeout=opt_timeout,
+            enable_stats=enable_stats,
+            enable_perf=enable_perf,
         )
         bl = baseline.get(bc.name, {})
+        text_size = r.get("text_size")
+        binary_size = r.get("binary_size")
+        runtime = r.get("runtime")
+        err = r.get("error")
+
         info = {
             "text_size": text_size,
             "binary_size": binary_size,
             "runtime": runtime,
+            "timings": r.get("timings", []),
+            "opt_stats": r.get("opt_stats", {}),
+            "llc_stats": r.get("llc_stats", {}),
+            "perf_counters": r.get("perf_counters", {}),
         }
 
         if err:
@@ -593,3 +757,311 @@ def objective(trial):
     best_params = study.best_params
     best_flags = [f"-{k}={v}" for k, v in best_params.items()]
     return study.best_value, best_params, best_flags
+
+
+# ---------------------------------------------------------------------------
+# ASI — Actionable Side Information (GEPA-style text gradients)
+# ---------------------------------------------------------------------------
+
+@dataclass
+class ScoreFormula:
+    """Describes how a task's score is computed from metrics.
+
+    Used by ``generate_asi()`` to decompose the score into its components so
+    the LLM can understand what drives the fitness function.
+    """
+    speedup_weight: float = 5.0   # multiplier on speedup_pct
+    binary_weight: float = 1.0    # multiplier on binary_reduction_pct
+    description: str = "5 * speedup% + binary_reduction%"
+
+
+def _classify_signal(info, bl):
+    """Classify a benchmark result's signal reliability.
+
+    Returns a human-readable label:
+    - ``UNRELIABLE (<10ms)`` — baseline runtime too short for stable measurement
+    - ``HIGH_VARIANCE (<100ms)`` — baseline runtime marginal
+    - ``REAL (code changed)`` — text section changed AND meaningful speedup
+    - ``NOISE (same code)`` — speedup without code change (measurement noise)
+    - ``MARGINAL`` — small or no change
+    """
+    bl_rt = bl.get("runtime")
+    speedup = info.get("speedup", 1.0)
+    text_pct = abs(info.get("text_reduction_pct", 0))
+    speedup_delta = abs(speedup - 1.0) * 100 if speedup else 0
+
+    if bl_rt is not None and bl_rt < 0.01:
+        return "UNRELIABLE (<10ms)"
+    if bl_rt is not None and bl_rt < 0.1:
+        return "HIGH_VARIANCE (<100ms)"
+    if text_pct > 0.01 and speedup_delta > 1:
+        return "REAL (code changed)"
+    if text_pct <= 0.01 and speedup_delta > 1:
+        return "NOISE (same code)"
+    return "MARGINAL"
+
+
+def _fmt_runtime(seconds):
+    """Format a runtime value for display."""
+    if seconds is None:
+        return "N/A"
+    if seconds < 1.0:
+        return f"{seconds * 1000:.1f}ms"
+    return f"{seconds:.1f}s"
+
+
+def generate_asi(score, result_dict, baseline, baseline_stats=None,
+                 formula=None):
+    """Generate Actionable Side Information markdown narrative.
+
+    Produces structured diagnostic feedback (GEPA-style "text gradients")
+    with up to four tiers of analysis:
+
+    - **Tier 1** — Score decomposition + per-benchmark signal classification
+    - **Tier 2** — Compiler statistics delta vs baseline (requires *baseline_stats*)
+    - **Tier 3** — Runtime variance from individual timings
+    - **Tier 4** — Hardware perf counters (if collected)
+    """
+    if formula is None:
+        formula = ScoreFormula()
+    details = result_dict.get("details", {})
+    lines = []
+
+    # ---- Tier 1: Score Decomposition ----
+    speedups = result_dict.get("speedups", [])
+    avg_speedup = sum(speedups) / len(speedups) if speedups else 0.0
+    speedup_pct = (avg_speedup - 1.0) * 100 if avg_speedup > 0 else 0.0
+
+    total_binary = result_dict.get("total_binary", 0)
+    bl_total_binary = result_dict.get("baseline_total_binary", 0)
+    binary_pct = (
+        100.0 * (bl_total_binary - total_binary) / bl_total_binary
+        if bl_total_binary > 0 else 0.0
+    )
+
+    lines.append(f"## Performance Analysis (Score: {score})")
+    lines.append("")
+    lines.append("### Score Decomposition")
+    lines.append(f"Formula: {formula.description}")
+    lines.append(
+        f"- Avg speedup: {avg_speedup:.4f}x ({speedup_pct:+.2f}%) "
+        f"x {formula.speedup_weight} = {formula.speedup_weight * speedup_pct:.2f}"
+    )
+    lines.append(
+        f"- Binary reduction: {binary_pct:.2f}% "
+        f"x {formula.binary_weight} = {formula.binary_weight * binary_pct:.2f}"
+    )
+    lines.append("")
+
+    # Per-benchmark table
+    lines.append("### Per-Benchmark Results")
+    lines.append(
+        "| Benchmark | Speedup | Text D | Binary D | Baseline RT | Signal |"
+    )
+    lines.append(
+        "|-----------|---------|--------|----------|-------------|--------|"
+    )
+
+    score_contributions = {}
+    for bname in sorted(details.keys()):
+        info = details[bname]
+        bl = baseline.get(bname, {})
+
+        speedup = info.get("speedup")
+        text_delta = info.get("text_reduction_pct", 0)
+        binary_delta = info.get("binary_reduction_pct", 0)
+        bl_rt = bl.get("runtime")
+        signal = _classify_signal(info, bl)
+
+        sp_str = f"{(speedup - 1) * 100:+.1f}%" if speedup else "N/A"
+        text_str = f"{text_delta:+.2f}%"
+        binary_str = f"{binary_delta:+.2f}%"
+        rt_str = _fmt_runtime(bl_rt)
+        short_name = bname.replace(".bc", "")
+
+        lines.append(
+            f"| {short_name} | {sp_str} | {text_str} | {binary_str} "
+            f"| {rt_str} | {signal} |"
+        )
+
+        # Track score contribution per benchmark
+        if speedup and len(details) > 0:
+            contrib = (
+                (speedup - 1.0) * 100
+                * formula.speedup_weight
+                / len(details)
+            )
+            score_contributions[bname] = contrib
+
+    lines.append("")
+
+    # Key observations
+    if score_contributions:
+        total_sp_score = sum(score_contributions.values())
+        if total_sp_score != 0:
+            top = max(score_contributions, key=lambda k: abs(score_contributions[k]))
+            top_contrib = score_contributions[top]
+            top_pct = abs(top_contrib / total_sp_score * 100)
+            top_signal = _classify_signal(details[top], baseline.get(top, {}))
+            lines.append("### Key Observations")
+            short = top.replace(".bc", "")
+            lines.append(
+                f"- {short} contributes {top_pct:.0f}% of speedup score"
+                f" -- {top_signal}"
+            )
+
+        # Summarize real improvements
+        real_gains = [
+            (n, details[n].get("speedup", 1.0))
+            for n in details
+            if _classify_signal(details[n], baseline.get(n, {})).startswith("REAL")
+            and details[n].get("speedup", 1.0) > 1.0
+        ]
+        if real_gains:
+            real_avg = (
+                sum(s - 1.0 for _, s in real_gains) / len(real_gains) * 100
+            )
+            lines.append(
+                f"- Real avg speedup (code-changed benchmarks): {real_avg:+.1f}%"
+            )
+        lines.append("")
+
+    # ---- Tier 2: Compiler Statistics Delta ----
+    if baseline_stats:
+        has_any_stats = any(
+            details[b].get("opt_stats") or details[b].get("llc_stats")
+            for b in details
+        )
+        if has_any_stats:
+            lines.append("### Compiler Statistics Delta")
+            for bname in sorted(details.keys()):
+                info = details[bname]
+                bl_stats = baseline_stats.get(bname, {})
+
+                evolved_opt = info.get("opt_stats", {})
+                evolved_llc = info.get("llc_stats", {})
+                bl_opt = bl_stats.get("opt_stats", {})
+                bl_llc = bl_stats.get("llc_stats", {})
+
+                # Combine and find interesting deltas
+                deltas = []
+                for key in set(list(evolved_opt.keys()) + list(bl_opt.keys())):
+                    ev = evolved_opt.get(key, 0)
+                    bl_v = bl_opt.get(key, 0)
+                    if bl_v != 0 and ev != bl_v:
+                        pct = (ev - bl_v) / bl_v * 100
+                        deltas.append((key, bl_v, ev, ev - bl_v, pct))
+                for key in set(list(evolved_llc.keys()) + list(bl_llc.keys())):
+                    ev = evolved_llc.get(key, 0)
+                    bl_v = bl_llc.get(key, 0)
+                    if bl_v != 0 and ev != bl_v:
+                        pct = (ev - bl_v) / bl_v * 100
+                        deltas.append((key, bl_v, ev, ev - bl_v, pct))
+
+                if deltas:
+                    deltas.sort(key=lambda x: abs(x[4]), reverse=True)
+                    short = bname.replace(".bc", "")
+                    lines.append(f"\n**{short}** (top changes):")
+                    lines.append("| Metric | Baseline | Evolved | Delta |")
+                    lines.append("|--------|----------|---------|-------|")
+                    for key, bl_v, ev, delta, pct in deltas[:8]:
+                        lines.append(
+                            f"| {key} | {bl_v:,} | {ev:,} "
+                            f"| {delta:+,} ({pct:+.1f}%) |"
+                        )
+            lines.append("")
+
+    # ---- Tier 3: Runtime Variance ----
+    has_timings = any(
+        len(details[b].get("timings", [])) > 1 for b in details
+    )
+    if has_timings:
+        lines.append("### Runtime Variance")
+        lines.append("| Benchmark | Timings | CoV | Signal |")
+        lines.append("|-----------|---------|-----|--------|")
+        for bname in sorted(details.keys()):
+            timings = details[bname].get("timings", [])
+            if len(timings) < 2:
+                continue
+            mean = sum(timings) / len(timings)
+            variance = sum((t - mean) ** 2 for t in timings) / (len(timings) - 1)
+            stdev = math.sqrt(variance)
+            cov = (stdev / mean * 100) if mean > 0 else 0
+            signal = (
+                "STABLE" if cov < 5 else ("MODERATE" if cov < 15 else "NOISY")
+            )
+            timing_strs = ", ".join(f"{t:.4f}" for t in timings[:5])
+            short = bname.replace(".bc", "")
+            lines.append(f"| {short} | {timing_strs} | {cov:.1f}% | {signal} |")
+        lines.append("")
+
+    # ---- Tier 4: Hardware Counters ----
+    has_perf = any(details[b].get("perf_counters") for b in details)
+    if has_perf:
+        lines.append("### Hardware Counters")
+        for bname in sorted(details.keys()):
+            perf = details[bname].get("perf_counters", {})
+            if not perf:
+                continue
+            short = bname.replace(".bc", "")
+            lines.append(f"\n**{short}**:")
+            lines.append("| Counter | Value |")
+            lines.append("|---------|-------|")
+            for counter, value in sorted(perf.items()):
+                lines.append(f"| {counter} | {value:,} |")
+        lines.append("")
+
+    return "\n".join(lines)
+
+
+# ---------------------------------------------------------------------------
+# Baseline stats caching
+# ---------------------------------------------------------------------------
+
+def load_baseline_stats(config):
+    """Load or compute baseline compiler stats (``opt``/``llc -stats`` output).
+
+    Stats are cached in ``baseline_stats.json`` alongside the baseline file.
+    Re-generates when the file is missing.
+    """
+    stats_path = Path(config.baseline_file).parent / "baseline_stats.json"
+    if stats_path.exists():
+        with open(stats_path) as f:
+            return json.load(f)
+
+    # Compile each benchmark with -stats (no evolved flags) to collect baseline
+    opt_path = os.path.join(config.build_dir, "bin", "opt")
+    llc_path = os.path.join(config.build_dir, "bin", "llc")
+    benchmarks = find_benchmarks(Path(config.testsuite_dir))
+
+    if not benchmarks:
+        return {}
+
+    baseline_stats = {}
+    with tempfile.TemporaryDirectory(prefix="evolve_blstats_") as tmp_dir:
+        for bc in benchmarks:
+            print(f"  Baseline stats: {bc.stem}...", end=" ", flush=True)
+            r = compile_benchmark(
+                bc, opt_path, llc_path, tmp_dir, config.data_dir,
+                opt_timeout=config.opt_timeout, enable_stats=True,
+            )
+            if r.get("error"):
+                print(f"ERROR: {r['error']}")
+            else:
+                baseline_stats[bc.name] = {
+                    "opt_stats": r.get("opt_stats", {}),
+                    "llc_stats": r.get("llc_stats", {}),
+                }
+                opt_n = len(r.get("opt_stats", {}))
+                llc_n = len(r.get("llc_stats", {}))
+                print(f"opt: {opt_n} stats, llc: {llc_n} stats")
+
+    try:
+        os.makedirs(stats_path.parent, exist_ok=True)
+        with open(stats_path, "w") as f:
+            json.dump(baseline_stats, f, indent=2)
+        print(f"  Baseline stats saved to {stats_path}")
+    except OSError:
+        pass
+
+    return baseline_stats
diff --git a/src/mlirAgent/evolve/tasks/llvm_inlining/evaluate.py b/src/mlirAgent/evolve/tasks/llvm_inlining/evaluate.py
index 58b4518..dea6b08 100644
--- a/src/mlirAgent/evolve/tasks/llvm_inlining/evaluate.py
+++ b/src/mlirAgent/evolve/tasks/llvm_inlining/evaluate.py
@@ -24,19 +24,24 @@
 
 try:
     from ..llvm_bench import (
-        EvalConfig, build_llvm, eval_benchmarks, extract_hyperparams,
-        find_benchmarks, load_baseline, optuna_tune, patch_source,
-        restore_source,
+        EvalConfig, ScoreFormula, build_llvm, eval_benchmarks,
+        extract_hyperparams, find_benchmarks, generate_asi, load_baseline,
+        load_baseline_stats, optuna_tune, patch_source, restore_source,
     )
 except ImportError:
     # Standalone loading by OpenEvolve's importlib (no parent package)
     sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
     from llvm_bench import (
-        EvalConfig, build_llvm, eval_benchmarks, extract_hyperparams,
-        find_benchmarks, load_baseline, optuna_tune, patch_source,
-        restore_source,
+        EvalConfig, ScoreFormula, build_llvm, eval_benchmarks,
+        extract_hyperparams, find_benchmarks, generate_asi, load_baseline,
+        load_baseline_stats, optuna_tune, patch_source, restore_source,
     )
 
+try:
+    from openevolve.evaluation_result import EvaluationResult
+except ImportError:
+    EvaluationResult = None
+
 
 def _score(total_binary, baseline_total_binary, speedups):
     """Inlining score: binary reduction % + speedup bonus."""
@@ -138,6 +143,8 @@ def evaluate(program_path: str, config: EvalConfig = None) -> dict:
                 config.data_dir, _score,
                 evolved_opt_flags=evolved_opt_flags,
                 opt_timeout=config.opt_timeout,
+                enable_stats=config.enable_stats,
+                enable_perf=config.enable_perf_counters,
             )
 
         result["combined_score"] = score
@@ -162,6 +169,22 @@ def evaluate(program_path: str, config: EvalConfig = None) -> dict:
         if ev["errors"]:
             result["error"] = "; ".join(ev["errors"])
 
+        # Generate ASI (Actionable Side Information)
+        baseline_stats = None
+        if config.enable_stats:
+            baseline_stats = load_baseline_stats(config)
+        asi = generate_asi(
+            score, ev, baseline, baseline_stats=baseline_stats,
+            formula=ScoreFormula(
+                speedup_weight=0.1,
+                binary_weight=1.0,
+                description="binary_reduction% + (avg_speedup - 1) x 10",
+            ),
+        )
+
+        if EvaluationResult is not None:
+            return EvaluationResult(metrics=result, artifacts={"asi": asi})
+
     except subprocess.TimeoutExpired:
         result["error"] = "Build timed out (600s)"
     finally:
diff --git a/src/mlirAgent/evolve/tasks/loop_unrolling/evaluate.py b/src/mlirAgent/evolve/tasks/loop_unrolling/evaluate.py
index d16c628..85f91da 100644
--- a/src/mlirAgent/evolve/tasks/loop_unrolling/evaluate.py
+++ b/src/mlirAgent/evolve/tasks/loop_unrolling/evaluate.py
@@ -25,19 +25,24 @@
 
 try:
     from ..llvm_bench import (
-        EvalConfig, build_llvm, eval_benchmarks, extract_hyperparams,
-        find_benchmarks, load_baseline, optuna_tune, patch_source,
-        restore_source,
+        EvalConfig, ScoreFormula, build_llvm, eval_benchmarks,
+        extract_hyperparams, find_benchmarks, generate_asi, load_baseline,
+        load_baseline_stats, optuna_tune, patch_source, restore_source,
     )
 except ImportError:
     # Standalone loading by OpenEvolve's importlib (no parent package)
     sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
     from llvm_bench import (
-        EvalConfig, build_llvm, eval_benchmarks, extract_hyperparams,
-        find_benchmarks, load_baseline, optuna_tune, patch_source,
-        restore_source,
+        EvalConfig, ScoreFormula, build_llvm, eval_benchmarks,
+        extract_hyperparams, find_benchmarks, generate_asi, load_baseline,
+        load_baseline_stats, optuna_tune, patch_source, restore_source,
     )
 
+try:
+    from openevolve.evaluation_result import EvaluationResult
+except ImportError:
+    EvaluationResult = None
+
 _EVAL_DIR = Path(__file__).resolve().parent
 
 
@@ -140,6 +145,8 @@ def evaluate(program_path: str, config: EvalConfig = None) -> dict:
                 config.data_dir, _score,
                 evolved_opt_flags=evolved_opt_flags,
                 opt_timeout=config.opt_timeout,
+                enable_stats=config.enable_stats,
+                enable_perf=config.enable_perf_counters,
             )
 
         result["combined_score"] = score
@@ -158,6 +165,22 @@ def evaluate(program_path: str, config: EvalConfig = None) -> dict:
         if ev["errors"]:
             result["error"] = "; ".join(ev["errors"])
 
+        # Generate ASI (Actionable Side Information)
+        baseline_stats = None
+        if config.enable_stats:
+            baseline_stats = load_baseline_stats(config)
+        asi = generate_asi(
+            score, ev, baseline, baseline_stats=baseline_stats,
+            formula=ScoreFormula(
+                speedup_weight=5.0,
+                binary_weight=1.0,
+                description="5 x speedup% + binary_reduction%",
+            ),
+        )
+
+        if EvaluationResult is not None:
+            return EvaluationResult(metrics=result, artifacts={"asi": asi})
+
     except subprocess.TimeoutExpired:
         result["error"] = "Build timed out (600s)"
     finally:
diff --git a/src/mlirAgent/evolve/tasks/regalloc_priority/evaluate.py b/src/mlirAgent/evolve/tasks/regalloc_priority/evaluate.py
index 7edd176..b5bf452 100644
--- a/src/mlirAgent/evolve/tasks/regalloc_priority/evaluate.py
+++ b/src/mlirAgent/evolve/tasks/regalloc_priority/evaluate.py
@@ -24,19 +24,24 @@
 
 try:
     from ..llvm_bench import (
-        EvalConfig, build_llvm, eval_benchmarks, extract_hyperparams,
-        find_benchmarks, load_baseline, optuna_tune, patch_source,
-        restore_source,
+        EvalConfig, ScoreFormula, build_llvm, eval_benchmarks,
+        extract_hyperparams, find_benchmarks, generate_asi, load_baseline,
+        load_baseline_stats, optuna_tune, patch_source, restore_source,
     )
 except ImportError:
     # Standalone loading by OpenEvolve's importlib (no parent package)
     sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
     from llvm_bench import (
-        EvalConfig, build_llvm, eval_benchmarks, extract_hyperparams,
-        find_benchmarks, load_baseline, optuna_tune, patch_source,
-        restore_source,
+        EvalConfig, ScoreFormula, build_llvm, eval_benchmarks,
+        extract_hyperparams, find_benchmarks, generate_asi, load_baseline,
+        load_baseline_stats, optuna_tune, patch_source, restore_source,
     )
 
+try:
+    from openevolve.evaluation_result import EvaluationResult
+except ImportError:
+    EvaluationResult = None
+
 _EVAL_DIR = Path(__file__).resolve().parent
 
 
@@ -138,6 +143,8 @@ def evaluate(program_path: str, config: EvalConfig = None) -> dict:
                 config.data_dir, _score,
                 evolved_llc_flags=evolved_llc_flags,
                 opt_timeout=config.opt_timeout,
+                enable_stats=config.enable_stats,
+                enable_perf=config.enable_perf_counters,
             )
 
         result["combined_score"] = score
@@ -156,6 +163,22 @@ def evaluate(program_path: str, config: EvalConfig = None) -> dict:
         if ev["errors"]:
             result["error"] = "; ".join(ev["errors"])
 
+        # Generate ASI (Actionable Side Information)
+        baseline_stats = None
+        if config.enable_stats:
+            baseline_stats = load_baseline_stats(config)
+        asi = generate_asi(
+            score, ev, baseline, baseline_stats=baseline_stats,
+            formula=ScoreFormula(
+                speedup_weight=5.0,
+                binary_weight=1.0,
+                description="5 x speedup% + binary_reduction%",
+            ),
+        )
+
+        if EvaluationResult is not None:
+            return EvaluationResult(metrics=result, artifacts={"asi": asi})
+
     except subprocess.TimeoutExpired:
         result["error"] = "Build timed out (600s)"
     finally:

From b00304138a31e24abf3455ab1de84c4a038225af Mon Sep 17 00:00:00 2001
From: Ashvin Verma <ashvin.verma@Berkeley.EDU>
Date: Mon, 23 Feb 2026 00:10:18 -0800
Subject: [PATCH 6/8] Add Tier 5 optimization remarks + fix GEPA adapter for
 real API
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Tier 5: per-decision optimization remarks via -pass-remarks-output.
Line-by-line state-machine YAML parser (no PyYAML dependency) extracts
inline/loop-unroll !Passed/!Missed documents, compares evolved vs
baseline to identify flipped decisions with cost/threshold values.
Wired through compile_benchmark → eval_benchmarks → generate_asi.
Enabled via EVOLVE_ENABLE_REMARKS=1 (~20% overhead).

GEPA: rewrite gepa_adapter.py evaluator to return (score, side_info)
tuple per GEPA protocol, passing ASI as native Feedback channel.
Rewrite gepa_run.py to use real optimize_anything API with GEPAConfig,
EngineConfig, ReflectionConfig. Add --auto-respond flag for smoke
testing (background thread auto-creates response files).

README: add ASI tiers explainer and GEPA integration guide.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/mlirAgent/evolve/README.md                | 142 +++++++-
 src/mlirAgent/evolve/gepa_adapter.py          |  20 +-
 src/mlirAgent/evolve/gepa_run.py              | 143 ++++++--
 src/mlirAgent/evolve/tasks/llvm_bench.py      | 323 +++++++++++++++++-
 .../evolve/tasks/llvm_inlining/evaluate.py    |  11 +-
 .../evolve/tasks/loop_unrolling/evaluate.py   |  11 +-
 .../tasks/regalloc_priority/evaluate.py       |  11 +-
 7 files changed, 618 insertions(+), 43 deletions(-)

diff --git a/src/mlirAgent/evolve/README.md b/src/mlirAgent/evolve/README.md
index d76a558..25ee3ae 100644
--- a/src/mlirAgent/evolve/README.md
+++ b/src/mlirAgent/evolve/README.md
@@ -210,6 +210,135 @@ Each benchmark is run **5 times** and the **median** wall-clock time is used
 scheduling and process startup, though very short benchmarks (sqlite3 at 2ms)
 remain unreliable.
 
+## ASI — Actionable Side Information
+
+ASI is a structured diagnostic feedback mechanism inspired by GEPA's "text
+gradients". Instead of returning only a scalar score to the LLM, the evaluator
+generates a multi-tier markdown narrative explaining *why* the code scored as it
+did and *what to change*.
+
+### Tiers
+
+| Tier | Content | Overhead | Config |
+|------|---------|----------|--------|
+| **1** | Score decomposition + per-benchmark signal classification | Zero | Always on |
+| **2** | Compiler statistics delta (`-stats` output vs baseline) | Zero | `EVOLVE_ENABLE_STATS=1` (default) |
+| **3** | Runtime variance (CoV from 5 runs, STABLE/MODERATE/NOISY) | Zero | Always on |
+| **4** | Hardware perf counters (instructions, cycles, cache/branch misses) | ~1s | `EVOLVE_ENABLE_PERF=1` |
+| **5** | Optimization decision changes (`-pass-remarks-output` YAML diff) | ~20% | `EVOLVE_ENABLE_REMARKS=1` |
+
+### Tier 1: Score Decomposition
+
+Breaks the score into its components (speedup vs binary reduction) and
+classifies each benchmark's signal reliability:
+
+- **UNRELIABLE (<10ms)** — baseline runtime too short (e.g., sqlite3 at 2ms)
+- **HIGH_VARIANCE (<100ms)** — borderline runtime stability
+- **REAL (code changed)** — text section changed AND meaningful speedup
+- **NOISE (same code)** — speedup without code change (measurement artifact)
+- **MARGINAL** — small or no change
+
+### Tier 2: Compiler Statistics Delta
+
+Compares LLVM `-stats` output between evolved and baseline compilations. Shows
+which optimization passes changed behavior (e.g., "inline - Number of functions
+inlined: 1234 → 1567, +27%").
+
+### Tier 5: Optimization Decision Changes
+
+Compares per-decision optimization remarks (YAML) between evolved and baseline.
+Identifies "flipped" decisions — functions that changed from inlined→rejected
+or rejected→inlined — with their cost/threshold values. This gives the LLM
+precise targets: "function X was rejected because cost=500 exceeds threshold=225;
+lower the cost or raise the threshold."
+
+The remarks parser uses a line-by-line state machine (not PyYAML) for
+performance on 62MB files. Only `inline` and `loop-unroll` pass remarks are
+extracted.
+
+### Example ASI Output
+
+```markdown
+## Performance Analysis (Score: 8.78)
+
+### Score Decomposition
+Formula: binary_reduction% + (avg_speedup - 1) x 10
+- Avg speedup: 1.0023x (+0.23%) x 0.1 = 0.02
+- Binary reduction: 9.24% x 1.0 = 9.24
+
+### Per-Benchmark Results
+| Benchmark | Speedup | Text D | Binary D | Baseline RT | Signal |
+|-----------|---------|--------|----------|-------------|--------|
+| spass     | +0.3%   | +12.31%| +10.42%  | 8.1s        | REAL   |
+| tramp3d-v4| -1.2%   | -3.45% | -2.11%   | 0.11s       | HIGH_VARIANCE |
+
+### Optimization Decisions
+**spass** (412 decisions changed vs baseline):
+- 287 newly passed (were rejected)
+- 125 newly rejected (were passed)
+
+| Function | Callee | Direction | BL Cost/Thresh | Ev Cost/Thresh |
+|----------|--------|-----------|----------------|----------------|
+| memory_Free | allocBlock | now passed | 500/225 | -15025/225 |
+```
+
+## GEPA Integration
+
+[GEPA](https://github.com/google-deepmind/gepa) (Generalist Evolutionary
+Prompt Architect) is an optimization framework that uses LLM reflections to
+evolve arbitrary text parameters. We integrate GEPA as an alternative to
+OpenEvolve for driving LLVM heuristic evolution.
+
+### Architecture
+
+```
+GEPA optimize_anything()
+  │
+  ├─ evaluator(code_str) → (score, {"Feedback": ASI_markdown})
+  │    └─ Our make_evaluator(): patch LLVM, build, benchmark, generate ASI
+  │
+  └─ reflection_lm(prompt) → str
+       └─ ManualLM: write prompt to disk, poll for response file
+```
+
+Key insight: GEPA's evaluator protocol accepts `(score, side_info_dict)` tuples.
+We pass our ASI as `{"Feedback": asi_text}`, which GEPA includes in its
+reflection prompt alongside the candidate code. This gives the LLM rich
+diagnostic context for proposing improvements.
+
+### Usage
+
+```bash
+# Manual mode: prompts appear as prompt_NNN.md, you write prompt_NNN.response.md
+python gepa_run.py --task llvm_inlining --max-evals 10
+
+# Auto mode for smoke testing (auto-responds with trivially modified code)
+python gepa_run.py --task llvm_inlining --max-evals 2 --auto-respond
+```
+
+### Configuration
+
+| Flag | Default | Description |
+|------|---------|-------------|
+| `--task` | (required) | `llvm_inlining`, `loop_unrolling`, or `regalloc_priority` |
+| `--max-evals` | 10 | Maximum evaluator calls (seed + proposals) |
+| `--prompts-dir` | `gepa_prompts` | Directory for prompt/response files |
+| `--output-dir` | `<prompts-dir>/run` | GEPA state directory (for resume) |
+| `--auto-respond` | off | Spawn background thread that auto-creates responses |
+
+### GEPA vs OpenEvolve
+
+| Feature | OpenEvolve | GEPA |
+|---------|-----------|------|
+| Population | MAP-Elites (50 candidates) | Pareto frontier |
+| Feedback | Scalar score only → ASI via artifacts | Native side-info channel |
+| LLM interface | ManualLLM (file-based) | ManualLM (file-based) |
+| Hyperparameter tuning | Optuna inner-loop | Not integrated (future) |
+| Resume | Checkpoint directory | `run_dir` state |
+
+Both frameworks use our same evaluation pipeline (`llvm_bench.py`), so scores
+are directly comparable.
+
 ## LLVM Hooks
 
 ### Inlining (`-use-evolved-inline-cost`)
@@ -291,14 +420,20 @@ config = EvalConfig.from_env(
 | `EVOLVE_BUILD_DIR` | (required) | LLVM ninja build directory |
 | `EVOLVE_OPT_TIMEOUT` | 120 | Per-benchmark opt/llc timeout (seconds) |
 | `EVOLVE_OPTUNA_TRIALS` | 20 | Optuna trials (0 = disable) |
+| `EVOLVE_ENABLE_STATS` | 1 | Tier 2: collect `-stats` output |
+| `EVOLVE_ENABLE_PERF` | 0 | Tier 4: collect perf counters |
+| `EVOLVE_ENABLE_REMARKS` | 0 | Tier 5: collect optimization remarks (~20% overhead) |
 
 ## Task Structure
 
 ```
 src/mlirAgent/evolve/
-  manual_run.py                  # Orchestrator: --auto/--wait/--resume
+  manual_run.py                  # OpenEvolve orchestrator: --auto/--wait/--resume
+  gepa_run.py                   # GEPA orchestrator: --auto-respond
+  gepa_adapter.py               # GEPA evaluator bridge (score, side_info)
+  gepa_manual_lm.py             # File-based LLM for GEPA
   tasks/
-    llvm_bench.py                # Shared: EvalConfig, compile, baseline, Optuna
+    llvm_bench.py                # Shared: EvalConfig, compile, baseline, Optuna, ASI
     llvm_inlining/
       evaluate.py                # _score(): bin_red% + speedup*10
       initial.cpp                # Seed: sums heuristic features - threshold
@@ -307,6 +442,9 @@ src/mlirAgent/evolve/
         compile_testsuite.sh     # Script to build .bc from llvm-test-suite
         testsuite/               # .bc files (gitignored, built locally)
           data/                  # Runtime input data per benchmark
+    loop_unrolling/
+      evaluate.py                # _score(): 5*speedup% + bin_red%
+      initial.cpp                # Seed: LLVM default unroll heuristic
     regalloc_priority/
       evaluate.py                # _score(): 5*speedup% + bin_red%
       initial.cpp                # Seed: LLVM default bit-packed priority
diff --git a/src/mlirAgent/evolve/gepa_adapter.py b/src/mlirAgent/evolve/gepa_adapter.py
index b1ff02c..2e9082b 100644
--- a/src/mlirAgent/evolve/gepa_adapter.py
+++ b/src/mlirAgent/evolve/gepa_adapter.py
@@ -40,8 +40,9 @@ def inject_evolve_block(template, block):
 def make_evaluator(task_name, config=None):
     """Create an evaluator function for GEPA.
 
-    Returns a callable ``code_str -> float`` that compiles and benchmarks
-    the given C++ source code, returning the ``combined_score``.
+    Returns a callable ``code_str -> (score, side_info)`` matching GEPA's
+    evaluator protocol.  *side_info* is a dict that may contain a
+    ``"Feedback"`` key with ASI markdown text.
     """
     if task_name == "llvm_inlining":
         from tasks.llvm_inlining.evaluate import evaluate
@@ -65,7 +66,7 @@ def make_evaluator(task_name, config=None):
         raise ValueError(f"Unknown task: {task_name}")
 
     def evaluator(code_str):
-        """Write code to temp file, evaluate, return score."""
+        """Write code to temp file, evaluate, return (score, side_info)."""
         with tempfile.NamedTemporaryFile(
             mode="w", suffix=".cpp", delete=False, prefix="gepa_"
         ) as f:
@@ -74,9 +75,16 @@ def evaluator(code_str):
         try:
             result = evaluate(tmp_path, config=config)
             if isinstance(result, dict):
-                return result.get("combined_score", 0.0)
-            # EvaluationResult
-            return result.metrics.get("combined_score", 0.0)
+                score = result.get("combined_score", 0.0)
+                side_info = {}
+            else:
+                # EvaluationResult from OpenEvolve
+                score = result.metrics.get("combined_score", 0.0)
+                if hasattr(result, "artifacts") and "asi" in result.artifacts:
+                    side_info = {"Feedback": result.artifacts["asi"]}
+                else:
+                    side_info = {}
+            return score, side_info
         finally:
             os.unlink(tmp_path)
 
diff --git a/src/mlirAgent/evolve/gepa_run.py b/src/mlirAgent/evolve/gepa_run.py
index cd38202..8e7dfb5 100644
--- a/src/mlirAgent/evolve/gepa_run.py
+++ b/src/mlirAgent/evolve/gepa_run.py
@@ -3,6 +3,7 @@
 Usage::
 
     python gepa_run.py --task llvm_inlining [--prompts-dir gepa_prompts]
+    python gepa_run.py --task llvm_inlining --max-evals 2 --auto-respond
 
 Requires ``pip install gepa`` and environment variables:
   - LLVM_SRC_PATH: path to LLVM source tree
@@ -12,7 +13,10 @@
 import argparse
 import json
 import os
+import re
 import sys
+import threading
+import time
 from pathlib import Path
 
 # Ensure local packages are importable
@@ -25,6 +29,70 @@
     "regalloc_priority": "tasks/regalloc_priority/initial.cpp",
 }
 
+# Task → objective string for GEPA
+_TASK_OBJECTIVE = {
+    "llvm_inlining": (
+        "Maximize binary size reduction across CTMark benchmarks "
+        "by modifying the inlining cost heuristic."
+    ),
+    "loop_unrolling": (
+        "Maximize runtime speedup across CTMark benchmarks "
+        "by modifying the loop unrolling heuristic."
+    ),
+    "regalloc_priority": (
+        "Maximize runtime speedup across CTMark benchmarks "
+        "by modifying the register allocation priority function."
+    ),
+}
+
+_TASK_BACKGROUND = (
+    "You are modifying a C++ heuristic function in LLVM's optimization pipeline. "
+    "The function is compiled into the opt/llc tools and evaluated against CTMark "
+    "benchmarks (real-world C/C++ programs). The evaluator returns a score based on "
+    "binary size reduction and/or runtime speedup vs the default LLVM heuristic. "
+    "Higher scores are better. The source code uses LLVM APIs (cl::opt for flags, "
+    "InlineCost, LoopUnrollResult, etc.). Expose tunable constants as "
+    "// [hyperparam]: flag-name, type, min, max comments for the autotuner."
+)
+
+
+def _auto_respond_thread(prompts_dir, initial_code, poll_interval=1.0):
+    """Background thread that auto-creates response files for smoke testing.
+
+    Watches for new prompt_NNN.md files and creates prompt_NNN.response.md
+    with a trivially modified version of the initial code.
+    """
+    seen = set()
+    prompt_re = re.compile(r"^prompt_(\d+)\.md$")
+
+    while True:
+        try:
+            for fname in os.listdir(prompts_dir):
+                m = prompt_re.match(fname)
+                if not m:
+                    continue
+                num = m.group(1)
+                response_name = f"prompt_{num}.response.md"
+                if response_name in seen:
+                    continue
+                response_path = os.path.join(prompts_dir, response_name)
+                if os.path.exists(response_path):
+                    seen.add(response_name)
+                    continue
+
+                # Create a trivially modified version of the code
+                modified = initial_code.replace(
+                    "// EVOLVE-BLOCK-START",
+                    f"// EVOLVE-BLOCK-START\n// Auto-response iteration {num}",
+                )
+                with open(response_path, "w") as f:
+                    f.write(f"```cpp\n{modified}\n```\n")
+                seen.add(response_name)
+                print(f"  [auto-respond] Created {response_name}")
+        except OSError:
+            pass
+        time.sleep(poll_interval)
+
 
 def main():
     parser = argparse.ArgumentParser(
@@ -48,18 +116,28 @@ def main():
         help="Poll interval for response files in seconds (default: 2.0)",
     )
     parser.add_argument(
-        "--max-iterations", type=int, default=10,
-        help="Maximum GEPA iterations (default: 10)",
+        "--max-evals", type=int, default=10,
+        help="Maximum evaluator calls (default: 10)",
+    )
+    parser.add_argument(
+        "--output-dir", default=None,
+        help="GEPA run directory for state/resume (default: <prompts-dir>/run)",
     )
     parser.add_argument(
         "--output", default=None,
         help="Path to save best code (default: tasks/<task>/gepa_best.cpp)",
     )
+    parser.add_argument(
+        "--auto-respond", action="store_true",
+        help="Auto-create response files for smoke testing",
+    )
     args = parser.parse_args()
 
     # Import GEPA
     try:
-        from gepa import optimize_anything
+        from gepa.optimize_anything import (
+            optimize_anything, GEPAConfig, EngineConfig, ReflectionConfig,
+        )
     except ImportError:
         print("Error: gepa not installed. Run: pip install gepa")
         sys.exit(1)
@@ -88,36 +166,57 @@ def main():
     )
     evaluator = make_evaluator(args.task)
 
+    run_dir = args.output_dir or os.path.join(args.prompts_dir, "run")
+    os.makedirs(run_dir, exist_ok=True)
+
     print(f"{'=' * 60}")
     print(f"GEPA Runner")
     print(f"  Task:           {args.task}")
     print(f"  Initial code:   {initial_file}")
     print(f"  Prompts dir:    {args.prompts_dir}")
-    print(f"  Max iterations: {args.max_iterations}")
+    print(f"  Max evals:      {args.max_evals}")
+    print(f"  Run dir:        {run_dir}")
+    print(f"  Auto-respond:   {args.auto_respond}")
     print(f"{'=' * 60}")
     print()
 
-    # Evaluate initial program first
-    print("Evaluating initial program...")
-    initial_score = evaluator(initial_code)
-    print(f"  Initial score: {initial_score}")
-    print()
+    # Start auto-responder thread if requested
+    if args.auto_respond:
+        t = threading.Thread(
+            target=_auto_respond_thread,
+            args=(args.prompts_dir, initial_code, args.poll_interval),
+            daemon=True,
+        )
+        t.start()
+        print("  [auto-respond] Background responder started")
+
+    # Run GEPA with real API
+    objective = _TASK_OBJECTIVE[args.task]
+    config = GEPAConfig(
+        engine=EngineConfig(
+            max_metric_calls=args.max_evals,
+            parallel=False,
+            run_dir=run_dir,
+        ),
+        reflection=ReflectionConfig(
+            reflection_lm=lm,
+        ),
+    )
 
-    # Run GEPA
     result = optimize_anything(
-        initial_code=initial_code,
-        evaluate_fn=evaluator,
-        lm=lm,
-        max_iterations=args.max_iterations,
+        seed_candidate=initial_code,
+        evaluator=evaluator,
+        objective=objective,
+        background=_TASK_BACKGROUND,
+        config=config,
     )
 
     print()
     print(f"{'=' * 60}")
     print(f"GEPA Results:")
-    print(f"  Best score:     {result.best_score}")
-    print(f"  Initial score:  {initial_score}")
-    print(f"  Improvement:    {result.best_score - initial_score:+.4f}")
-    print(f"  Iterations:     {result.iterations}")
+    print(f"  Best candidate: {len(result.best_candidate)} chars")
+    print(f"  Num candidates: {result.num_candidates}")
+    print(f"  Total evals:    {result.total_metric_calls}")
     print(f"{'=' * 60}")
 
     # Save best code
@@ -126,16 +225,16 @@ def main():
     )
     os.makedirs(os.path.dirname(output_path), exist_ok=True)
     with open(output_path, "w") as f:
-        f.write(result.best_code)
+        f.write(result.best_candidate)
     print(f"Best code saved to: {output_path}")
 
     # Save summary
     summary = {
         "task": args.task,
-        "initial_score": initial_score,
-        "best_score": result.best_score,
-        "iterations": result.iterations,
+        "num_candidates": result.num_candidates,
+        "total_metric_calls": result.total_metric_calls,
         "output_path": output_path,
+        "run_dir": run_dir,
     }
     summary_path = os.path.join(args.prompts_dir, "summary.json")
     with open(summary_path, "w") as f:
diff --git a/src/mlirAgent/evolve/tasks/llvm_bench.py b/src/mlirAgent/evolve/tasks/llvm_bench.py
index 20957cb..1df35c7 100644
--- a/src/mlirAgent/evolve/tasks/llvm_bench.py
+++ b/src/mlirAgent/evolve/tasks/llvm_bench.py
@@ -289,6 +289,195 @@ def parse_perf_output(perf_stderr):
     return counters
 
 
+# ---------------------------------------------------------------------------
+# Optimization remarks parsing (Tier 5)
+# ---------------------------------------------------------------------------
+
+# Pass names we care about for remarks
+_REMARK_PASSES = {"inline", "loop-unroll"}
+
+
+def parse_remarks(remarks_file):
+    """Parse LLVM optimization remarks YAML using line-by-line state machine.
+
+    Avoids PyYAML for performance (62MB files).  Extracts only inline and
+    loop-unroll related ``!Passed`` / ``!Missed`` documents.
+
+    Returns ``{"passed": [...], "missed": [...]}`` where each entry is
+    ``{"pass": str, "name": str, "function": str, "args": dict}``.
+    """
+    passed = []
+    missed = []
+
+    doc_type = None   # "passed" or "missed"
+    cur = None        # current document dict
+    in_args = False
+    last_arg_key = None
+
+    try:
+        fh = open(remarks_file, "r", errors="replace")
+    except OSError:
+        return {"passed": [], "missed": []}
+
+    try:
+        for line in fh:
+            stripped = line.rstrip()
+
+            # New document separator
+            if stripped.startswith("--- !"):
+                # Flush previous document
+                if cur and cur.get("pass") in _REMARK_PASSES:
+                    if doc_type == "passed":
+                        passed.append(cur)
+                    elif doc_type == "missed":
+                        missed.append(cur)
+
+                tag = stripped[5:].strip()
+                if tag == "Passed":
+                    doc_type = "passed"
+                    cur = {"pass": "", "name": "", "function": "", "args": {}}
+                    in_args = False
+                elif tag == "Missed":
+                    doc_type = "missed"
+                    cur = {"pass": "", "name": "", "function": "", "args": {}}
+                    in_args = False
+                else:
+                    doc_type = None
+                    cur = None
+                    in_args = False
+                continue
+
+            if cur is None:
+                continue
+
+            # End of document
+            if stripped == "...":
+                if cur.get("pass") in _REMARK_PASSES:
+                    if doc_type == "passed":
+                        passed.append(cur)
+                    elif doc_type == "missed":
+                        missed.append(cur)
+                cur = None
+                doc_type = None
+                in_args = False
+                continue
+
+            # Top-level fields
+            if not in_args:
+                if stripped.startswith("Pass:"):
+                    cur["pass"] = stripped.split(":", 1)[1].strip().strip("'\"")
+                elif stripped.startswith("Name:"):
+                    cur["name"] = stripped.split(":", 1)[1].strip().strip("'\"")
+                elif stripped.startswith("Function:"):
+                    cur["function"] = stripped.split(":", 1)[1].strip().strip("'\"")
+                elif stripped.startswith("Args:"):
+                    in_args = True
+                    last_arg_key = None
+            else:
+                # Inside Args list — look for key-value pairs
+                s = stripped.lstrip()
+                if s.startswith("- "):
+                    # New arg entry: "- Callee: foo"
+                    kv = s[2:]
+                    colon = kv.find(":")
+                    if colon > 0:
+                        key = kv[:colon].strip()
+                        val = kv[colon + 1:].strip().strip("'\"")
+                        cur["args"][key] = val
+                        last_arg_key = key
+                elif ":" in s and not s.startswith("#"):
+                    # Continuation key on same arg: "  Cost: '15'"
+                    colon = s.find(":")
+                    key = s[:colon].strip()
+                    val = s[colon + 1:].strip().strip("'\"")
+                    if key:
+                        cur["args"][key] = val
+    finally:
+        fh.close()
+
+    # Flush last document
+    if cur and cur.get("pass") in _REMARK_PASSES:
+        if doc_type == "passed":
+            passed.append(cur)
+        elif doc_type == "missed":
+            missed.append(cur)
+
+    return {"passed": passed, "missed": missed}
+
+
+def summarize_remarks(evolved_remarks, baseline_remarks):
+    """Compare evolved vs baseline remarks to find flipped decisions.
+
+    Returns a compact summary dict with counts and top flipped decisions.
+    """
+    summary = {
+        "evolved_passed": len(evolved_remarks.get("passed", [])),
+        "evolved_missed": len(evolved_remarks.get("missed", [])),
+        "baseline_passed": len(baseline_remarks.get("passed", [])),
+        "baseline_missed": len(baseline_remarks.get("missed", [])),
+        "flipped": [],
+    }
+
+    # Build lookup: (function, callee) -> doc for baseline
+    def _key(doc):
+        callee = doc["args"].get("Callee", "")
+        return (doc["function"], callee)
+
+    bl_passed = {}
+    for doc in baseline_remarks.get("passed", []):
+        k = _key(doc)
+        bl_passed[k] = doc
+
+    bl_missed = {}
+    for doc in baseline_remarks.get("missed", []):
+        k = _key(doc)
+        bl_missed[k] = doc
+
+    # Find newly passed (were missed in baseline)
+    for doc in evolved_remarks.get("passed", []):
+        k = _key(doc)
+        if k in bl_missed:
+            bl_doc = bl_missed[k]
+            flip = {
+                "function": doc["function"],
+                "callee": doc["args"].get("Callee", ""),
+                "pass": doc["pass"],
+                "direction": "newly_passed",
+                "evolved_cost": doc["args"].get("Cost", ""),
+                "evolved_threshold": doc["args"].get("Threshold", ""),
+                "baseline_cost": bl_doc["args"].get("Cost", ""),
+                "baseline_threshold": bl_doc["args"].get("Threshold", ""),
+            }
+            summary["flipped"].append(flip)
+
+    # Find newly missed (were passed in baseline)
+    for doc in evolved_remarks.get("missed", []):
+        k = _key(doc)
+        if k in bl_passed:
+            bl_doc = bl_passed[k]
+            flip = {
+                "function": doc["function"],
+                "callee": doc["args"].get("Callee", ""),
+                "pass": doc["pass"],
+                "direction": "newly_missed",
+                "evolved_cost": doc["args"].get("Cost", ""),
+                "evolved_threshold": doc["args"].get("Threshold", ""),
+                "baseline_cost": bl_doc["args"].get("Cost", ""),
+                "baseline_threshold": bl_doc["args"].get("Threshold", ""),
+            }
+            summary["flipped"].append(flip)
+
+    # Sort flipped by absolute cost difference (most impactful first)
+    def _sort_key(f):
+        try:
+            return abs(int(f["evolved_cost"]) - int(f["baseline_cost"]))
+        except (ValueError, TypeError):
+            return 0
+    summary["flipped"].sort(key=_sort_key, reverse=True)
+
+    return summary
+
+
 def run_perf_stat(name, binary_path, tmp_dir, data_dir,
                   counters=None):
     """Run a single ``perf stat`` measurement. Returns dict of counter values."""
@@ -416,7 +605,7 @@ def run_benchmark(name: str, binary_path: str, tmp_dir: str, data_dir: str,
 def compile_benchmark(bc_path, opt_path, llc_path, tmp_dir, data_dir,
                       evolved_opt_flags=None, evolved_llc_flags=None,
                       opt_timeout=120, enable_stats=False,
-                      enable_perf=False):
+                      enable_perf=False, enable_remarks=False):
     """Compile a .bc file through ``opt -> llc -> gcc``.
 
     Callers pass evolved flags to *opt*, *llc*, or both:
@@ -435,12 +624,16 @@ def compile_benchmark(bc_path, opt_path, llc_path, tmp_dir, data_dir,
     def _err(msg):
         return {"text_size": None, "binary_size": None, "runtime": None,
                 "timings": [], "opt_stats": {}, "llc_stats": {},
-                "perf_counters": {}, "error": msg}
+                "perf_counters": {}, "opt_remarks": {}, "error": msg}
 
     # opt pass
     opt_cmd = [str(opt_path), "-O2"]
     if enable_stats:
         opt_cmd.append("-stats")
+    remarks_file = None
+    if enable_remarks:
+        remarks_file = os.path.join(tmp_dir, f"{name}_remarks.yaml")
+        opt_cmd.append(f"-pass-remarks-output={remarks_file}")
     if evolved_opt_flags:
         opt_cmd.extend(evolved_opt_flags)
     opt_cmd += [str(bc_path), "-o", opt_bc]
@@ -455,6 +648,7 @@ def _err(msg):
         return _err(proc.stderr[:500])
 
     opt_stats = parse_stats(proc.stderr) if enable_stats else {}
+    opt_remarks = parse_remarks(remarks_file) if remarks_file else {}
 
     # llc: bitcode -> object
     llc_cmd = [str(llc_path), "-O2", "-filetype=obj", "-relocation-model=pic"]
@@ -487,11 +681,12 @@ def _err(msg):
     except subprocess.TimeoutExpired:
         return {"text_size": text_size, "binary_size": None, "runtime": None,
                 "timings": [], "opt_stats": opt_stats, "llc_stats": llc_stats,
-                "perf_counters": {}, "error": "link timed out"}
+                "perf_counters": {}, "opt_remarks": opt_remarks,
+                "error": "link timed out"}
     if proc.returncode != 0:
         return {"text_size": text_size, "binary_size": None, "runtime": None,
                 "timings": [], "opt_stats": opt_stats, "llc_stats": llc_stats,
-                "perf_counters": {},
+                "perf_counters": {}, "opt_remarks": opt_remarks,
                 "error": f"link failed: {proc.stderr[:200]}"}
 
     binary_size = os.path.getsize(binary)
@@ -510,6 +705,7 @@ def _err(msg):
         "opt_stats": opt_stats,
         "llc_stats": llc_stats,
         "perf_counters": perf_counters,
+        "opt_remarks": opt_remarks,
         "error": None,
     }
 
@@ -614,7 +810,8 @@ def load_baseline(config: EvalConfig):
 def eval_benchmarks(benchmarks, opt_path, llc_path, baseline, tmp_dir,
                     data_dir, score_fn, evolved_opt_flags=None,
                     evolved_llc_flags=None, opt_timeout=120,
-                    enable_stats=False, enable_perf=False):
+                    enable_stats=False, enable_perf=False,
+                    enable_remarks=False):
     """Compile and score benchmarks.
 
     *score_fn(total_binary, baseline_total_binary, speedups)* computes the
@@ -639,6 +836,7 @@ def eval_benchmarks(benchmarks, opt_path, llc_path, baseline, tmp_dir,
             opt_timeout=opt_timeout,
             enable_stats=enable_stats,
             enable_perf=enable_perf,
+            enable_remarks=enable_remarks,
         )
         bl = baseline.get(bc.name, {})
         text_size = r.get("text_size")
@@ -654,6 +852,7 @@ def eval_benchmarks(benchmarks, opt_path, llc_path, baseline, tmp_dir,
             "opt_stats": r.get("opt_stats", {}),
             "llc_stats": r.get("llc_stats", {}),
             "perf_counters": r.get("perf_counters", {}),
+            "opt_remarks": r.get("opt_remarks", {}),
         }
 
         if err:
@@ -811,16 +1010,17 @@ def _fmt_runtime(seconds):
 
 
 def generate_asi(score, result_dict, baseline, baseline_stats=None,
-                 formula=None):
+                 formula=None, baseline_remarks=None):
     """Generate Actionable Side Information markdown narrative.
 
     Produces structured diagnostic feedback (GEPA-style "text gradients")
-    with up to four tiers of analysis:
+    with up to five tiers of analysis:
 
     - **Tier 1** — Score decomposition + per-benchmark signal classification
     - **Tier 2** — Compiler statistics delta vs baseline (requires *baseline_stats*)
     - **Tier 3** — Runtime variance from individual timings
     - **Tier 4** — Hardware perf counters (if collected)
+    - **Tier 5** — Optimization decision changes (requires *baseline_remarks*)
     """
     if formula is None:
         formula = ScoreFormula()
@@ -1011,6 +1211,71 @@ def generate_asi(score, result_dict, baseline, baseline_stats=None,
                 lines.append(f"| {counter} | {value:,} |")
         lines.append("")
 
+    # ---- Tier 5: Optimization Decision Changes ----
+    if baseline_remarks:
+        has_remarks = any(details[b].get("opt_remarks") for b in details)
+        if has_remarks:
+            lines.append("### Optimization Decisions")
+            for bname in sorted(details.keys()):
+                evolved_rm = details[bname].get("opt_remarks", {})
+                bl_rm = baseline_remarks.get(bname, {})
+                if not evolved_rm and not bl_rm:
+                    continue
+
+                summary = summarize_remarks(evolved_rm, bl_rm)
+                flipped = summary.get("flipped", [])
+                if not flipped and summary["evolved_passed"] == summary["baseline_passed"]:
+                    continue
+
+                short = bname.replace(".bc", "")
+                n_flipped = len(flipped)
+                newly_passed = sum(
+                    1 for f in flipped if f["direction"] == "newly_passed"
+                )
+                newly_missed = sum(
+                    1 for f in flipped if f["direction"] == "newly_missed"
+                )
+                lines.append(
+                    f"\n**{short}** ({n_flipped} decisions changed vs baseline):"
+                )
+                if newly_passed:
+                    lines.append(f"- {newly_passed} newly passed (were rejected)")
+                if newly_missed:
+                    lines.append(f"- {newly_missed} newly rejected (were passed)")
+
+                # Show top flipped decisions with cost/threshold info
+                top_flips = flipped[:5]
+                if top_flips:
+                    lines.append("")
+                    lines.append(
+                        "| Function | Callee | Direction | "
+                        "BL Cost/Thresh | Ev Cost/Thresh |"
+                    )
+                    lines.append(
+                        "|----------|--------|-----------|"
+                        "----------------|----------------|"
+                    )
+                    for f in top_flips:
+                        direction = (
+                            "now passed" if f["direction"] == "newly_passed"
+                            else "now rejected"
+                        )
+                        bl_ct = (
+                            f"{f['baseline_cost']}/{f['baseline_threshold']}"
+                            if f["baseline_cost"] else "N/A"
+                        )
+                        ev_ct = (
+                            f"{f['evolved_cost']}/{f['evolved_threshold']}"
+                            if f["evolved_cost"] else "N/A"
+                        )
+                        func = f["function"][:30]
+                        callee = f["callee"][:20]
+                        lines.append(
+                            f"| {func} | {callee} | {direction} "
+                            f"| {bl_ct} | {ev_ct} |"
+                        )
+            lines.append("")
+
     return "\n".join(lines)
 
 
@@ -1065,3 +1330,47 @@ def load_baseline_stats(config):
         pass
 
     return baseline_stats
+
+
+def load_baseline_remarks(config):
+    """Load or compute baseline optimization remarks.
+
+    Remarks are cached in ``baseline_remarks.json`` alongside the baseline
+    file.  Re-generates when the file is missing.  Only called when
+    ``config.enable_remarks`` is True.
+    """
+    remarks_path = Path(config.baseline_file).parent / "baseline_remarks.json"
+    if remarks_path.exists():
+        with open(remarks_path) as f:
+            return json.load(f)
+
+    opt_path = os.path.join(config.build_dir, "bin", "opt")
+    llc_path = os.path.join(config.build_dir, "bin", "llc")
+    benchmarks = find_benchmarks(Path(config.testsuite_dir))
+
+    if not benchmarks:
+        return {}
+
+    baseline_remarks = {}
+    with tempfile.TemporaryDirectory(prefix="evolve_blremarks_") as tmp_dir:
+        for bc in benchmarks:
+            print(f"  Baseline remarks: {bc.stem}...", end=" ", flush=True)
+            r = compile_benchmark(
+                bc, opt_path, llc_path, tmp_dir, config.data_dir,
+                opt_timeout=config.opt_timeout, enable_remarks=True,
+            )
+            remarks = r.get("opt_remarks", {})
+            n_passed = len(remarks.get("passed", []))
+            n_missed = len(remarks.get("missed", []))
+            baseline_remarks[bc.name] = remarks
+            print(f"passed={n_passed}, missed={n_missed}")
+
+    try:
+        os.makedirs(remarks_path.parent, exist_ok=True)
+        with open(remarks_path, "w") as f:
+            json.dump(baseline_remarks, f)
+        print(f"  Baseline remarks saved to {remarks_path}")
+    except OSError:
+        pass
+
+    return baseline_remarks
diff --git a/src/mlirAgent/evolve/tasks/llvm_inlining/evaluate.py b/src/mlirAgent/evolve/tasks/llvm_inlining/evaluate.py
index dea6b08..25162f3 100644
--- a/src/mlirAgent/evolve/tasks/llvm_inlining/evaluate.py
+++ b/src/mlirAgent/evolve/tasks/llvm_inlining/evaluate.py
@@ -26,7 +26,8 @@
     from ..llvm_bench import (
         EvalConfig, ScoreFormula, build_llvm, eval_benchmarks,
         extract_hyperparams, find_benchmarks, generate_asi, load_baseline,
-        load_baseline_stats, optuna_tune, patch_source, restore_source,
+        load_baseline_remarks, load_baseline_stats, optuna_tune,
+        patch_source, restore_source,
     )
 except ImportError:
     # Standalone loading by OpenEvolve's importlib (no parent package)
@@ -34,7 +35,8 @@
     from llvm_bench import (
         EvalConfig, ScoreFormula, build_llvm, eval_benchmarks,
         extract_hyperparams, find_benchmarks, generate_asi, load_baseline,
-        load_baseline_stats, optuna_tune, patch_source, restore_source,
+        load_baseline_remarks, load_baseline_stats, optuna_tune,
+        patch_source, restore_source,
     )
 
 try:
@@ -145,6 +147,7 @@ def evaluate(program_path: str, config: EvalConfig = None) -> dict:
                 opt_timeout=config.opt_timeout,
                 enable_stats=config.enable_stats,
                 enable_perf=config.enable_perf_counters,
+                enable_remarks=config.enable_remarks,
             )
 
         result["combined_score"] = score
@@ -173,6 +176,9 @@ def evaluate(program_path: str, config: EvalConfig = None) -> dict:
         baseline_stats = None
         if config.enable_stats:
             baseline_stats = load_baseline_stats(config)
+        bl_remarks = None
+        if config.enable_remarks:
+            bl_remarks = load_baseline_remarks(config)
         asi = generate_asi(
             score, ev, baseline, baseline_stats=baseline_stats,
             formula=ScoreFormula(
@@ -180,6 +186,7 @@ def evaluate(program_path: str, config: EvalConfig = None) -> dict:
                 binary_weight=1.0,
                 description="binary_reduction% + (avg_speedup - 1) x 10",
             ),
+            baseline_remarks=bl_remarks,
         )
 
         if EvaluationResult is not None:
diff --git a/src/mlirAgent/evolve/tasks/loop_unrolling/evaluate.py b/src/mlirAgent/evolve/tasks/loop_unrolling/evaluate.py
index 85f91da..e6563a5 100644
--- a/src/mlirAgent/evolve/tasks/loop_unrolling/evaluate.py
+++ b/src/mlirAgent/evolve/tasks/loop_unrolling/evaluate.py
@@ -27,7 +27,8 @@
     from ..llvm_bench import (
         EvalConfig, ScoreFormula, build_llvm, eval_benchmarks,
         extract_hyperparams, find_benchmarks, generate_asi, load_baseline,
-        load_baseline_stats, optuna_tune, patch_source, restore_source,
+        load_baseline_remarks, load_baseline_stats, optuna_tune,
+        patch_source, restore_source,
     )
 except ImportError:
     # Standalone loading by OpenEvolve's importlib (no parent package)
@@ -35,7 +36,8 @@
     from llvm_bench import (
         EvalConfig, ScoreFormula, build_llvm, eval_benchmarks,
         extract_hyperparams, find_benchmarks, generate_asi, load_baseline,
-        load_baseline_stats, optuna_tune, patch_source, restore_source,
+        load_baseline_remarks, load_baseline_stats, optuna_tune,
+        patch_source, restore_source,
     )
 
 try:
@@ -147,6 +149,7 @@ def evaluate(program_path: str, config: EvalConfig = None) -> dict:
                 opt_timeout=config.opt_timeout,
                 enable_stats=config.enable_stats,
                 enable_perf=config.enable_perf_counters,
+                enable_remarks=config.enable_remarks,
             )
 
         result["combined_score"] = score
@@ -169,6 +172,9 @@ def evaluate(program_path: str, config: EvalConfig = None) -> dict:
         baseline_stats = None
         if config.enable_stats:
             baseline_stats = load_baseline_stats(config)
+        bl_remarks = None
+        if config.enable_remarks:
+            bl_remarks = load_baseline_remarks(config)
         asi = generate_asi(
             score, ev, baseline, baseline_stats=baseline_stats,
             formula=ScoreFormula(
@@ -176,6 +182,7 @@ def evaluate(program_path: str, config: EvalConfig = None) -> dict:
                 binary_weight=1.0,
                 description="5 x speedup% + binary_reduction%",
             ),
+            baseline_remarks=bl_remarks,
         )
 
         if EvaluationResult is not None:
diff --git a/src/mlirAgent/evolve/tasks/regalloc_priority/evaluate.py b/src/mlirAgent/evolve/tasks/regalloc_priority/evaluate.py
index b5bf452..a09acf2 100644
--- a/src/mlirAgent/evolve/tasks/regalloc_priority/evaluate.py
+++ b/src/mlirAgent/evolve/tasks/regalloc_priority/evaluate.py
@@ -26,7 +26,8 @@
     from ..llvm_bench import (
         EvalConfig, ScoreFormula, build_llvm, eval_benchmarks,
         extract_hyperparams, find_benchmarks, generate_asi, load_baseline,
-        load_baseline_stats, optuna_tune, patch_source, restore_source,
+        load_baseline_remarks, load_baseline_stats, optuna_tune,
+        patch_source, restore_source,
     )
 except ImportError:
     # Standalone loading by OpenEvolve's importlib (no parent package)
@@ -34,7 +35,8 @@
     from llvm_bench import (
         EvalConfig, ScoreFormula, build_llvm, eval_benchmarks,
         extract_hyperparams, find_benchmarks, generate_asi, load_baseline,
-        load_baseline_stats, optuna_tune, patch_source, restore_source,
+        load_baseline_remarks, load_baseline_stats, optuna_tune,
+        patch_source, restore_source,
     )
 
 try:
@@ -145,6 +147,7 @@ def evaluate(program_path: str, config: EvalConfig = None) -> dict:
                 opt_timeout=config.opt_timeout,
                 enable_stats=config.enable_stats,
                 enable_perf=config.enable_perf_counters,
+                enable_remarks=config.enable_remarks,
             )
 
         result["combined_score"] = score
@@ -167,6 +170,9 @@ def evaluate(program_path: str, config: EvalConfig = None) -> dict:
         baseline_stats = None
         if config.enable_stats:
             baseline_stats = load_baseline_stats(config)
+        bl_remarks = None
+        if config.enable_remarks:
+            bl_remarks = load_baseline_remarks(config)
         asi = generate_asi(
             score, ev, baseline, baseline_stats=baseline_stats,
             formula=ScoreFormula(
@@ -174,6 +180,7 @@ def evaluate(program_path: str, config: EvalConfig = None) -> dict:
                 binary_weight=1.0,
                 description="5 x speedup% + binary_reduction%",
             ),
+            baseline_remarks=bl_remarks,
         )
 
         if EvaluationResult is not None:

From c0cf65b47ba708e47704cfcdcd49d163fdd6ea8d Mon Sep 17 00:00:00 2001
From: Ashvin Verma <ashvin.verma@Berkeley.EDU>
Date: Mon, 23 Feb 2026 09:07:50 -0800
Subject: [PATCH 7/8] Fix GEPA runner: disable cloudpickle, tolerate eval
 exceptions

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/mlirAgent/evolve/gepa_run.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/mlirAgent/evolve/gepa_run.py b/src/mlirAgent/evolve/gepa_run.py
index 8e7dfb5..e2f10f9 100644
--- a/src/mlirAgent/evolve/gepa_run.py
+++ b/src/mlirAgent/evolve/gepa_run.py
@@ -197,6 +197,8 @@ def main():
             max_metric_calls=args.max_evals,
             parallel=False,
             run_dir=run_dir,
+            use_cloudpickle=False,
+            raise_on_exception=False,
         ),
         reflection=ReflectionConfig(
             reflection_lm=lm,

From f5d18b58a193e59e7044a99dcc8cba0e46d52aa0 Mon Sep 17 00:00:00 2001
From: Ashvin Verma <ashvin.verma@Berkeley.EDU>
Date: Mon, 23 Feb 2026 09:40:59 -0800
Subject: [PATCH 8/8] Unify GEPA + OpenEvolve into single run.py entry point

Consolidate manual_run.py, gepa_run.py, gepa_adapter.py, and providers.py
into three clean modules: run.py (CLI), adapters.py (framework adapters),
evaluator.py (shared eval bridge). Both frameworks share the same evaluator
pipeline and prompt/response file-based LLM interface.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/mlirAgent/evolve/README.md       | 109 ++++---
 src/mlirAgent/evolve/adapters.py     | 424 ++++++++++++++++++-------
 src/mlirAgent/evolve/evaluator.py    | 189 +++++++-----
 src/mlirAgent/evolve/gepa_adapter.py |  91 ------
 src/mlirAgent/evolve/gepa_run.py     | 248 ---------------
 src/mlirAgent/evolve/manual_run.py   | 444 ---------------------------
 src/mlirAgent/evolve/providers.py    |  46 ---
 src/mlirAgent/evolve/run.py          | 232 ++++++++------
 8 files changed, 633 insertions(+), 1150 deletions(-)
 delete mode 100644 src/mlirAgent/evolve/gepa_adapter.py
 delete mode 100644 src/mlirAgent/evolve/gepa_run.py
 delete mode 100644 src/mlirAgent/evolve/manual_run.py
 delete mode 100644 src/mlirAgent/evolve/providers.py

diff --git a/src/mlirAgent/evolve/README.md b/src/mlirAgent/evolve/README.md
index 25ee3ae..3b1d0ac 100644
--- a/src/mlirAgent/evolve/README.md
+++ b/src/mlirAgent/evolve/README.md
@@ -83,40 +83,58 @@ export LLVM_SRC_PATH=/scratch/ashvin/llvm-project
 export EVOLVE_BUILD_DIR=/scratch/ashvin/llvm-build
 export EVOLVE_OPTUNA_TRIALS=5   # 0 to disable Optuna
 
-# Launch (--wait mode: you respond to prompts manually or via Claude Code)
-python -m mlirAgent.evolve.manual_run --example llvm_inlining -n 10 --wait
+# GEPA (default) — manual mode: write prompt_NNN.response.md when prompted
+python run.py --task llvm_inlining --max-evals 10
 
-# Or auto mode (built-in heuristic strategies respond automatically)
-python -m mlirAgent.evolve.manual_run --example regalloc_priority -n 10 --auto
+# GEPA — auto mode for smoke testing
+python run.py --task llvm_inlining --max-evals 2 --auto
+
+# OpenEvolve — manual mode
+python run.py --framework openevolve --task llvm_inlining --max-evals 10
+
+# OpenEvolve — auto mode
+python run.py --framework openevolve --task regalloc_priority -n 10 --auto
+
+# Override Optuna trials
+python run.py --task llvm_inlining --max-evals 10 --optuna-trials 5
 ```
 
 This creates an experiment directory:
 ```
 experiments/run_20260219_132604/
-  scores.jsonl                    # One JSON line per iteration with all metrics
+  summary.json                    # Framework, task, output paths
+  best.cpp                        # Best evolved code
   prompts/
-    prompt_001.md                 # OpenEvolve prompt (parent code + history)
+    prompt_001.md                 # LLM prompt (GEPA reflection or OpenEvolve parent code)
     prompt_001.response.md        # LLM/agent response (new code)
     prompt_002.md
     ...
-  openevolve_output/
-    checkpoints/checkpoint_N/     # Population state for --resume
-    best/best_program.cpp         # Best evolved program
-    logs/openevolve_*.log         # Detailed log
+  gepa_state/                     # GEPA only: optimizer state (for resume)
+  openevolve_output/              # OpenEvolve only:
+    checkpoints/checkpoint_N/     #   Population state for --resume
+    best/best_program.cpp         #   Best evolved program
+    logs/openevolve_*.log         #   Detailed log
+  scores.jsonl                    # OpenEvolve only: per-iteration metrics
 ```
 
 ### What Happens Each Iteration
 
 ```
+    run.py --framework {gepa, openevolve}
+                        │
+            ┌───────────┴───────────┐
+            ▼                       ▼
+    ┌───────────────┐     ┌──────────────────────┐
+    │ GEPAAdapter   │     │ OpenEvolveAdapter    │
+    │ (Pareto       │     │ (MAP-Elites          │
+    │  frontier)    │     │  population)         │
+    └───────┬───────┘     └──────────┬───────────┘
+            │                        │
+            └───────────┬────────────┘
+                        │ 1. Select/reflect on parent
+                        ▼
                         ┌─────────────────────────────────┐
-                        │        OpenEvolve Controller     │
-                        │  (population, MAP-Elites, etc.)  │
-                        └────────────┬────────────────────┘
-                                     │ 1. Sample parent program
-                                     │    from population
-                                     ▼
-                        ┌─────────────────────────────────┐
-                        │          ManualLLM Bridge        │
+                        │       ManualLM / ManualLLM       │
                         │  Write prompt_NNN.md to disk     │
                         │  Poll for prompt_NNN.response.md │
                         └────────────┬────────────────────┘
@@ -124,7 +142,7 @@ experiments/run_20260219_132604/
                                      │    writes response file
                                      ▼
                         ┌─────────────────────────────────┐
-                        │       Task Evaluator (evaluate.py)│
+                        │    evaluator.py → evaluate.py    │
                         └────────────┬────────────────────┘
                                      │
               ┌──────────────────────┼──────────────────────┐
@@ -308,36 +326,31 @@ diagnostic context for proposing improvements.
 
 ### Usage
 
-```bash
-# Manual mode: prompts appear as prompt_NNN.md, you write prompt_NNN.response.md
-python gepa_run.py --task llvm_inlining --max-evals 10
+Both frameworks are accessed through the unified `run.py`:
 
-# Auto mode for smoke testing (auto-responds with trivially modified code)
-python gepa_run.py --task llvm_inlining --max-evals 2 --auto-respond
-```
+```bash
+# GEPA — manual mode
+python run.py --task llvm_inlining --max-evals 10
 
-### Configuration
+# GEPA — auto mode for smoke testing
+python run.py --task llvm_inlining --max-evals 2 --auto
 
-| Flag | Default | Description |
-|------|---------|-------------|
-| `--task` | (required) | `llvm_inlining`, `loop_unrolling`, or `regalloc_priority` |
-| `--max-evals` | 10 | Maximum evaluator calls (seed + proposals) |
-| `--prompts-dir` | `gepa_prompts` | Directory for prompt/response files |
-| `--output-dir` | `<prompts-dir>/run` | GEPA state directory (for resume) |
-| `--auto-respond` | off | Spawn background thread that auto-creates responses |
+# OpenEvolve
+python run.py --framework openevolve --task llvm_inlining --max-evals 10
+```
 
 ### GEPA vs OpenEvolve
 
-| Feature | OpenEvolve | GEPA |
-|---------|-----------|------|
-| Population | MAP-Elites (50 candidates) | Pareto frontier |
-| Feedback | Scalar score only → ASI via artifacts | Native side-info channel |
-| LLM interface | ManualLLM (file-based) | ManualLM (file-based) |
-| Hyperparameter tuning | Optuna inner-loop | Not integrated (future) |
-| Resume | Checkpoint directory | `run_dir` state |
+| Feature | GEPA (default) | OpenEvolve |
+|---------|----------------|-----------|
+| Population | Pareto frontier | MAP-Elites (10 candidates) |
+| Feedback | Native side-info `(score, {"Feedback": ASI})` | ASI via `artifacts["asi"]` |
+| LLM interface | ManualLM (`gepa_manual_lm.py`) | ManualLLM (`third_party/openevolve/`) |
+| Hyperparameter tuning | Optuna inner-loop | Optuna inner-loop |
+| Resume | `gepa_state/` directory | Checkpoint directory |
 
-Both frameworks use our same evaluation pipeline (`llvm_bench.py`), so scores
-are directly comparable.
+Both frameworks share the same evaluation pipeline (`evaluator.py` → `llvm_bench.py`),
+so scores are directly comparable.
 
 ## LLVM Hooks
 
@@ -428,9 +441,9 @@ config = EvalConfig.from_env(
 
 ```
 src/mlirAgent/evolve/
-  manual_run.py                  # OpenEvolve orchestrator: --auto/--wait/--resume
-  gepa_run.py                   # GEPA orchestrator: --auto-respond
-  gepa_adapter.py               # GEPA evaluator bridge (score, side_info)
+  run.py                         # Unified CLI: --framework {gepa,openevolve}
+  adapters.py                    # GEPAAdapter + OpenEvolveAdapter
+  evaluator.py                   # Framework-agnostic evaluator bridge
   gepa_manual_lm.py             # File-based LLM for GEPA
   tasks/
     llvm_bench.py                # Shared: EvalConfig, compile, baseline, Optuna, ASI
@@ -461,8 +474,10 @@ experiments/                     # Output (gitignored)
 1. Create `tasks/my_task/` with `initial.cpp` and `evaluate.py`
 2. In `evaluate.py`, define `_score(total_binary, baseline_binary, speedups)`
 3. Call shared functions from `llvm_bench.py` with the right evolved flags
-4. Add entry to `EXAMPLES` dict in `manual_run.py`
-5. If the evolved code affects `llc` (not `opt`), use `flag_target="llc"` in
+4. Add task name to `_TASKS` and `_TASK_INITIAL` in `run.py`
+5. Add task config to `_TASK_CONFIG` in `evaluator.py`
+6. Add objective string to `_TASK_OBJECTIVE` in `adapters.py`
+7. If the evolved code affects `llc` (not `opt`), use `flag_target="llc"` in
    `optuna_tune()` and pass flags via `evolved_llc_flags`
 
 ## Scoring Formulas
diff --git a/src/mlirAgent/evolve/adapters.py b/src/mlirAgent/evolve/adapters.py
index 0b17df0..666797b 100644
--- a/src/mlirAgent/evolve/adapters.py
+++ b/src/mlirAgent/evolve/adapters.py
@@ -1,144 +1,362 @@
-"""Framework adapters for evolutionary optimization.
+"""Framework adapters for LLVM heuristic evolution.
 
-Translates our unified config (task + agent + framework YAML) into the
-specific format each evolution framework expects, then launches it.
+Each adapter wraps a specific evolution framework (GEPA, OpenEvolve) with a
+common interface so ``run.py`` can dispatch to either one.
 """
 
+import asyncio
+import json
 import os
+import re
 import sys
-import tempfile
+import threading
+import time
 from abc import ABC, abstractmethod
+from datetime import datetime
 from pathlib import Path
-from typing import Dict, Any, Optional
 
-import yaml
+_BASE_DIR = Path(__file__).resolve().parent
 
-from ..config import Config
-from .providers import load_agent_config
+# Ensure local packages are importable
+if str(_BASE_DIR) not in sys.path:
+    sys.path.insert(0, str(_BASE_DIR))
 
+# Task → initial source file (relative to _BASE_DIR)
+_TASK_INITIAL = {
+    "llvm_inlining": "tasks/llvm_inlining/initial.cpp",
+    "loop_unrolling": "tasks/loop_unrolling/initial.cpp",
+    "regalloc_priority": "tasks/regalloc_priority/initial.cpp",
+}
 
-class FrameworkAdapter(ABC):
-    """Abstract adapter that bridges our config to a specific evo framework."""
+# Task → GEPA objective string
+_TASK_OBJECTIVE = {
+    "llvm_inlining": (
+        "Maximize binary size reduction across CTMark benchmarks "
+        "by modifying the inlining cost heuristic."
+    ),
+    "loop_unrolling": (
+        "Maximize runtime speedup across CTMark benchmarks "
+        "by modifying the loop unrolling heuristic."
+    ),
+    "regalloc_priority": (
+        "Maximize runtime speedup across CTMark benchmarks "
+        "by modifying the register allocation priority function."
+    ),
+}
 
-    def __init__(self):
-        self.task = None
-        self.agent_config = None
-        self.framework_config = None
+_GEPA_BACKGROUND = (
+    "You are modifying a C++ heuristic function in LLVM's optimization pipeline. "
+    "The function is compiled into the opt/llc tools and evaluated against CTMark "
+    "benchmarks (real-world C/C++ programs). The evaluator returns a score based on "
+    "binary size reduction and/or runtime speedup vs the default LLVM heuristic. "
+    "Higher scores are better. The source code uses LLVM APIs (cl::opt for flags, "
+    "InlineCost, LoopUnrollResult, etc.). Expose tunable constants as "
+    "// [hyperparam]: flag-name, type, min, max comments for the autotuner."
+)
 
-    def configure(self, task, agent_config: Dict[str, Any], framework_config: Dict[str, Any]):
-        """Store task, agent, and framework configs."""
-        self.task = task
-        self.agent_config = agent_config
-        self.framework_config = framework_config
 
-    @abstractmethod
-    def launch(self, dry_run: bool = False, max_iterations: Optional[int] = None) -> Dict[str, Any]:
-        """Start the evolution run. Returns result dict."""
-        ...
+class FrameworkAdapter(ABC):
+    """Common interface for evolution framework adapters."""
 
     @abstractmethod
-    def get_results(self) -> Dict[str, Any]:
-        """Return results from the most recent run."""
+    def run(self, *, task, initial_file, prompts_dir, max_evals,
+            auto_respond, poll_interval, output, exp_dir, **kwargs):
+        """Run the evolution loop. Returns a result dict."""
         ...
 
 
+# ---------------------------------------------------------------------------
+# GEPA
+# ---------------------------------------------------------------------------
+
+def _auto_respond_thread(prompts_dir, initial_code, poll_interval=1.0):
+    """Background thread that auto-creates response files for smoke testing."""
+    seen = set()
+    prompt_re = re.compile(r"^prompt_(\d+)\.md$")
+
+    while True:
+        try:
+            for fname in os.listdir(prompts_dir):
+                m = prompt_re.match(fname)
+                if not m:
+                    continue
+                num = m.group(1)
+                response_name = f"prompt_{num}.response.md"
+                if response_name in seen:
+                    continue
+                response_path = os.path.join(prompts_dir, response_name)
+                if os.path.exists(response_path):
+                    seen.add(response_name)
+                    continue
+
+                modified = initial_code.replace(
+                    "// EVOLVE-BLOCK-START",
+                    f"// EVOLVE-BLOCK-START\n// Auto-response iteration {num}",
+                )
+                with open(response_path, "w") as f:
+                    f.write(f"```cpp\n{modified}\n```\n")
+                seen.add(response_name)
+                print(f"  [auto-respond] Created {response_name}")
+        except OSError:
+            pass
+        time.sleep(poll_interval)
+
+
+class GEPAAdapter(FrameworkAdapter):
+    """Runs GEPA ``optimize_anything()`` with ManualLM and ASI feedback.
+
+    GEPA's evaluator receives ``(score, {"Feedback": ASI_text})`` so it
+    can embed rich diagnostic feedback into its reflection prompts.
+    Optuna hyperparameter tuning runs inside each evaluation automatically
+    when ``[hyperparam]`` annotations are present in the C++ code.
+    """
+
+    def run(self, *, task, initial_file, prompts_dir, max_evals,
+            auto_respond, poll_interval=2.0, output=None, exp_dir=None,
+            **kwargs):
+        from gepa.optimize_anything import (
+            optimize_anything, GEPAConfig, EngineConfig, ReflectionConfig,
+        )
+        from gepa_manual_lm import ManualLM
+        from evaluator import make_evaluator
+
+        with open(initial_file) as f:
+            initial_code = f.read()
+
+        lm = ManualLM(prompts_dir=prompts_dir, poll_interval=poll_interval)
+        evaluator = make_evaluator(task)
+
+        run_dir = os.path.join(exp_dir, "gepa_state") if exp_dir else None
+        if run_dir:
+            os.makedirs(run_dir, exist_ok=True)
+
+        if auto_respond:
+            t = threading.Thread(
+                target=_auto_respond_thread,
+                args=(prompts_dir, initial_code, poll_interval),
+                daemon=True,
+            )
+            t.start()
+            print("  [auto-respond] Background responder started")
+
+        config = GEPAConfig(
+            engine=EngineConfig(
+                max_metric_calls=max_evals,
+                parallel=False,
+                run_dir=run_dir,
+                use_cloudpickle=False,
+                raise_on_exception=False,
+            ),
+            reflection=ReflectionConfig(
+                reflection_lm=lm,
+            ),
+        )
+
+        result = optimize_anything(
+            seed_candidate=initial_code,
+            evaluator=evaluator,
+            objective=_TASK_OBJECTIVE[task],
+            background=_GEPA_BACKGROUND,
+            config=config,
+        )
+
+        # Save best code
+        output_path = output or os.path.join(exp_dir or ".", "best.cpp")
+        os.makedirs(os.path.dirname(output_path), exist_ok=True)
+        with open(output_path, "w") as f:
+            f.write(result.best_candidate)
+        print(f"Best code saved to: {output_path}")
+
+        summary = {
+            "framework": "gepa",
+            "task": task,
+            "num_candidates": result.num_candidates,
+            "total_metric_calls": result.total_metric_calls,
+            "output_path": output_path,
+        }
+        summary_path = os.path.join(exp_dir or prompts_dir, "summary.json")
+        with open(summary_path, "w") as f:
+            json.dump(summary, f, indent=2)
+        print(f"Summary saved to: {summary_path}")
+
+        return summary
+
+
+# ---------------------------------------------------------------------------
+# OpenEvolve
+# ---------------------------------------------------------------------------
+
 class OpenEvolveAdapter(FrameworkAdapter):
-    """Adapter for the OpenEvolve framework (third_party/openevolve)."""
+    """Runs OpenEvolve with ManualLLM (file-based prompt/response).
+
+    OpenEvolve uses MAP-Elites population management. ASI feedback is
+    passed through ``EvaluationResult.artifacts["asi"]``.  Optuna runs
+    inside each evaluation automatically.
+    """
 
-    def __init__(self):
-        super().__init__()
-        self._result = None
+    def run(self, *, task, initial_file, prompts_dir, max_evals,
+            auto_respond, poll_interval=2.0, output=None, exp_dir=None,
+            resume=None, **kwargs):
+        return asyncio.run(self._run_async(
+            task=task, initial_file=initial_file, prompts_dir=prompts_dir,
+            max_evals=max_evals, auto_respond=auto_respond,
+            output=output, exp_dir=exp_dir, resume=resume,
+        ))
 
-    def launch(self, dry_run: bool = False, max_iterations: Optional[int] = None) -> Dict[str, Any]:
+    async def _run_async(self, *, task, initial_file, prompts_dir,
+                         max_evals, auto_respond, output, exp_dir, resume):
         # Ensure openevolve is importable
-        oe_path = Config.OPENEVOLVE_PATH
+        mlirevolve_root = _BASE_DIR.parent.parent.parent
+        oe_path = str(mlirevolve_root / "third_party" / "openevolve")
         if oe_path not in sys.path:
             sys.path.insert(0, oe_path)
 
         from openevolve.config import Config as OEConfig, LLMModelConfig
+        from openevolve.controller import OpenEvolve
+        from openevolve.llm.manual import create_manual_llm
 
-        # Build OpenEvolve config from our YAML configs
-        oe_cfg = OEConfig()
-
-        # Framework settings
-        fw = self.framework_config or {}
-        oe_cfg.max_iterations = max_iterations or fw.get("max_iterations", 100)
-        oe_cfg.database.num_islands = fw.get("islands", 4)
-        oe_cfg.database.population_size = fw.get("population_size", 50)
-        oe_cfg.database.migration_interval = fw.get("migration_interval", 10)
-        if fw.get("random_seed") is not None:
-            oe_cfg.random_seed = fw["random_seed"]
-
-        # File suffix for C++ evolution
-        oe_cfg.language = "cpp"
-        oe_cfg.file_suffix = ".cpp"
-
-        # LLM settings from agent config
-        agent = self.agent_config or {}
-        model = LLMModelConfig(
-            name=agent.get("model", "claude-opus-4-6"),
-            api_base=agent.get("api_base", "https://api.anthropic.com/v1"),
-            api_key=agent.get("api_key", ""),
-            temperature=agent.get("temperature", 0.7),
-            max_tokens=agent.get("max_tokens", 4096),
+        os.environ["MANUAL_LLM_PROMPTS_DIR"] = prompts_dir
+
+        # Build config
+        configs_dir = str(mlirevolve_root / "configs")
+        fw_yaml = os.path.join(configs_dir, "frameworks", "manual.yaml")
+        cfg = OEConfig.from_yaml(fw_yaml) if os.path.exists(fw_yaml) else OEConfig()
+
+        cfg.max_iterations = max_evals
+        cfg.file_suffix = ".cpp"
+        cfg.language = "cpp"
+
+        manual_model = LLMModelConfig(
+            name="manual",
+            init_client=create_manual_llm,
+            weight=1.0,
         )
-        oe_cfg.llm.models = [model]
-        oe_cfg.llm.evaluator_models = [model]
-
-        # Paths
-        initial_program = str(self.task.get_initial_program())
-        evaluator = str(self.task.get_evaluator())
-
-        if dry_run:
-            return {
-                "dry_run": True,
-                "initial_program": initial_program,
-                "evaluator": evaluator,
-                "config": {
-                    "max_iterations": oe_cfg.max_iterations,
-                    "population_size": oe_cfg.database.population_size,
-                    "islands": oe_cfg.database.num_islands,
-                    "model": agent.get("model"),
-                    "language": oe_cfg.language,
-                },
-            }
+        cfg.llm.models = [manual_model]
+        cfg.llm.evaluator_models = [manual_model]
 
-        # Launch via OpenEvolve API
-        from openevolve.api import run_evolution
+        cfg.database.population_size = 10
+        cfg.database.archive_size = 10
+        cfg.database.num_islands = 1
+        cfg.database.migration_interval = 999
+        cfg.checkpoint_interval = 1
+        cfg.diff_based_evolution = False
 
-        result = run_evolution(
-            initial_program=initial_program,
-            evaluator=evaluator,
-            config=oe_cfg,
-            iterations=oe_cfg.max_iterations,
-            cleanup=False,
+        # Evaluator path (the task's evaluate.py)
+        evaluator_path = str(_BASE_DIR / "tasks" / task / "evaluate.py")
+
+        oe_output_dir = os.path.join(exp_dir, "openevolve_output")
+        scores_path = os.path.join(exp_dir, "scores.jsonl")
+        os.makedirs(oe_output_dir, exist_ok=True)
+
+        openevolve = OpenEvolve(
+            initial_program_path=initial_file,
+            evaluation_file=evaluator_path,
+            config=cfg,
+            output_dir=oe_output_dir,
         )
-        self._result = {
-            "best_score": result.best_score,
-            "best_code": result.best_code,
-            "metrics": result.metrics,
-            "output_dir": result.output_dir,
+
+        if resume:
+            if os.path.exists(resume):
+                print(f"Resuming from checkpoint: {resume}")
+                openevolve.database.load(resume)
+            else:
+                print(f"Warning: Checkpoint not found: {resume}")
+
+        # Auto-respond thread
+        stop_event = asyncio.Event()
+        responder_task = None
+        if auto_respond:
+            loop = asyncio.get_event_loop()
+            responder_task = loop.run_in_executor(
+                None, _oe_auto_respond, prompts_dir, stop_event,
+            )
+
+        # Score logging hook
+        _original_add = openevolve.database.add
+
+        def _logging_add(program, *a, **kw):
+            result = _original_add(program, *a, **kw)
+            entry = {
+                "timestamp": time.time(),
+                "iteration": program.iteration_found,
+                "program_id": program.id,
+                "metrics": program.metrics,
+            }
+            best = openevolve.database.get_best_program()
+            if best:
+                entry["best_score"] = best.metrics.get("combined_score", 0)
+            with open(scores_path, "a") as f:
+                f.write(json.dumps(entry, default=str) + "\n")
+            return result
+
+        openevolve.database.add = _logging_add
+
+        try:
+            print(f"Starting OpenEvolve ({cfg.max_iterations} iterations)...")
+            best = await openevolve.run(
+                iterations=cfg.max_iterations,
+                checkpoint_path=resume,
+            )
+            if best:
+                print(f"\nBest metrics:")
+                for k, v in best.metrics.items():
+                    if isinstance(v, float):
+                        print(f"  {k}: {v:.4f}")
+                    else:
+                        print(f"  {k}: {v}")
+
+                # Save best code
+                if output and hasattr(best, "code"):
+                    os.makedirs(os.path.dirname(output), exist_ok=True)
+                    with open(output, "w") as f:
+                        f.write(best.code)
+                    print(f"Best code saved to: {output}")
+        finally:
+            stop_event.set()
+            if responder_task:
+                await asyncio.sleep(2)
+
+        summary = {
+            "framework": "openevolve",
+            "task": task,
+            "output_dir": oe_output_dir,
         }
-        return self._result
+        summary_path = os.path.join(exp_dir, "summary.json")
+        with open(summary_path, "w") as f:
+            json.dump(summary, f, indent=2)
+
+        return summary
 
-    def get_results(self) -> Dict[str, Any]:
-        return self._result or {}
 
+def _oe_auto_respond(prompts_dir, stop_event):
+    """Auto-responder for OpenEvolve (trivial code modification)."""
+    import glob
 
-class ShinkaAdapter(FrameworkAdapter):
-    """Adapter for ShinkaEvolve (stub — not yet implemented)."""
+    responded = set()
+    pattern = re.compile(r"prompt_\d+\.md$")
 
-    def launch(self, dry_run: bool = False, max_iterations: Optional[int] = None) -> Dict[str, Any]:
-        raise NotImplementedError(
-            "ShinkaEvolve adapter is not yet implemented. "
-            "Use --framework openevolve for now."
+    while not stop_event.is_set():
+        prompt_files = sorted(
+            f for f in glob.glob(os.path.join(prompts_dir, "prompt_*.md"))
+            if pattern.search(f)
         )
+        for pf in prompt_files:
+            if pf in responded:
+                continue
+            resp_path = pf.replace(".md", ".response.md")
+            if os.path.exists(resp_path):
+                responded.add(pf)
+                continue
 
-    def get_results(self) -> Dict[str, Any]:
-        raise NotImplementedError("ShinkaEvolve adapter is not yet implemented.")
+            with open(pf) as f:
+                prompt_text = f.read()
 
+            # Trivial modification: add a comment to the code
+            num = Path(pf).stem.split("_")[-1]
+            response = f"// Auto-response iteration {num}\n{prompt_text[:500]}"
+            with open(resp_path, "w") as f:
+                f.write(response)
+            responded.add(pf)
+            print(f"  [auto] Responded to {os.path.basename(pf)}")
 
-ADAPTERS = {
-    "openevolve": OpenEvolveAdapter,
-    "shinkaevolve": ShinkaAdapter,
-}
+        time.sleep(1)
diff --git a/src/mlirAgent/evolve/evaluator.py b/src/mlirAgent/evolve/evaluator.py
index f990917..a40e2bd 100644
--- a/src/mlirAgent/evolve/evaluator.py
+++ b/src/mlirAgent/evolve/evaluator.py
@@ -1,72 +1,121 @@
+"""Evaluator bridges for LLVM heuristic evolution.
+
+Creates framework-agnostic evaluator callables that:
+1. Write candidate C++ to a temp file
+2. Call the task-specific evaluate() (patch LLVM, build, benchmark)
+3. Return (score, side_info) for GEPA or EvaluationResult for OpenEvolve
+
+The actual compilation/benchmark logic lives in ``tasks/llvm_bench.py``
+and per-task ``evaluate.py`` files.
+"""
+
 import os
 import re
-import subprocess
-import optuna
-from typing import Dict, Any
-from openevolve.evaluation_result import EvaluationResult
-
-class MagellanEvaluator:
-    def __init__(self, llvm_build_dir, benchmark_script):
-        self.build_dir = llvm_build_dir
-        self.benchmark_script = benchmark_script
-
-    def evaluate(self, code: str) -> EvaluationResult:
-        # 1. Inject Code into LLVM Source
-        self._inject_code(code)
-
-        # 2. Compile LLVM (Incremental)
-        # We only rebuild the relevant library to save time
-        build_cmd = ["ninja", "-C", self.build_dir, "lib/Analysis/AEInlineAdvisor.o"]
-        if subprocess.run(build_cmd).returncode != 0:
-            return EvaluationResult(score=float('-inf'), error="Compilation Failed")
-        
-        # Link the final tool (e.g., opt or clang)
-        subprocess.run(["ninja", "-C", self.build_dir, "bin/opt"])
-
-        # 3. Inner Loop: Hyperparameter Tuning (The Magellan "Secret Sauce")
-        # Extract params defined in the C++ comments
-        params = self._extract_hyperparams(code) 
-        
-        if not params:
-            # No params to tune, just run once
-            score = self._run_benchmark({})
-            return EvaluationResult(score=score)
-
-        # Use Optuna to tune the exposed flags
-        study = optuna.create_study(direction="maximize")
-        study.optimize(lambda trial: self._objective(trial, params), n_trials=20)
-        
-        best_score = study.best_value
-        best_params = study.best_params
-        
-        return EvaluationResult(
-            score=best_score, 
-            metadata={"tuned_params": best_params}
-        )
-
-    def _objective(self, trial, params_schema):
-        # Map trial suggestions to LLVM flags
-        # e.g., -ae-inline-base-threshold=255
-        flags = []
-        for name, type_, min_v, max_v in params_schema:
-            val = trial.suggest_int(name, int(min_v), int(max_v))
-            flags.append(f"-{name}={val}")
-            
-        return self._run_benchmark(flags)
-
-    def _run_benchmark(self, flags):
-        # Execute the benchmark script with the tuned flags
-        cmd = [self.benchmark_script] + flags
-        result = subprocess.run(cmd, capture_output=True, text=True)
-        # Parse output for binary size reduction or execution speed
-        return self._parse_score(result.stdout)
-
-    def _extract_hyperparams(self, code):
-        # Regex to find lines like: // [hyperparam]: name, type, min, max
-        pattern = r"//\s*\[hyperparam\]:\s*([\w-]+),\s*(\w+),\s*(\d+),\s*(\d+)"
-        return re.findall(pattern, code)
-
-    def _inject_code(self, code):
-        target_path = "llvm-project/llvm/lib/Analysis/AEInlineAdvisor.cpp"
-        with open(target_path, "w") as f:
-            f.write(code)
\ No newline at end of file
+import sys
+import tempfile
+from pathlib import Path
+
+# Ensure tasks package is importable when run standalone
+sys.path.insert(0, str(Path(__file__).resolve().parent))
+
+from tasks.llvm_bench import EvalConfig
+
+_EVOLVE_BLOCK_RE = re.compile(
+    r"(// EVOLVE-BLOCK-START\n)(.*?)(// EVOLVE-BLOCK-END)",
+    re.DOTALL,
+)
+
+
+def extract_evolve_block(code):
+    """Extract the EVOLVE-BLOCK content from C++ source code."""
+    m = _EVOLVE_BLOCK_RE.search(code)
+    if m:
+        return m.group(2)
+    return code
+
+
+def inject_evolve_block(template, block):
+    """Replace EVOLVE-BLOCK in *template* with new *block* content."""
+    return _EVOLVE_BLOCK_RE.sub(
+        lambda m: m.group(1) + block + m.group(3),
+        template,
+    )
+
+
+# Task → (target_file, default baseline overrides)
+_TASK_CONFIG = {
+    "llvm_inlining": {
+        "target_file": "llvm/lib/Analysis/EvolvedInlineCost.cpp",
+    },
+    "loop_unrolling": {
+        "target_file": "llvm/lib/Transforms/Scalar/EvolvedLoopUnroll.cpp",
+        "baseline_file": str(
+            Path(__file__).resolve().parent
+            / "tasks" / "loop_unrolling" / "baseline_unroll.json"
+        ),
+    },
+    "regalloc_priority": {
+        "target_file": "llvm/lib/CodeGen/EvolvedRegAllocPriority.cpp",
+        "baseline_file": str(
+            Path(__file__).resolve().parent
+            / "tasks" / "regalloc_priority" / "baseline_regalloc.json"
+        ),
+    },
+}
+
+
+def _import_evaluate(task_name):
+    """Import the task-specific evaluate function."""
+    if task_name == "llvm_inlining":
+        from tasks.llvm_inlining.evaluate import evaluate
+    elif task_name == "loop_unrolling":
+        from tasks.loop_unrolling.evaluate import evaluate
+    elif task_name == "regalloc_priority":
+        from tasks.regalloc_priority.evaluate import evaluate
+    else:
+        raise ValueError(f"Unknown task: {task_name}")
+    return evaluate
+
+
+def make_evaluator(task_name, config=None):
+    """Create an evaluator function for a given task.
+
+    Returns a callable ``code_str -> (score, side_info)`` matching GEPA's
+    evaluator protocol.  *side_info* is a dict that may contain a
+    ``"Feedback"`` key with ASI markdown text.
+
+    The same evaluator works with OpenEvolve — the score is extracted from
+    the returned tuple's first element.
+    """
+    evaluate = _import_evaluate(task_name)
+
+    if config is None:
+        tc = _TASK_CONFIG[task_name]
+        config = EvalConfig.from_env(tc["target_file"], **{
+            k: v for k, v in tc.items() if k != "target_file"
+        })
+
+    def evaluator(code_str):
+        """Write code to temp file, evaluate, return (score, side_info)."""
+        with tempfile.NamedTemporaryFile(
+            mode="w", suffix=".cpp", delete=False, prefix="evolve_"
+        ) as f:
+            f.write(code_str)
+            tmp_path = f.name
+        try:
+            result = evaluate(tmp_path, config=config)
+            if isinstance(result, dict):
+                score = result.get("combined_score", 0.0)
+                side_info = {}
+            else:
+                # EvaluationResult from OpenEvolve
+                score = result.metrics.get("combined_score", 0.0)
+                if hasattr(result, "artifacts") and "asi" in result.artifacts:
+                    side_info = {"Feedback": result.artifacts["asi"]}
+                else:
+                    side_info = {}
+            return score, side_info
+        finally:
+            os.unlink(tmp_path)
+
+    return evaluator
diff --git a/src/mlirAgent/evolve/gepa_adapter.py b/src/mlirAgent/evolve/gepa_adapter.py
deleted file mode 100644
index 2e9082b..0000000
--- a/src/mlirAgent/evolve/gepa_adapter.py
+++ /dev/null
@@ -1,91 +0,0 @@
-"""GEPA adapter for LLVM heuristic evolution.
-
-Bridges GEPA's ``optimize_anything`` API with our LLVM benchmark evaluator.
-Handles EVOLVE-BLOCK extraction, code injection, and score retrieval.
-"""
-
-import os
-import re
-import sys
-import tempfile
-from pathlib import Path
-
-# Ensure tasks package is importable
-sys.path.insert(0, str(Path(__file__).resolve().parent))
-
-from tasks.llvm_bench import EvalConfig
-
-_EVOLVE_BLOCK_RE = re.compile(
-    r"(// EVOLVE-BLOCK-START\n)(.*?)(// EVOLVE-BLOCK-END)",
-    re.DOTALL,
-)
-
-
-def extract_evolve_block(code):
-    """Extract the EVOLVE-BLOCK content from C++ source code."""
-    m = _EVOLVE_BLOCK_RE.search(code)
-    if m:
-        return m.group(2)
-    return code
-
-
-def inject_evolve_block(template, block):
-    """Replace EVOLVE-BLOCK in *template* with new *block* content."""
-    return _EVOLVE_BLOCK_RE.sub(
-        lambda m: m.group(1) + block + m.group(3),
-        template,
-    )
-
-
-def make_evaluator(task_name, config=None):
-    """Create an evaluator function for GEPA.
-
-    Returns a callable ``code_str -> (score, side_info)`` matching GEPA's
-    evaluator protocol.  *side_info* is a dict that may contain a
-    ``"Feedback"`` key with ASI markdown text.
-    """
-    if task_name == "llvm_inlining":
-        from tasks.llvm_inlining.evaluate import evaluate
-        if config is None:
-            config = EvalConfig.from_env(
-                "llvm/lib/Analysis/EvolvedInlineCost.cpp"
-            )
-    elif task_name == "loop_unrolling":
-        from tasks.loop_unrolling.evaluate import evaluate
-        if config is None:
-            config = EvalConfig.from_env(
-                "llvm/lib/Transforms/Scalar/EvolvedLoopUnroll.cpp"
-            )
-    elif task_name == "regalloc_priority":
-        from tasks.regalloc_priority.evaluate import evaluate
-        if config is None:
-            config = EvalConfig.from_env(
-                "llvm/lib/CodeGen/EvolvedRegAllocPriority.cpp"
-            )
-    else:
-        raise ValueError(f"Unknown task: {task_name}")
-
-    def evaluator(code_str):
-        """Write code to temp file, evaluate, return (score, side_info)."""
-        with tempfile.NamedTemporaryFile(
-            mode="w", suffix=".cpp", delete=False, prefix="gepa_"
-        ) as f:
-            f.write(code_str)
-            tmp_path = f.name
-        try:
-            result = evaluate(tmp_path, config=config)
-            if isinstance(result, dict):
-                score = result.get("combined_score", 0.0)
-                side_info = {}
-            else:
-                # EvaluationResult from OpenEvolve
-                score = result.metrics.get("combined_score", 0.0)
-                if hasattr(result, "artifacts") and "asi" in result.artifacts:
-                    side_info = {"Feedback": result.artifacts["asi"]}
-                else:
-                    side_info = {}
-            return score, side_info
-        finally:
-            os.unlink(tmp_path)
-
-    return evaluator
diff --git a/src/mlirAgent/evolve/gepa_run.py b/src/mlirAgent/evolve/gepa_run.py
deleted file mode 100644
index e2f10f9..0000000
--- a/src/mlirAgent/evolve/gepa_run.py
+++ /dev/null
@@ -1,248 +0,0 @@
-"""CLI runner for GEPA on LLVM evolution tasks.
-
-Usage::
-
-    python gepa_run.py --task llvm_inlining [--prompts-dir gepa_prompts]
-    python gepa_run.py --task llvm_inlining --max-evals 2 --auto-respond
-
-Requires ``pip install gepa`` and environment variables:
-  - LLVM_SRC_PATH: path to LLVM source tree
-  - EVOLVE_BUILD_DIR: path to LLVM build directory
-"""
-
-import argparse
-import json
-import os
-import re
-import sys
-import threading
-import time
-from pathlib import Path
-
-# Ensure local packages are importable
-sys.path.insert(0, str(Path(__file__).resolve().parent))
-
-# Task → initial source file mapping
-_TASK_INITIAL = {
-    "llvm_inlining": "tasks/llvm_inlining/initial.cpp",
-    "loop_unrolling": "tasks/loop_unrolling/initial.cpp",
-    "regalloc_priority": "tasks/regalloc_priority/initial.cpp",
-}
-
-# Task → objective string for GEPA
-_TASK_OBJECTIVE = {
-    "llvm_inlining": (
-        "Maximize binary size reduction across CTMark benchmarks "
-        "by modifying the inlining cost heuristic."
-    ),
-    "loop_unrolling": (
-        "Maximize runtime speedup across CTMark benchmarks "
-        "by modifying the loop unrolling heuristic."
-    ),
-    "regalloc_priority": (
-        "Maximize runtime speedup across CTMark benchmarks "
-        "by modifying the register allocation priority function."
-    ),
-}
-
-_TASK_BACKGROUND = (
-    "You are modifying a C++ heuristic function in LLVM's optimization pipeline. "
-    "The function is compiled into the opt/llc tools and evaluated against CTMark "
-    "benchmarks (real-world C/C++ programs). The evaluator returns a score based on "
-    "binary size reduction and/or runtime speedup vs the default LLVM heuristic. "
-    "Higher scores are better. The source code uses LLVM APIs (cl::opt for flags, "
-    "InlineCost, LoopUnrollResult, etc.). Expose tunable constants as "
-    "// [hyperparam]: flag-name, type, min, max comments for the autotuner."
-)
-
-
-def _auto_respond_thread(prompts_dir, initial_code, poll_interval=1.0):
-    """Background thread that auto-creates response files for smoke testing.
-
-    Watches for new prompt_NNN.md files and creates prompt_NNN.response.md
-    with a trivially modified version of the initial code.
-    """
-    seen = set()
-    prompt_re = re.compile(r"^prompt_(\d+)\.md$")
-
-    while True:
-        try:
-            for fname in os.listdir(prompts_dir):
-                m = prompt_re.match(fname)
-                if not m:
-                    continue
-                num = m.group(1)
-                response_name = f"prompt_{num}.response.md"
-                if response_name in seen:
-                    continue
-                response_path = os.path.join(prompts_dir, response_name)
-                if os.path.exists(response_path):
-                    seen.add(response_name)
-                    continue
-
-                # Create a trivially modified version of the code
-                modified = initial_code.replace(
-                    "// EVOLVE-BLOCK-START",
-                    f"// EVOLVE-BLOCK-START\n// Auto-response iteration {num}",
-                )
-                with open(response_path, "w") as f:
-                    f.write(f"```cpp\n{modified}\n```\n")
-                seen.add(response_name)
-                print(f"  [auto-respond] Created {response_name}")
-        except OSError:
-            pass
-        time.sleep(poll_interval)
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Run GEPA on LLVM heuristic evolution tasks"
-    )
-    parser.add_argument(
-        "--task", required=True,
-        choices=list(_TASK_INITIAL.keys()),
-        help="Task to optimize",
-    )
-    parser.add_argument(
-        "--initial", default=None,
-        help="Path to initial C++ source (overrides default)",
-    )
-    parser.add_argument(
-        "--prompts-dir", default="gepa_prompts",
-        help="Directory for prompt/response files (default: gepa_prompts)",
-    )
-    parser.add_argument(
-        "--poll-interval", type=float, default=2.0,
-        help="Poll interval for response files in seconds (default: 2.0)",
-    )
-    parser.add_argument(
-        "--max-evals", type=int, default=10,
-        help="Maximum evaluator calls (default: 10)",
-    )
-    parser.add_argument(
-        "--output-dir", default=None,
-        help="GEPA run directory for state/resume (default: <prompts-dir>/run)",
-    )
-    parser.add_argument(
-        "--output", default=None,
-        help="Path to save best code (default: tasks/<task>/gepa_best.cpp)",
-    )
-    parser.add_argument(
-        "--auto-respond", action="store_true",
-        help="Auto-create response files for smoke testing",
-    )
-    args = parser.parse_args()
-
-    # Import GEPA
-    try:
-        from gepa.optimize_anything import (
-            optimize_anything, GEPAConfig, EngineConfig, ReflectionConfig,
-        )
-    except ImportError:
-        print("Error: gepa not installed. Run: pip install gepa")
-        sys.exit(1)
-
-    from gepa_manual_lm import ManualLM
-    from gepa_adapter import make_evaluator
-
-    # Find initial program
-    base_dir = Path(__file__).resolve().parent
-    if args.initial:
-        initial_file = Path(args.initial)
-    else:
-        initial_file = base_dir / _TASK_INITIAL[args.task]
-
-    if not initial_file.exists():
-        print(f"Error: initial source not found at {initial_file}")
-        sys.exit(1)
-
-    with open(initial_file) as f:
-        initial_code = f.read()
-
-    # Create LM and evaluator
-    lm = ManualLM(
-        prompts_dir=args.prompts_dir,
-        poll_interval=args.poll_interval,
-    )
-    evaluator = make_evaluator(args.task)
-
-    run_dir = args.output_dir or os.path.join(args.prompts_dir, "run")
-    os.makedirs(run_dir, exist_ok=True)
-
-    print(f"{'=' * 60}")
-    print(f"GEPA Runner")
-    print(f"  Task:           {args.task}")
-    print(f"  Initial code:   {initial_file}")
-    print(f"  Prompts dir:    {args.prompts_dir}")
-    print(f"  Max evals:      {args.max_evals}")
-    print(f"  Run dir:        {run_dir}")
-    print(f"  Auto-respond:   {args.auto_respond}")
-    print(f"{'=' * 60}")
-    print()
-
-    # Start auto-responder thread if requested
-    if args.auto_respond:
-        t = threading.Thread(
-            target=_auto_respond_thread,
-            args=(args.prompts_dir, initial_code, args.poll_interval),
-            daemon=True,
-        )
-        t.start()
-        print("  [auto-respond] Background responder started")
-
-    # Run GEPA with real API
-    objective = _TASK_OBJECTIVE[args.task]
-    config = GEPAConfig(
-        engine=EngineConfig(
-            max_metric_calls=args.max_evals,
-            parallel=False,
-            run_dir=run_dir,
-            use_cloudpickle=False,
-            raise_on_exception=False,
-        ),
-        reflection=ReflectionConfig(
-            reflection_lm=lm,
-        ),
-    )
-
-    result = optimize_anything(
-        seed_candidate=initial_code,
-        evaluator=evaluator,
-        objective=objective,
-        background=_TASK_BACKGROUND,
-        config=config,
-    )
-
-    print()
-    print(f"{'=' * 60}")
-    print(f"GEPA Results:")
-    print(f"  Best candidate: {len(result.best_candidate)} chars")
-    print(f"  Num candidates: {result.num_candidates}")
-    print(f"  Total evals:    {result.total_metric_calls}")
-    print(f"{'=' * 60}")
-
-    # Save best code
-    output_path = args.output or str(
-        base_dir / "tasks" / args.task / "gepa_best.cpp"
-    )
-    os.makedirs(os.path.dirname(output_path), exist_ok=True)
-    with open(output_path, "w") as f:
-        f.write(result.best_candidate)
-    print(f"Best code saved to: {output_path}")
-
-    # Save summary
-    summary = {
-        "task": args.task,
-        "num_candidates": result.num_candidates,
-        "total_metric_calls": result.total_metric_calls,
-        "output_path": output_path,
-        "run_dir": run_dir,
-    }
-    summary_path = os.path.join(args.prompts_dir, "summary.json")
-    with open(summary_path, "w") as f:
-        json.dump(summary, f, indent=2)
-    print(f"Summary saved to: {summary_path}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/src/mlirAgent/evolve/manual_run.py b/src/mlirAgent/evolve/manual_run.py
deleted file mode 100644
index 59cb285..0000000
--- a/src/mlirAgent/evolve/manual_run.py
+++ /dev/null
@@ -1,444 +0,0 @@
-"""Orchestrator for running OpenEvolve with ManualLLM + Claude Code as the responder.
-
-Usage:
-    # Auto mode: Claude Code sub-agent responds to each prompt
-    python -m mlirAgent.evolve.manual_run --example function_minimization --iterations 10 --auto
-
-    # Wait mode: user manually creates response files
-    python -m mlirAgent.evolve.manual_run --example function_minimization --iterations 10 --wait
-
-    # Resume from checkpoint
-    python -m mlirAgent.evolve.manual_run --example function_minimization --iterations 10 --auto \
-        --resume experiments/run_YYYYMMDD_HHMMSS/openevolve_output/checkpoints/checkpoint_5
-"""
-
-import argparse
-import asyncio
-import json
-import os
-import sys
-import time
-from datetime import datetime
-from pathlib import Path
-
-# Ensure openevolve is importable
-_MLIREVOLVE_ROOT = Path(__file__).resolve().parent.parent.parent.parent
-_OE_PATH = str(_MLIREVOLVE_ROOT / "third_party" / "openevolve")
-if _OE_PATH not in sys.path:
-    sys.path.insert(0, _OE_PATH)
-
-from openevolve.config import Config as OEConfig, LLMModelConfig, load_config
-from openevolve.controller import OpenEvolve
-from openevolve.llm.manual import create_manual_llm
-
-
-EXAMPLES = {
-    "function_minimization": {
-        "initial_program": _OE_PATH + "/examples/function_minimization/initial_program.py",
-        "evaluator": _OE_PATH + "/examples/function_minimization/evaluator.py",
-        "file_suffix": ".py",
-        "language": "python",
-    },
-    "llvm_inlining": {
-        "initial_program": str(Path(__file__).parent / "tasks/llvm_inlining/initial.cpp"),
-        "evaluator": str(Path(__file__).parent / "tasks/llvm_inlining/evaluate.py"),
-        "file_suffix": ".cpp",
-        "language": "cpp",
-    },
-    "regalloc_priority": {
-        "initial_program": str(Path(__file__).parent / "tasks/regalloc_priority/initial.cpp"),
-        "evaluator": str(Path(__file__).parent / "tasks/regalloc_priority/evaluate.py"),
-        "file_suffix": ".cpp",
-        "language": "cpp",
-    },
-    "loop_unrolling": {
-        "initial_program": str(Path(__file__).parent / "tasks/loop_unrolling/initial.cpp"),
-        "evaluator": str(Path(__file__).parent / "tasks/loop_unrolling/evaluate.py"),
-        "file_suffix": ".cpp",
-        "language": "cpp",
-    },
-}
-
-
-def _build_config(args, prompts_dir: str) -> OEConfig:
-    """Build OpenEvolve config with ManualLLM injected."""
-    # Set env var so ManualLLM instances (including in worker processes) find the prompts dir
-    os.environ["MANUAL_LLM_PROMPTS_DIR"] = prompts_dir
-
-    # Load framework YAML as base
-    configs_dir = str(_MLIREVOLVE_ROOT / "configs")
-    fw_yaml = os.path.join(configs_dir, "frameworks", "manual.yaml")
-    if os.path.exists(fw_yaml):
-        cfg = OEConfig.from_yaml(fw_yaml)
-    else:
-        cfg = OEConfig()
-
-    # Override iterations
-    if args.iterations:
-        cfg.max_iterations = args.iterations
-
-    # Set file suffix / language from example
-    if args.example and args.example in EXAMPLES:
-        ex = EXAMPLES[args.example]
-        cfg.file_suffix = ex["file_suffix"]
-        cfg.language = ex["language"]
-
-    # Inject ManualLLM via init_client (module-level function, picklable)
-    manual_model = LLMModelConfig(
-        name="manual",
-        init_client=create_manual_llm,
-        weight=1.0,
-    )
-
-    cfg.llm.models = [manual_model]
-    cfg.llm.evaluator_models = [manual_model]
-
-    # Small population for manual speed
-    cfg.database.population_size = 10
-    cfg.database.archive_size = 10
-    cfg.database.num_islands = 1
-    cfg.database.migration_interval = 999
-    cfg.checkpoint_interval = 1
-    cfg.diff_based_evolution = False
-
-    return cfg
-
-
-def _auto_respond(prompts_dir: str, stop_event: asyncio.Event):
-    """Watch prompts_dir for new prompt files and auto-respond using a simple heuristic improver."""
-    import glob
-    import re
-
-    responded = set()
-    # Match only prompt_NNN.md (not .response.md files)
-    pattern = re.compile(r"prompt_\d+\.md$")
-    while not stop_event.is_set():
-        prompt_files = sorted(
-            f for f in glob.glob(os.path.join(prompts_dir, "prompt_*.md"))
-            if pattern.search(f)
-        )
-        for pf in prompt_files:
-            if pf in responded:
-                continue
-            resp_path = pf.replace(".md", ".response.md")
-            if os.path.exists(resp_path):
-                responded.add(pf)
-                continue
-
-            # Read the prompt
-            with open(pf) as f:
-                prompt_text = f.read()
-
-            # Generate a response: extract parent code and propose improvement
-            response = _generate_improvement(prompt_text)
-            with open(resp_path, "w") as f:
-                f.write(response)
-            responded.add(pf)
-            print(f"  [auto] Responded to {os.path.basename(pf)}")
-
-        time.sleep(1)
-
-
-def _generate_improvement(prompt_text: str) -> str:
-    """Generate an improved version of the code from the prompt.
-
-    Produces a diff-style response with a concrete improvement to the search algorithm.
-    """
-    import random
-
-    # Choose a random improvement strategy
-    strategies = [
-        _strategy_simulated_annealing,
-        _strategy_adaptive_step,
-        _strategy_multi_restart,
-        _strategy_gradient_estimate,
-    ]
-    strategy = random.choice(strategies)
-    return strategy()
-
-
-def _strategy_simulated_annealing() -> str:
-    return '''Here's an improved search algorithm using simulated annealing:
-
-<<<<<<< SEARCH
-    for _ in range(iterations):
-        # Simple random search
-        x = np.random.uniform(bounds[0], bounds[1])
-        y = np.random.uniform(bounds[0], bounds[1])
-        value = evaluate_function(x, y)
-
-        if value < best_value:
-            best_value = value
-            best_x, best_y = x, y
-=======
-    temperature = 2.0
-    cooling_rate = 0.995
-    step_size = 1.0
-    for i in range(iterations):
-        # Simulated annealing with adaptive step size
-        x = best_x + np.random.normal(0, step_size)
-        y = best_y + np.random.normal(0, step_size)
-        x = np.clip(x, bounds[0], bounds[1])
-        y = np.clip(y, bounds[0], bounds[1])
-        value = evaluate_function(x, y)
-
-        delta = value - best_value
-        if delta < 0 or np.random.random() < np.exp(-delta / max(temperature, 1e-10)):
-            best_value = value
-            best_x, best_y = x, y
-
-        temperature *= cooling_rate
-        step_size = max(0.01, step_size * 0.999)
->>>>>>> REPLACE
-'''
-
-
-def _strategy_adaptive_step() -> str:
-    return '''Here's an improved search algorithm with adaptive step sizes and local refinement:
-
-<<<<<<< SEARCH
-    for _ in range(iterations):
-        # Simple random search
-        x = np.random.uniform(bounds[0], bounds[1])
-        y = np.random.uniform(bounds[0], bounds[1])
-        value = evaluate_function(x, y)
-
-        if value < best_value:
-            best_value = value
-            best_x, best_y = x, y
-=======
-    step = (bounds[1] - bounds[0]) / 4.0
-    no_improve = 0
-    for i in range(iterations):
-        if no_improve > 50:
-            # Random restart
-            best_x = np.random.uniform(bounds[0], bounds[1])
-            best_y = np.random.uniform(bounds[0], bounds[1])
-            best_value = evaluate_function(best_x, best_y)
-            step = (bounds[1] - bounds[0]) / 4.0
-            no_improve = 0
-
-        x = best_x + np.random.uniform(-step, step)
-        y = best_y + np.random.uniform(-step, step)
-        x = np.clip(x, bounds[0], bounds[1])
-        y = np.clip(y, bounds[0], bounds[1])
-        value = evaluate_function(x, y)
-
-        if value < best_value:
-            best_value = value
-            best_x, best_y = x, y
-            no_improve = 0
-        else:
-            no_improve += 1
-            if no_improve % 20 == 0:
-                step *= 0.8
->>>>>>> REPLACE
-'''
-
-
-def _strategy_multi_restart() -> str:
-    return '''Here's an improved search algorithm with multiple restarts and basin hopping:
-
-<<<<<<< SEARCH
-    for _ in range(iterations):
-        # Simple random search
-        x = np.random.uniform(bounds[0], bounds[1])
-        y = np.random.uniform(bounds[0], bounds[1])
-        value = evaluate_function(x, y)
-
-        if value < best_value:
-            best_value = value
-            best_x, best_y = x, y
-=======
-    num_restarts = 5
-    iters_per_restart = iterations // num_restarts
-    for restart in range(num_restarts):
-        # Random restart point
-        cx = np.random.uniform(bounds[0], bounds[1])
-        cy = np.random.uniform(bounds[0], bounds[1])
-        cv = evaluate_function(cx, cy)
-        step = 1.0
-
-        for i in range(iters_per_restart):
-            x = cx + np.random.normal(0, step)
-            y = cy + np.random.normal(0, step)
-            x = np.clip(x, bounds[0], bounds[1])
-            y = np.clip(y, bounds[0], bounds[1])
-            value = evaluate_function(x, y)
-
-            if value < cv:
-                cv = value
-                cx, cy = x, y
-            step = max(0.01, step * 0.998)
-
-        if cv < best_value:
-            best_value = cv
-            best_x, best_y = cx, cy
->>>>>>> REPLACE
-'''
-
-
-def _strategy_gradient_estimate() -> str:
-    return '''Here's an improved search algorithm using numerical gradient estimation:
-
-<<<<<<< SEARCH
-    for _ in range(iterations):
-        # Simple random search
-        x = np.random.uniform(bounds[0], bounds[1])
-        y = np.random.uniform(bounds[0], bounds[1])
-        value = evaluate_function(x, y)
-
-        if value < best_value:
-            best_value = value
-            best_x, best_y = x, y
-=======
-    lr = 0.1
-    eps = 1e-4
-    for i in range(iterations):
-        # Estimate gradient via finite differences
-        fx = evaluate_function(best_x + eps, best_y)
-        fy = evaluate_function(best_x, best_y + eps)
-        f0 = evaluate_function(best_x, best_y)
-        gx = (fx - f0) / eps
-        gy = (fy - f0) / eps
-
-        # Gradient descent step with noise for exploration
-        noise_scale = max(0.01, 0.5 * (1 - i / iterations))
-        nx = best_x - lr * gx + np.random.normal(0, noise_scale)
-        ny = best_y - lr * gy + np.random.normal(0, noise_scale)
-        nx = np.clip(nx, bounds[0], bounds[1])
-        ny = np.clip(ny, bounds[0], bounds[1])
-        nv = evaluate_function(nx, ny)
-
-        if nv < best_value:
-            best_value = nv
-            best_x, best_y = nx, ny
-
-        lr = max(0.001, lr * 0.999)
->>>>>>> REPLACE
-'''
-
-
-async def _run(args):
-    """Main async entry point."""
-    # Set up experiment directory
-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    exp_dir = os.path.join(_MLIREVOLVE_ROOT, "experiments", f"run_{timestamp}")
-    prompts_dir = os.path.join(exp_dir, "prompts")
-    oe_output_dir = os.path.join(exp_dir, "openevolve_output")
-    scores_path = os.path.join(exp_dir, "scores.jsonl")
-    os.makedirs(prompts_dir, exist_ok=True)
-    os.makedirs(oe_output_dir, exist_ok=True)
-
-    print(f"Experiment directory: {exp_dir}")
-    print(f"Prompts directory:    {prompts_dir}")
-    print(f"Scores log:           {scores_path}")
-
-    # Build config
-    cfg = _build_config(args, prompts_dir)
-
-    # Resolve example paths
-    if args.example:
-        if args.example not in EXAMPLES:
-            print(f"Unknown example: {args.example}. Available: {list(EXAMPLES.keys())}")
-            return 1
-        ex = EXAMPLES[args.example]
-        initial_program = ex["initial_program"]
-        evaluator = ex["evaluator"]
-    else:
-        print("Error: --example is required for now")
-        return 1
-
-    # Initialize OpenEvolve
-    openevolve = OpenEvolve(
-        initial_program_path=initial_program,
-        evaluation_file=evaluator,
-        config=cfg,
-        output_dir=oe_output_dir,
-    )
-
-    # Load checkpoint if resuming
-    if args.resume:
-        if not os.path.exists(args.resume):
-            print(f"Error: Checkpoint not found: {args.resume}")
-            return 1
-        print(f"Resuming from checkpoint: {args.resume}")
-        openevolve.database.load(args.resume)
-
-    # Start auto-responder if --auto
-    stop_event = asyncio.Event()
-    responder_task = None
-    if args.auto:
-        print("Auto mode: built-in heuristic strategies will respond to prompts")
-        loop = asyncio.get_event_loop()
-        responder_task = loop.run_in_executor(None, _auto_respond, prompts_dir, stop_event)
-
-    # Hook into the database to log scores
-    _original_add = openevolve.database.add
-
-    def _logging_add(program, *a, **kw):
-        result = _original_add(program, *a, **kw)
-        score_entry = {
-            "timestamp": time.time(),
-            "iteration": program.iteration_found,
-            "program_id": program.id,
-            "metrics": program.metrics,
-            "generation": program.generation,
-        }
-        best = openevolve.database.get_best_program()
-        if best:
-            score_entry["best_score"] = best.metrics.get("combined_score", 0)
-            score_entry["best_id"] = best.id
-        with open(scores_path, "a") as f:
-            f.write(json.dumps(score_entry, default=str) + "\n")
-        return result
-
-    openevolve.database.add = _logging_add
-
-    # Run evolution
-    try:
-        print(f"\nStarting OpenEvolve with ManualLLM ({cfg.max_iterations} iterations)...")
-        best = await openevolve.run(
-            iterations=cfg.max_iterations,
-            checkpoint_path=args.resume,
-        )
-        if best:
-            print(f"\nEvolution complete! Best metrics:")
-            for k, v in best.metrics.items():
-                if isinstance(v, float):
-                    print(f"  {k}: {v:.4f}")
-                else:
-                    print(f"  {k}: {v}")
-        else:
-            print("\nNo valid programs found.")
-    finally:
-        stop_event.set()
-        if responder_task:
-            # Give responder time to notice the stop event
-            await asyncio.sleep(2)
-
-    return 0
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Run OpenEvolve with ManualLLM")
-    parser.add_argument("--example", "-e", choices=list(EXAMPLES.keys()),
-                        help="Built-in example to run")
-    parser.add_argument("--iterations", "-n", type=int, default=10,
-                        help="Number of iterations (default: 10)")
-    parser.add_argument("--auto", action="store_true",
-                        help="Auto-respond with built-in heuristic strategies")
-    parser.add_argument("--wait", action="store_true",
-                        help="Wait for manual response files (human or external tool)")
-    parser.add_argument("--resume", help="Path to checkpoint directory to resume from")
-
-    args = parser.parse_args()
-
-    if not args.auto and not args.wait:
-        args.auto = True  # Default to auto mode
-
-    return asyncio.run(_run(args))
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/src/mlirAgent/evolve/providers.py b/src/mlirAgent/evolve/providers.py
deleted file mode 100644
index 6e8b8de..0000000
--- a/src/mlirAgent/evolve/providers.py
+++ /dev/null
@@ -1,46 +0,0 @@
-"""LLM provider configuration loader.
-
-Reads YAML agent configs from configs/agents/ and returns dicts
-consumable by framework adapters (OpenEvolve, ShinkaEvolve).
-"""
-
-import os
-from pathlib import Path
-from typing import Dict, Any, Optional
-
-import yaml
-
-from ..config import Config
-
-
-def load_agent_config(agent_name: str, configs_dir: Optional[str] = None) -> Dict[str, Any]:
-    """Load an agent YAML config by name.
-
-    Args:
-        agent_name: Name without extension, e.g. "claude_opus".
-        configs_dir: Override for configs directory.
-
-    Returns:
-        Dict with keys: api_base, model, api_key_env, temperature, max_tokens.
-        The api_key is resolved from the environment variable named in api_key_env.
-    """
-    base = Path(configs_dir or Config.EVOLVE_CONFIGS_DIR) / "agents"
-    path = base / f"{agent_name}.yaml"
-    if not path.exists():
-        raise FileNotFoundError(f"Agent config not found: {path}")
-
-    with open(path) as f:
-        cfg = yaml.safe_load(f)
-
-    # Resolve API key from environment
-    key_env = cfg.get("api_key_env", "")
-    cfg["api_key"] = os.environ.get(key_env, "")
-    return cfg
-
-
-def list_agents(configs_dir: Optional[str] = None) -> list:
-    """List available agent config names."""
-    base = Path(configs_dir or Config.EVOLVE_CONFIGS_DIR) / "agents"
-    if not base.exists():
-        return []
-    return sorted(p.stem for p in base.glob("*.yaml"))
diff --git a/src/mlirAgent/evolve/run.py b/src/mlirAgent/evolve/run.py
index 7e2cbbe..1826f41 100644
--- a/src/mlirAgent/evolve/run.py
+++ b/src/mlirAgent/evolve/run.py
@@ -1,124 +1,154 @@
-"""CLI entry point for the evolve harness.
-
-Usage:
-    python -m mlirAgent.evolve.run --task llvm_inlining --framework openevolve --agent claude_opus
-    python -m mlirAgent.evolve.run --list
-    python -m mlirAgent.evolve.run --task llvm_inlining --framework openevolve --agent claude_opus --dry-run
-"""
-
-import argparse
-import json
-import sys
-from pathlib import Path
-from typing import Dict, Any, Optional
-
-import yaml
-
-from ..config import Config
-from .adapters import ADAPTERS
-from .providers import load_agent_config, list_agents
-
-
-# Registry of available tasks
-TASKS = {
-    "llvm_inlining": "mlirAgent.evolve.tasks.llvm_inlining.task.LLVMInliningTask",
-}
+"""Unified entry point for LLVM heuristic evolution.
 
+Supports two evolution frameworks dispatched via ``--framework``:
+  - **gepa** (default): GEPA optimize_anything() with ASI as native side-info
+  - **openevolve**: MAP-Elites with ManualLLM
 
-def _load_task(task_name: str, configs_dir: str) -> Any:
-    """Instantiate a task by name, loading its YAML config if present."""
-    if task_name not in TASKS:
-        raise ValueError(f"Unknown task: {task_name}. Available: {list(TASKS.keys())}")
+Both frameworks share the same evaluator pipeline (``tasks/llvm_bench.py``),
+which includes Optuna hyperparameter tuning when ``[hyperparam]`` annotations
+are present in the evolved C++ code.
 
-    # Import task class
-    module_path, class_name = TASKS[task_name].rsplit(".", 1)
-    import importlib
-    mod = importlib.import_module(module_path)
-    task_cls = getattr(mod, class_name)
+Usage::
 
-    # Load task YAML config
-    task_yaml = Path(configs_dir) / "tasks" / f"{task_name}.yaml"
-    task_config = {}
-    if task_yaml.exists():
-        with open(task_yaml) as f:
-            task_config = yaml.safe_load(f) or {}
+    # GEPA with auto-respond (smoke test)
+    python run.py --task llvm_inlining --max-evals 2 --auto
 
-    return task_cls(task_config)
+    # GEPA manual mode (Claude Code or human writes response files)
+    python run.py --task llvm_inlining --max-evals 10
 
+    # OpenEvolve
+    python run.py --framework openevolve --task llvm_inlining --max-evals 10 --auto
 
-def _load_framework_config(framework_name: str, configs_dir: str) -> Dict[str, Any]:
-    """Load framework YAML config."""
-    fw_yaml = Path(configs_dir) / "frameworks" / f"{framework_name}.yaml"
-    if not fw_yaml.exists():
-        raise FileNotFoundError(f"Framework config not found: {fw_yaml}")
-    with open(fw_yaml) as f:
-        return yaml.safe_load(f) or {}
+    # Override Optuna trials
+    python run.py --task llvm_inlining --max-evals 10 --optuna-trials 5
+"""
 
+import argparse
+import os
+import sys
+from datetime import datetime
+from pathlib import Path
 
-def _list_available(configs_dir: str):
-    """Print available agents, frameworks, and tasks."""
-    print("Available configurations:\n")
+_BASE_DIR = Path(__file__).resolve().parent
 
-    print("  Agents:")
-    agents_dir = Path(configs_dir) / "agents"
-    if agents_dir.exists():
-        for p in sorted(agents_dir.glob("*.yaml")):
-            print(f"    - {p.stem}")
-    else:
-        print("    (none)")
+# Ensure local packages are importable
+if str(_BASE_DIR) not in sys.path:
+    sys.path.insert(0, str(_BASE_DIR))
 
-    print("\n  Frameworks:")
-    fw_dir = Path(configs_dir) / "frameworks"
-    if fw_dir.exists():
-        for p in sorted(fw_dir.glob("*.yaml")):
-            print(f"    - {p.stem}")
-    else:
-        print("    (none)")
+_TASKS = ["llvm_inlining", "loop_unrolling", "regalloc_priority"]
 
-    print("\n  Tasks:")
-    for name in sorted(TASKS.keys()):
-        print(f"    - {name}")
+_TASK_INITIAL = {
+    "llvm_inlining": _BASE_DIR / "tasks" / "llvm_inlining" / "initial.cpp",
+    "loop_unrolling": _BASE_DIR / "tasks" / "loop_unrolling" / "initial.cpp",
+    "regalloc_priority": _BASE_DIR / "tasks" / "regalloc_priority" / "initial.cpp",
+}
 
 
 def main():
     parser = argparse.ArgumentParser(
-        description="Evolve harness: evolutionary compiler optimization"
+        description="Evolve LLVM heuristics via LLM-guided search",
+    )
+    parser.add_argument(
+        "--framework", "-f", default="gepa",
+        choices=["gepa", "openevolve"],
+        help="Evolution framework (default: gepa)",
+    )
+    parser.add_argument(
+        "--task", "-t", required=True,
+        choices=_TASKS,
+        help="LLVM task to optimize",
+    )
+    parser.add_argument(
+        "--initial", default=None,
+        help="Override initial C++ source path",
+    )
+    parser.add_argument(
+        "--max-evals", "-n", type=int, default=10,
+        help="Max evaluator calls / iterations (default: 10)",
+    )
+    parser.add_argument(
+        "--auto", action="store_true",
+        help="Auto-respond to prompts (for smoke testing)",
+    )
+    parser.add_argument(
+        "--prompts-dir", default=None,
+        help="Prompt/response directory (default: auto)",
+    )
+    parser.add_argument(
+        "--output", default=None,
+        help="Save best code to this path",
+    )
+    parser.add_argument(
+        "--resume", default=None,
+        help="Resume from checkpoint (OpenEvolve only)",
+    )
+    parser.add_argument(
+        "--poll-interval", type=float, default=2.0,
+        help="ManualLM poll interval in seconds (default: 2.0)",
+    )
+    parser.add_argument(
+        "--optuna-trials", type=int, default=None,
+        help="Optuna inner-loop trials (overrides EVOLVE_OPTUNA_TRIALS env)",
     )
-    parser.add_argument("--task", "-t", help="Task name (e.g. llvm_inlining)")
-    parser.add_argument("--framework", "-f", help="Framework name (e.g. openevolve)")
-    parser.add_argument("--agent", "-a", help="Agent config name (e.g. claude_opus)")
-    parser.add_argument("--list", action="store_true", help="List available configs")
-    parser.add_argument("--dry-run", action="store_true", help="Print config without running")
-    parser.add_argument("--max-iterations", type=int, help="Override max iterations")
-    parser.add_argument("--configs-dir", default=Config.EVOLVE_CONFIGS_DIR,
-                        help="Path to configs directory")
-
     args = parser.parse_args()
-    configs_dir = args.configs_dir
-
-    if args.list:
-        _list_available(configs_dir)
-        return 0
-
-    if not all([args.task, args.framework, args.agent]):
-        parser.error("--task, --framework, and --agent are required (or use --list)")
 
-    # Load everything
-    task = _load_task(args.task, configs_dir)
-    agent_config = load_agent_config(args.agent, configs_dir)
-    framework_config = _load_framework_config(args.framework, configs_dir)
-
-    # Get adapter
-    if args.framework not in ADAPTERS:
-        print(f"Error: Unknown framework '{args.framework}'. Available: {list(ADAPTERS.keys())}")
-        return 1
-
-    adapter = ADAPTERS[args.framework]()
-    adapter.configure(task, agent_config, framework_config)
+    # Set Optuna env var if specified
+    if args.optuna_trials is not None:
+        os.environ["EVOLVE_OPTUNA_TRIALS"] = str(args.optuna_trials)
+
+    # Set up experiment directory
+    mlirevolve_root = _BASE_DIR.parent.parent.parent
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    exp_dir = str(mlirevolve_root / "experiments" / f"run_{timestamp}")
+    prompts_dir = args.prompts_dir or os.path.join(exp_dir, "prompts")
+    os.makedirs(prompts_dir, exist_ok=True)
+    os.makedirs(exp_dir, exist_ok=True)
+
+    # Resolve initial program
+    initial_file = args.initial or str(_TASK_INITIAL[args.task])
+    if not os.path.exists(initial_file):
+        print(f"Error: Initial source not found: {initial_file}")
+        sys.exit(1)
+
+    # Print header
+    optuna_trials = args.optuna_trials or os.environ.get("EVOLVE_OPTUNA_TRIALS", "20")
+    print(f"{'=' * 60}")
+    print(f"Evolve LLVM Heuristics")
+    print(f"  Framework:      {args.framework}")
+    print(f"  Task:           {args.task}")
+    print(f"  Initial:        {initial_file}")
+    print(f"  Max evals:      {args.max_evals}")
+    print(f"  Optuna trials:  {optuna_trials}")
+    print(f"  Auto-respond:   {args.auto}")
+    print(f"  Prompts:        {prompts_dir}")
+    print(f"  Experiment:     {exp_dir}")
+    print(f"{'=' * 60}")
+    print()
+
+    # Dispatch to framework adapter
+    if args.framework == "gepa":
+        from adapters import GEPAAdapter
+        adapter = GEPAAdapter()
+    else:
+        from adapters import OpenEvolveAdapter
+        adapter = OpenEvolveAdapter()
+
+    result = adapter.run(
+        task=args.task,
+        initial_file=initial_file,
+        prompts_dir=prompts_dir,
+        max_evals=args.max_evals,
+        auto_respond=args.auto,
+        poll_interval=args.poll_interval,
+        output=args.output or os.path.join(exp_dir, "best.cpp"),
+        exp_dir=exp_dir,
+        resume=args.resume,
+    )
 
-    # Run
-    result = adapter.launch(dry_run=args.dry_run, max_iterations=args.max_iterations)
-    print(json.dumps(result, indent=2, default=str))
+    print()
+    print(f"{'=' * 60}")
+    print(f"Done. Results in: {exp_dir}")
+    print(f"{'=' * 60}")
     return 0