skeleton of gepa trainer

xzrderek · xzrderek · commit 4fa4162b6588 · 2025-12-06T00:13:07.000-08:00
diff --git a/eval_protocol/benchmarks/test_aime25.py b/eval_protocol/benchmarks/test_aime25.py
@@ -12,6 +12,8 @@
     SingleTurnRolloutProcessor,
 )
 from eval_protocol.pytest.evaluation_test import evaluation_test
+from eval_protocol.training import GEPATrainer
+from eval_protocol.training.gepa_utils import build_reflection_lm
 
 SYSTEM_PROMPT = (
     "You are a helpful math assistant. Please reason step by step, and put your final answer within \\boxed{...}."
@@ -131,3 +133,17 @@ def test_aime25_pointwise(row: EvaluationRow) -> EvaluationRow:
         metrics=metrics,
     )
     return row
+
+
+if __name__ == "__main__":
+    trainer = GEPATrainer(test_aime25_pointwise)
+    reflection_lm = build_reflection_lm("gpt-5")
+
+    optimized_program = trainer.train(
+        num_threads=32,
+        track_stats=True,
+        reflection_minibatch_size=3,
+        reflection_lm=reflection_lm,
+    )
+
+    print(trainer.evaluate(optimized_program))
diff --git a/eval_protocol/trainable_gepa_design.md b/eval_protocol/trainable_gepa_design.md
@@ -19,7 +19,7 @@
     - Specifies what is tunable (e.g., the system prompt) and how to adapt rows using a candidate.
     - Invokes a train routine (GEPA-based or otherwise).
 
-- **training core**
+- **Training core**
   - Provides a single central abstraction:
     - **`EPParameters`**: Encapsulates everything `evaluation_test` knows about the eval in a structured form:
       - One field for every parameter that `evaluation_test` accepts (dataset sources, adapters, completion params, rollout processor, aggregation, thresholds, etc.), after parsing/env overrides.
@@ -68,7 +68,7 @@ setattr(dual_mode_wrapper, "__ep_params__", ep_params)
     - Rollout and mode information (processor, kwargs, concurrency limits, mode).
   - The training core can then **directly convert `__ep_params__` into an `EPParameters` instance** without maintaining a separate training-only config.
 
-- training core will expose:
+- Training core will expose:
   - A factory like:
 
     ```python
@@ -199,3 +199,38 @@ class GEPATrainer:
   - After GEPA integration works for AIME:
     - Decide on the canonical way to treat GEPA’s `run_dir` and/or additional artifacts for tuned prompts.
     - Optionally add a small helper that knows how to “run evaluation once with best GEPA candidate” for CI workflows.
+
+
+future:
+
+this is how gepa defines eval:
+
+def metric(
+    gold: Example,
+    pred: Prediction,
+    trace: Optional[DSPyTrace] = None,
+    pred_name: Optional[str] = None,
+    pred_trace: Optional[DSPyTrace] = None,
+) -> float | ScoreWithFeedback:
+    """
+    This function is called with the following arguments:
+    - gold: The gold example.
+    - pred: The predicted output.
+    - trace: Optional. The trace of the program's execution.
+    - pred_name: Optional. The name of the target predictor currently being optimized by GEPA, for which
+        the feedback is being requested.
+    - pred_trace: Optional. The trace of the target predictor's execution GEPA is seeking feedback for.
+
+    Note the `pred_name` and `pred_trace` arguments. During optimization, GEPA will call the metric to obtain
+    feedback for individual predictors being optimized. GEPA provides the name of the predictor in `pred_name`
+    and the sub-trace (of the trace) corresponding to the predictor in `pred_trace`.
+    If available at the predictor level, the metric should return {'score': float, 'feedback': str} corresponding
+    to the predictor.
+    If not available at the predictor level, the metric can also return a text feedback at the program level
+    (using just the gold, pred and trace).
+    If no feedback is returned, GEPA will use a simple text feedback consisting of just the score:
+    f"This trajectory got a score of {score}."
+    """
+    ...
+
+ideally generic way to turn evaluation_test into this.
diff --git a/eval_protocol/training/__init__.py b/eval_protocol/training/__init__.py
@@ -0,0 +1,3 @@
+from gepa_adapter import GEPATrainer
+
+__all__ = ["GEPATrainer"]
diff --git a/eval_protocol/training/gepa_adapter.py b/eval_protocol/training/gepa_adapter.py
@@ -0,0 +1,138 @@
+from typing import Any, Dict, Literal
+
+import dspy
+from dspy.clients.lm import LM
+from dspy.primitives import Module
+from dspy.teleprompt.gepa.gepa import GEPA
+from gepa.core.adapter import ProposalFn
+from gepa.proposer.reflective_mutation.base import ReflectionComponentSelector
+
+from eval_protocol.models import EPParameters, EvaluationRow
+from eval_protocol.pytest.types import TestFunction
+from eval_protocol.training.gepa_utils import REFLECTION_LM_CONFIGS
+from eval_protocol.training.utils import build_ep_parameters_from_test
+
+
+class GEPATrainer:
+    """
+    High-level entrypoint for running GEPA-style training against an existing
+    `@evaluation_test`-decorated function.
+
+    This class is intentionally minimal for now:
+    - It captures `EPParameters` from the provided test function via
+      `build_ep_parameters_from_test`.
+    - It stores any GEPA-related configuration kwargs for future use.
+    - The actual GEPA optimization loop is left as a TODO.
+    """
+
+    def __init__(self, test_fn: TestFunction) -> None:
+        """
+        Args:
+            test_fn: The `@evaluation_test`-decorated function defining the eval.
+        """
+        self.test_fn = test_fn
+        self.ep_params: EPParameters = build_ep_parameters_from_test(test_fn)
+
+        self.metric = (
+            test_fn  # TODO: need to convert our ep test_fn to a GEPA metric. also need to inject the feedback text.
+        )
+
+        self.program = ...  # TODO: converting between a program (dspy.Module) and an @evaluation_test is a bit tricky.
+
+        self.train_set, self.val_set, self.test_set = (
+            ...,
+            ...,
+            ...,
+        )  # TODO: need to convert our input_dataset to a train set
+
+    def train(
+        self,
+        auto: Literal["light", "medium", "heavy"] | None = None,
+        max_full_evals: int | None = None,
+        max_metric_calls: int | None = None,
+        reflection_minibatch_size: int = 3,
+        candidate_selection_strategy: Literal["pareto", "current_best"] = "pareto",
+        reflection_lm: LM | None = None,
+        skip_perfect_score: bool = True,
+        add_format_failure_as_feedback: bool = False,
+        instruction_proposer: ProposalFn | None = None,
+        component_selector: ReflectionComponentSelector | str = "round_robin",
+        use_merge: bool = True,
+        max_merge_invocations: int | None = 5,
+        num_threads: int | None = None,
+        failure_score: float = 0.0,
+        perfect_score: float = 1.0,
+        log_dir: str | None = None,
+        track_stats: bool = False,
+        use_wandb: bool = False,
+        wandb_api_key: str | None = None,
+        wandb_init_kwargs: dict[str, Any] | None = None,
+        track_best_outputs: bool = False,
+        warn_on_score_mismatch: bool = True,
+        enable_tool_optimization: bool = False,
+        use_mlflow: bool = False,
+        seed: int | None = 0,
+        gepa_kwargs: dict | None = None,
+    ) -> Module:
+        """
+        Run GEPA to optimize over candidates.
+        """
+        gepa_args: dict[str, Any] = {
+            "auto": auto,
+            "max_full_evals": max_full_evals,
+            "max_metric_calls": max_metric_calls,
+            "reflection_minibatch_size": reflection_minibatch_size,
+            "candidate_selection_strategy": candidate_selection_strategy,
+            "reflection_lm": reflection_lm,
+            "skip_perfect_score": skip_perfect_score,
+            "add_format_failure_as_feedback": add_format_failure_as_feedback,
+            "instruction_proposer": instruction_proposer,
+            "component_selector": component_selector,
+            "use_merge": use_merge,
+            "max_merge_invocations": max_merge_invocations,
+            "num_threads": num_threads,
+            "failure_score": failure_score,
+            "perfect_score": perfect_score,
+            "log_dir": log_dir,
+            "track_stats": track_stats,
+            "use_wandb": use_wandb,
+            "wandb_api_key": wandb_api_key,
+            "wandb_init_kwargs": wandb_init_kwargs,
+            "track_best_outputs": track_best_outputs,
+            "warn_on_score_mismatch": warn_on_score_mismatch,
+            "enable_tool_optimization": enable_tool_optimization,
+            "use_mlflow": use_mlflow,
+            "seed": seed,
+        }
+        gepa_args.update(gepa_kwargs or {})
+
+        optimizer = GEPA(
+            metric=self.metric,
+            **gepa_args,
+        )
+
+        optimized_program = optimizer.compile(
+            self.program,
+            trainset=self.train_set,
+            valset=self.val_set,
+        )
+
+        return optimized_program
+
+    def evaluate(self, optimized_program: Module) -> list[EvaluationRow]:
+        # convert back to EP
+
+        # and then just run our evaluation_test function on the optimized program.
+
+        # OR we can evaluate using dspy.Evaluate
+
+        # evaluate = dspy.Evaluate(
+        #     devset=self.test_set,
+        #     metric=self.metric,
+        #     num_threads=32,
+        #     display_table=True,
+        #     display_progress=True
+        # )
+
+        # return evaluate(self.optimized_program)
+        ...
diff --git a/eval_protocol/training/gepa_utils.py b/eval_protocol/training/gepa_utils.py
@@ -0,0 +1,32 @@
+import os
+
+import dspy
+from dspy.clients.lm import LM
+
+REFLECTION_LM_CONFIGS = {
+    "gpt-5": {
+        "model": "gpt-5",
+        "temperature": 1.0,
+        "max_tokens": 32000,
+        "api_key": os.getenv("OPENAI_API_KEY"),
+        "base_url": "https://api.openai.com/v1",
+    },
+    "kimi-k2-instruct-0905": {
+        "model": "accounts/fireworks/models/kimi-k2-instruct-0905",
+        "temperature": 0.6,  # Kimi recommended temperature
+        "max_tokens": 131000,
+        "api_key": os.getenv("FIREWORKS_API_KEY"),
+        "base_url": "https://api.fireworks.ai/inference/v1",
+    },
+}
+
+
+def build_reflection_lm(reflection_lm_name: str) -> LM:
+    reflection_lm_config = REFLECTION_LM_CONFIGS[reflection_lm_name]
+    return dspy.LM(
+        model=reflection_lm_config["model"],
+        temperature=reflection_lm_config["temperature"],
+        max_tokens=reflection_lm_config["max_tokens"],
+        api_key=reflection_lm_config["api_key"],
+        base_url=reflection_lm_config["base_url"],
+    )
diff --git a/pyproject.toml b/pyproject.toml
@@ -47,6 +47,7 @@ dependencies = [
     "deepdiff>=6.0.0",
     "websockets>=15.0.1",
     "fastapi>=0.116.1",
+    "dspy>=3.0.0",
 ]
 
 [project.urls]

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from gepa_adapter import GEPATrainer`
	`2`	`+`
	`3`	`+__all__ = ["GEPATrainer"]`
Original file line number	Diff line number	Diff line change
`@@ -47,6 +47,7 @@ dependencies = [`
`47`	`47`	`"deepdiff>=6.0.0",`
`48`	`48`	`"websockets>=15.0.1",`
`49`	`49`	`"fastapi>=0.116.1",`
	`50`	`+ "dspy>=3.0.0",`
`50`	`51`	`]`
`51`	`52`
`52`	`53`	`[project.urls]`