eval-protocol · benjibc · Aug 10, 2025 · Aug 9, 2025 · Aug 10, 2025 · Aug 10, 2025
diff --git a/development/RUNNING_EVALUATIONS.md b/development/RUNNING_EVALUATIONS.md
@@ -0,0 +1,80 @@
+# Running AIME/GPQA Evaluations in CI and Locally
+
+This guide explains how to run the AIME2025 and GPQA evaluations using the
+pytest-based `evaluation_test` decorator, how to control dataset size and
+concurrency, how to select effort presets, and how to print/persist results
+for CI dashboards/artifacts.
+
+## Objectives
+- Simple pass/fail: ensure evaluation configs don’t regress.
+- Comparable metrics: capture aggregated accuracy across runs/rows.
+- CI-friendly outputs: print summary lines to logs and save JSON artifacts.
+
+## Prerequisites
+- `FIREWORKS_API_KEY` set in the environment
+- Install SDK: `pip install -e .[dev]`
+
+## Controls
+- Row limit
+  - Default `max_dataset_rows=2` in each test decorator for quick CI.
+  - Override centrally: `pytest --ep-max-rows=all` or `--ep-max-rows=50`.
+- Concurrency
+  - Set `max_concurrent_rollouts` in the decorator (recommend 4 for production Fireworks).
+- Repeats
+  - Set `num_runs` in the decorator (e.g., 4).
+- Effort (Fireworks reasoning)
+  - Provide `{"reasoning": {"effort": "low|medium|high"}}` in the test’s `rollout_input_params`.
+  - The default rollout forwards it via LiteLLM `extra_body`.
+
+## Printing & Persisting Results
+- Flags:
+  - `--ep-print-summary`: print concise summary lines at end of each eval
+  - `--ep-summary-json=PATH`: write JSON with suite/model/agg_score/runs/rows/timestamp
+- Example GitHub Actions snippet:
+```yaml
+- name: Run AIME low effort (full)
+  run: |
+    cd python-sdk
+    pytest --ep-max-rows=all --ep-print-summary \
+      --ep-summary-json=outputs/aime_low.json \
+      -q examples/aime2025_chat_completion/tests/test_evaluation.py::test_aime2025_pointwise -q
+- name: Upload AIME results
+  uses: actions/upload-artifact@v4
+  with:
+    name: aime2025-low-summary
+    path: python-sdk/outputs/aime_low.json
+```
+
+## Examples
+### AIME (Low Effort, Full, Repeats=4, Concurrency=4)
+```bash
+cd python-sdk
+pytest --ep-max-rows=all --ep-print-summary \
+  --ep-summary-json=outputs/aime_low.json \
+  -q examples/aime2025_chat_completion/tests/test_evaluation.py::test_aime2025_pointwise -q
+```
+Expected:
+- Terminal summary: `EP Summary | suite=test_aime2025_pointwise model=... agg=0.530 runs=4 rows=...`
+- JSON artifact at `outputs/aime_low.json`
+- For `.../gpt-oss-120b`, low-effort pass rate should be ~≥ 0.50 when repeated
+
+For medium/high effort, add `{"reasoning": {"effort": "medium|high"}}` to
+`rollout_input_params` in the test decorator and rerun with a different JSON path.
+
+### GPQA (Diamond, Low Effort)
+```bash
+cd python-sdk
+pytest --ep-max-rows=all --ep-print-summary \
+  --ep-summary-json=outputs/gpqa_low.json \
+  -q examples/gpqa/tests/test_evaluation.py -q
+```
+Adjust repeats/concurrency/effort in the test decorator similarly to AIME.
+
+## Pass/Fail Signals
+- If `threshold_of_success` is set in a test, it will fail when aggregated score < threshold.
+- Otherwise, printing and writing artifacts occur and the run succeeds for CI.
+
+## Tips
+- Use `--ep-max-rows` for toggling quick checks vs full evaluations without editing tests.
+- Upload JSON artifacts for dashboards and historical comparisons.
+- Keep concurrency conservative (e.g., 4) to avoid rate limiting.
diff --git a/eval_protocol/common_utils.py b/eval_protocol/common_utils.py
@@ -2,6 +2,8 @@
 import re
 from typing import Any, Dict, List
 
+import requests
+
 
 def load_jsonl(file_path: str) -> List[Dict[str, Any]]:
     """
@@ -15,16 +17,39 @@ def load_jsonl(file_path: str) -> List[Dict[str, Any]]:
         Returns an empty list if the file is not found or if errors occur during parsing.
-        Returns an empty list if the file is not found or if errors occur during parsing.
+        Returns an empty list if the file is not found or if errors occur during parsing. Supports HTTP urls and local file paths.
-        Returns an empty list if the file is not found or if errors occur during parsing.
+        Returns an empty list if the file is not found or if errors occur during parsing. Supports HTTP urls and local file paths.
     """
     data: List[Dict[str, Any]] = []
-    with open(file_path, "r", encoding="utf-8") as f:
-        for line_number, line in enumerate(f):
+    if file_path.startswith("http://") or file_path.startswith("https://"):
+        resp = requests.get(file_path, stream=True, timeout=30)
+        resp.raise_for_status()
+        for line_number, raw in enumerate(resp.iter_lines(decode_unicode=True), start=1):
+            if raw is None:
+                continue
+            stripped = raw.strip()
+            if not stripped:
+                continue
             try:
-                data.append(json.loads(line.strip()))
+                data.append(json.loads(stripped))
             except json.JSONDecodeError as e:
-                print(f"Error parsing JSON line for file {file_path} at line {line_number}")
-                # attempt to find "row_id" in the line by finding index of "row_id" and performing regex of `"row_id": (.*),`
-                row_id_index = line.find("row_id")
+                print(f"Error parsing JSON line for URL {file_path} at line {line_number}")
+                row_id_index = stripped.find("row_id")
                 if row_id_index != -1:
-                    row_id = re.search(r'"row_id": (.*),', line[row_id_index:])
-                    raise ValueError(f"{e.msg} at line {line_number}: {line} ({row_id})")
+                    row_id = re.search(r'"row_id": (.*),', stripped[row_id_index:])
+                    raise ValueError(f"{e.msg} at line {line_number}: {stripped} ({row_id})")
                 raise e
+    else:
+        with open(file_path, "r", encoding="utf-8") as f:
+            for line_number, line in enumerate(f, start=1):
+                # Skip entirely blank or whitespace-only lines to be robust to trailing newlines
+                stripped = line.strip()
+                if not stripped:
+                    continue
+                try:
+                    data.append(json.loads(stripped))
+                except json.JSONDecodeError as e:
+                    print(f"Error parsing JSON line for file {file_path} at line {line_number}")
+                    # attempt to find "row_id" in the line by finding index of "row_id" and performing regex of `"row_id": (.*),`
+                    row_id_index = line.find("row_id")
+                    if row_id_index != -1:
+                        row_id = re.search(r'"row_id": (.*),', line[row_id_index:])
+                        raise ValueError(f"{e.msg} at line {line_number}: {line} ({row_id})")
+                    raise e
     return data
diff --git a/eval_protocol/generation/clients.py b/eval_protocol/generation/clients.py
@@ -11,7 +11,7 @@
 
 import aiohttp
 from omegaconf import DictConfig
-from pydantic import BaseModel, Field  # Added for new models
+from pydantic import BaseModel  # Added for new models
 
 logger = logging.getLogger(__name__)
 
@@ -83,6 +83,9 @@ async def generate(
         }
         if self.top_p is not None:
             payload["top_p"] = self.top_p
+        # Include reasoning settings if configured (for reasoning-capable models)
+        if self.reasoning_effort:
+            payload["reasoning_effort"] = self.reasoning_effort
 
         if tools:
             payload["tools"] = tools

diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py
@@ -2,6 +2,7 @@
 from typing import List
 
 from litellm import acompletion
+import litellm
 from openai.types.chat.chat_completion_message import ChatCompletionMessageToolCall
 
 from eval_protocol.dataset_logger import default_logger
@@ -14,6 +15,15 @@ async def default_single_turn_rollout_processor(
 ) -> List[EvaluationRow]:
     """Generate a single response from any supported model provider using LiteLLM."""
 
+    # Explicitly disable LiteLLM caching to avoid reused responses across runs
+    try:
+        litellm.cache = None
+        # Some versions expose a helper; ignore if unavailable
+        if hasattr(litellm, "disable_cache"):
+            litellm.disable_cache()  # type: ignore[call-arg]
+    except Exception:
+        pass
+
     async def process_row(row: EvaluationRow) -> EvaluationRow:
         """Process a single row asynchronously."""
         if len(row.messages) == 0:
@@ -22,6 +32,11 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
         messages_payload = [{"role": m.role, "content": m.content} for m in row.messages]
 
         request_params = {"model": config.model, "messages": messages_payload, **config.input_params}
+        # Allow passing reasoning effort to Fireworks via LiteLLM using extra_body
+        # Expected: config.input_params may contain {"reasoning": {"effort": "low|medium|high"}}
+        if "reasoning" in config.input_params:
+            request_params.setdefault("extra_body", {})
+            request_params["extra_body"]["reasoning"] = config.input_params["reasoning"]
 
         if row.tools is not None:
             request_params["tools"] = row.tools
@@ -57,8 +72,15 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
         default_logger.log(row)
         return row
 
-    # Process all rows concurrently
-    tasks = [process_row(row) for row in rows]
+    # Process rows with bounded concurrency if configured
+    max_concurrent = getattr(config, "max_concurrent_rollouts", 8) or 8
+    semaphore = asyncio.Semaphore(max_concurrent)
+
+    async def _sem_wrapper(r: EvaluationRow) -> EvaluationRow:
+        async with semaphore:
+            return await process_row(r)
+
+    tasks = [_sem_wrapper(row) for row in rows]
     dataset = list(await asyncio.gather(*tasks))
 
     return dataset