Update LLM Judge Example and Adapter (#175)

xzrderek · web-flow · commit 668652d7c488 · 2025-09-14T21:37:02.000-07:00
* fix langfuse rate limit issue

* to revert later, get 50 random traces to query

* don't skip

* make judgment async

* bump limit up

* lower concurrency for gemini

* small limit to see if we get the error still

* test

* test

* try this

* fix

* fix

* no split

* ok wtf

* try something else

* test

* 1 run

* same as aime now

* try osmething else

* remove gpt

* gpt

* try to mute and see what happens

* monkey patch

* try

* broken still

* how about 2 and 4

* fix single turn rollout acompletion

* add back

* test repro

* add

* undo weird changes i made

* big run with kimi judge

* lol

* add timing filter

* unique traces

* update adapter
diff --git a/eval_protocol/adapters/langfuse.py b/eval_protocol/adapters/langfuse.py
@@ -6,6 +6,8 @@
 
 from langfuse.api.resources.commons.types.observations_view import ObservationsView
 import logging
+import random
+import time
 from datetime import datetime, timedelta
 from typing import Any, Dict, Iterator, List, Optional, cast
 
@@ -59,54 +61,154 @@ def __init__(self):
     def get_evaluation_rows(
         self,
         limit: int = 100,
+        sample_size: int = 50,
         tags: Optional[List[str]] = None,
         user_id: Optional[str] = None,
         session_id: Optional[str] = None,
         hours_back: Optional[int] = None,
+        from_timestamp: Optional[datetime] = None,
+        to_timestamp: Optional[datetime] = None,
         include_tool_calls: bool = True,
+        sleep_between_gets: float = 2.5,
+        max_retries: int = 3,
     ) -> List[EvaluationRow]:
         """Pull traces from Langfuse and convert to EvaluationRow format.
 
         Args:
-            limit: Maximum number of rows to return
+            limit: Max number of trace summaries to collect via pagination (pre-sampling)
+            sample_size: Number of traces to fetch full details for (sampled from collected summaries)
             tags: Filter by specific tags
             user_id: Filter by user ID
             session_id: Filter by session ID
             hours_back: Filter traces from this many hours ago
+            from_timestamp: Explicit start time (overrides hours_back)
+            to_timestamp: Explicit end time (overrides hours_back)
             include_tool_calls: Whether to include tool calling traces
+            sleep_between_gets: Sleep time between individual trace.get() calls (2.5s for 30 req/min limit)
+            max_retries: Maximum retries for rate limit errors
 
-        Yields:
-            EvaluationRow: Converted evaluation rows
+        Returns:
+            List[EvaluationRow]: Converted evaluation rows
         """
-        # Get traces from Langfuse using new API
+        eval_rows = []
 
-        if hours_back:
+        # Determine time window: explicit from/to takes precedence over hours_back
+        if from_timestamp is None and to_timestamp is None and hours_back:
             to_timestamp = datetime.now()
             from_timestamp = to_timestamp - timedelta(hours=hours_back)
-        else:
-            to_timestamp = None
-            from_timestamp = None
 
-        eval_rows = []
+        # Collect trace summaries via pagination (up to limit)
+        all_traces = []
+        page = 1
+        collected = 0
 
-        traces: Traces = self.client.api.trace.list(
-            limit=limit,
-            tags=tags,
-            user_id=user_id,
-            session_id=session_id,
-            from_timestamp=from_timestamp,
-            to_timestamp=to_timestamp,
-        )
+        while collected < limit:
+            current_page_limit = min(100, limit - collected)  # Langfuse API max is 100
 
-        for trace in traces.data:
-            try:
-                trace: TraceWithFullDetails = self.client.api.trace.get(trace.id)
-                eval_row = self._convert_trace_to_evaluation_row(trace, include_tool_calls)
-                if eval_row:
-                    eval_rows.append(eval_row)
-            except (AttributeError, ValueError, KeyError) as e:
-                logger.warning("Failed to convert trace %s: %s", trace.id, e)
-                continue
+            logger.debug(
+                "Fetching page %d with limit %d (collected: %d/%d)", page, current_page_limit, collected, limit
+            )
+
+            # Fetch trace list with retry logic
+            traces = None
+            list_retries = 0
+            while list_retries < max_retries:
+                try:
+                    traces = self.client.api.trace.list(
+                        page=page,
+                        limit=current_page_limit,
+                        tags=tags,
+                        user_id=user_id,
+                        session_id=session_id,
+                        from_timestamp=from_timestamp,
+                        to_timestamp=to_timestamp,
+                        order_by="timestamp.desc",
+                    )
+                    break
+                except Exception as e:
+                    list_retries += 1
+                    if "429" in str(e) and list_retries < max_retries:
+                        sleep_time = 2**list_retries  # Exponential backoff
+                        logger.warning(
+                            "Rate limit hit on trace.list(), retrying in %ds (attempt %d/%d)",
+                            sleep_time,
+                            list_retries,
+                            max_retries,
+                        )
+                        time.sleep(sleep_time)
+                    else:
+                        logger.error("Failed to fetch trace list after %d retries: %s", max_retries, e)
+                        return eval_rows  # Return what we have so far
+
+            if not traces or not traces.data:
+                logger.debug("No more traces found on page %d", page)
+                break
+
+            logger.debug("Collected %d traces from page %d", len(traces.data), page)
+
+            all_traces.extend(traces.data)
+            collected += len(traces.data)
+
+            # Check if we have more pages
+            if hasattr(traces.meta, "page") and hasattr(traces.meta, "total_pages"):
+                if traces.meta.page >= traces.meta.total_pages:
+                    break
+            elif len(traces.data) < current_page_limit:
+                break
+
+            page += 1
+
+        if not all_traces:
+            logger.debug("No traces found")
+            return eval_rows
+
+        # Randomly sample traces to fetch full details (respect rate limits)
+        actual_sample_size = min(sample_size, len(all_traces))
+        selected_traces = random.sample(all_traces, actual_sample_size)
+
+        logger.debug("Randomly selected %d traces from %d collected", actual_sample_size, len(all_traces))
+
+        # Process each selected trace with sleep and retry logic
+        for trace_info in selected_traces:
+            # Sleep between gets to avoid rate limits
+            if sleep_between_gets > 0:
+                time.sleep(sleep_between_gets)
+
+            # Fetch full trace details with retry logic
+            trace_full = None
+            detail_retries = 0
+            while detail_retries < max_retries:
+                try:
+                    trace_full = self.client.api.trace.get(trace_info.id)
+                    break
+                except Exception as e:
+                    detail_retries += 1
+                    if "429" in str(e) and detail_retries < max_retries:
+                        sleep_time = 2**detail_retries  # Exponential backoff
+                        logger.warning(
+                            "Rate limit hit on trace.get(%s), retrying in %ds (attempt %d/%d)",
+                            trace_info.id,
+                            sleep_time,
+                            detail_retries,
+                            max_retries,
+                        )
+                        time.sleep(sleep_time)
+                    else:
+                        logger.warning("Failed to fetch trace %s after %d retries: %s", trace_info.id, max_retries, e)
+                        break  # Skip this trace
+
+            if trace_full:
+                try:
+                    eval_row = self._convert_trace_to_evaluation_row(trace_full, include_tool_calls)
+                    if eval_row:
+                        eval_rows.append(eval_row)
+                except (AttributeError, ValueError, KeyError) as e:
+                    logger.warning("Failed to convert trace %s: %s", trace_info.id, e)
+                    continue
+
+        logger.info(
+            "Successfully processed %d selected traces into %d evaluation rows", len(selected_traces), len(eval_rows)
+        )
         return eval_rows
 
     def get_evaluation_rows_by_ids(
diff --git a/eval_protocol/mcp/execution/policy.py b/eval_protocol/mcp/execution/policy.py
@@ -194,7 +194,12 @@ async def _make_llm_call(self, messages: List[Dict[str, Any]], tools: List[Dict[
             request_params["tools"] = tools
 
         try:
-            response = await acompletion(model=self.model_id, **request_params)
+            response = await acompletion(
+                model=self.model_id,
+                **request_params,
+                # api_base="https://litellm-cloud-proxy-prod-zfdbl7ykrq-uc.a.run.app/v1",
+                # extra_body={"tags": ["kimi-k2-tau-bench"]},
+            )
 
             # Log cache hit/miss for monitoring
             hidden = getattr(response, "_hidden_params", {})
diff --git a/eval_protocol/quickstart/llm_judge.py b/eval_protocol/quickstart/llm_judge.py
@@ -3,6 +3,7 @@
 """
 
 import os
+from datetime import datetime
 from typing import List, Dict, Any, Optional
 from tqdm import tqdm
 
@@ -14,32 +15,44 @@
 from eval_protocol.quickstart.utils import (
     split_multi_turn_rows,
     JUDGE_CONFIGS,
-    fetch_langfuse_traces_as_evaluation_rows,
     calculate_bootstrap_scores,
     push_scores_to_langfuse,
-    run_judgment,
+    run_judgment_async,
 )
+import asyncio
+from openai import AsyncOpenAI
+from eval_protocol.adapters.langfuse import create_langfuse_adapter
 
-import concurrent.futures
-from concurrent.futures import ThreadPoolExecutor
+adapter = create_langfuse_adapter()
 
 
-@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI")
 @pytest.mark.asyncio
 @evaluation_test(
-    input_rows=[fetch_langfuse_traces_as_evaluation_rows()],
+    input_rows=[
+        adapter.get_evaluation_rows(
+            to_timestamp=datetime(2025, 9, 12, 0, 11, 18),
+            limit=711,
+            sample_size=50,
+            sleep_between_gets=3.0,
+            max_retries=5,
+        )
+    ],
     completion_params=[
+        {"model": "gpt-4.1"},
         {
-            "model": "fireworks_ai/accounts/fireworks/models/qwen3-235b-a22b-instruct-2507",
+            "max_tokens": 131000,
+            "extra_body": {"reasoning_effort": "medium"},
+            "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
         },
         {
             "max_tokens": 131000,
             "extra_body": {"reasoning_effort": "low"},
-            "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
+            "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-20b",
         },
     ],
     rollout_processor=SingleTurnRolloutProcessor(),
     preprocess_fn=split_multi_turn_rows,
+    max_concurrent_rollouts=64,
     mode="all",
 )
 async def test_llm_judge(rows: list[EvaluationRow]) -> list[EvaluationRow]:
@@ -73,11 +86,21 @@ async def test_llm_judge(rows: list[EvaluationRow]) -> list[EvaluationRow]:
     judgments = []
     max_concurrency = JUDGE_CONFIGS[judge_name]["max_concurrency"]
 
-    with ThreadPoolExecutor(max_workers=max_concurrency) as executor:
-        futures = [executor.submit(run_judgment, row, model_name, judge_name) for row in rows]
+    judge_config = JUDGE_CONFIGS[judge_name]
+
+    async with AsyncOpenAI(
+        api_key=judge_config.get("api_key"), base_url=judge_config.get("base_url")
+    ) as shared_client:
+        semaphore = asyncio.Semaphore(max_concurrency)
+
+        async def run_judgment(row):
+            async with semaphore:
+                return await run_judgment_async(row, model_name, judge_name, shared_client)
+
+        tasks = [run_judgment(row) for row in rows]
 
-        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Generating judgments"):
-            result = future.result()
+        for coro in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Generating judgments"):
+            result = await coro
             if result and result["games"][0] and result["games"][1]:
                 judgments.append(result)
 
diff --git a/eval_protocol/quickstart/utils.py b/eval_protocol/quickstart/utils.py