various changes

xzrderek · xzrderek · commit 91f237824690 · 2025-10-06T10:39:28.000-07:00
diff --git a/eval_protocol/adapters/__init__.py b/eval_protocol/adapters/__init__.py
@@ -25,12 +25,9 @@
 except ImportError:
     pass
 
-try:
-    from .fireworks_tracing import FireworksTracingAdapter, create_fireworks_tracing_adapter
+from .fireworks_tracing import FireworksTracingAdapter, create_fireworks_tracing_adapter
 
-    __all__.extend(["FireworksTracingAdapter", "create_fireworks_tracing_adapter"])
-except ImportError:
-    pass
+__all__.extend(["FireworksTracingAdapter", "create_fireworks_tracing_adapter"])
 
 try:
     from .huggingface import (
diff --git a/eval_protocol/adapters/fireworks_tracing.py b/eval_protocol/adapters/fireworks_tracing.py
@@ -347,9 +347,9 @@ def get_evaluation_rows(
 
         # Make request to proxy
         if self.project_id:
-            url = f"{self.base_url}/v1/project_id/{self.project_id}/langfuse/traces"
+            url = f"{self.base_url}/v1/project_id/{self.project_id}/traces"
         else:
-            url = f"{self.base_url}/v1/langfuse/traces"
+            url = f"{self.base_url}/v1/traces"
 
         try:
             response = requests.post(url, json=payload, timeout=self.timeout)
diff --git a/eval_protocol/pytest/remote_rollout_processor.py b/eval_protocol/pytest/remote_rollout_processor.py
@@ -8,6 +8,8 @@
 from eval_protocol.models import EvaluationRow, Status
 from eval_protocol.data_loader.dynamic_data_loader import DynamicDataLoader
 from eval_protocol.types.remote_rollout_processor import ElasticsearchConfig, InitRequest, RolloutMetadata
+from eval_protocol.adapters.fireworks_tracing import create_fireworks_tracing_adapter
+from eval_protocol.quickstart.utils import filter_longest_conversation
 from .rollout_processor import RolloutProcessor
 from .types import RolloutProcessorConfig
 from .elasticsearch_setup import ElasticsearchSetup
@@ -18,10 +20,30 @@
 logger = logging.getLogger(__name__)
 
 
+def _default_output_data_loader(rollout_id: str, base_url: str) -> DynamicDataLoader:
+    """Default output data loader that fetches traces from Fireworks tracing proxy.
+
+    Args:
+        rollout_id: The rollout ID to filter traces by
+
+    Returns:
+        DynamicDataLoader configured to fetch and process traces
+    """
+
+    def fetch_traces() -> List[EvaluationRow]:
+        adapter = create_fireworks_tracing_adapter(base_url=base_url)
+        return adapter.get_evaluation_rows(tags=[f"rollout_id:{rollout_id}"], max_retries=5)
+
+    return DynamicDataLoader(generators=[fetch_traces], preprocess_fn=filter_longest_conversation)
+
+
 class RemoteRolloutProcessor(RolloutProcessor):
     """
     Rollout processor that triggers a remote HTTP server to perform the rollout.
 
+    By default, fetches traces from the Fireworks tracing proxy using rollout_id tags.
+    You can provide a custom output_data_loader for different tracing backends.
+
     See https://evalprotocol.io/tutorial/remote-rollout-processor for documentation.
     """
 
@@ -32,7 +54,7 @@ def __init__(
         model_base_url: str = "https://tracing.fireworks.ai",
         poll_interval: float = 1.0,
         timeout_seconds: float = 120.0,
-        output_data_loader: Callable[[str], DynamicDataLoader],
+        output_data_loader: Optional[Callable[[str, str], DynamicDataLoader]] = None,
         disable_elastic_search: bool = False,
         elastic_search_config: Optional[ElasticsearchConfig] = None,
     ):
@@ -44,7 +66,7 @@ def __init__(
             self._remote_base_url = os.getenv("EP_REMOTE_ROLLOUT_PROCESSOR_BASE_URL")
         self._poll_interval = poll_interval
         self._timeout_seconds = timeout_seconds
-        self._output_data_loader = output_data_loader
+        self._output_data_loader = output_data_loader or _default_output_data_loader
         self._disable_elastic_search = disable_elastic_search
         self._elastic_search_config = elastic_search_config
 
@@ -242,7 +264,7 @@ def _get_status() -> Dict[str, Any]:
             if row.execution_metadata.rollout_id is None:
                 raise ValueError("Rollout ID is required in RemoteRolloutProcessor")
 
-            data_loader = self._output_data_loader(row.execution_metadata.rollout_id)
+            data_loader = self._output_data_loader(row.execution_metadata.rollout_id, model_base_url)
 
             def _load_data():
                 return data_loader.load()
diff --git a/tests/remote_server/quickstart.py b/tests/remote_server/quickstart.py
@@ -0,0 +1,53 @@
+# MANUAL SERVER STARTUP REQUIRED:
+#
+# For Python server testing, start:
+# python -m tests.remote_server.remote_server (runs on http://127.0.0.1:3000)
+#
+# For TypeScript server testing, start:
+# cd tests/remote_server/typescript-server
+# npm install
+# npm start
+#
+# The TypeScript server should be running on http://127.0.0.1:3000
+# You only need to start one of the servers!
+
+import os
+from typing import List
+
+import pytest
+
+from eval_protocol.data_loader.dynamic_data_loader import DynamicDataLoader
+from eval_protocol.models import EvaluationRow, Message
+from eval_protocol.pytest import evaluation_test
+from eval_protocol.pytest.remote_rollout_processor import RemoteRolloutProcessor
+
+
+def rows() -> List[EvaluationRow]:
+    row = EvaluationRow(messages=[Message(role="user", content="What is the capital of France?")])
+    return [row, row, row]
+
+
+@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Only run this test locally (skipped in CI)")
+@pytest.mark.parametrize("completion_params", [{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}])
+@evaluation_test(
+    data_loaders=DynamicDataLoader(
+        generators=[rows],
+    ),
+    rollout_processor=RemoteRolloutProcessor(
+        remote_base_url="http://127.0.0.1:3000",
+        timeout_seconds=30,
+    ),
+)
+async def test_remote_rollout_and_fetch_fireworks(row: EvaluationRow) -> EvaluationRow:
+    """
+    End-to-end test:
+    - REQUIRES MANUAL SERVER STARTUP: python -m tests.remote_server.remote_server
+    - trigger remote rollout via RemoteRolloutProcessor (calls init/status)
+    - fetch traces from Langfuse via Fireworks tracing proxy (uses default FireworksTracingAdapter)
+    - FAIL if no traces found or rollout_id missing
+    """
+    assert row.messages[0].content == "What is the capital of France?", "Row should have correct message content"
+    assert len(row.messages) > 1, "Row should have a response. If this fails, we fellback to the original row."
+    assert row.execution_metadata.rollout_id, "Row should have a rollout_id from the remote rollout"
+
+    return row
diff --git a/tests/remote_server/test_remote_fireworks.py b/tests/remote_server/test_remote_fireworks.py
@@ -36,17 +36,17 @@ def check_rollout_coverage():
     assert len(ROLLOUT_IDS) == 3, f"Expected to see 3 rollout_ids, but only saw {ROLLOUT_IDS}"
 
 
-def fetch_fireworks_traces(rollout_id: str) -> List[EvaluationRow]:
+def fetch_fireworks_traces(rollout_id: str, base_url: str) -> List[EvaluationRow]:
     global ROLLOUT_IDS  # Track all rollout_ids we've seen
     ROLLOUT_IDS.add(rollout_id)
 
-    adapter = create_fireworks_tracing_adapter()
+    adapter = create_fireworks_tracing_adapter(base_url=base_url)
     return adapter.get_evaluation_rows(tags=[f"rollout_id:{rollout_id}"], max_retries=5)
 
 
-def fireworks_output_data_loader(rollout_id: str) -> DynamicDataLoader:
+def fireworks_output_data_loader(rollout_id: str, base_url: str) -> DynamicDataLoader:
     return DynamicDataLoader(
-        generators=[lambda: fetch_fireworks_traces(rollout_id)], preprocess_fn=filter_longest_conversation
+        generators=[lambda: fetch_fireworks_traces(rollout_id, base_url)], preprocess_fn=filter_longest_conversation
     )