1- # MANUAL SERVER STARTUP REQUIRED:
2- #
3- # For Python server testing, start:
4- # python -m tests.remote_server.remote_server (runs on http://127.0.0.1:3000)
5- #
6- # For TypeScript server testing, start:
7- # cd tests/remote_server/typescript-server
8- # npm install
9- # npm start
10- #
11- # The TypeScript server should be running on http://127.0.0.1:3000
12- # You only need to start one of the servers!
13-
141import os
152from typing import List
163
207from eval_protocol .models import EvaluationRow , Message
218from eval_protocol .pytest import evaluation_test
229from eval_protocol .pytest .remote_rollout_processor import RemoteRolloutProcessor
23- from eval_protocol .adapters .langfuse import create_langfuse_adapter
24- from eval_protocol .utils .evaluation_row_utils import filter_longest_conversation
25- from eval_protocol .types .remote_rollout_processor import DataLoaderConfig
26-
27- ROLLOUT_IDS = set ()
28-
29-
30- @pytest .fixture (autouse = True )
31- def check_rollout_coverage ():
32- """Ensure we processed all expected rollout_ids"""
33- global ROLLOUT_IDS
34- ROLLOUT_IDS .clear ()
35- yield
36-
37- assert len (ROLLOUT_IDS ) == 3 , f"Expected to see { ROLLOUT_IDS } rollout_ids, but only saw { ROLLOUT_IDS } "
38-
39-
40- def fetch_langfuse_traces (config : DataLoaderConfig ) -> List [EvaluationRow ]:
41- global ROLLOUT_IDS # Track all rollout_ids we've seen
42- ROLLOUT_IDS .add (config .rollout_id )
43-
44- adapter = create_langfuse_adapter ()
45- return adapter .get_evaluation_rows (tags = [f"rollout_id:{ config .rollout_id } " ], max_retries = 5 )
46-
47-
48- def langfuse_output_data_loader (config : DataLoaderConfig ) -> DynamicDataLoader :
49- return DynamicDataLoader (
50- generators = [lambda : fetch_langfuse_traces (config )], preprocess_fn = filter_longest_conversation
51- )
5210
5311
5412def rows () -> List [EvaluationRow ]:
@@ -62,25 +20,14 @@ def rows() -> List[EvaluationRow]:
6220 data_loaders = DynamicDataLoader (
6321 generators = [rows ],
6422 ),
65- rollout_processor = RemoteRolloutProcessor (
66- remote_base_url = "http://127.0.0.1:3000" ,
67- timeout_seconds = 30 ,
68- output_data_loader = langfuse_output_data_loader ,
69- model_base_url = "https://tracing.fireworks.ai/project_id/cmg5fd57b0006y107kuxkcrhk" ,
70- ),
23+ rollout_processor = RemoteRolloutProcessor (remote_base_url = "http://127.0.0.1:3000" , timeout_seconds = 30 ),
7124)
7225async def test_remote_rollout_and_fetch_langfuse (row : EvaluationRow ) -> EvaluationRow :
7326 """
7427 End-to-end test:
75- - REQUIRES MANUAL SERVER STARTUP: python -m tests.remote_server.remote_server
7628 - trigger remote rollout via RemoteRolloutProcessor (calls init/status)
77- - fetch traces from Langfuse filtered by metadata via output_data_loader; FAIL if none found
7829 """
7930 assert row .messages [0 ].content == "What is the capital of France?" , "Row should have correct message content"
8031 assert len (row .messages ) > 1 , "Row should have a response. If this fails, we fellback to the original row."
8132
82- assert row .execution_metadata .rollout_id in ROLLOUT_IDS , (
83- f"Row rollout_id { row .execution_metadata .rollout_id } should be in tracked rollout_ids: { ROLLOUT_IDS } "
84- )
85-
8633 return row
0 commit comments