test

xzrderek · xzrderek · commit fc827678c33e · 2025-10-14T22:00:50.000-07:00
diff --git a/eval_protocol/benchmarks/test_aime25.py b/eval_protocol/benchmarks/test_aime25.py
@@ -79,9 +79,9 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
 
 @evaluation_test(
     input_dataset=[
-        # _get_aime_dataset_path(),
-        "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl",
-        "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl",
+        _get_aime_dataset_path(),
+        # "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl",
+        # "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl",
     ],
     dataset_adapter=aime2025_dataset_adapter,
     completion_params=[
@@ -91,6 +91,7 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:
             "model": "fireworks_ai/accounts/pyroworks/deployedModels/glm-4p6-qpwrimne",
             # "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
             "stream": True,
+            # "timeout": 2400,
         }
     ],
     rollout_processor=SingleTurnRolloutProcessor(),
diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py
@@ -6,6 +6,8 @@
 
 import litellm
 from litellm import acompletion
+from litellm.types.utils import ModelResponse, Choices
+from litellm.litellm_core_utils.streaming_handler import CustomStreamWrapper
 
 from eval_protocol.dataset_logger import default_logger
 from eval_protocol.models import EvaluationRow, Message
@@ -65,14 +67,18 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
             if request_params.get("stream") is True:
                 chunks = []
                 stream = await acompletion(**request_params)
+
+                assert isinstance(stream, CustomStreamWrapper), "Stream should be a CustomStreamWrapper"
+
                 async for chunk in stream:  # pyright: ignore[reportGeneralTypeIssues]
                     chunks.append(chunk)
                 response = litellm.stream_chunk_builder(chunks, messages_payload)
             else:
                 response = await acompletion(**request_params)
 
-            if response is None:
-                raise ValueError("Response is None")
+            assert response is not None, "Response is None"
+            assert isinstance(response, ModelResponse), "Response should be ModelResponse"
+            assert isinstance(response.choices[0], Choices), "Response choice should be a Choices"
 
             assistant_content = response.choices[0].message.content or ""
             tool_calls = response.choices[0].message.tool_calls if response.choices[0].message.tool_calls else None
@@ -115,10 +121,12 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
                     tool_calls=converted_tool_calls,
                 )
             ]
-            row.execution_metadata.usage = CompletionUsage(
-                prompt_tokens=response.usage.prompt_tokens,
-                completion_tokens=response.usage.completion_tokens,
-                total_tokens=response.usage.total_tokens,
+            row.execution_metadata.usage = (
+                CompletionUsage(  # Note: LiteLLM sets usage dynamically via setattr(), not as a typed field
+                    prompt_tokens=response.usage.prompt_tokens,  # pyright: ignore[reportAttributeAccessIssue]
+                    completion_tokens=response.usage.completion_tokens,  # pyright: ignore[reportAttributeAccessIssue]
+                    total_tokens=response.usage.total_tokens,  # pyright: ignore[reportAttributeAccessIssue]
+                )
             )
 
             row.messages = messages