eval-protocol
diff --git a/‎development/RUNNING_EVALUATIONS.md‎
Lines changed: 0 additions & 80 deletions b/‎development/RUNNING_EVALUATIONS.md‎
Lines changed: 0 additions & 80 deletions
diff --git a/‎eval_protocol/common_utils.py‎
Lines changed: 3 additions & 3 deletions b/‎eval_protocol/common_utils.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎eval_protocol/pytest/default_single_turn_rollout_process.py‎
Lines changed: 19 additions & 9 deletions b/‎eval_protocol/pytest/default_single_turn_rollout_process.py‎
Lines changed: 19 additions & 9 deletions
@@ -14,7 +14,7 @@ def load_jsonl(file_path: str) -> List[Dict[str, Any]]:
 
     Returns:
         A list of dictionaries, where each dictionary is a parsed JSON object from a line.
-        Returns an empty list if the file is not found or if errors occur during parsing.
+        Returns an empty list if the file is not found or if errors occur during parsing. Supports HTTP urls and local file paths.
     """
     data: List[Dict[str, Any]] = []
     if file_path.startswith("http://") or file_path.startswith("https://"):
@@ -33,7 +33,7 @@ def load_jsonl(file_path: str) -> List[Dict[str, Any]]:
                 row_id_index = stripped.find("row_id")
                 if row_id_index != -1:
                     row_id = re.search(r'"row_id": (.*),', stripped[row_id_index:])
-                    raise ValueError(f"{e.msg} at line {line_number}: {stripped} ({row_id})")
+                    raise ValueError(f"{e.msg} at line {line_number}: {stripped} ({row_id})") from e
                 raise e
     else:
         with open(file_path, "r", encoding="utf-8") as f:
@@ -50,6 +50,6 @@ def load_jsonl(file_path: str) -> List[Dict[str, Any]]:
                     row_id_index = line.find("row_id")
                     if row_id_index != -1:
                         row_id = re.search(r'"row_id": (.*),', line[row_id_index:])
-                        raise ValueError(f"{e.msg} at line {line_number}: {line} ({row_id})")
+                        raise ValueError(f"{e.msg} at line {line_number}: {line} ({row_id})") from e
                     raise e
     return data
@@ -1,12 +1,11 @@
 import asyncio
 from typing import List
 
-from litellm import acompletion
-import litellm
-from openai.types.chat.chat_completion_message import ChatCompletionMessageToolCall
+import logging
+import os
 
 from eval_protocol.dataset_logger import default_logger
-from eval_protocol.models import EvaluationRow, Message
+from eval_protocol.models import EvaluationRow, Message, ChatCompletionMessageToolCall
 from eval_protocol.pytest.types import RolloutProcessorConfig
 
 
@@ -15,15 +14,20 @@ async def default_single_turn_rollout_processor(
 ) -> List[EvaluationRow]:
     """Generate a single response from any supported model provider using LiteLLM."""
 
-    # Explicitly disable LiteLLM caching to avoid reused responses across runs
+    # Quiet LiteLLM logs in test runs unless user overrode
     try:
-        litellm.cache = None
-        # Some versions expose a helper; ignore if unavailable
-        if hasattr(litellm, "disable_cache"):
-            litellm.disable_cache()  # type: ignore[call-arg]
+        if os.environ.get("LITELLM_LOG") is None:
+            os.environ["LITELLM_LOG"] = "ERROR"
+        _llog = logging.getLogger("LiteLLM")
+        _llog.setLevel(logging.CRITICAL)
+        _llog.propagate = False
+        for _h in list(_llog.handlers):
+            _llog.removeHandler(_h)
     except Exception:
         pass
 
+    # Do not modify global LiteLLM cache. Disable caching per-request instead.
+
     async def process_row(row: EvaluationRow) -> EvaluationRow:
         """Process a single row asynchronously."""
         if len(row.messages) == 0:
@@ -32,6 +36,8 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
         messages_payload = [{"role": m.role, "content": m.content} for m in row.messages]
 
         request_params = {"model": config.model, "messages": messages_payload, **config.input_params}
+        # Ensure caching is disabled only for this request (review feedback)
+        request_params["cache"] = {"no-cache": True}
         # Allow passing reasoning effort to Fireworks via LiteLLM using extra_body
         # Expected: config.input_params may contain {"reasoning": {"effort": "low|medium|high"}}
         if "reasoning" in config.input_params:
@@ -41,6 +47,10 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
         if row.tools is not None:
             request_params["tools"] = row.tools
 
+        # Dynamic import to avoid static dependency/lint errors if LiteLLM isn't installed yet
+        import importlib
+        _litellm = importlib.import_module("litellm")
+        acompletion = getattr(_litellm, "acompletion")
         response = await acompletion(**request_params)
 
         assistant_content = response.choices[0].message.content or ""