Skip to content

Commit 613d8d1

Browse files
committed
fixed per comments
1 parent 4aa9e5c commit 613d8d1

File tree

10 files changed

+355
-148
lines changed

10 files changed

+355
-148
lines changed

development/RUNNING_EVALUATIONS.md

Lines changed: 0 additions & 80 deletions
This file was deleted.

eval_protocol/common_utils.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ def load_jsonl(file_path: str) -> List[Dict[str, Any]]:
1414
1515
Returns:
1616
A list of dictionaries, where each dictionary is a parsed JSON object from a line.
17-
Returns an empty list if the file is not found or if errors occur during parsing.
17+
Returns an empty list if the file is not found or if errors occur during parsing. Supports HTTP urls and local file paths.
1818
"""
1919
data: List[Dict[str, Any]] = []
2020
if file_path.startswith("http://") or file_path.startswith("https://"):
@@ -33,7 +33,7 @@ def load_jsonl(file_path: str) -> List[Dict[str, Any]]:
3333
row_id_index = stripped.find("row_id")
3434
if row_id_index != -1:
3535
row_id = re.search(r'"row_id": (.*),', stripped[row_id_index:])
36-
raise ValueError(f"{e.msg} at line {line_number}: {stripped} ({row_id})")
36+
raise ValueError(f"{e.msg} at line {line_number}: {stripped} ({row_id})") from e
3737
raise e
3838
else:
3939
with open(file_path, "r", encoding="utf-8") as f:
@@ -50,6 +50,6 @@ def load_jsonl(file_path: str) -> List[Dict[str, Any]]:
5050
row_id_index = line.find("row_id")
5151
if row_id_index != -1:
5252
row_id = re.search(r'"row_id": (.*),', line[row_id_index:])
53-
raise ValueError(f"{e.msg} at line {line_number}: {line} ({row_id})")
53+
raise ValueError(f"{e.msg} at line {line_number}: {line} ({row_id})") from e
5454
raise e
5555
return data

eval_protocol/pytest/default_single_turn_rollout_process.py

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,11 @@
11
import asyncio
22
from typing import List
33

4-
from litellm import acompletion
5-
import litellm
6-
from openai.types.chat.chat_completion_message import ChatCompletionMessageToolCall
4+
import logging
5+
import os
76

87
from eval_protocol.dataset_logger import default_logger
9-
from eval_protocol.models import EvaluationRow, Message
8+
from eval_protocol.models import EvaluationRow, Message, ChatCompletionMessageToolCall
109
from eval_protocol.pytest.types import RolloutProcessorConfig
1110

1211

@@ -15,15 +14,20 @@ async def default_single_turn_rollout_processor(
1514
) -> List[EvaluationRow]:
1615
"""Generate a single response from any supported model provider using LiteLLM."""
1716

18-
# Explicitly disable LiteLLM caching to avoid reused responses across runs
17+
# Quiet LiteLLM logs in test runs unless user overrode
1918
try:
20-
litellm.cache = None
21-
# Some versions expose a helper; ignore if unavailable
22-
if hasattr(litellm, "disable_cache"):
23-
litellm.disable_cache() # type: ignore[call-arg]
19+
if os.environ.get("LITELLM_LOG") is None:
20+
os.environ["LITELLM_LOG"] = "ERROR"
21+
_llog = logging.getLogger("LiteLLM")
22+
_llog.setLevel(logging.CRITICAL)
23+
_llog.propagate = False
24+
for _h in list(_llog.handlers):
25+
_llog.removeHandler(_h)
2426
except Exception:
2527
pass
2628

29+
# Do not modify global LiteLLM cache. Disable caching per-request instead.
30+
2731
async def process_row(row: EvaluationRow) -> EvaluationRow:
2832
"""Process a single row asynchronously."""
2933
if len(row.messages) == 0:
@@ -32,6 +36,8 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
3236
messages_payload = [{"role": m.role, "content": m.content} for m in row.messages]
3337

3438
request_params = {"model": config.model, "messages": messages_payload, **config.input_params}
39+
# Ensure caching is disabled only for this request (review feedback)
40+
request_params["cache"] = {"no-cache": True}
3541
# Allow passing reasoning effort to Fireworks via LiteLLM using extra_body
3642
# Expected: config.input_params may contain {"reasoning": {"effort": "low|medium|high"}}
3743
if "reasoning" in config.input_params:
@@ -41,6 +47,10 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
4147
if row.tools is not None:
4248
request_params["tools"] = row.tools
4349

50+
# Dynamic import to avoid static dependency/lint errors if LiteLLM isn't installed yet
51+
import importlib
52+
_litellm = importlib.import_module("litellm")
53+
acompletion = getattr(_litellm, "acompletion")
4454
response = await acompletion(**request_params)
4555

4656
assistant_content = response.choices[0].message.content or ""

0 commit comments

Comments
 (0)