|
2 | 2 | import logging |
3 | 3 | import os |
4 | 4 | import time |
5 | | -from typing import AsyncIterator, List |
| 5 | +from typing import List |
6 | 6 |
|
7 | | -import litellm |
8 | 7 | from litellm import acompletion |
9 | 8 | from openai.types.chat.chat_completion_message import ChatCompletionMessageToolCall |
10 | 9 |
|
11 | 10 | from eval_protocol.dataset_logger import default_logger |
12 | 11 | from eval_protocol.models import EvaluationRow, Message |
| 12 | +from eval_protocol.pytest.rollout_processor import RolloutProcessor |
13 | 13 | from eval_protocol.pytest.types import RolloutProcessorConfig |
14 | 14 |
|
15 | 15 | logger = logging.getLogger(__name__) |
16 | 16 |
|
17 | 17 |
|
18 | | -async def default_single_turn_rollout_processor( |
19 | | - rows: List[EvaluationRow], config: RolloutProcessorConfig |
20 | | -) -> AsyncIterator[EvaluationRow]: |
21 | | - """Generate a single response from any supported model provider using LiteLLM.""" |
22 | | - |
23 | | - # Quiet LiteLLM logs in test runs unless user overrode |
24 | | - try: |
25 | | - if os.environ.get("LITELLM_LOG") is None: |
26 | | - os.environ["LITELLM_LOG"] = "ERROR" |
27 | | - _llog = logging.getLogger("LiteLLM") |
28 | | - _llog.setLevel(logging.CRITICAL) |
29 | | - _llog.propagate = False |
30 | | - for _h in list(_llog.handlers): |
31 | | - _llog.removeHandler(_h) |
32 | | - except Exception: |
33 | | - pass |
34 | | - |
35 | | - # Do not modify global LiteLLM cache. Disable caching per-request instead. |
36 | | - |
37 | | - async def process_row(row: EvaluationRow) -> EvaluationRow: |
38 | | - """Process a single row asynchronously.""" |
39 | | - if len(row.messages) == 0: |
40 | | - raise ValueError("Messages is empty. Please provide a non-empty dataset") |
41 | | - |
42 | | - messages_payload = [{"role": m.role, "content": m.content} for m in row.messages] |
43 | | - |
44 | | - request_params = {"model": config.model, "messages": messages_payload, **config.input_params} |
45 | | - # Ensure caching is disabled only for this request (review feedback) |
46 | | - request_params["cache"] = {"no-cache": True} |
47 | | - # Single-level reasoning effort: expect `reasoning_effort` only |
48 | | - effort_val = None |
49 | | - if isinstance(config.input_params, dict): |
50 | | - if "reasoning_effort" in config.input_params: |
51 | | - effort_val = str(config.input_params["reasoning_effort"]) # flat shape |
| 18 | +class SingleTurnRolloutProcessor(RolloutProcessor): |
| 19 | + """Single turn rollout processor for direct LLM calls.""" |
| 20 | + |
| 21 | + def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) -> List[asyncio.Task[EvaluationRow]]: |
| 22 | + """Generate single turn rollout tasks and return them for external handling.""" |
| 23 | + |
| 24 | + # Quiet LiteLLM logs in test runs unless user overrode |
| 25 | + try: |
| 26 | + if os.environ.get("LITELLM_LOG") is None: |
| 27 | + os.environ["LITELLM_LOG"] = "ERROR" |
| 28 | + _llog = logging.getLogger("LiteLLM") |
| 29 | + _llog.setLevel(logging.CRITICAL) |
| 30 | + _llog.propagate = False |
| 31 | + for _h in list(_llog.handlers): |
| 32 | + _llog.removeHandler(_h) |
| 33 | + except Exception: |
| 34 | + pass |
| 35 | + |
| 36 | + # Do not modify global LiteLLM cache. Disable caching per-request instead. |
| 37 | + |
| 38 | + async def process_row(row: EvaluationRow) -> EvaluationRow: |
| 39 | + """Process a single row asynchronously.""" |
| 40 | + if len(row.messages) == 0: |
| 41 | + raise ValueError("Messages is empty. Please provide a non-empty dataset") |
| 42 | + |
| 43 | + messages_payload = [{"role": m.role, "content": m.content} for m in row.messages] |
| 44 | + |
| 45 | + request_params = {"messages": messages_payload, **config.completion_params} |
| 46 | + # Ensure caching is disabled only for this request (review feedback) |
| 47 | + request_params["cache"] = {"no-cache": True} |
| 48 | + # Single-level reasoning effort: expect `reasoning_effort` only |
| 49 | + effort_val = None |
| 50 | + |
| 51 | + if "reasoning_effort" in config.completion_params: |
| 52 | + effort_val = str(config.completion_params["reasoning_effort"]) # flat shape |
52 | 53 | elif ( |
53 | | - isinstance(config.input_params.get("extra_body"), dict) |
54 | | - and "reasoning_effort" in config.input_params["extra_body"] |
| 54 | + isinstance(config.completion_params.get("extra_body"), dict) |
| 55 | + and "reasoning_effort" in config.completion_params["extra_body"] |
55 | 56 | ): |
56 | 57 | # Accept if user passed it directly inside extra_body |
57 | | - effort_val = str(config.input_params["extra_body"]["reasoning_effort"]) # already in extra_body |
58 | | - |
59 | | - if effort_val: |
60 | | - # Always under extra_body so LiteLLM forwards to provider-specific param set |
61 | | - request_params.setdefault("extra_body", {}) |
62 | | - request_params["extra_body"]["reasoning_effort"] = effort_val |
63 | | - # Ensure unsupported top-level keys are not present |
64 | | - if "reasoning_effort" in request_params: |
65 | | - request_params.pop("reasoning_effort", None) |
66 | | - |
67 | | - if row.tools is not None: |
68 | | - request_params["tools"] = row.tools |
69 | | - |
70 | | - # Dynamic import to avoid static dependency/lint errors if LiteLLM isn't installed yet |
71 | | - import importlib |
72 | | - |
73 | | - _litellm = importlib.import_module("litellm") |
74 | | - acompletion = getattr(_litellm, "acompletion") |
75 | | - response = await acompletion(**request_params) |
76 | | - |
77 | | - assistant_content = response.choices[0].message.content or "" |
78 | | - tool_calls = response.choices[0].message.tool_calls if response.choices[0].message.tool_calls else None |
79 | | - |
80 | | - converted_tool_calls = None |
81 | | - if tool_calls: |
82 | | - converted_tool_calls = [ |
83 | | - ChatCompletionMessageToolCall( |
84 | | - id=tool_call.id, |
85 | | - type=tool_call.type, |
86 | | - function={ |
87 | | - "name": tool_call.function.name, |
88 | | - "arguments": tool_call.function.arguments, |
89 | | - }, |
| 58 | + effort_val = str(config.completion_params["extra_body"]["reasoning_effort"]) # already in extra_body |
| 59 | + |
| 60 | + if effort_val: |
| 61 | + # Always under extra_body so LiteLLM forwards to provider-specific param set |
| 62 | + request_params.setdefault("extra_body", {}) |
| 63 | + request_params["extra_body"]["reasoning_effort"] = effort_val |
| 64 | + # Ensure unsupported top-level keys are not present |
| 65 | + if "reasoning_effort" in request_params: |
| 66 | + request_params.pop("reasoning_effort", None) |
| 67 | + |
| 68 | + if row.tools is not None: |
| 69 | + request_params["tools"] = row.tools |
| 70 | + |
| 71 | + # Dynamic import to avoid static dependency/lint errors if LiteLLM isn't installed yet |
| 72 | + import importlib |
| 73 | + |
| 74 | + _litellm = importlib.import_module("litellm") |
| 75 | + acompletion = getattr(_litellm, "acompletion") |
| 76 | + response = await acompletion(**request_params) |
| 77 | + |
| 78 | + assistant_content = response.choices[0].message.content or "" |
| 79 | + tool_calls = response.choices[0].message.tool_calls if response.choices[0].message.tool_calls else None |
| 80 | + |
| 81 | + converted_tool_calls = None |
| 82 | + if tool_calls: |
| 83 | + converted_tool_calls = [ |
| 84 | + ChatCompletionMessageToolCall( |
| 85 | + id=tool_call.id, |
| 86 | + type=tool_call.type, |
| 87 | + function={ |
| 88 | + "name": tool_call.function.name, |
| 89 | + "arguments": tool_call.function.arguments, |
| 90 | + }, |
| 91 | + ) |
| 92 | + for tool_call in tool_calls |
| 93 | + ] |
| 94 | + |
| 95 | + messages = list(row.messages) + [ |
| 96 | + Message( |
| 97 | + role="assistant", |
| 98 | + content=assistant_content, |
| 99 | + tool_calls=converted_tool_calls, |
90 | 100 | ) |
91 | | - for tool_call in tool_calls |
92 | 101 | ] |
93 | 102 |
|
94 | | - messages = list(row.messages) + [ |
95 | | - Message( |
96 | | - role="assistant", |
97 | | - content=assistant_content, |
98 | | - tool_calls=converted_tool_calls, |
99 | | - ) |
100 | | - ] |
101 | | - |
102 | | - row.messages = messages |
103 | | - default_logger.log(row) |
104 | | - return row |
105 | | - |
106 | | - # Process rows with bounded concurrency and yield as they complete |
107 | | - max_concurrent = getattr(config, "max_concurrent_rollouts", 8) or 8 |
108 | | - semaphore = asyncio.Semaphore(max_concurrent) |
109 | | - |
110 | | - async def _sem_wrapper(r: EvaluationRow) -> EvaluationRow: |
111 | | - async with semaphore: |
112 | | - try: |
113 | | - return await process_row(r) |
114 | | - except Exception: |
115 | | - return r |
116 | | - |
117 | | - # Create all tasks |
118 | | - tasks = [asyncio.create_task(_sem_wrapper(row)) for row in rows] |
119 | | - |
120 | | - # Yield results as they complete (note that they're not necessarily in original order) |
121 | | - try: |
122 | | - for task in asyncio.as_completed(tasks): |
123 | | - try: |
124 | | - yield await task |
125 | | - except Exception: |
126 | | - logger.exception("Error processing row") |
127 | | - finally: |
128 | | - for t in tasks: |
129 | | - t.cancel() |
130 | | - await asyncio.gather(*tasks, return_exceptions=True) |
| 103 | + row.messages = messages |
| 104 | + default_logger.log(row) |
| 105 | + return row |
| 106 | + |
| 107 | + # Process rows with bounded concurrency |
| 108 | + max_concurrent = getattr(config, "max_concurrent_rollouts", 8) or 8 |
| 109 | + semaphore = asyncio.Semaphore(max_concurrent) |
| 110 | + |
| 111 | + async def _sem_wrapper(r: EvaluationRow) -> EvaluationRow: |
| 112 | + async with semaphore: |
| 113 | + result = await process_row(r) |
| 114 | + return result |
| 115 | + |
| 116 | + # Create and return tasks for external handling |
| 117 | + tasks = [asyncio.create_task(_sem_wrapper(row)) for row in rows] |
| 118 | + return tasks |
0 commit comments