Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
268 changes: 133 additions & 135 deletions eval_protocol/adapters/huggingface.py

Large diffs are not rendered by default.

16 changes: 8 additions & 8 deletions eval_protocol/adapters/langfuse.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@
to EvaluationRow format for use in evaluation pipelines.
"""

from typing import Any, Dict, Iterator, List, Optional
from datetime import datetime
import logging
from datetime import datetime
from typing import Any, Dict, Iterator, List, Optional

from eval_protocol.models import EvaluationRow, Message, InputMetadata, CompletionParams
from eval_protocol.models import EvaluationRow, InputMetadata, Message

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -277,20 +277,20 @@ def _create_input_metadata(self, trace: Any, observations: List[Any]) -> InputMe
InputMetadata object
"""
# Extract completion parameters from observations
completion_params = CompletionParams()
completion_params = {}

# Look for model parameters in observations
for obs in observations:
if hasattr(obs, "model") and obs.model:
completion_params.model = obs.model
completion_params["model"] = obs.model
if hasattr(obs, "model_parameters") and obs.model_parameters:
params = obs.model_parameters
if "temperature" in params:
completion_params.temperature = params["temperature"]
completion_params["temperature"] = params["temperature"]
if "max_tokens" in params:
completion_params.max_tokens = params["max_tokens"]
completion_params["max_tokens"] = params["max_tokens"]
if "top_p" in params:
completion_params.top_p = params["top_p"]
completion_params["top_p"] = params["top_p"]
break

# Create dataset info from trace metadata
Expand Down
9 changes: 7 additions & 2 deletions eval_protocol/benchmarks/suites/aime25.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,13 +60,18 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]:

@export_benchmark("aime25")
@evaluation_test(
model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"],
input_dataset=[
"https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl",
"https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl",
],
dataset_adapter=aime2025_dataset_adapter,
rollout_input_params=[{"max_tokens": 131000, "extra_body": {"reasoning_effort": "low"}}],
completion_params=[
{
"max_tokens": 131000,
"extra_body": {"reasoning_effort": "low"},
"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
}
],
rollout_processor=default_single_turn_rollout_processor,
aggregation_method="mean",
passed_threshold=None,
Expand Down
10 changes: 7 additions & 3 deletions eval_protocol/benchmarks/suites/gpqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def _extract_abcd_letter(text: str) -> str | None:

_GPQA_INPUT_MESSAGES = _load_gpqa_messages_from_csv()


def _strip_gt_messages(msgs: List[Message]) -> List[Message]:
return [m for m in msgs if not (m.role == "system" and (m.content or "").startswith("__GT__:"))]

Expand All @@ -67,16 +68,19 @@ async def gpqa_strip_gt_rollout_processor(rows: List[EvaluationRow], config) ->
if gt_tokens:
gt_val = gt_tokens[-1].split(":", 1)[1].strip()
r.ground_truth = gt_val
r.messages = [m for m in r.messages if not (m.role == "system" and (m.content or "").startswith("__GT__:"))]
r.messages = [
m for m in r.messages if not (m.role == "system" and (m.content or "").startswith("__GT__:"))
]
processed.append(r)
return await default_single_turn_rollout_processor(processed, config)


@export_benchmark("gpqa")
@evaluation_test(
model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"],
input_messages=_GPQA_INPUT_MESSAGES,
rollout_input_params=[{"extra_body": {"reasoning_effort": "low"}}],
completion_params=[
{"extra_body": {"reasoning_effort": "low"}, "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}
],
rollout_processor=gpqa_strip_gt_rollout_processor,
aggregation_method="mean",
passed_threshold=None,
Expand Down
39 changes: 18 additions & 21 deletions eval_protocol/benchmarks/suites/livebench_data_analysis.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,19 @@
from typing import Any, Dict, List, Optional

import json
import re
from typing import Any, Dict, List, Optional

from eval_protocol.benchmarks.registry import export_benchmark, register_composite_benchmark
from eval_protocol.models import EvaluateResult, EvaluationRow, Message, MetricResult
from eval_protocol.pytest.default_single_turn_rollout_process import (
default_single_turn_rollout_processor,
)
from eval_protocol.pytest.evaluation_test import evaluation_test
from eval_protocol.benchmarks.registry import export_benchmark, register_composite_benchmark


# -------------------------
# Lightweight ports of LiveBench scoring utilities for data_analysis tasks
# -------------------------


def _lb_clean_text(text: str) -> str:
text = text.lower().strip()
text = re.sub(r"[^\w]", "", text)
Expand All @@ -36,9 +35,7 @@ def _cta_process_results(ground_truth: str, llm_answer: str) -> int:
boxed = _extract_last_boxed_segment(parsed_answer)
if boxed is not None:
parsed_answer = boxed
parsed_answer = (
parsed_answer.replace("\\text{", "").replace("}", "").replace("\\", "")
)
parsed_answer = parsed_answer.replace("\\text{", "").replace("}", "").replace("\\", "")

gt_clean = _lb_clean_text(ground_truth)
ans_clean = _lb_clean_text(parsed_answer)
Expand Down Expand Up @@ -132,17 +129,15 @@ def _tablejoin_process_results(ground_truth: Any, llm_answer: str) -> float:
return round((2 * tp) / denom, 2)


def _tablereformat_process_results(
input_command: str, ground_truth: str, llm_answer: str, version: str
) -> int:
def _tablereformat_process_results(input_command: str, ground_truth: str, llm_answer: str, version: str) -> int:
try:
import pandas as pd # type: ignore
except Exception:
return 0

from io import StringIO
import math as _math
import traceback as _traceback
from io import StringIO

def _read_df_v1(df_type: str, df_str: str):
if df_type == "json":
Expand Down Expand Up @@ -252,8 +247,12 @@ def _read_jsonl_table_from_text(text: str, header_cols: List[str]):
)
else:
lines = input_command.split("\n")
input_fmt = [l for l in lines if "Source Format" in l][-1].split("Source Format: ")[-1].strip().lower()
output_fmt = [l for l in lines if "Target Format" in l][-1].split("Target Format: ")[-1].strip().lower()
input_fmt = (
[line for line in lines if "Source Format" in line][-1].split("Source Format: ")[-1].strip().lower()
)
output_fmt = (
[line for line in lines if "Target Format" in line][-1].split("Target Format: ")[-1].strip().lower()
)

reader = _read_df_v1 if version == "v1" else _read_df_v2
gt_df = reader(output_fmt, ground_truth)
Expand Down Expand Up @@ -373,9 +372,9 @@ def _extract_gt(row: EvaluationRow) -> Dict[str, Any]:

@export_benchmark("live_bench/data_analysis/cta")
@evaluation_test(
model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"],
completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}],
input_messages=[[m for m in r.messages] for r in _CTA_ROWS],
rollout_input_params=[{"extra_body": {"reasoning_effort": "low"}}],
rollout_processor_kwargs=[{"extra_body": {"reasoning_effort": "low"}}],
rollout_processor=default_single_turn_rollout_processor,
aggregation_method="mean",
passed_threshold=None,
Expand Down Expand Up @@ -416,9 +415,9 @@ def livebench_cta_pointwise(row: EvaluationRow) -> EvaluationRow:

@export_benchmark("live_bench/data_analysis/tablejoin")
@evaluation_test(
model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"],
completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}],
input_messages=[[m for m in r.messages] for r in _TABLEJOIN_ROWS],
rollout_input_params=[{"extra_body": {"reasoning_effort": "low"}}],
rollout_processor_kwargs=[{"extra_body": {"reasoning_effort": "low"}}],
rollout_processor=default_single_turn_rollout_processor,
aggregation_method="mean",
passed_threshold=None,
Expand Down Expand Up @@ -460,9 +459,9 @@ def livebench_tablejoin_pointwise(row: EvaluationRow) -> EvaluationRow:

@export_benchmark("live_bench/data_analysis/tablereformat")
@evaluation_test(
model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"],
completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}],
input_messages=[[m for m in r.messages] for r in _TABLEREFORMAT_ROWS],
rollout_input_params=[{"extra_body": {"reasoning_effort": "low"}}],
rollout_processor_kwargs=[{"extra_body": {"reasoning_effort": "low"}}],
rollout_processor=default_single_turn_rollout_processor,
aggregation_method="mean",
passed_threshold=None,
Expand Down Expand Up @@ -508,5 +507,3 @@ def livebench_tablereformat_pointwise(row: EvaluationRow) -> EvaluationRow:
"live_bench/data_analysis/tablereformat",
],
)


11 changes: 8 additions & 3 deletions eval_protocol/benchmarks/suites/tau_bench_retail.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from typing import Any, Dict, List

from eval_protocol.benchmarks.registry import export_benchmark
from eval_protocol.models import CompletionParams, EvaluateResult, EvaluationRow, InputMetadata, Message
from eval_protocol.models import EvaluateResult, EvaluationRow, InputMetadata, Message
from eval_protocol.pytest import evaluation_test
from eval_protocol.pytest.default_mcp_gym_rollout_processor import default_mcp_gym_rollout_processor
from vendor.tau2.data_model.message import (
Expand Down Expand Up @@ -66,8 +66,13 @@ def tau_bench_retail_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evalu
@evaluation_test(
input_dataset=["tests/pytest/data/retail_dataset.jsonl"],
dataset_adapter=tau_bench_retail_to_evaluation_row,
model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"],
rollout_input_params=[{"temperature": 0.8, "extra_body": {"reasoning_effort": "medium"}}],
completion_params=[
{
"temperature": 0.8,
"extra_body": {"reasoning_effort": "medium"},
"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
}
],
rollout_processor=default_mcp_gym_rollout_processor,
rollout_processor_kwargs={"domain": "retail"},
num_runs=8,
Expand Down
9 changes: 6 additions & 3 deletions eval_protocol/dataset_logger/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
from eval_protocol.dataset_logger.sqlite_dataset_logger_adapter import SqliteDatasetLoggerAdapter
import os

from eval_protocol.dataset_logger.dataset_logger import DatasetLogger
from eval_protocol.dataset_logger.sqlite_dataset_logger_adapter import SqliteDatasetLoggerAdapter

# Allow disabling sqlite logger to avoid environment-specific constraints in simple CLI runs.
if os.getenv("EP_SQLITE_LOG", "0").strip() == "1":
if os.getenv("DISABLE_EP_SQLITE_LOG", "0").strip() == "1":
default_logger = SqliteDatasetLoggerAdapter()
else:
class _NoOpLogger:

class _NoOpLogger(DatasetLogger):
def log(self, row):
return None

Expand Down
14 changes: 7 additions & 7 deletions eval_protocol/mcp/execution/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from vendor.tau2.data_model.message import AssistantMessage, UserMessage
from vendor.tau2.user.user_simulator import UserSimulator

from ...models import CompletionParams, EvaluationRow, InputMetadata, Message
from ...models import EvaluationRow, InputMetadata, Message
from ...types import MCPSession, MCPToolCall, TerminationReason, Trajectory

if TYPE_CHECKING:
Expand Down Expand Up @@ -128,12 +128,12 @@ async def _execute_with_semaphore(idx):
evaluation_row.messages = messages
evaluation_row.tools = shared_tool_schema
evaluation_row.usage = CompletionUsage(**trajectory.usage)
evaluation_row.input_metadata.completion_params = CompletionParams(
model=policy.model_id,
temperature=getattr(policy, "temperature", None),
max_tokens=getattr(policy, "max_tokens", None),
max_tool_calls=getattr(policy, "max_tools_per_turn", None),
)
evaluation_row.input_metadata.completion_params = {
"model": policy.model_id,
"temperature": getattr(policy, "temperature", None),
"max_tokens": getattr(policy, "max_tokens", None),
"max_tool_calls": getattr(policy, "max_tools_per_turn", None),
}

if trajectory.terminated:
if trajectory.termination_reason == TerminationReason.ERROR:
Expand Down
23 changes: 15 additions & 8 deletions eval_protocol/models.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os
from datetime import datetime
from typing import Any, Dict, List, Literal, Optional, Union
from typing import Any, Dict, List, Literal, Optional, TypedDict, Union

from openai.types import CompletionUsage
from openai.types.chat.chat_completion_message import (
Expand Down Expand Up @@ -178,13 +178,18 @@ def __iter__(self):
return iter(self.__fields__.keys()) # Changed to __fields__


class CompletionParams(BaseModel):
"""Configuration for the language model used in the session."""
CompletionParams = Dict[str, Any]
"""
Common set of completion parameters that most model providers support in their
API. Set total=False to allow extra fields since LiteLLM + providers have their
own set of parameters. The following parameters are common fields that are
populated.

model: str = Field(..., description="Model identifier (e.g., 'gpt-4.1', 'fireworks/llama')")
temperature: Optional[float] = Field(None, description="Temperature setting for model generation")
max_tokens: Optional[int] = Field(None, description="Maximum tokens to generate")
max_tool_calls: Optional[int] = Field(None, description="Maximum tool calls per turn")
model: str
temperature: Optional[float]
max_tokens: Optional[int]
top_p: Optional[float]
"""


class InputMetadata(BaseModel):
Expand All @@ -193,7 +198,9 @@ class InputMetadata(BaseModel):
model_config = ConfigDict(extra="allow")

row_id: Optional[str] = Field(default_factory=generate_id, description="Unique string to ID the row")
completion_params: Optional[CompletionParams] = Field(None, description="Completion endpoint parameters used")
completion_params: CompletionParams = Field(
default_factory=dict, description="Completion endpoint parameters used"
)
dataset_info: Optional[Dict[str, Any]] = Field(
None, description="Dataset row details: seed, system_prompt, environment_context, etc"
)
Expand Down
4 changes: 3 additions & 1 deletion eval_protocol/pytest/default_agent_rollout_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,9 @@ async def default_agent_rollout_processor(

async def process_row(row: EvaluationRow) -> EvaluationRow:
"""Process a single row with agent rollout."""
agent = Agent(model=config.model, row=row, config_path=config.mcp_config_path, logger=config.logger)
agent = Agent(
model=config.completion_params.model, row=row, config_path=config.mcp_config_path, logger=config.logger
)
try:
await agent.setup()
await agent.call_agent()
Expand Down
8 changes: 4 additions & 4 deletions eval_protocol/pytest/default_mcp_gym_rollout_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,10 +216,10 @@ async def default_mcp_gym_rollout_processor(
server.start()

policy = ep.LiteLLMPolicy(
model_id=config.model,
temperature=config.input_params.get("temperature", 0.0),
max_tokens=config.input_params.get("max_tokens", 4096),
reasoning_effort=config.input_params.get("reasoning_effort", None),
model_id=config.completion_params.model,
temperature=config.completion_params.get("temperature", 0.0),
max_tokens=config.completion_params.get("max_tokens", 4096),
reasoning_effort=config.completion_params.get("reasoning_effort", None),
)

# Create MCP environments directly from evaluation_rows
Expand Down
20 changes: 10 additions & 10 deletions eval_protocol/pytest/default_single_turn_rollout_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,20 +41,20 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:

messages_payload = [{"role": m.role, "content": m.content} for m in row.messages]

request_params = {"model": config.model, "messages": messages_payload, **config.input_params}
request_params = {"messages": messages_payload, **config.completion_params}
# Ensure caching is disabled only for this request (review feedback)
request_params["cache"] = {"no-cache": True}
# Single-level reasoning effort: expect `reasoning_effort` only
effort_val = None
if isinstance(config.input_params, dict):
if "reasoning_effort" in config.input_params:
effort_val = str(config.input_params["reasoning_effort"]) # flat shape
elif (
isinstance(config.input_params.get("extra_body"), dict)
and "reasoning_effort" in config.input_params["extra_body"]
):
# Accept if user passed it directly inside extra_body
effort_val = str(config.input_params["extra_body"]["reasoning_effort"]) # already in extra_body

if "reasoning_effort" in config.completion_params:
effort_val = str(config.completion_params["reasoning_effort"]) # flat shape
elif (
isinstance(config.completion_params.get("extra_body"), dict)
and "reasoning_effort" in config.completion_params["extra_body"]
):
# Accept if user passed it directly inside extra_body
effort_val = str(config.completion_params["extra_body"]["reasoning_effort"]) # already in extra_body

if effort_val:
# Always under extra_body so LiteLLM forwards to provider-specific param set
Expand Down
Loading
Loading