From db2785bdebfc6318392e4ef7101bc48084223031 Mon Sep 17 00:00:00 2001 From: Derek Xu <32891260+xzrderek@users.noreply.github.com> Date: Tue, 12 Aug 2025 17:07:01 -0700 Subject: [PATCH 01/26] bug fixes (#69) --- eval_protocol/benchmarks/registry.py | 15 +++++------- .../benchmarks/suites/tau_bench_retail.py | 1 + .../default_mcp_gym_rollout_processor.py | 8 +++---- eval_protocol/pytest/evaluation_test.py | 23 ++++++++++++++----- eval_protocol/pytest/types.py | 4 +++- .../retail_environment/retail_environment.py | 7 ++++-- 6 files changed, 36 insertions(+), 22 deletions(-) diff --git a/eval_protocol/benchmarks/registry.py b/eval_protocol/benchmarks/registry.py index 1e3b3e7b..98065b82 100644 --- a/eval_protocol/benchmarks/registry.py +++ b/eval_protocol/benchmarks/registry.py @@ -29,7 +29,6 @@ def test_aime_pointwise(row: EvaluationRow) -> EvaluationRow: import os from typing import Any, Callable, Dict, List, Optional - # Global registry: name -> callable runner _BENCHMARK_REGISTRY: Dict[str, Callable[..., Any]] = {} @@ -61,9 +60,7 @@ def export_benchmark(name: str) -> Callable[[Callable[..., Any]], Callable[..., def _decorator(test_wrapper: Callable[..., Any]) -> Callable[..., Any]: # Pull through metadata attached by evaluation_test ep_config: Dict[str, Any] = getattr(test_wrapper, "__ep_config", {}) - original_test_func: Optional[Callable[..., Any]] = getattr( - test_wrapper, "__ep_original_test_func", None - ) + original_test_func: Optional[Callable[..., Any]] = getattr(test_wrapper, "__ep_original_test_func", None) def _runner( *, @@ -87,6 +84,7 @@ def _runner( # Fireworks OpenAI-compatible endpoint expects extra_body.reasoning_effort, not nested reasoning dict merged.setdefault("extra_body", {})["reasoning_effort"] = str(reasoning_effort) if input_params_override: + def _deep_update(base: Dict[str, Any], over: Dict[str, Any]) -> Dict[str, Any]: for k, v in over.items(): if isinstance(v, dict) and isinstance(base.get(k), dict): @@ -94,6 +92,7 @@ def _deep_update(base: Dict[str, Any], over: Dict[str, Any]) -> Dict[str, Any]: else: base[k] = v return base + merged = _deep_update(merged, dict(input_params_override)) if merged: os.environ["EP_INPUT_PARAMS_JSON"] = json.dumps(merged) @@ -108,15 +107,14 @@ def _deep_update(base: Dict[str, Any], over: Dict[str, Any]) -> Dict[str, Any]: models: List[str] = ep_config.get("model") or [] model_to_use = model or (models[0] if models else None) if not model_to_use: - raise ValueError( - f"No model provided and none captured from evaluation_test for benchmark '{name}'" - ) + raise ValueError(f"No model provided and none captured from evaluation_test for benchmark '{name}'") input_messages = ep_config.get("input_messages") input_dataset = ep_config.get("input_dataset") dataset_adapter = ep_config.get("dataset_adapter") rollout_input_params_list = ep_config.get("rollout_input_params") rollout_processor = ep_config.get("rollout_processor") + rollout_processor_kwargs = ep_config.get("rollout_processor_kwargs") aggregation_method = ep_config.get("aggregation_method") threshold = ep_config.get("threshold_of_success") default_num_runs = ep_config.get("num_runs") @@ -149,6 +147,7 @@ def _deep_update(base: Dict[str, Any], over: Dict[str, Any]) -> Dict[str, Any]: dataset_adapter=dataset_adapter, rollout_input_params=rollout_params, rollout_processor=rollout_processor, + rollout_processor_kwargs=rollout_processor_kwargs, aggregation_method=aggregation_method, threshold_of_success=threshold, num_runs=(num_runs if num_runs is not None else default_num_runs), @@ -170,5 +169,3 @@ def _deep_update(base: Dict[str, Any], over: Dict[str, Any]) -> Dict[str, Any]: return test_wrapper return _decorator - - diff --git a/eval_protocol/benchmarks/suites/tau_bench_retail.py b/eval_protocol/benchmarks/suites/tau_bench_retail.py index 51beab0b..9e1104d4 100644 --- a/eval_protocol/benchmarks/suites/tau_bench_retail.py +++ b/eval_protocol/benchmarks/suites/tau_bench_retail.py @@ -69,6 +69,7 @@ def tau_bench_retail_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evalu model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"], rollout_input_params=[{"temperature": 0.8, "extra_body": {"reasoning_effort": "medium"}}], rollout_processor=default_mcp_gym_rollout_processor, + rollout_processor_kwargs={"domain": "retail"}, num_runs=8, mode="pointwise", max_concurrent_rollouts=50, diff --git a/eval_protocol/pytest/default_mcp_gym_rollout_processor.py b/eval_protocol/pytest/default_mcp_gym_rollout_processor.py index 0adbbea0..5037cbad 100644 --- a/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +++ b/eval_protocol/pytest/default_mcp_gym_rollout_processor.py @@ -20,10 +20,10 @@ class MCPServerManager: _active_servers = [] _cleanup_registered = False - def __init__(self, server_script: str, port: int = 8000, domain: str = "airline"): + def __init__(self, server_script: str, port: int = 8000, **kwargs): self.server_script = server_script self.port = port - self.domain = domain + self.domain = str(kwargs.get("domain", "airline")) self.process: Optional[subprocess.Popen] = None self.base_dir = Path(".").resolve() self._log_file = None @@ -58,7 +58,7 @@ def start(self) -> None: env["PORT"] = str(self.port) # Start server process (no domain argument needed for tau2_mcp server) - cmd = ["python", self.server_script, "--port", str(self.port)] + cmd = ["python", self.server_script, "--port", str(self.port), "--domain", self.domain] # Setup log file with cleanup log_file_path = os.path.join(self.base_dir, f"server_output_{self.domain}_{self.port}.log") @@ -213,7 +213,7 @@ async def default_mcp_gym_rollout_processor( """ if config.server_script_path is None: raise ValueError("server_script_path is required for default_mcp_gym_rollout_processor") - server = MCPServerManager(config.server_script_path, port=9700) + server = MCPServerManager(config.server_script_path, port=9700, **(config.kwargs or {})) try: server.start() diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index 7557ae3d..f1d9af50 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -30,6 +30,7 @@ RolloutInputParam, RolloutProcessor, RolloutProcessorConfig, + RolloutProcessorInputParam, TestFunction, ) from eval_protocol.pytest.utils import ( @@ -53,6 +54,7 @@ def evaluation_test( # noqa: C901 rollout_input_params: Optional[List[RolloutInputParam]] = None, rollout_processor: RolloutProcessor = default_no_op_rollout_processor, evaluation_test_kwargs: Optional[List[EvaluationInputParam]] = None, + rollout_processor_kwargs: Optional[RolloutProcessorInputParam] = None, aggregation_method: AggregationMethod = "mean", passed_threshold: Optional[Union[EvaluationThreshold, float]] = None, num_runs: int = 1, @@ -114,6 +116,7 @@ def evaluation_test( # noqa: C901 rollout_input_params: Generation parameters for the rollout. rollout_processor: Function used to perform the rollout. evaluation_test_kwargs: Kwargs for the evaluation function. + rollout_processor_kwargs: Kwargs for the rollout processor. aggregation_method: How to aggregate scores across rows. passed_threshold: Threshold configuration for test success. Success rate must be above success, and if set, standard deviation must be below standard_deviation. @@ -399,6 +402,7 @@ def _log_eval_error( server_script_path=server_script_path, steps=steps, logger=active_logger, + kwargs=rollout_processor_kwargs, ) for i in range(num_runs): @@ -765,6 +769,7 @@ def dual_mode_wrapper(*args, **kwargs): "rollout_input_params": rollout_input_params, "rollout_processor": rollout_processor, "evaluation_test_kwargs": evaluation_test_kwargs, + "rollout_processor_kwargs": rollout_processor_kwargs, "aggregation_method": aggregation_method, "passed_threshold": passed_threshold, "num_runs": num_runs, @@ -832,6 +837,7 @@ def run_evaluation_test_direct( dataset_adapter: Callable[[List[Dict[str, Any]]], Dataset] = default_dataset_adapter, rollout_input_params: Optional[RolloutInputParam] = None, rollout_processor: RolloutProcessor = default_no_op_rollout_processor, + rollout_processor_kwargs: Optional[RolloutProcessorInputParam] = None, aggregation_method: AggregationMethod = "mean", threshold_of_success: Optional[float] = None, num_runs: int = 1, @@ -941,6 +947,7 @@ def _deep_update_dict(base: dict, override: dict) -> dict: max_concurrent_rollouts=max_concurrent_rollouts, server_script_path=server_script_path, steps=steps, + kwargs=rollout_processor_kwargs, ) all_results: List[EvaluationRow] = [] @@ -1022,8 +1029,8 @@ def _deep_update_dict(base: dict, override: dict) -> dict: if summary_path: import json as _json import pathlib as _pathlib - import time as _time import re as _re + import time as _time def _sanitize_filename(text: str) -> str: safe = _re.sub(r"[^A-Za-z0-9._-]+", "-", text.strip()) @@ -1039,7 +1046,11 @@ def _extract_effort_tag(params: dict) -> str | None: return str(eb["reasoning"]["effort"]).lower() if "reasoning_effort" in eb: return str(eb["reasoning_effort"]).lower() - if "reasoning" in params and isinstance(params["reasoning"], dict) and "effort" in params["reasoning"]: + if ( + "reasoning" in params + and isinstance(params["reasoning"], dict) + and "effort" in params["reasoning"] + ): return str(params["reasoning"]["effort"]).lower() except Exception: return None @@ -1069,9 +1080,9 @@ def _extract_effort_tag(params: dict) -> str | None: pass if threshold_of_success is not None and not passed: - assert agg_score >= threshold_of_success, ( - f"Aggregated score {agg_score:.3f} below threshold {threshold_of_success}" - ) + assert ( + agg_score >= threshold_of_success + ), f"Aggregated score {agg_score:.3f} below threshold {threshold_of_success}" return {"summary": summary_obj, "results": all_results} except Exception: @@ -1079,7 +1090,7 @@ def _extract_effort_tag(params: dict) -> str | None: if eval_metadata is not None: eval_metadata.status = "error" eval_metadata.passed = False - for r in (data or []): + for r in data or []: if r.eval_metadata is not None: r.eval_metadata.status = "error" r.eval_metadata.passed = False diff --git a/eval_protocol/pytest/types.py b/eval_protocol/pytest/types.py index 42fb3d56..c6de681e 100644 --- a/eval_protocol/pytest/types.py +++ b/eval_protocol/pytest/types.py @@ -2,7 +2,7 @@ Parameter types """ -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import Any, Callable, Dict, List, Literal, Optional from eval_protocol.dataset_logger import default_logger @@ -15,6 +15,7 @@ RolloutInputParam = Dict[str, Any] InputMessagesParam = List[Message] EvaluationInputParam = Dict[str, Any] +RolloutProcessorInputParam = Dict[str, Any] Dataset = List[EvaluationRow] @@ -49,6 +50,7 @@ class RolloutProcessorConfig: max_concurrent_rollouts: int = 8 # maximum number of concurrent rollouts steps: int = 30 # max number of rollout steps logger: DatasetLogger = default_logger # logger to use during rollout for mid-rollout logs + kwargs: Dict[str, Any] = field(default_factory=dict) # any additional kwargs to pass to the rollout processor RolloutProcessor = Callable[[List[EvaluationRow], RolloutProcessorConfig], List[EvaluationRow]] diff --git a/examples/tau2_mcp/retail_environment/retail_environment.py b/examples/tau2_mcp/retail_environment/retail_environment.py index 122fc92e..425ef785 100644 --- a/examples/tau2_mcp/retail_environment/retail_environment.py +++ b/examples/tau2_mcp/retail_environment/retail_environment.py @@ -30,11 +30,14 @@ class RetailEnvironment: def __init__(self, config: Optional[Dict[str, Any]] = None): self.config = config or {} - self.db = RetailDB.load(RETAIL_DB_PATH) - self.retail_tools = RetailTools(self.db) + self.db = None + self.airline_tools = None def reset(self, seed: Optional[int] = None) -> Tuple[Dict[str, Any], Dict[str, Any]]: """Reset the environment to initial state""" + self.db = RetailDB.load(RETAIL_DB_PATH) + self.retail_tools = RetailTools(self.db) + return {}, {} def step(self, action: Dict[str, Any]) -> Tuple[Dict[str, Any], float, bool, bool, Dict[str, Any]]: From a6e3709b827572d56850455576882b20a5c5ac01 Mon Sep 17 00:00:00 2001 From: "Yufei (Benny) Chen" <1585539+benjibc@users.noreply.github.com> Date: Tue, 12 Aug 2025 20:42:05 -0700 Subject: [PATCH 02/26] remove old template folder (#52) --- .../plan_forkable_filesystem_rl_scenario.md | 6 ++--- .../mcp_agent_filesystem_rl/test_example.py | 24 ++----------------- .../source_dir/file_to_move.txt | 1 - .../fs_move_scenario/target_dir/.gitkeep | 1 - .../fs_rl_example_scenario/archive/.gitkeep | 1 - .../source_files/important_document.txt | 1 - 6 files changed, 5 insertions(+), 29 deletions(-) delete mode 100644 mcp_agent_test_templates/fs_move_scenario/source_dir/file_to_move.txt delete mode 100644 mcp_agent_test_templates/fs_move_scenario/target_dir/.gitkeep delete mode 100644 mcp_agent_test_templates/fs_rl_example_scenario/archive/.gitkeep delete mode 100644 mcp_agent_test_templates/fs_rl_example_scenario/source_files/important_document.txt diff --git a/development/notes/plan_forkable_filesystem_rl_scenario.md b/development/notes/plan_forkable_filesystem_rl_scenario.md index f9f729ad..d594e81c 100644 --- a/development/notes/plan_forkable_filesystem_rl_scenario.md +++ b/development/notes/plan_forkable_filesystem_rl_scenario.md @@ -57,9 +57,9 @@ This strategy is preferred over `docker commit` for `mcp/filesystem` because the ### 4.1. Prepare Host Template Directory -* Create a directory structure on the host, e.g.: +* Create a directory structure on the host at a path of your choice, e.g.: ``` - ./mcp_agent_test_templates/fs_move_scenario/ + /path/to/fs_move_scenario/ ├── source_dir/ │ └── file_to_move.txt (contains "Hello from source") └── target_dir/ (empty) @@ -76,7 +76,7 @@ This strategy is preferred over `docker commit` for `mcp/filesystem` because the mcp_transport: "stdio" docker_image: "mcp/filesystem" container_command: ["/data"] # Served directory inside container - template_data_path_host: "./mcp_agent_test_templates/fs_move_scenario/" # Path to host template + template_data_path_host: "/path/to/fs_move_scenario/" # Path to host template # container_volumes can be omitted or will be overridden if template_data_path_host is used for filesystem type ``` diff --git a/examples/mcp_agent_filesystem_rl/test_example.py b/examples/mcp_agent_filesystem_rl/test_example.py index 25ace7f1..ef7c5f73 100644 --- a/examples/mcp_agent_filesystem_rl/test_example.py +++ b/examples/mcp_agent_filesystem_rl/test_example.py @@ -3,7 +3,7 @@ Test script for MCP Agent Filesystem RL Example This script verifies that the example setup is working correctly by: -1. Testing the template directory structure +1. Testing the dataset format 2. Testing the reward function with mock data 3. Testing MCP server connectivity (if running) """ @@ -17,32 +17,12 @@ # Add the eval-protocol package to the path sys.path.insert(0, str(Path(__file__).parent.parent.parent)) -from eval_protocol.models import ( # Ensure EvaluateResult is imported +from eval_protocol.models import ( EvaluateResult, Message, ) -def test_template_structure(): - """Test that the template directory has the correct structure.""" - print("Testing template directory structure...") - - # Construct path relative to this test file, then go to project root and find the template - base_path = Path(__file__).parent.parent.parent - template_path = base_path / "mcp_agent_test_templates" / "fs_rl_example_scenario" - - # Check directories exist - assert template_path.exists(), f"Template directory not found: {template_path}" - assert (template_path / "source_files").exists(), "source_files directory missing" - assert (template_path / "archive").exists(), "archive directory missing" - - # Check important_document.txt exists - important_doc = template_path / "source_files" / "important_document.txt" - assert important_doc.exists(), "important_document.txt missing from source_files" - - print("✓ Template directory structure is correct") - - def test_dataset_format(): """Test that the dataset file is correctly formatted.""" print("Testing dataset format...") diff --git a/mcp_agent_test_templates/fs_move_scenario/source_dir/file_to_move.txt b/mcp_agent_test_templates/fs_move_scenario/source_dir/file_to_move.txt deleted file mode 100644 index 2aeede53..00000000 --- a/mcp_agent_test_templates/fs_move_scenario/source_dir/file_to_move.txt +++ /dev/null @@ -1 +0,0 @@ -Hello from source diff --git a/mcp_agent_test_templates/fs_move_scenario/target_dir/.gitkeep b/mcp_agent_test_templates/fs_move_scenario/target_dir/.gitkeep deleted file mode 100644 index e0e72a2a..00000000 --- a/mcp_agent_test_templates/fs_move_scenario/target_dir/.gitkeep +++ /dev/null @@ -1 +0,0 @@ -# This file is a placeholder to ensure the target_dir is created. diff --git a/mcp_agent_test_templates/fs_rl_example_scenario/archive/.gitkeep b/mcp_agent_test_templates/fs_rl_example_scenario/archive/.gitkeep deleted file mode 100644 index b4ffa907..00000000 --- a/mcp_agent_test_templates/fs_rl_example_scenario/archive/.gitkeep +++ /dev/null @@ -1 +0,0 @@ -# Keep this directory diff --git a/mcp_agent_test_templates/fs_rl_example_scenario/source_files/important_document.txt b/mcp_agent_test_templates/fs_rl_example_scenario/source_files/important_document.txt deleted file mode 100644 index 6f2124b8..00000000 --- a/mcp_agent_test_templates/fs_rl_example_scenario/source_files/important_document.txt +++ /dev/null @@ -1 +0,0 @@ -This is an important document that needs to be archived for safekeeping. From a9e7009ce6e7afc1bbe3fc56e96685b6cdab98c8 Mon Sep 17 00:00:00 2001 From: Yinghan Ma Date: Tue, 12 Aug 2025 21:01:30 -0700 Subject: [PATCH 03/26] fix failure reason (#70) * fix failure reason * update --- eval_protocol/mcp/execution/manager.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/eval_protocol/mcp/execution/manager.py b/eval_protocol/mcp/execution/manager.py index aec867fc..5664e5ac 100644 --- a/eval_protocol/mcp/execution/manager.py +++ b/eval_protocol/mcp/execution/manager.py @@ -14,6 +14,7 @@ from dataclasses import asdict from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union +import anyio from openai.types import CompletionUsage from vendor.tau2.data_model.message import AssistantMessage, UserMessage @@ -464,11 +465,19 @@ async def _execute_rollout( ) except asyncio.CancelledError: - logger.error(f"🚨 AsyncIO Cancel Error in roll out {rollout_idx}", exc_info=True) failure_reason = "asyncio context cancelled" + logger.error( + f"🚨 Error in rollout {session.dataset_row.id} {rollout_idx}: {failure_reason}", exc_info=True + ) + except (anyio.ClosedResourceError, anyio.BrokenResourceError): + failure_reason = "anyioconnection/resource error" + logger.error( + f"🚨 Error in rollout {session.dataset_row.id} {rollout_idx}: {failure_reason}", exc_info=True + ) except Exception as e: - logger.error(f"🚨 Error in rollout {rollout_idx}: {e}", exc_info=True) - failure_reason = str(e) + error_msg = str(e) if str(e) else f"{type(e).__name__}: Unexpected error" + logger.error(f"🚨 Error in rollout {session.dataset_row.id} {rollout_idx}: {error_msg}", exc_info=True) + failure_reason = error_msg finally: if failure_reason: trajectory.terminated = True From e2198ac6861d03ce61137f4ccba87d47ea3518f1 Mon Sep 17 00:00:00 2001 From: "Yufei (Benny) Chen" <1585539+benjibc@users.noreply.github.com> Date: Tue, 12 Aug 2025 21:27:34 -0700 Subject: [PATCH 04/26] add live bench (#68) * add live bench * fix live bench and rollout processor --- eval_protocol/benchmarks/registry.py | 161 +++++- eval_protocol/benchmarks/suites/aime25.py | 1 + eval_protocol/benchmarks/suites/gpqa.py | 26 +- .../suites/livebench_data_analysis.py | 512 ++++++++++++++++++ eval_protocol/dataset_logger/__init__.py | 14 +- .../sqlite_evaluation_row_store.py | 3 +- 6 files changed, 708 insertions(+), 9 deletions(-) create mode 100644 eval_protocol/benchmarks/suites/livebench_data_analysis.py diff --git a/eval_protocol/benchmarks/registry.py b/eval_protocol/benchmarks/registry.py index 98065b82..31840fd1 100644 --- a/eval_protocol/benchmarks/registry.py +++ b/eval_protocol/benchmarks/registry.py @@ -126,7 +126,7 @@ def _deep_update(base: Dict[str, Any], over: Dict[str, Any]) -> Dict[str, Any]: server_script_path = ep_config.get("server_script_path") steps = ep_config.get("steps") mode = ep_config.get("mode") - combine_datasets = ep_config.get("combine_datasets") + # combine_datasets captured but not used here # Choose the first rollout param set by default rollout_params = None @@ -169,3 +169,162 @@ def _deep_update(base: Dict[str, Any], over: Dict[str, Any]) -> Dict[str, Any]: return test_wrapper return _decorator + + +def register_composite_benchmark(name: str, children: List[str]) -> None: + """ + Register a composite benchmark that runs multiple exported benchmarks and aggregates results. + + The composite runner forwards common overrides to each child benchmark and aggregates + a combined score as a rows-weighted mean of each child's aggregated score. + + Args: + name: Name of the composite benchmark to register. + children: List of child benchmark names previously registered via export_benchmark. + """ + + def _composite_runner( + *, + model: Optional[str] = None, + print_summary: bool = False, + out: Optional[str] = None, + reasoning_effort: Optional[str] = None, + max_rows: Optional[int | str] = None, + num_runs: Optional[int] = None, + input_params_override: Optional[Dict[str, Any]] = None, + max_concurrency: Optional[int] = None, + ) -> Dict[str, Any]: + # Resolve child runners at call-time to ensure all suites are imported + # Local import avoided to prevent circular import at module import time + _get_benchmark_runner = get_benchmark_runner + import pathlib as _pathlib + import time as _time + _json = json + + child_summaries: List[Dict[str, Any]] = [] + total_rows = 0 + weighted_sum = 0.0 + # For per-metric aggregation across children + metric_weighted_sums: Dict[str, float] = {} + metric_total_rows: Dict[str, int] = {} + combined_rows: List[Any] = [] + + # If 'out' is a file path, also compute a directory for child artifacts + child_out_dir: Optional[str] = None + if out: + p = _pathlib.Path(out) + if p.suffix.lower() == ".json" and not str(out).endswith("/"): + # Use parent directory for child artifacts + child_out_dir = str(p.parent) + else: + child_out_dir = out + + for child_name in children: + runner = _get_benchmark_runner(child_name) + result = runner( + model=model, + print_summary=print_summary, + out=child_out_dir, + reasoning_effort=reasoning_effort, + max_rows=max_rows, + num_runs=num_runs, + input_params_override=input_params_override, + max_concurrency=max_concurrency, + ) + summary = (result or {}).get("summary") if isinstance(result, dict) else None + if not summary: + continue + # Gather underlying rows to recompute CI across children + try: + rows_obj = result.get("results") if isinstance(result, dict) else None + if isinstance(rows_obj, list): + combined_rows.extend(rows_obj) + except Exception: + pass + child_summaries.append(summary) + rows = int(summary.get("rows", 0) or 0) + agg = summary.get("agg_score") + if isinstance(agg, (int, float)) and rows > 0: + total_rows += rows + weighted_sum += float(agg) * rows + # Combine per-metric means if available + metrics_agg = summary.get("metrics_agg") or {} + if isinstance(metrics_agg, dict): + for m_name, m_vals in metrics_agg.items(): + m_mean = m_vals.get("mean") + if isinstance(m_mean, (int, float)) and rows > 0: + metric_weighted_sums[m_name] = metric_weighted_sums.get(m_name, 0.0) + float(m_mean) * rows + metric_total_rows[m_name] = metric_total_rows.get(m_name, 0) + rows + + combined_agg = (weighted_sum / total_rows) if total_rows > 0 else None + # Compute 95% CI for combined rows if available + ci_low: Optional[float] = None + ci_high: Optional[float] = None + if combined_rows: + try: + from eval_protocol.stats.confidence_intervals import compute_fixed_set_mu_ci as _compute_ci + + r = _compute_ci(combined_rows) + if r and len(r) >= 3 and r[1] is not None and r[2] is not None: + ci_low = float(r[1]) + ci_high = float(r[2]) + except Exception: + ci_low = None + ci_high = None + combined_metrics: Dict[str, Dict[str, float]] = {} + for m_name, wsum in metric_weighted_sums.items(): + denom = metric_total_rows.get(m_name, 0) + if denom > 0: + combined_metrics[m_name] = {"mean": float(wsum / denom)} + combined = { + "suite": name, + "model": model, + "agg_score": float(combined_agg) if combined_agg is not None else None, + "rows": total_rows, + "children": child_summaries, + "num_runs": num_runs, + **({"metrics_agg": combined_metrics} if combined_metrics else {}), + **({"agg_ci_low": ci_low, "agg_ci_high": ci_high} if (ci_low is not None and ci_high is not None) else {}), + } + + # Optional print and persist + # Respect either function arg or EP_PRINT_SUMMARY env + _should_print = print_summary or (os.getenv("EP_PRINT_SUMMARY") == "1") + if _should_print: + try: + if combined_agg is not None: + if ci_low is not None and ci_high is not None: + print( + f"EP Summary | suite={name} model={model} agg={combined['agg_score']:.3f} ci95=[{ci_low:.3f},{ci_high:.3f}] rows={total_rows}" + ) + else: + print( + f"EP Summary | suite={name} model={model} agg={combined['agg_score']:.3f} rows={total_rows}" + ) + else: + print( + f"EP Summary | suite={name} model={model} agg=None rows={total_rows}" + ) + except Exception: + pass + + if out: + out_path = _pathlib.Path(out) + if out_path.suffix.lower() == ".json" and not str(out).endswith("/"): + # Write to the specified file + out_path.parent.mkdir(parents=True, exist_ok=True) + with open(out_path, "w", encoding="utf-8") as f: + _json.dump({**combined, "timestamp": int(_time.time())}, f) + else: + # Treat as directory + dir_path = out_path + dir_path.mkdir(parents=True, exist_ok=True) + safe_name = name.replace("/", "__") + file_path = dir_path / f"{safe_name}__composite.json" + with open(file_path, "w", encoding="utf-8") as f: + _json.dump({**combined, "timestamp": int(_time.time())}, f) + + return {"summary": combined} + + # Register (overwrite if exists) + _BENCHMARK_REGISTRY[name] = _composite_runner diff --git a/eval_protocol/benchmarks/suites/aime25.py b/eval_protocol/benchmarks/suites/aime25.py index 406ee74b..4a5d3a4c 100644 --- a/eval_protocol/benchmarks/suites/aime25.py +++ b/eval_protocol/benchmarks/suites/aime25.py @@ -69,6 +69,7 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]: rollout_input_params=[{"max_tokens": 131000, "extra_body": {"reasoning_effort": "low"}}], rollout_processor=default_single_turn_rollout_processor, aggregation_method="mean", + passed_threshold=None, num_runs=8, max_dataset_rows=2, max_concurrent_rollouts=4, diff --git a/eval_protocol/benchmarks/suites/gpqa.py b/eval_protocol/benchmarks/suites/gpqa.py index 2024d202..91620c9a 100644 --- a/eval_protocol/benchmarks/suites/gpqa.py +++ b/eval_protocol/benchmarks/suites/gpqa.py @@ -39,8 +39,6 @@ def _load_gpqa_messages_from_csv() -> List[List[Message]]: [ Message(role="system", content=SYSTEM_PROMPT), Message(role="user", content=user_content), - # Correct answer is always option A by construction - Message(role="system", content="__GT__:A"), ] ) if not messages_list: @@ -57,14 +55,31 @@ def _extract_abcd_letter(text: str) -> str | None: _GPQA_INPUT_MESSAGES = _load_gpqa_messages_from_csv() +def _strip_gt_messages(msgs: List[Message]) -> List[Message]: + return [m for m in msgs if not (m.role == "system" and (m.content or "").startswith("__GT__:"))] + + +async def gpqa_strip_gt_rollout_processor(rows: List[EvaluationRow], config) -> List[EvaluationRow]: + """Preprocess rows to set ground_truth and remove __GT__ messages, then delegate to default processor.""" + processed: List[EvaluationRow] = [] + for r in rows: + gt_tokens = [m.content for m in r.messages if m.role == "system" and (m.content or "").startswith("__GT__:")] + if gt_tokens: + gt_val = gt_tokens[-1].split(":", 1)[1].strip() + r.ground_truth = gt_val + r.messages = [m for m in r.messages if not (m.role == "system" and (m.content or "").startswith("__GT__:"))] + processed.append(r) + return await default_single_turn_rollout_processor(processed, config) + @export_benchmark("gpqa") @evaluation_test( model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"], input_messages=_GPQA_INPUT_MESSAGES, rollout_input_params=[{"extra_body": {"reasoning_effort": "low"}}], - rollout_processor=default_single_turn_rollout_processor, + rollout_processor=gpqa_strip_gt_rollout_processor, aggregation_method="mean", + passed_threshold=None, num_runs=8, mode="pointwise", ) @@ -73,9 +88,8 @@ def gpqa_pointwise(row: EvaluationRow) -> EvaluationRow: content = assistant_msgs[-1].content if assistant_msgs else "" pred = _extract_abcd_letter(content or "") - # Retrieve GT from the trailing system message we appended - gt_tokens = [m.content for m in row.messages if m.role == "system" and (m.content or "").startswith("__GT__:")] - gt = gt_tokens[-1].split(":", 1)[1].strip() if gt_tokens else None + # GPQA diamond CSV constructs options so that the correct answer is always A + gt = "A" is_valid = pred is not None and gt in {"A", "B", "C", "D"} score = 1.0 if (is_valid and pred == gt) else 0.0 diff --git a/eval_protocol/benchmarks/suites/livebench_data_analysis.py b/eval_protocol/benchmarks/suites/livebench_data_analysis.py new file mode 100644 index 00000000..1c04b6fd --- /dev/null +++ b/eval_protocol/benchmarks/suites/livebench_data_analysis.py @@ -0,0 +1,512 @@ +from typing import Any, Dict, List, Optional + +import json +import re + +from eval_protocol.models import EvaluateResult, EvaluationRow, Message, MetricResult +from eval_protocol.pytest.default_single_turn_rollout_process import ( + default_single_turn_rollout_processor, +) +from eval_protocol.pytest.evaluation_test import evaluation_test +from eval_protocol.benchmarks.registry import export_benchmark, register_composite_benchmark + + +# ------------------------- +# Lightweight ports of LiveBench scoring utilities for data_analysis tasks +# ------------------------- + +def _lb_clean_text(text: str) -> str: + text = text.lower().strip() + text = re.sub(r"[^\w]", "", text) + return text + + +def _extract_last_boxed_segment(text: str) -> Optional[str]: + # Extract the last occurrence of \\boxed{...} or \\framebox{...} + pattern = r"\\(?:boxed|framebox)\{(.*?)\}" + matches = re.findall(pattern, text, re.DOTALL) + if not matches: + return None + return matches[-1] + + +def _cta_process_results(ground_truth: str, llm_answer: str) -> int: + parsed_answer = llm_answer + if "\\boxed{" in parsed_answer or "\\framebox{" in parsed_answer: + boxed = _extract_last_boxed_segment(parsed_answer) + if boxed is not None: + parsed_answer = boxed + parsed_answer = ( + parsed_answer.replace("\\text{", "").replace("}", "").replace("\\", "") + ) + + gt_clean = _lb_clean_text(ground_truth) + ans_clean = _lb_clean_text(parsed_answer) + if gt_clean == ans_clean: + return 1 + # Suffix match to handle answers like "... answer: XYZ" + if len(ans_clean) >= len(gt_clean) and ans_clean[-len(gt_clean) :] == gt_clean: + return 1 + return 0 + + +def _tj_clean_llm_output(s: str) -> Dict[str, Any]: + # Try to extract the last ... + m = re.findall(r"(.*?)", s, re.DOTALL) + if len(m) > 0: + return _tj_clean_llm_output(m[-1].strip()) + + candidate: Optional[str] = None + # Prefer code blocks (python/json/any) + for fence in ("```python", "```json", "```"): + mm = re.findall(r"%s(.*?)```" % re.escape(fence), s.replace("\n", ""), re.MULTILINE) + if mm: + candidate = mm[-1] + break + # Fallback to boxed + if candidate is None and "\\boxed" in s: + boxed = _extract_last_boxed_segment(s.replace("\n", "")) + if boxed: + # Convert \text{"str"} to 'str' and strip backslashes + candidate = re.sub(r"\\text{['\"](.*?)['\"]}", r"'\1'", boxed).replace("\\", "") + if candidate is None: + candidate = s + + # Make JSON-like to python literal + candidate = candidate.replace("null", "None") + try: + from ast import literal_eval + + parsed = literal_eval(candidate) + if not isinstance(parsed, dict): + return {} + # Drop None values + for k in list(parsed.keys()): + if parsed[k] is None: + del parsed[k] + return parsed + except Exception: + return {} + + +def _tablejoin_process_results(ground_truth: Any, llm_answer: str) -> float: + import json as _json + from ast import literal_eval + + # Parse GT into dict if needed + gt: Dict[str, Any] + if isinstance(ground_truth, str): + try: + gt = literal_eval(ground_truth) + except Exception: + try: + gt = _json.loads(ground_truth) + except Exception: + return 0.0 + else: + gt = dict(ground_truth) + + pred = _tj_clean_llm_output(llm_answer) + if len(pred) == 0: + return 0.0 + + tp = 0 + fp = 0 + fn = 0 + for k, v in pred.items(): + gt_v = gt.get(k, None) + if gt_v is None: + fp += 1 + elif gt_v == v: + tp += 1 + else: + fp += 1 + fn += 1 + for k, v in gt.items(): + if k not in pred: + fn += 1 + denom = (2 * tp) + fp + fn + if denom == 0: + return 0.0 + # Round to 2 decimals to mirror LiveBench + return round((2 * tp) / denom, 2) + + +def _tablereformat_process_results( + input_command: str, ground_truth: str, llm_answer: str, version: str +) -> int: + try: + import pandas as pd # type: ignore + except Exception: + return 0 + + from io import StringIO + import math as _math + import traceback as _traceback + + def _read_df_v1(df_type: str, df_str: str): + if df_type == "json": + for orient in ("index", "records", "records", "table", "values"): + try: + return pd.read_json(StringIO(df_str), orient=orient) + except Exception: + pass + return pd.read_json(StringIO(df_str), orient="values") + if df_type == "jsonl": + return pd.read_json(StringIO(df_str), orient="records", lines=True) + if df_type == "html": + return pd.concat(pd.read_html(StringIO(df_str)), axis=0) + if df_type == "csv": + return pd.read_csv(StringIO(df_str)) + if df_type == "markdown": + return pd.read_table(StringIO(df_str), sep="|", header=0, index_col=1, skipinitialspace=True) + if df_type == "tsv": + return pd.read_csv(StringIO(df_str), sep="\t") + raise ValueError(f"Unsupported type {df_type}") + + def _read_df_v2(df_type: str, df_str: str): + if df_type == "json": + for orient in ("table", "index", "records"): + try: + return pd.read_json(StringIO(df_str), orient=orient) + except Exception: + pass + return None + if df_type == "jsonl": + return pd.read_json(StringIO(df_str), orient="records", lines=True) + if df_type == "html": + return pd.concat(pd.read_html(StringIO(df_str)), axis=0) + if df_type == "csv": + return pd.read_csv(StringIO(df_str)) + if df_type == "markdown": + # Remove alignment line + lines = df_str.strip().split("\n") + header = lines[0] + data_lines = lines[2:] if len(lines) > 2 else [] + processed = header + "\n" + "\n".join(data_lines) + df = pd.read_table(StringIO(processed), sep="|", header=0, skipinitialspace=True).iloc[:, 1:-1] + for col in df.columns: + if df[col].dtype == "object": + df[col] = df[col].astype(str).str.strip() + return df + if df_type == "tsv": + return pd.read_csv(StringIO(df_str), sep="\t") + raise ValueError(f"Unsupported type {df_type}") + + def _clean_llm_output(s: str) -> str: + m = re.findall(r"```json\n(.*?)```", s, re.DOTALL) + if m: + return m[-1].strip() + m = re.findall(r"```html\n(.*?)```", s, re.DOTALL) + if m: + return m[-1].strip() + s = re.sub(r"^```.*\n", "", s) + s = s.replace("&", "&") + return s.replace("```", "").strip() + + def _remove_initial_phrase(text: str) -> str: + return re.sub(r"^\s*(Here|Input)\b.*?\b(format|table)\s*[:)]\s*", "", text, flags=re.IGNORECASE).strip() + + def _read_sep_table_from_text(text: str, header: str, sep: str): + text = text.strip() + lines = text.split("\n") + header_line = 0 + while header_line < len(lines) and lines[header_line].strip() != header.strip(): + header_line += 1 + if header_line == len(lines) or lines[header_line].strip() != header.strip(): + return None + table = lines[header_line:] + parsed = None + while parsed is None and table: + try: + parsed = pd.read_csv(StringIO("\n".join(table)), sep=sep) + except Exception: + table = table[:-1] + return parsed + + def _read_jsonl_table_from_text(text: str, header_cols: List[str]): + rows = [] + for line in text.strip().split("\n"): + if len(line) < 2 or line[0] != "{" or line[-1] != "}": + continue + if not all(col in line for col in header_cols): + continue + try: + rows.append(json.loads(line)) + except Exception: + continue + if not rows: + return None + import pandas as _pd + + return _pd.DataFrame(rows) + + # Determine formats from the instruction + if version == "v1": + input_fmt = input_command.split("Please convert the Input Table from ")[1].split(" format")[0].lower() + output_fmt = ( + input_command.split("Please convert the Input Table from ")[1] + .split("format to ")[1] + .split(" format")[0] + .lower() + ) + else: + lines = input_command.split("\n") + input_fmt = [l for l in lines if "Source Format" in l][-1].split("Source Format: ")[-1].strip().lower() + output_fmt = [l for l in lines if "Target Format" in l][-1].split("Target Format: ")[-1].strip().lower() + + reader = _read_df_v1 if version == "v1" else _read_df_v2 + gt_df = reader(output_fmt, ground_truth) + + llm_clean = _clean_llm_output(llm_answer) + llm_clean = _remove_initial_phrase(llm_clean) + try: + llm_df = reader(output_fmt, llm_clean) + except Exception: + llm_df = None + if output_fmt in ("csv", "tsv") and gt_df is not None: + header = (",", "\t")[output_fmt == "tsv"].join(list(gt_df.columns)) + llm_df = _read_sep_table_from_text(llm_clean, header, sep="," if output_fmt == "csv" else "\t") + elif output_fmt == "jsonl" and gt_df is not None: + llm_df = _read_jsonl_table_from_text(llm_clean, list(gt_df.columns)) + if llm_df is None: + return 0 + + # Compare + try: + gt_df.columns = [str(s).strip() for s in gt_df.columns] + if "index" in gt_df.columns: + gt_df = gt_df.drop(columns=["index"]) + llm_df.columns = [str(s).strip() for s in llm_df.columns] + if "index" in llm_df.columns: + llm_df = llm_df.drop(columns=["index"]) + assert len(llm_df) == len(gt_df) + assert sorted(llm_df.columns) == sorted(gt_df.columns) + for i in range(len(llm_df)): + for key in llm_df.columns: + lv = llm_df.iloc[i][key] + gv = gt_df.iloc[i][key] + if isinstance(lv, str): + lv = lv.strip() + if isinstance(gv, str): + gv = gv.strip() + # Numeric tolerance for floats + try: + lvf = float(lv) + gvf = float(gv) + if _math.isnan(lvf) and _math.isnan(gvf): + continue + assert abs(lvf - gvf) < 1e-6 + except Exception: + assert str(lv) == str(gv) + except AssertionError: + return 0 + except Exception: + # Silent on failure, match LiveBench robustness + _traceback.print_exc() + return 0 + return 1 + + +# ------------------------- +# Dataset loading from Hugging Face at import time +# ------------------------- + +SYSTEM_PROMPT = "You are a helpful data analyst. Read the task and answer precisely." + + +def _load_livebench_da_messages(task_name: str) -> List[EvaluationRow]: + try: + from datasets import load_dataset # type: ignore + except Exception as e: # pragma: no cover + raise RuntimeError( + "The 'datasets' package is required for LiveBench Data Analysis benchmarks. Please 'pip install datasets'." + ) from e + + ds = load_dataset("livebench/data_analysis", split="test") + rows: List[EvaluationRow] = [] + for ex in ds: + if str(ex.get("task", "")) != task_name: + continue + question_text = str(ex.get("turns", [""])[0]) + ground_truth = ex.get("ground_truth") + release = ex.get("livebench_release_date", "") + try: + gt_payload = json.dumps({"ground_truth": ground_truth, "release": release}, ensure_ascii=False) + except TypeError: + gt_payload = json.dumps({"ground_truth": str(ground_truth), "release": str(release)}) + rows.append( + EvaluationRow( + messages=[ + Message(role="system", content=SYSTEM_PROMPT), + Message(role="user", content=question_text), + ], + ground_truth=gt_payload, + ) + ) + if not rows: + raise RuntimeError(f"No rows found for LiveBench data_analysis task '{task_name}'") + return rows + + +def _extract_gt(row: EvaluationRow) -> Dict[str, Any]: + # For LiveBench Data Analysis, we fetch the ground truth from the HF dataset + # and store it in the top-level ground_truth field in the adapter below. + # Here, just parse row.ground_truth if it contains a JSON payload, else string. + if row.ground_truth is None: + return {"ground_truth": None, "release": None} + try: + payload = json.loads(row.ground_truth) + if isinstance(payload, dict): + return payload + except Exception: + pass + return {"ground_truth": row.ground_truth, "release": None} + + +# ------------------------- +# CTA +# ------------------------- + +_CTA_ROWS = _load_livebench_da_messages("cta") + + +@export_benchmark("live_bench/data_analysis/cta") +@evaluation_test( + model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"], + input_messages=[[m for m in r.messages] for r in _CTA_ROWS], + rollout_input_params=[{"extra_body": {"reasoning_effort": "low"}}], + rollout_processor=default_single_turn_rollout_processor, + aggregation_method="mean", + passed_threshold=None, + num_runs=4, + mode="pointwise", +) +def livebench_cta_pointwise(row: EvaluationRow) -> EvaluationRow: + assistant_msgs = [m for m in row.messages if m.role == "assistant"] + content = assistant_msgs[-1].content if assistant_msgs else "" + payload = _extract_gt(row) + gt = payload.get("ground_truth") + gt_str = str(gt) if gt is not None else "" + + score_val = float(_cta_process_results(gt_str, content or "")) if gt_str else 0.0 + is_valid = bool(gt_str) + + row.evaluation_result = EvaluateResult( + score=score_val, + reason=("Matched" if score_val == 1.0 else "Not matched"), + is_score_valid=is_valid, + metrics={ + "exact_match": MetricResult( + score=score_val, + is_score_valid=is_valid, + reason=("Exact/suffix match" if score_val == 1.0 else "Mismatch"), + ) + }, + ) + return row + + +# ------------------------- +# Table Join +# ------------------------- + +_TABLEJOIN_ROWS = _load_livebench_da_messages("tablejoin") + + +@export_benchmark("live_bench/data_analysis/tablejoin") +@evaluation_test( + model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"], + input_messages=[[m for m in r.messages] for r in _TABLEJOIN_ROWS], + rollout_input_params=[{"extra_body": {"reasoning_effort": "low"}}], + rollout_processor=default_single_turn_rollout_processor, + aggregation_method="mean", + passed_threshold=None, + num_runs=4, + mode="pointwise", +) +def livebench_tablejoin_pointwise(row: EvaluationRow) -> EvaluationRow: + user_msgs = [m for m in row.messages if m.role == "user"] + question = user_msgs[-1].content if user_msgs else "" + assistant_msgs = [m for m in row.messages if m.role == "assistant"] + content = assistant_msgs[-1].content if assistant_msgs else "" + payload = _extract_gt(row) + gt = payload.get("ground_truth") + + score_val = float(_tablejoin_process_results(gt, content or "")) + is_valid = True + + row.evaluation_result = EvaluateResult( + score=score_val, + reason=f"F1 score: {score_val:.2f}", + is_score_valid=is_valid, + metrics={ + "f1": MetricResult( + score=score_val, + is_score_valid=is_valid, + reason="Entity/relation mapping F1", + ) + }, + ) + return row + + +# ------------------------- +# Table Reformat +# ------------------------- + +_TABLEREFORMAT_ROWS = _load_livebench_da_messages("tablereformat") + + +@export_benchmark("live_bench/data_analysis/tablereformat") +@evaluation_test( + model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"], + input_messages=[[m for m in r.messages] for r in _TABLEREFORMAT_ROWS], + rollout_input_params=[{"extra_body": {"reasoning_effort": "low"}}], + rollout_processor=default_single_turn_rollout_processor, + aggregation_method="mean", + passed_threshold=None, + num_runs=4, + mode="pointwise", +) +def livebench_tablereformat_pointwise(row: EvaluationRow) -> EvaluationRow: + user_msgs = [m for m in row.messages if m.role == "user"] + question = user_msgs[-1].content if user_msgs else "" + assistant_msgs = [m for m in row.messages if m.role == "assistant"] + content = assistant_msgs[-1].content if assistant_msgs else "" + payload = _extract_gt(row) + gt = payload.get("ground_truth") + release = payload.get("release") or "" + version = "v2" if str(release) >= "2025-04-25" else "v1" + + gt_str = str(gt) if gt is not None else "" + score_int = _tablereformat_process_results(question or "", gt_str, content or "", version) + score_val = float(score_int) + is_valid = bool(gt_str) + + row.evaluation_result = EvaluateResult( + score=score_val, + reason=("Table matches" if score_val == 1.0 else "Table mismatch"), + is_score_valid=is_valid, + metrics={ + "structure_exact": MetricResult( + score=score_val, + is_score_valid=is_valid, + reason="Exact structure and values match", + ) + }, + ) + return row + + +# Register a composite benchmark that aggregates all three LiveBench Data Analysis tests +register_composite_benchmark( + name="live_bench/data_analysis", + children=[ + "live_bench/data_analysis/cta", + "live_bench/data_analysis/tablejoin", + "live_bench/data_analysis/tablereformat", + ], +) + + diff --git a/eval_protocol/dataset_logger/__init__.py b/eval_protocol/dataset_logger/__init__.py index d60fe513..9478ec6f 100644 --- a/eval_protocol/dataset_logger/__init__.py +++ b/eval_protocol/dataset_logger/__init__.py @@ -1,3 +1,15 @@ from eval_protocol.dataset_logger.sqlite_dataset_logger_adapter import SqliteDatasetLoggerAdapter +import os -default_logger = SqliteDatasetLoggerAdapter() +# Allow disabling sqlite logger to avoid environment-specific constraints in simple CLI runs. +if os.getenv("EP_SQLITE_LOG", "0").strip() == "1": + default_logger = SqliteDatasetLoggerAdapter() +else: + class _NoOpLogger: + def log(self, row): + return None + + def read(self, rollout_id=None): + return [] + + default_logger = _NoOpLogger() diff --git a/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py b/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py index a8f149a8..6ab0bb8e 100644 --- a/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py +++ b/eval_protocol/dataset_logger/sqlite_evaluation_row_store.py @@ -30,7 +30,8 @@ class EvaluationRow(BaseModel): # type: ignore self._EvaluationRow = EvaluationRow self._db.connect() - self._db.create_tables([EvaluationRow]) + # Use safe=True to avoid errors when tables/indexes already exist + self._db.create_tables([EvaluationRow], safe=True) @property def db_path(self) -> str: From 7317549e091dd3bf1233890010d760dafad164bb Mon Sep 17 00:00:00 2001 From: "Yufei (Benny) Chen" <1585539+benjibc@users.noreply.github.com> Date: Tue, 12 Aug 2025 22:16:58 -0700 Subject: [PATCH 05/26] remove http rollout old code (#71) --- development/notes/frozen_lake_context.md | 33 -- development/notes/frozen_lake_plan.md | 93 --- development/notes/http_rollout.md | 38 -- eval_protocol/agent/orchestrator.py | 3 - eval_protocol/agent/resources/__init__.py | 25 - .../agent/resources/http_rollout_protocol.py | 85 --- .../agent/resources/http_rollout_resource.py | 325 ----------- examples/frozen_lake/README.md | 152 ----- examples/frozen_lake/analyze_trajectory.py | 273 --------- examples/frozen_lake/client/dataset.jsonl | 3 - examples/frozen_lake/client/reward.py | 126 ---- examples/frozen_lake/client/task_def.yaml | 46 -- .../gymnasium_frozen_lake_server.py | 281 --------- examples/frozen_lake/run_full_evaluation.sh | 318 ---------- examples/frozen_lake/server/README.md | 187 ------ .../frozen_lake/server/http_rollout_server.py | 104 ---- tests/test_frozen_lake_http_server.py | 269 --------- tests/test_frozen_lake_seed_evaluation.py | 541 ------------------ 18 files changed, 2902 deletions(-) delete mode 100644 development/notes/frozen_lake_context.md delete mode 100644 development/notes/frozen_lake_plan.md delete mode 100644 development/notes/http_rollout.md delete mode 100644 eval_protocol/agent/resources/http_rollout_protocol.py delete mode 100644 eval_protocol/agent/resources/http_rollout_resource.py delete mode 100644 examples/frozen_lake/README.md delete mode 100755 examples/frozen_lake/analyze_trajectory.py delete mode 100644 examples/frozen_lake/client/dataset.jsonl delete mode 100644 examples/frozen_lake/client/reward.py delete mode 100644 examples/frozen_lake/client/task_def.yaml delete mode 100644 examples/frozen_lake/gymnasium_frozen_lake_server.py delete mode 100755 examples/frozen_lake/run_full_evaluation.sh delete mode 100644 examples/frozen_lake/server/README.md delete mode 100644 examples/frozen_lake/server/http_rollout_server.py delete mode 100644 tests/test_frozen_lake_http_server.py delete mode 100644 tests/test_frozen_lake_seed_evaluation.py diff --git a/development/notes/frozen_lake_context.md b/development/notes/frozen_lake_context.md deleted file mode 100644 index 49c85152..00000000 --- a/development/notes/frozen_lake_context.md +++ /dev/null @@ -1,33 +0,0 @@ -# Frozen Lake Implementation Context - -This document provides context for the implementation of the Frozen Lake example within the `eval-protocol` framework. - -## High-Level Goal - -The primary objective is to create a robust and reproducible reinforcement learning environment for the Frozen Lake game. This involves allowing an LLM-based agent to interact with the game, and critically, enabling a data-driven approach to evaluations where initial conditions (like random seeds) are controlled by a dataset. - -## Core Components - -The implementation is distributed across several key files: - -- **`examples/frozen_lake/client/dataset.jsonl`**: The source of truth for evaluation runs. Each line defines a scenario, specifying the `seed` for the environment's initial state. -- **`examples/frozen_lake/client/task_def.yaml`**: The main configuration file for the task. It points to the dataset and defines how many rollouts to perform for each sample in the dataset. -- **`examples/frozen_lake/server/http_rollout_server.py`**: A FastAPI server that wraps the Frozen Lake game logic, exposing it via an HTTP API that the `eval-protocol` agent can interact with. -- **`examples/frozen_lake/gymnasium_frozen_lake_server.py`**: The core game logic, which wraps the official `gymnasium` Frozen Lake environment. It is responsible for accepting a `seed` to create a deterministic starting state. -- **`examples/frozen_lake/client/reward.py`**: A reward function that evaluates the agent's performance based on the outcome of the game (e.g., reaching the goal). -- **`eval_protocol/agent/`**: The core agent framework, including the `TaskManager` and `Orchestrator`, which together manage the data-driven execution of rollouts based on the task definition and dataset. - -## Data-Driven Rollout Flow - -The evaluation process follows a clear, data-driven flow: - -1. The **TaskManager** reads the `task_def.yaml`. -2. It loads the scenarios from the specified `dataset.jsonl` file. -3. For each scenario (i.e., each `seed` in the dataset), it schedules `num_rollouts_per_sample` rollouts. -4. For each individual rollout, the **Orchestrator** is invoked with the specific `seed`. -5. The **Orchestrator** passes the `seed` to the **HttpRolloutResource**. -6. The **HttpRolloutResource** sends the `seed` in a request to the `/start_episode` endpoint of the **http_rollout_server**. -7. The server uses the `seed` to initialize the **GymnasiumFrozenLakeGame** in a deterministic state. -8. The agent then plays the game, and the final outcome is evaluated by the reward function. - -This architecture ensures that evaluations are reproducible and that the agent's performance can be measured across a controlled set of initial conditions. diff --git a/development/notes/frozen_lake_plan.md b/development/notes/frozen_lake_plan.md deleted file mode 100644 index 2ae3005f..00000000 --- a/development/notes/frozen_lake_plan.md +++ /dev/null @@ -1,93 +0,0 @@ -# Frozen Lake Example Plan: Data-Driven Rollouts - -This document outlines the plan for refactoring the Frozen Lake example to use a data-driven evaluation workflow. The goal is to make the system more robust, extensible, and aligned with standard practices in reinforcement learning research. - -The core principle is to treat the initial conditions of an environment (like a random seed) as data. Each row in a dataset will define a specific scenario, and the framework will run a configurable number of rollouts for each scenario. - -### 1. The Dataset (`dataset.jsonl`) - -The foundation of this new approach is a dataset file that defines the experimental conditions. - -- **Action:** Create a new dataset file at `examples/frozen_lake/client/dataset.jsonl`. -- **Format:** Each line in the file will be a JSON object representing a single experimental sample. Initially, this will just contain a unique `id` and a `seed`. -- **Example Content:** - ```json - {"id": "run_001", "seed": 42} - {"id": "run_002", "seed": 123} - {"id": "run_003", "seed": 555} - {"id": "run_004", "seed": 678} - ``` - -### 2. The Task Definition (`task_def.yaml`) - -The task definition will be updated to reference the dataset and specify how many rollouts (`N`) to perform for each sample. - -- **File to Modify:** `examples/frozen_lake/client/task_def.yaml` -- **Changes:** - - Remove the old `num_rollouts` field. - - Add `dataset_path` to point to our new `dataset.jsonl` file. - - Add `num_rollouts_per_sample` to define `N`. -- **Example:** - ```yaml - name: "frozen_lake_http_rollout" - description: "Evaluate an agent's ability to navigate a Frozen Lake environment via HTTP rollout" - - # Data-driven configuration - dataset_path: "examples/frozen_lake/client/dataset.jsonl" - num_rollouts_per_sample: 5 # This is 'N', the number of rollouts per seed - - # Resource configuration remains the same - resource_type: "http_rollout" - # ... (rest of the file) - ``` - -### 3. Core Framework Modifications - -The following changes will plumb the `seed` from the dataset through the framework to the game environment. - -1. **Data Model (`eval_protocol/models.py`):** - - Update `TaskDefinitionModel` to include `dataset_path: Optional[str]` and `num_rollouts_per_sample: int`. - -2. **TaskManager (`eval_protocol/agent/task_manager.py`):** - - Modify the `execute_tasks` method to load samples from the `dataset_path`. - - For each sample, generate `num_rollouts_per_sample` rollout jobs. - - Pass the sample data (containing the `seed`) for each job down to the `Orchestrator`. - -3. **Orchestrator (`eval_protocol/agent/orchestrator.py`):** - - Modify `execute_task_poc` to accept `sample_data` as a parameter. - - Pass this data to the resource's `initialize` method: `await episode_resource.initialize(**sample_data)`. - -4. **HTTP Rollout Resource (`eval_protocol/agent/resources/http_rollout_resource.py`):** - - The `initialize` method will accept `**kwargs`. - - These `kwargs` (the `sample_data`) will be sent as the JSON body of the POST request to the `/start_episode` endpoint. - -5. **HTTP Rollout Server & Protocol:** - - The `/start_episode` endpoint in `examples/frozen_lake/server/http_rollout_server.py` will be updated to accept a JSON request body. - - It will pass the entire request body as keyword arguments to the `GymnasiumFrozenLakeGame` constructor: `game = FrozenLakeGame(**request_data)`. - - The `StartEpisodeRequest` model in `eval_protocol/agent/resources/http_rollout_protocol.py` will be updated to allow arbitrary extra fields. - -6. **Gymnasium Game (`examples/frozen_lake/gymnasium_frozen_lake_server.py`):** - - The `__init__` method of `GymnasiumFrozenLakeGame` will be changed to accept `**kwargs`. - - The `reset` method will use the `seed` from these arguments to initialize the environment deterministically: `self.env.reset(seed=self.seed)`. - -### 4. Visualization of the Flow - -```mermaid -sequenceDiagram - participant TaskManager - participant Orchestrator - participant Resource as HttpRolloutResource - participant Server as http_rollout_server - participant Game as GymnasiumFrozenLakeGame - - TaskManager->>TaskManager: Reads dataset.jsonl - TaskManager->>Orchestrator: execute_task_poc(sample_data={"seed": 42}) - Orchestrator->>Resource: fork() - Orchestrator->>Resource: initialize(**sample_data) - Resource->>Server: POST /start_episode (body={"seed": 42}) - Server->>Game: __init__(**{"seed": 42}) - Game->>Game: self.env.reset(seed=42) - Game-->>Server: observation - Server-->>Resource: {episode_id, observation} - Resource-->>Orchestrator: (initialization complete) - Orchestrator->>Orchestrator: (proceeds with agent interaction) diff --git a/development/notes/http_rollout.md b/development/notes/http_rollout.md deleted file mode 100644 index f81678d9..00000000 --- a/development/notes/http_rollout.md +++ /dev/null @@ -1,38 +0,0 @@ -# Remote Rollout Server API - -Eval Protocol can collect reinforcement learning trajectories from an external HTTP service. -The service exposes three simple endpoints used by `RemoteHttpRolloutClient`: - -## `POST /start_episode` -Returns an `episode_id` and the initial observation. - -## `POST /step` -Request body: -```json -{ - "episode_id": "string", - "action": {"any": "payload"} -} -``` -Returns a JSON object: -```json -{ - "observation": {"any": "payload"}, - "is_done": false -} -``` -representing the new observation after the action and whether the episode has ended. - -## `POST /end_episode` -Request body: -```json -{"episode_id": "string"} -``` -Signals that the episode is complete. - -The Eval Protocol pipeline is responsible for invoking an -OpenAI-compatible API between steps and feeding the resulting assistant messages -back into the rollout. This illustrates how an environment can interact with an -LLM at every step while keeping model calls in the pipeline. - -A concrete example of this is the [Frozen Lake Example](./frozen_lake_plan.md), which uses a remote HTTP rollout server to play the Frozen Lake game. diff --git a/eval_protocol/agent/orchestrator.py b/eval_protocol/agent/orchestrator.py index 410baf5f..2f737e2c 100644 --- a/eval_protocol/agent/orchestrator.py +++ b/eval_protocol/agent/orchestrator.py @@ -57,7 +57,6 @@ class ChatCompletionMessageToolCall: BFCLSimAPIResource, DockerResource, FileSystemResource, - HttpRolloutResource, PythonStateResource, SQLResource, ) @@ -244,8 +243,6 @@ def _get_resource_class(self, resource_type_name: str) -> Type[ForkableResource] "FileSystemResource": FileSystemResource, "DockerResource": DockerResource, "BFCLSimAPIResource": BFCLSimAPIResource, # Add BFCLSimAPIResource to mapping - "HttpRolloutResource": HttpRolloutResource, # Add HttpRolloutResource to mapping - "http_rollout": HttpRolloutResource, # Allow lowercase alias for convenience } resource_class = mapping.get(resource_type_name) diff --git a/eval_protocol/agent/resources/__init__.py b/eval_protocol/agent/resources/__init__.py index 7f5a03b0..852e1597 100644 --- a/eval_protocol/agent/resources/__init__.py +++ b/eval_protocol/agent/resources/__init__.py @@ -7,20 +7,6 @@ from .bfcl_sim_api_resource import BFCLSimAPIResource from .docker_resource import DockerResource from .filesystem_resource import FileSystemResource - -# HTTP Rollout Protocol types for server implementations -from .http_rollout_protocol import ( - EndEpisodeRequest, - EndEpisodeResponse, - GameObservation, - HealthResponse, - HttpRolloutConfig, - StartEpisodeRequest, - StartEpisodeResponse, - StepRequest, - StepResponse, -) -from .http_rollout_resource import HttpRolloutResource from .python_state_resource import PythonStateResource from .sql_resource import SQLResource @@ -30,15 +16,4 @@ "FileSystemResource", "DockerResource", "BFCLSimAPIResource", - "HttpRolloutResource", - # HTTP Rollout Protocol - "HttpRolloutConfig", - "StartEpisodeRequest", - "StartEpisodeResponse", - "StepRequest", - "StepResponse", - "EndEpisodeRequest", - "EndEpisodeResponse", - "HealthResponse", - "GameObservation", ] diff --git a/eval_protocol/agent/resources/http_rollout_protocol.py b/eval_protocol/agent/resources/http_rollout_protocol.py deleted file mode 100644 index d6992d0f..00000000 --- a/eval_protocol/agent/resources/http_rollout_protocol.py +++ /dev/null @@ -1,85 +0,0 @@ -""" -HTTP Rollout Protocol - Standardized types for HTTP rollout communication. - -This module defines the standard request/response models for HTTP rollout servers -and clients, ensuring consistent communication across different implementations. -""" - -from typing import Any, Dict, List, Optional - -from pydantic import BaseModel - - -class StartEpisodeRequest(BaseModel): - """Request to start a new episode.""" - - class Config: - extra = "allow" # Allow arbitrary extra fields (like seed) - - -class StartEpisodeResponse(BaseModel): - """Response from starting a new episode.""" - - episode_id: str - observation: Dict[str, Any] - - -class StepRequest(BaseModel): - """Request to take a step in the environment.""" - - episode_id: str - action: Any # Can be int, str, dict, etc. depending on environment - - -class StepResponse(BaseModel): - """Response from taking a step in the environment.""" - - observation: Dict[str, Any] - is_done: bool - info: Optional[Dict[str, Any]] = None - - -class EndEpisodeRequest(BaseModel): - """Request to end an episode.""" - - episode_id: str - - -class EndEpisodeResponse(BaseModel): - """Response from ending an episode.""" - - message: str - - -class HealthResponse(BaseModel): - """Response from health check endpoint.""" - - status: str - game: Optional[str] = None - version: Optional[str] = None - - -class HttpRolloutConfig(BaseModel): - """Configuration for HTTP rollout resource.""" - - base_url: str - start_episode_endpoint: str = "/start_episode" - step_endpoint: str = "/step" - end_episode_endpoint: str = "/end_episode" - health_endpoint: str = "/health" - timeout: float = 30.0 - max_retries: int = 3 - - -# Observation structure for game environments -class GameObservation(BaseModel): - """Standard observation structure for game environments.""" - - position: Optional[List[int]] = None - current_cell: Optional[str] = None - done: bool = False - won: bool = False - visual: Optional[str] = None - message: Optional[str] = None - step_count: Optional[int] = None - max_steps: Optional[int] = None diff --git a/eval_protocol/agent/resources/http_rollout_resource.py b/eval_protocol/agent/resources/http_rollout_resource.py deleted file mode 100644 index 04be93e0..00000000 --- a/eval_protocol/agent/resources/http_rollout_resource.py +++ /dev/null @@ -1,325 +0,0 @@ -""" -HTTP Rollout Resource implementation for the agent evaluation framework. - -This resource bridges the HTTP rollout protocol with the ForkableResource interface, -allowing HTTP-based environments to be used in agent evaluations. -""" - -import json -import uuid -from typing import Any, Dict, List, Optional - -import httpx - -from ..resource_abc import ForkableResource -from .http_rollout_protocol import ( - EndEpisodeRequest, - GameObservation, - HttpRolloutConfig, - StartEpisodeRequest, - StartEpisodeResponse, - StepRequest, - StepResponse, -) - - -class HttpRolloutResource(ForkableResource): - """ - A ForkableResource implementation that communicates with HTTP rollout servers. - - This resource allows the agent evaluation framework to interact with - HTTP-based environments through a standardized rollout protocol. - """ - - def __init__(self): - """Initialize the HTTP rollout resource.""" - super().__init__() - self.config: Optional[HttpRolloutConfig] = None - self.episode_id: Optional[str] = None - self.current_observation: Optional[Dict[str, Any]] = None - self.is_episode_active = False - self.client: Optional[httpx.Client] = None - - # Set up logging - import logging - - self.logger = logging.getLogger(f"{self.__class__.__name__}") - - async def setup(self, config: Dict[str, Any]) -> None: - """ - Set up the resource with the provided configuration. - - Args: - config: Configuration dictionary from the task definition - """ - self.config = HttpRolloutConfig(**config) - self.client = httpx.Client(timeout=self.config.timeout) - - async def fork(self) -> "HttpRolloutResource": - """ - Create a new independent instance of this resource. - - For HTTP rollout, forking means creating a new resource instance - that will start its own episode when initialized. - """ - if not self.config: - raise RuntimeError("Resource not set up. Call setup() first.") - - # Create a new instance with the same config - new_resource = HttpRolloutResource() - await new_resource.setup(self.config.model_dump()) - return new_resource - - async def get_state(self) -> Dict[str, Any]: - """ - Get the current state of the resource. - - Returns the current observation and episode metadata. - """ - return { - "episode_id": self.episode_id, - "observation": self.current_observation, - "is_episode_active": self.is_episode_active, - "type": "http_rollout", - } - - async def initialize(self, **kwargs) -> None: - """ - Initialize the resource by starting a new episode. - Passes any provided kwargs (like seed) to the server in the request body. - """ - try: - url = f"{self.config.base_url}{self.config.start_episode_endpoint}" - - # Include any sample data (like seed) in the request body - if kwargs: - self.logger.info(f"Sending initialization data to server: {kwargs}") - response = self.client.post(url, json=kwargs) - else: - response = self.client.post(url) - response.raise_for_status() - - episode_data = response.json() - self.episode_id = episode_data["episode_id"] - self.current_observation = episode_data["observation"] - self.is_episode_active = True - - except Exception as e: - raise RuntimeError(f"Failed to start HTTP rollout episode: {e}") - - async def get_initial_state_description(self) -> str: - """ - Get a formatted description of the initial game state for the agent. - Uses the observation from start_episode to build the prompt. - """ - # Start episode to get current game state - if not self.is_episode_active: - await self.initialize() - - if not self.current_observation: - return "No initial state available." - - obs = self.current_observation - - # Build comprehensive game prompt - content = """🎮 FROZEN LAKE GAME - AUTONOMOUS PLAY MODE - -🎯 OBJECTIVE: Navigate from S to G without hitting H - -📋 GAME RULES: S=start, F=safe, H=hole(death), G=goal(win) - -🤖 AUTONOMOUS MODE INSTRUCTIONS: -- You are playing this game AUTONOMOUSLY until completion -- KEEP MAKING MOVES using the step tool until you reach G or hit H -- DO NOT ask for user input or wait for confirmation -- DO NOT stop after one move - continue until the game ends -- Each move should be followed immediately by another move -- Game only ends when you reach G (win) or hit H (lose) - -🎮 ACTION: Use step tool with: "left", "right", "up", or "down" - -⚡ START NOW - Make your first move and continue until the game is complete!""" - - description_parts = [content] - - if obs.get("message"): - description_parts.append(f"\nEnvironment: {obs['message']}") - - if obs.get("visual"): - description_parts.append(f"\nGame Board:\n{obs['visual']}") - - if obs.get("position"): - description_parts.append(f"\nStarting Position: {obs['position']}") - - description_parts.append("\nGame Rules:") - description_parts.append("- S = Start position") - description_parts.append("- F = Frozen (safe to step on)") - description_parts.append("- H = Hole (game over if you step here)") - description_parts.append("- G = Goal (reach this to win)") - description_parts.append("- [X] = Your current position") - - return "\n".join(description_parts) - - async def cleanup(self) -> None: - """ - Clean up the resource by ending the current episode. - """ - if self.is_episode_active and self.episode_id: - try: - url = f"{self.config.base_url}{self.config.end_episode_endpoint}" - response = self.client.post(url, json={"episode_id": self.episode_id}) - response.raise_for_status() - - except Exception as e: - # Log but don't raise - cleanup should be best effort - print(f"Warning: Failed to properly end episode {self.episode_id}: {e}") - - finally: - self.episode_id = None - self.current_observation = None - self.is_episode_active = False - - # Close the HTTP client - self.client.close() - - async def get_tools_spec(self) -> List[Dict[str, Any]]: - """ - Get the list of available tools for this resource. - - For HTTP rollout, this returns the 'step' tool that allows - the agent to take actions in the environment. - """ - return [ - { - "name": "step", - "description": "Take a step in the Frozen Lake game by choosing a direction to move", - "parameters": { - "type": "object", - "properties": { - "action": { - "type": "string", - "enum": ["left", "down", "right", "up"], - "description": "The direction to move in the game: 'left', 'down', 'right', or 'up'", - } - }, - "required": ["action"], - }, - } - ] - - async def step(self, action_name: str, action_params: Dict[str, Any]) -> Any: - """ - Execute a tool call on this resource. - - For HTTP rollout, this handles the 'step' tool by sending - the action to the HTTP rollout server. - """ - if not self.is_episode_active or not self.episode_id: - # If no active episode, start one first - await self.initialize() - - if action_name == "step": - action = action_params.get("action") - return await self._handle_step_tool(action) - else: - raise ValueError(f"Unknown action: {action_name}") - - async def get_observation(self) -> Any: - """ - Get the current observation from the environment. - """ - if self.current_observation: - return self.current_observation - else: - return {"message": "No observation available. Start an episode first."} - - async def checkpoint(self) -> Dict[str, Any]: - """ - Create a checkpoint of the current resource state. - - For HTTP rollout, this saves the episode ID and current observation. - """ - return { - "episode_id": self.episode_id, - "current_observation": self.current_observation, - "is_episode_active": self.is_episode_active, - } - - async def restore(self, state_data: Dict[str, Any]) -> None: - """ - Restore the resource state from a checkpoint. - - Note: This is limited for HTTP rollout since we can't restore - arbitrary server-side state. - """ - self.episode_id = state_data.get("episode_id") - self.current_observation = state_data.get("current_observation") - self.is_episode_active = state_data.get("is_episode_active", False) - - async def close(self) -> None: - """ - Clean up and close the resource. - """ - await self.cleanup() - - async def _handle_step_tool(self, action: Any) -> Dict[str, Any]: - """ - Handle the 'step' tool by sending an action to the HTTP rollout server. - """ - try: - # Convert string action to integer for the server - action_map = {"left": 0, "down": 1, "right": 2, "up": 3} - - if isinstance(action, str): - if action.lower() not in action_map: - raise ValueError(f"Invalid action '{action}'. Must be one of: left, down, right, up") - numeric_action = action_map[action.lower()] - else: - # Backward compatibility with numeric actions - numeric_action = action - - url = f"{self.config.base_url}{self.config.step_endpoint}" - step_data = {"episode_id": self.episode_id, "action": numeric_action} - - response = self.client.post(url, json=step_data) - response.raise_for_status() - - step_result = response.json() - self.current_observation = step_result["observation"] - - # If the episode is done, mark it as inactive - if step_result.get("is_done", False): - self.is_episode_active = False - - # Format the response for the agent - observation = step_result["observation"] - message = observation.get("message", "") - visual = observation.get("visual", "") - - # Create a comprehensive response - response_content = [] - if message: - response_content.append(f"Environment: {message}") - if visual: - response_content.append(f"Visual State:\n{visual}") - - # Add structured data - response_content.append(f"Position: {observation.get('position', 'unknown')}") - response_content.append(f"Done: {step_result.get('is_done', False)}") - - if step_result.get("is_done", False): - won = observation.get("won", False) - response_content.append(f"Result: {'Victory!' if won else 'Game Over'}") - - return {"content": [{"type": "text", "text": "\n".join(response_content)}]} - - except Exception as e: - raise RuntimeError(f"Failed to execute step: {e}") - - def __del__(self): - """Ensure cleanup on deletion.""" - if hasattr(self, "client") and self.client: - try: - self.client.close() - except Exception: - pass # Ignore cleanup errors during deletion diff --git a/examples/frozen_lake/README.md b/examples/frozen_lake/README.md deleted file mode 100644 index 803591a7..00000000 --- a/examples/frozen_lake/README.md +++ /dev/null @@ -1,152 +0,0 @@ -# Frozen Lake Agent Evaluation - -This example demonstrates LLM agent evaluation on the Frozen Lake game using eval-protocol's HTTP rollout framework. The agent must navigate from start (S) to goal (G) while avoiding holes (H). - -## Quick Start - -### Setup -```bash -# For Fireworks AI -export FIREWORKS_API_KEY="your_fireworks_api_key" -export MODEL_AGENT="fireworks/accounts/fireworks/models/qwen3-235b-a22b" - -# For OpenAI -export OPENAI_API_KEY="your_openai_api_key" -export MODEL_AGENT="openai/gpt-4o-mini" - -# For other providers, set appropriate API key and MODEL_AGENT -``` - -### Run Evaluation -```bash -# Batch evaluation (8 parallel rollouts) - recommended -eval-protocol agent-eval --task-def examples/frozen_lake/client/task_def.yaml - -# Single rollout for debugging -eval-protocol agent-eval --task-def examples/frozen_lake/client/task_def.yaml --num-rollouts 1 - -# Custom batch size -eval-protocol agent-eval --task-def examples/frozen_lake/client/task_def.yaml --num-rollouts 16 -``` - -### Output -```bash -Task 'frozen_lake_http_rollout' batch results: - - Rollouts: 6/8 successful - - Success rate: 75.00% - - Average score: 0.7500 ± 0.4330 - - Trajectory data saved to: client/evaluation_logs/trajectory_frozen_lake_http_rollout_20250610_143052.jsonl -``` - - -## Architecture - -``` -┌─────────────────┐ HTTP ┌──────────────────┐ -│ Client Side │ ◄─────────► │ Server Side │ -│ (eval-protocol) │ Rollout │ (Game Env) │ -│ │ │ │ -│ • Agent Eval │ │ • Game Logic │ -│ • Reward Func │ │ • State Mgmt │ -│ • Trajectory │ │ • HTTP API │ -└─────────────────┘ └──────────────────┘ -``` - -## Project Structure - -``` -frozen_lake/ -├── README.md # This overview -├── server/ # Game Environment (HTTP API) -│ ├── README.md # Server documentation -│ └── http_rollout_server.py # FastAPI game server -└── client/ # Agent Evaluation - ├── task_def.yaml # Task configuration (works with any model) - ├── reward.py # Reward function - └── evaluation_logs/ # Generated results & trajectories - ├── trajectory_*.jsonl # Conversation histories + tool calls - └── *_reeval_*.jsonl # Re-evaluation results -``` - -## Game Rules - -**Objective:** Navigate from S to G without falling into holes (H) - -``` -[S] F F F - F H F H - F F F H - H F F G -``` - -**Actions:** `"left"`, `"right"`, `"up"`, `"down"` - -## Trajectory Data Format - -Each trajectory JSONL file contains: - -```json -{"type": "summary", "task_id": "frozen_lake_http_rollout", "num_rollouts": 8, "success_rate": 0.75, "avg_score": 0.75} -{"type": "individual_result", "rollout_index": 0, "score": 1.0, "conversation_messages": [...], "reward_function_inputs": {...}} -``` - -### Conversation Messages -Complete OpenAI format conversation history: -- User prompts -- Assistant responses with reasoning -- Tool calls (game actions) -- Tool results (game observations) - -### Reward Function Inputs -Exact parameters passed to reward functions: -- `messages`: Full conversation history -- `state`: Game state and successful function calls -- `task_achieved`: Success/failure status -- `ground_truth`: Reference data (if available) - -## Customization - -### Custom Reward Functions -Create new reward functions and test them on existing trajectories: - -```python -# my_rewards.py -from eval_protocol.typed_interface import reward_function -from eval_protocol.models import EvaluateResult, MetricResult - -@reward_function -def efficiency_reward(messages, state=None, **kwargs): - # Count steps taken - step_count = len(state.get("successful_func_calls", [[]])[0]) - - # Reward fewer steps - efficiency_score = max(0.0, 1.0 - (step_count - 4) * 0.1) - - return EvaluateResult( - score=efficiency_score, - reason=f"Efficiency reward: {step_count} steps", - metrics={"efficiency": MetricResult(score=efficiency_score, reason="Step efficiency")} - ) -``` - - -## Model Performance - -| Model | Success Rate | Average Score | Best Strategy | -|-------|-------------|---------------|---------------| -| qwen3-235b-a22b | 75-100% | 0.75-1.0 | down→down→right→right→down→right | -| gpt-4o-mini | 0-25% | 0.0-0.25 | Often fails at holes | - -## Troubleshooting - -- **Connection errors**: Server auto-starts, check port conflicts -- **API key issues**: Verify MODEL_AGENT and API key are set -- **Empty trajectories**: Check `client/evaluation_logs/` directory -- **Re-evaluation errors**: Ensure reward function module path is correct - -## Next Steps - -1. **Run the example**: Start with single rollout, then batch evaluation -2. **Analyze trajectories**: Examine generated JSONL files -3. **Create custom rewards**: Implement your own scoring functions -4. **Compare approaches**: Use re-evaluation to test different strategies diff --git a/examples/frozen_lake/analyze_trajectory.py b/examples/frozen_lake/analyze_trajectory.py deleted file mode 100755 index fe6a90c0..00000000 --- a/examples/frozen_lake/analyze_trajectory.py +++ /dev/null @@ -1,273 +0,0 @@ -#!/usr/bin/env python3 -""" -Agent Trajectory Analyzer for Frozen Lake HTTP Rollout Evaluation - -This script parses the evaluation logs and creates a human-readable -trajectory showing the agent's decision making process. -""" - -import json -import re -import sys -from pathlib import Path -from typing import Any, Dict, List, Optional - - -def extract_tool_calls_from_log(log_content: str) -> List[Dict[str, Any]]: - """Extract tool calls and their results from the log.""" - tool_calls = [] - - # Find all tool call patterns - tool_call_pattern = r"Attempting tool call: (\w+)\((.*?)\)" - tool_result_pattern = r"Tool '(\w+)' result: (.*?)(?=INFO:|DEBUG:|ERROR:|$)" - - tool_calls_matches = re.finditer(tool_call_pattern, log_content, re.DOTALL) - - for match in tool_calls_matches: - tool_name = match.group(1) - tool_args = match.group(2) - - # Try to parse the arguments as JSON - try: - args_dict = json.loads(tool_args) - except (json.JSONDecodeError, TypeError, ValueError): - args_dict = {"raw": tool_args} - - tool_call = {"tool_name": tool_name, "arguments": args_dict, "result": None} - - # Find the corresponding result - result_pattern = rf"Tool '{tool_name}' result: (.*?)(?=INFO:|DEBUG:|ERROR:|$)" - result_match = re.search(result_pattern, log_content[match.end() :], re.DOTALL) - - if result_match: - result_text = result_match.group(1).strip() - # Try to parse as JSON - try: - tool_call["result"] = json.loads(result_text) - except (json.JSONDecodeError, TypeError, ValueError): - tool_call["result"] = {"raw": result_text} - - tool_calls.append(tool_call) - - return tool_calls - - -def extract_agent_messages(log_content: str) -> List[Dict[str, Any]]: - """Extract the agent's reasoning and responses.""" - messages = [] - - # Find OpenAI response messages - response_pattern = r"OpenAI response message: ChatCompletionMessage\((.*?)\)" - - for match in re.finditer(response_pattern, log_content, re.DOTALL): - message_str = match.group(1) - - # Extract thinking content - think_match = re.search(r"content='(.*?)', refusal=", message_str, re.DOTALL) - if think_match: - thinking = think_match.group(1) - - # Extract tags - think_content_match = re.search(r"(.*?)", thinking, re.DOTALL) - if think_content_match: - thinking_content = think_content_match.group(1).strip() - else: - thinking_content = thinking - - messages.append({"type": "thinking", "content": thinking_content}) - - # Extract tool calls from the message - tool_calls_match = re.search(r"tool_calls=\[(.*?)\]", message_str, re.DOTALL) - if tool_calls_match: - messages.append({"type": "tool_calls", "content": tool_calls_match.group(1)}) - - return messages - - -def extract_game_state_changes(log_content: str) -> List[Dict[str, Any]]: - """Extract game state changes from the environment responses.""" - states = [] - - # Find environment responses - env_pattern = r"Environment: (.*?)(?=\\n|Position:|Done:)" - visual_pattern = r"Visual State:\\n(.*?)(?=\\nPosition:|\\nDone:)" - position_pattern = r"Position: (\[.*?\])" - done_pattern = r"Done: (True|False)" - - # Find all environment messages - env_matches = re.finditer( - r"Tool 'step' result:.*?Environment: (.*?)\\nVisual State:\\n(.*?)\\nPosition: (\[.*?\])\\nDone: (True|False)", - log_content, - re.DOTALL, - ) - - for i, match in enumerate(env_matches): - env_message = match.group(1) - visual_state = match.group(2) - position = match.group(3) - done = match.group(4) == "True" - - states.append( - { - "step": i + 1, - "message": env_message, - "visual_state": visual_state.replace("\\n", "\n"), - "position": position, - "done": done, - } - ) - - return states - - -def create_trajectory_report(log_file: str) -> str: - """Create a detailed trajectory report.""" - - with open(log_file, "r") as f: - log_content = f.read() - - tool_calls = extract_tool_calls_from_log(log_content) - agent_messages = extract_agent_messages(log_content) - game_states = extract_game_state_changes(log_content) - - report = [] - report.append("FROZEN LAKE AGENT TRAJECTORY ANALYSIS") - report.append("=" * 50) - report.append("") - - # Summary - report.append(f"📊 SUMMARY:") - report.append(f"• Total tool calls: {len(tool_calls)}") - report.append(f"• Total reasoning steps: {len([m for m in agent_messages if m['type'] == 'thinking'])}") - report.append(f"• Game state changes: {len(game_states)}") - report.append("") - - # Detailed trajectory - report.append("🎮 DETAILED TRAJECTORY:") - report.append("-" * 30) - report.append("") - - for i, tool_call in enumerate(tool_calls): - step_num = i + 1 - report.append(f"STEP {step_num}: {tool_call['tool_name'].upper()}") - report.append(f"Arguments: {tool_call['arguments']}") - - # Add corresponding game state if available - if i < len(game_states): - state = game_states[i] - report.append(f"Result: {state['message']}") - report.append(f"Position: {state['position']}") - report.append(f"Visual State:") - for line in state["visual_state"].split("\n"): - if line.strip(): - report.append(f" {line}") - report.append(f"Game Done: {state['done']}") - - report.append("") - - # Agent reasoning analysis - report.append("🧠 AGENT REASONING:") - report.append("-" * 20) - report.append("") - - thinking_messages = [m for m in agent_messages if m["type"] == "thinking"] - for i, message in enumerate(thinking_messages[:3]): # Show first 3 reasoning steps - report.append(f"REASONING STEP {i+1}:") - # Truncate long reasoning for readability - content = message["content"] - if len(content) > 500: - content = content[:500] + "...[truncated]" - report.append(content) - report.append("") - - if len(thinking_messages) > 3: - report.append(f"... and {len(thinking_messages) - 3} more reasoning steps") - report.append("") - - # Game progression analysis - report.append("📍 GAME PROGRESSION:") - report.append("-" * 20) - report.append("") - - positions = [] - for state in game_states: - try: - pos = eval(state["position"]) # Convert string representation to list - positions.append(pos) - except (SyntaxError, NameError, TypeError, ValueError): - positions.append(state["position"]) - - if positions: - report.append("Path taken:") - for i, pos in enumerate(positions): - if i == 0: - report.append(f" Start: {pos}") - else: - prev_pos = positions[i - 1] - direction = get_direction(prev_pos, pos) - report.append(f" Step {i}: {prev_pos} → {pos} ({direction})") - - # Final position - if positions: - final_pos = positions[-1] - # Check if reached goal (typically at [3,3]) - if final_pos == [3, 3]: - report.append(f" 🎉 GOAL REACHED at {final_pos}!") - else: - report.append(f" Final position: {final_pos}") - - return "\n".join(report) - - -def get_direction(from_pos: List[int], to_pos: List[int]) -> str: - """Determine the direction of movement.""" - if len(from_pos) != 2 or len(to_pos) != 2: - return "unknown" - - row_diff = to_pos[0] - from_pos[0] - col_diff = to_pos[1] - from_pos[1] - - if row_diff == 0 and col_diff == 1: - return "RIGHT" - elif row_diff == 0 and col_diff == -1: - return "LEFT" - elif row_diff == 1 and col_diff == 0: - return "DOWN" - elif row_diff == -1 and col_diff == 0: - return "UP" - else: - return f"DIAGONAL({row_diff},{col_diff})" - - -def main(): - if len(sys.argv) != 2: - print("Usage: python analyze_trajectory.py ") - sys.exit(1) - - log_file = sys.argv[1] - - if not Path(log_file).exists(): - print(f"Error: Log file {log_file} not found") - sys.exit(1) - - try: - report = create_trajectory_report(log_file) - - # Save to analysis file - analysis_file = str(Path(log_file).with_suffix(".analysis.txt")) - with open(analysis_file, "w") as f: - f.write(report) - - print(report) - print(f"\n📄 Analysis saved to: {analysis_file}") - - except Exception as e: - print(f"Error analyzing trajectory: {e}") - import traceback - - traceback.print_exc() - sys.exit(1) - - -if __name__ == "__main__": - main() diff --git a/examples/frozen_lake/client/dataset.jsonl b/examples/frozen_lake/client/dataset.jsonl deleted file mode 100644 index 27c5c684..00000000 --- a/examples/frozen_lake/client/dataset.jsonl +++ /dev/null @@ -1,3 +0,0 @@ -{"id": "run_001", "seed": 42} -{"id": "run_002", "seed": 123} -{"id": "run_003", "seed": 999} diff --git a/examples/frozen_lake/client/reward.py b/examples/frozen_lake/client/reward.py deleted file mode 100644 index 7239b8aa..00000000 --- a/examples/frozen_lake/client/reward.py +++ /dev/null @@ -1,126 +0,0 @@ -""" -Reward function for the Frozen Lake evaluation task. -""" - -from typing import List, Optional - -from eval_protocol.models import EvaluateResult, Message, MetricResult, StepOutput -from eval_protocol.typed_interface import reward_function - - -@reward_function -def frozen_lake_reward(messages: List[Message], state=None, **kwargs) -> EvaluateResult: - """ - Evaluate the final message list for a success string in the Frozen Lake game. - - Args: - messages: List of conversation messages - state: State dictionary containing trajectory data - **kwargs: Additional keyword arguments - - Returns: - EvaluateResult with score 1.0 for success, 0.0 for failure - """ - # Check if the last message (from the game) contains success indicators - if not messages: - return EvaluateResult( - score=0.0, - reason="No messages provided", - metrics={"success": MetricResult(score=0.0, reason="No messages provided")}, - ) - - # Check all messages (especially tool responses) for game outcome - def extract_content_from_message(msg): - """Extract text content from a message, handling JSON-encoded tool responses.""" - content = msg.content - if content and isinstance(content, str): - try: - # Try to parse JSON content from tool responses - import json - - parsed_content = json.loads(content) - if isinstance(parsed_content, dict) and "content" in parsed_content: - # Extract text from tool response format - content_list = parsed_content["content"] - if isinstance(content_list, list) and len(content_list) > 0: - text_item = content_list[0] - if isinstance(text_item, dict) and "text" in text_item: - content = text_item["text"] - except (json.JSONDecodeError, KeyError, IndexError, TypeError): - # If parsing fails, use original content - pass - return content.lower() if content else "" - - # Check for success/failure indicators in all messages - success_indicators = [ - "you win", - "you reached the goal", - "congratulations", - "success", - "goal reached", - "you made it", - "victory", - ] - - failure_indicators = ["you lose", "game over", "you fell", "hole"] - - is_success = False - is_failure = False - winning_message = "" - losing_message = "" - - # Check all messages for game outcome indicators - for msg in messages: - content = extract_content_from_message(msg) - - # Check for success - for indicator in success_indicators: - if indicator in content: - is_success = True - winning_message = content[:100] - break - - # Check for failure - for indicator in failure_indicators: - if indicator in content: - is_failure = True - losing_message = content[:100] - break - - # Determine the score (success takes precedence over failure) - if is_success: - score = 1.0 - reason = "Successfully reached the goal in Frozen Lake" - elif is_failure: - score = 0.0 - reason = "Failed to reach the goal (fell into hole or other failure)" - else: - # If no clear success/failure indicator, check if game is still ongoing - score = 0.0 - reason = "Game outcome unclear or still in progress" - - metrics = {"success": MetricResult(score=score, reason=reason, is_score_valid=True)} - - # Extract trajectory data if available - step_outputs = None - if state and "successful_func_calls" in state: - successful_calls = state["successful_func_calls"] - step_outputs = [] - - # Convert function calls to StepOutput format - step_index = 0 - for turn_calls in successful_calls: - for call in turn_calls: - # Extract action from function call arguments - action = call.get("args", {}).get("action", "unknown") - step_outputs.append( - StepOutput( - step_index=step_index, - action=action, - base_reward=(0.1 if action != "unknown" else 0.0), # Small reward for valid actions - reason=f"Agent took action: {action}", - ) - ) - step_index += 1 - - return EvaluateResult(score=score, reason=reason, metrics=metrics, step_outputs=step_outputs) diff --git a/examples/frozen_lake/client/task_def.yaml b/examples/frozen_lake/client/task_def.yaml deleted file mode 100644 index 62ee9540..00000000 --- a/examples/frozen_lake/client/task_def.yaml +++ /dev/null @@ -1,46 +0,0 @@ -name: "frozen_lake_http_rollout" -description: "Evaluate an agent's ability to navigate a Frozen Lake environment via HTTP rollout" - -# Data-driven configuration -dataset_path: "examples/frozen_lake/client/dataset.jsonl" -num_rollouts_per_sample: 1 # This is 'N', the number of rollouts per seed - -# Resource configuration - connects to the game server -resource_type: "http_rollout" -base_resource_config: - base_url: "http://localhost:8080" # Will be dynamically updated by TaskManager - timeout: 30.0 - -# Resource server configuration - automatically managed by TaskManager -resource_server: - start_command: "python examples/frozen_lake/server/http_rollout_server.py --port {port}" - health_check_url: "http://localhost:{port}/health" - -# Reward function - the only client-side logic needed -reward_function_path: "examples.frozen_lake.client.reward.frozen_lake_reward" - -# Initial user message - gets extended with game state from the server -messages: - - role: "user" - content: "Start playing the game!" - -# Evaluation configuration -poc_max_turns: 20 - -# Generation configuration -generation: - enabled: true - _target_: eval_protocol.generation.generate_responses - model_name: "accounts/fireworks/models/qwen3-235b-a22b" - temperature: 0.0 - max_new_tokens: 500 - batch_size: 1 - cache: - enabled: true - -# Reward Function Configuration -reward: - function_path: "examples.frozen_lake.client.reward.frozen_lake_reward" - -# Note: The TaskManager will automatically start the server on a free port -# and update the base_url accordingly for parallel execution support diff --git a/examples/frozen_lake/gymnasium_frozen_lake_server.py b/examples/frozen_lake/gymnasium_frozen_lake_server.py deleted file mode 100644 index 6a99153e..00000000 --- a/examples/frozen_lake/gymnasium_frozen_lake_server.py +++ /dev/null @@ -1,281 +0,0 @@ -""" -Gymnasium-based Frozen Lake game server implementation. - -This implementation wraps the official Gymnasium FrozenLake-v1 environment -and provides the same interface as the hand-rolled implementation for -seamless integration with the HTTP rollout server. -""" - -from typing import Dict, Optional, Tuple, Union - -import gymnasium as gym -import numpy as np -from gymnasium.envs.toy_text.frozen_lake import generate_random_map - - -class GymnasiumFrozenLakeGame: - """ - Gymnasium-based Frozen Lake game implementation. - - This class wraps the Gymnasium FrozenLake-v1 environment and provides - a compatible interface with the hand-rolled implementation. - - The game is played on a 4x4 grid where: - - S: Starting position - - F: Frozen surface (safe to walk on) - - H: Hole (game over if you fall in) - - G: Goal (reach this to win) - - Actions: - - 0: Left - - 1: Down - - 2: Right - - 3: Up - """ - - def __init__( - self, - map_name: str = "4x4", - is_slippery: bool = False, - render_mode: Optional[str] = None, - seed: Optional[int] = None, - **kwargs, - ): - """ - Initialize the Gymnasium Frozen Lake game. - - Args: - map_name: Map size ("4x4" or "8x8") - only used if seed is None - is_slippery: Whether the ice is slippery (stochastic environment) - render_mode: Rendering mode for Gymnasium environment - seed: Random seed for reproducible map generation and behavior - **kwargs: Additional keyword arguments (ignored for compatibility) - """ - self.map_name = map_name - self.is_slippery = is_slippery - self.seed = seed - - # Create the Gymnasium environment - if seed is not None: - # Use random map generation with seed for reproducible boards - size = 4 if map_name == "4x4" else 8 - desc = generate_random_map(size=size, p=0.8, seed=seed) - self.env = gym.make( - "FrozenLake-v1", - desc=desc, - is_slippery=is_slippery, - render_mode=render_mode, - ) - else: - # Use fixed predefined maps - self.env = gym.make( - "FrozenLake-v1", - map_name=map_name, - is_slippery=is_slippery, - render_mode=render_mode, - ) - - # Get environment properties - self.desc = self.env.unwrapped.desc - self.nrow, self.ncol = self.desc.shape - self.nS = self.env.observation_space.n - self.nA = self.env.action_space.n - - # Find start and goal positions - self.start_pos = None - self.goal_pos = None - for i in range(self.nrow): - for j in range(self.ncol): - if self.desc[i, j] == b"S": - self.start_pos = (i, j) - elif self.desc[i, j] == b"G": - self.goal_pos = (i, j) - - # Initialize state tracking - self.current_state: Optional[int] = None - self.current_pos: Optional[Tuple[int, int]] = None - self.done = False - self.won = False - - self.reset() - - def _state_to_pos(self, state: int) -> Tuple[int, int]: - """Convert state number to (row, col) position.""" - return state // self.ncol, state % self.ncol - - def _pos_to_state(self, row: int, col: int) -> int: - """Convert (row, col) position to state number.""" - return row * self.ncol + col - - def reset(self) -> Dict: - """Reset the game to the starting position.""" - if self.seed is not None: - self.current_state, _ = self.env.reset(seed=self.seed) - else: - self.current_state, _ = self.env.reset() - self.current_pos = self._state_to_pos(self.current_state) - self.done = False - self.won = False - return self._get_observation() - - def step(self, action: Union[int, str]) -> Tuple[Dict, bool]: - """ - Take a step in the environment. - - Args: - action: Action to take. Can be: - - Integer: 0=left, 1=down, 2=right, 3=up - - String: "left", "down", "right", "up" - - Returns: - Tuple of (observation, done) - """ - if self.done: - return self._get_observation(), True - - # Convert string action to integer if needed - if isinstance(action, str): - action_map = {"left": 0, "down": 1, "right": 2, "up": 3} - if action.lower() not in action_map: - raise ValueError(f"Invalid action '{action}'. Must be one of: left, down, right, up") - numeric_action = action_map[action.lower()] - else: - numeric_action = action - - if not (0 <= numeric_action < self.nA): - raise ValueError(f"Invalid action: {numeric_action}. Must be 0-{self.nA-1}") - - # Take the step in the Gymnasium environment - new_state, reward, terminated, truncated, info = self.env.step(numeric_action) - - # Update our state tracking - self.current_state = new_state - self.current_pos = self._state_to_pos(new_state) - self.done = terminated or truncated - self.won = reward > 0 # In FrozenLake, reward=1 for reaching goal, 0 otherwise - - return self._get_observation(), self.done - - def _get_observation(self) -> Dict: - """Get the current observation.""" - if self.current_pos is None: - raise RuntimeError("Game not initialized") - row, col = self.current_pos - cell = self.desc[row, col].decode("utf-8") - - # Create a visual representation - visual = [] - for i in range(self.nrow): - row_str = "" - for j in range(self.ncol): - if (i, j) == self.current_pos: - row_str += "[" + self.desc[i, j].decode("utf-8") + "]" - else: - row_str += " " + self.desc[i, j].decode("utf-8") + " " - visual.append(row_str) - - obs = { - "position": self.current_pos, - "current_cell": cell, - "done": self.done, - "won": self.won, - "visual": "\n".join(visual), - "message": self._get_message(), - "state": self.current_state, # Add the Gymnasium state for compatibility - } - - return obs - - def _get_message(self) -> str: - """Get a descriptive message about the current state.""" - if self.done: - if self.won: - return "Congratulations! You reached the goal! You win!" - else: - return "Oh no! You fell into a hole. Game over." - else: - if self.current_pos is None: - return "Game not initialized" - row, col = self.current_pos - cell = self.desc[row, col].decode("utf-8") - return f"You are at position ({row}, {col}) on a {cell} cell. Choose your next move carefully." - - def close(self): - """Close the Gymnasium environment.""" - self.env.close() - - def render(self, mode: str = "human"): - """Render the environment using Gymnasium's rendering.""" - return self.env.render() - - def get_action_meanings(self): - """Get human-readable action meanings.""" - return ["Left", "Down", "Right", "Up"] - - def get_action_space_info(self): - """Get information about the action space.""" - return { - "type": "Discrete", - "n": int(self.nA), # Convert numpy int to Python int - "actions": {0: "left", 1: "down", 2: "right", 3: "up"}, - } - - def get_observation_space_info(self): - """Get information about the observation space.""" - return { - "type": "Discrete", - "n": int(self.nS), # Convert numpy int to Python int - "shape": ( - int(self.nrow), - int(self.ncol), - ), # Convert numpy ints to Python ints - "description": "State number representing position on grid", - } - - def get_environment_info(self): - """Get comprehensive environment information.""" - return { - "name": "FrozenLake-v1", - "map_name": self.map_name, - "is_slippery": self.is_slippery, - "nrow": int(self.nrow), # Convert numpy int to Python int - "ncol": int(self.ncol), # Convert numpy int to Python int - "action_space": self.get_action_space_info(), - "observation_space": self.get_observation_space_info(), - "description": [[cell.decode("utf-8") for cell in row] for row in self.desc], # Convert to strings - } - - -# Backward compatibility: alias the old class name to the new one -FrozenLakeGame = GymnasiumFrozenLakeGame - - -if __name__ == "__main__": - """Test the Gymnasium implementation.""" - print("Testing Gymnasium FrozenLake implementation...") - - # Test with deterministic environment - game = GymnasiumFrozenLakeGame(is_slippery=False) - print(f"Environment info: {game.get_environment_info()}") - - obs = game.reset() - print(f"Initial observation: {obs}") - - # Test both string and numeric actions - test_actions = ["down", "down", "right", "right", "down", "right"] - - for i, action in enumerate(test_actions): - print(f"\nStep {i+1}: Taking action '{action}'") - obs, done = game.step(action) - print(f"Position: {obs['position']}, Done: {done}, Won: {obs['won']}") - print(f"Message: {obs['message']}") - - if done: - if obs["won"]: - print("🎉 Success! Reached the goal!") - else: - print("💀 Failed! Fell into a hole!") - break - - game.close() - print("\nTest completed!") diff --git a/examples/frozen_lake/run_full_evaluation.sh b/examples/frozen_lake/run_full_evaluation.sh deleted file mode 100755 index c2984c89..00000000 --- a/examples/frozen_lake/run_full_evaluation.sh +++ /dev/null @@ -1,318 +0,0 @@ -#!/bin/bash - -# Complete Frozen Lake HTTP Rollout Evaluation Script -# This script demonstrates the full end-to-end HTTP rollout evaluation - -set -e # Exit on any error - -# Configuration -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" -HTTP_ROLLOUT_SERVER_PORT=8082 -MAX_WAIT_TIME=30 - -# PID files to track server processes -HTTP_ROLLOUT_PID_FILE="/tmp/http_rollout_server.pid" - -# Colors for output -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -BLUE='\033[0;34m' -NC='\033[0m' # No Color - -log() { - echo -e "${GREEN}[$(date +'%Y-%m-%d %H:%M:%S')] $1${NC}" -} - -warn() { - echo -e "${YELLOW}[$(date +'%Y-%m-%d %H:%M:%S')] WARNING: $1${NC}" -} - -error() { - echo -e "${RED}[$(date +'%Y-%m-%d %H:%M:%S')] ERROR: $1${NC}" -} - -info() { - echo -e "${BLUE}[$(date +'%Y-%m-%d %H:%M:%S')] INFO: $1${NC}" -} - -# Cleanup function -cleanup() { - log "Cleaning up servers..." - - # Kill HTTP rollout server - if [ -f "$HTTP_ROLLOUT_PID_FILE" ]; then - HTTP_ROLLOUT_PID=$(cat "$HTTP_ROLLOUT_PID_FILE") - if kill -0 "$HTTP_ROLLOUT_PID" 2>/dev/null; then - log "Stopping HTTP rollout server (PID: $HTTP_ROLLOUT_PID)" - kill "$HTTP_ROLLOUT_PID" 2>/dev/null || true - sleep 2 - kill -9 "$HTTP_ROLLOUT_PID" 2>/dev/null || true - fi - rm -f "$HTTP_ROLLOUT_PID_FILE" - fi - - # Kill any remaining Python processes for our servers - pkill -f "http_rollout_server.py" 2>/dev/null || true - - log "Cleanup complete" -} - -# Set up signal handlers -trap cleanup EXIT INT TERM - -# Function to wait for a server to be ready -wait_for_server() { - local url=$1 - local name=$2 - local max_wait=$3 - - log "Waiting for $name to be ready at $url..." - - for i in $(seq 1 $max_wait); do - if curl -s -f "$url" > /dev/null 2>&1; then - log "$name is ready!" - return 0 - fi - sleep 1 - done - - error "$name failed to start within $max_wait seconds" - return 1 -} - -# Check prerequisites -check_prerequisites() { - info "Checking prerequisites..." - - # Check if reward-kit is available - if ! python -c "import eval_protocol" 2>/dev/null; then - error "reward-kit not installed or not in Python path" - exit 1 - fi - - # Check if required files exist - if [ ! -f "$SCRIPT_DIR/client/task_def.yaml" ]; then - error "task_def.yaml not found in $SCRIPT_DIR" - exit 1 - fi - - if [ ! -f "$SCRIPT_DIR/server/http_rollout_server.py" ]; then - error "http_rollout_server.py not found in $SCRIPT_DIR" - exit 1 - fi - - # Check if FIREWORKS_API_KEY is set - if [ -z "$FIREWORKS_API_KEY" ]; then - warn "FIREWORKS_API_KEY environment variable is not set" - warn "The evaluation will fail at the API call stage, but the infrastructure will be tested" - info "To run with a real model, set: export FIREWORKS_API_KEY=your_api_key" - else - info "FIREWORKS_API_KEY is set (length: ${#FIREWORKS_API_KEY})" - fi - - info "Prerequisites check complete" -} - -# Main execution -main() { - echo "" - echo "========================================" - echo "🎮 FROZEN LAKE HTTP ROLLOUT EVALUATION" - echo "========================================" - echo "" - - check_prerequisites - - # Change to the script directory - cd "$SCRIPT_DIR" - - # Check if ports are available - if lsof -Pi :$HTTP_ROLLOUT_SERVER_PORT -sTCP:LISTEN -t >/dev/null; then - error "Port $HTTP_ROLLOUT_SERVER_PORT is already in use" - exit 1 - fi - - # Start HTTP rollout server - log "Starting HTTP rollout server on port $HTTP_ROLLOUT_SERVER_PORT..." - python server/http_rollout_server.py --port $HTTP_ROLLOUT_SERVER_PORT & - HTTP_ROLLOUT_PID=$! - echo $HTTP_ROLLOUT_PID > "$HTTP_ROLLOUT_PID_FILE" - - # Wait for servers to be ready - wait_for_server "http://localhost:$HTTP_ROLLOUT_SERVER_PORT/health" "HTTP rollout server" $MAX_WAIT_TIME - - # Test the HTTP rollout server - info "Testing HTTP rollout server..." - - # Test start episode - EPISODE_DATA=$(curl -s -X POST "http://localhost:$HTTP_ROLLOUT_SERVER_PORT/start_episode") - EPISODE_ID=$(echo "$EPISODE_DATA" | python -c "import sys, json; print(json.load(sys.stdin)['episode_id'])") - info "Started episode: $EPISODE_ID" - - # Test step - STEP_DATA=$(curl -s -X POST "http://localhost:$HTTP_ROLLOUT_SERVER_PORT/step" \ - -H "Content-Type: application/json" \ - -d "{\"episode_id\": \"$EPISODE_ID\", \"action\": 2}") - info "Step result: $STEP_DATA" - - # End episode - curl -s -X POST "http://localhost:$HTTP_ROLLOUT_SERVER_PORT/end_episode" \ - -H "Content-Type: application/json" \ - -d "{\"episode_id\": \"$EPISODE_ID\"}" > /dev/null - info "Episode ended successfully" - - # Run the evaluation - log "Starting agent evaluation..." - cd "$REPO_ROOT" - - # Set model configuration - export MODEL_AGENT="fireworks/accounts/fireworks/models/qwen3-235b-a22b" - - # Create logs directory - LOG_DIR="$SCRIPT_DIR/evaluation_logs" - mkdir -p "$LOG_DIR" - TIMESTAMP=$(date +"%Y%m%d_%H%M%S") - FULL_LOG_FILE="$LOG_DIR/full_evaluation_${TIMESTAMP}.log" - TRAJECTORY_LOG_FILE="$LOG_DIR/agent_trajectory_${TIMESTAMP}.log" - - # Run the evaluation with detailed logging - info "Executing: python -m eval_protocol.cli agent-eval --task-def examples/frozen_lake/client/task_def.yaml" - info "Full logs will be saved to: $FULL_LOG_FILE" - info "Agent trajectory will be extracted to: $TRAJECTORY_LOG_FILE" - - # Capture all output and filter agent trajectory - python -m eval_protocol.cli agent-eval --task-def examples/frozen_lake/client/task_def.yaml 2>&1 | tee "$FULL_LOG_FILE" - - # Extract agent trajectory and tool calls - log "Extracting agent trajectory for review..." - - # Create a detailed trajectory log - cat > "$TRAJECTORY_LOG_FILE" << 'EOF' -FROZEN LAKE AGENT EVALUATION TRAJECTORY -====================================== - -This log contains the complete agent decision-making process including: -- User prompts -- Agent reasoning (thinking) -- Tool calls made by the agent -- Environment responses -- Agent reactions to environment feedback - -====================================== - -EOF - - # Extract the relevant trajectory information - grep -A 5 -B 5 "User Turn\|Inner Step\|Tool.*result\|OpenAI response\|Calling OpenAI\|tool calls" "$FULL_LOG_FILE" >> "$TRAJECTORY_LOG_FILE" || true - - echo "" >> "$TRAJECTORY_LOG_FILE" - echo "======================================" >> "$TRAJECTORY_LOG_FILE" - echo "DETAILED MESSAGES HISTORY" >> "$TRAJECTORY_LOG_FILE" - echo "======================================" >> "$TRAJECTORY_LOG_FILE" - echo "" >> "$TRAJECTORY_LOG_FILE" - - # Extract the complete conversation flow - grep -A 20 "messages_FULL_HISTORY" "$FULL_LOG_FILE" >> "$TRAJECTORY_LOG_FILE" || true - - echo "" >> "$TRAJECTORY_LOG_FILE" - echo "======================================" >> "$TRAJECTORY_LOG_FILE" - echo "TOOL CALLS AND RESPONSES" >> "$TRAJECTORY_LOG_FILE" - echo "======================================" >> "$TRAJECTORY_LOG_FILE" - echo "" >> "$TRAJECTORY_LOG_FILE" - - # Extract tool call details - grep -A 10 -B 2 "tool_calls\|Tool.*result\|step.*action" "$FULL_LOG_FILE" >> "$TRAJECTORY_LOG_FILE" || true - - # Run the trajectory analyzer - cd "$SCRIPT_DIR" - if [ -f "analyze_trajectory.py" ] && [ -f "$FULL_LOG_FILE" ]; then - info "Running trajectory analysis..." - python analyze_trajectory.py "$FULL_LOG_FILE" > "${LOG_DIR}/trajectory_analysis_${TIMESTAMP}.txt" 2>&1 || true - fi - - echo "" - echo "========================================" - echo "✅ EVALUATION INFRASTRUCTURE COMPLETE" - echo "========================================" - echo "" - info "HTTP rollout support has been successfully implemented!" - echo "" - echo "Key achievements:" - echo "• ✅ HttpRolloutResource implemented and integrated" - echo "• ✅ Fireworks model support added to orchestrator" - echo "• ✅ Tool calling protocol working correctly" - echo "• ✅ HTTP rollout server communication verified" - echo "• ✅ Complete evaluation framework functional" - echo "" - - echo "📋 EVALUATION LOGS SAVED:" - echo "• Full evaluation log: $FULL_LOG_FILE" - echo "• Agent trajectory log: $TRAJECTORY_LOG_FILE" - - ANALYSIS_FILE="${LOG_DIR}/trajectory_analysis_${TIMESTAMP}.txt" - if [ -f "$ANALYSIS_FILE" ]; then - echo "• Trajectory analysis: $ANALYSIS_FILE" - fi - echo "" - - echo "📊 AGENT TRAJECTORY SUMMARY:" - if [ -f "$FULL_LOG_FILE" ]; then - # Show a quick summary of tool calls - TOOL_CALL_COUNT=$(grep -c "Attempting tool call: step" "$FULL_LOG_FILE" || echo "0") - echo "• Total tool calls made: $TOOL_CALL_COUNT" - - # Show quick trajectory analysis if available - if [ -f "$ANALYSIS_FILE" ]; then - echo "" - echo "Quick trajectory preview:" - head -20 "$ANALYSIS_FILE" | tail -15 - echo "" - echo "📖 Full trajectory analysis:" - echo " cat $ANALYSIS_FILE" - else - echo "• Review detailed trajectory in: $TRAJECTORY_LOG_FILE" - - # Show the first few tool calls for quick review - echo "" - echo "First few tool calls:" - grep -m 3 -A 2 "Attempting tool call: step" "$FULL_LOG_FILE" | head -9 || true - fi - fi - echo "" - - if [ -z "$FIREWORKS_API_KEY" ]; then - echo "To run with actual LLM inference:" - echo "1. Set FIREWORKS_API_KEY environment variable" - echo "2. Re-run this script" - else - echo "🎉 Ready for production use with LLM inference!" - echo "" - echo "📖 To review the agent's decision making:" - echo " cat $TRAJECTORY_LOG_FILE" - fi - echo "" -} - -# Check for help flag -if [ "$1" = "--help" ] || [ "$1" = "-h" ]; then - echo "Frozen Lake HTTP Rollout Evaluation" - echo "" - echo "This script demonstrates the complete HTTP rollout evaluation infrastructure:" - echo "1. Starts HTTP rollout server for Frozen Lake game" - echo "2. Tests the HTTP rollout protocol" - echo "3. Runs the agent evaluation framework" - echo "4. Shows tool calling and resource integration working" - echo "" - echo "Prerequisites:" - echo "- reward-kit installed and configured" - echo "- FIREWORKS_API_KEY environment variable (optional for infrastructure testing)" - echo "" - echo "Usage: $0" - echo "" - exit 0 -fi - -# Run main function -main "$@" diff --git a/examples/frozen_lake/server/README.md b/examples/frozen_lake/server/README.md deleted file mode 100644 index 08d3f7eb..00000000 --- a/examples/frozen_lake/server/README.md +++ /dev/null @@ -1,187 +0,0 @@ -# Frozen Lake Game Server - -This is the **server-side implementation** of the Frozen Lake game environment that provides an HTTP API for agent evaluation. - -## Overview - -This server implements the HTTP Rollout Protocol that allows external evaluation frameworks (like eval-protocol) to interact with the game environment through standardized endpoints. - -## API Endpoints - -### `POST /start_episode` -Initializes a new game episode. - -**Response:** -```json -{ - "episode_id": "uuid-string", - "observation": { - "position": [0, 0], - "current_cell": "S", - "done": false, - "won": false, - "visual": "[S] F F F \n F H F H \n F F F H \n H F F G ", - "message": "You are at position (0, 0) on a S cell. Choose your next move carefully." - } -} -``` - -### `POST /step` -Executes an action in the game. - -**Request:** -```json -{ - "episode_id": "uuid-string", - "action": 2 -} -``` - -**Action Values:** -- `0` = Left -- `1` = Down -- `2` = Right -- `3` = Up - -**Response:** -```json -{ - "observation": { - "position": [0, 1], - "current_cell": "F", - "done": false, - "won": false, - "visual": " S [F] F F \n F H F H \n F F F H \n H F F G ", - "message": "You are at position (0, 1) on a F cell. Choose your next move carefully." - }, - "is_done": false -} -``` - -### `POST /end_episode` -Cleans up a completed episode. - -**Request:** -```json -{ - "episode_id": "uuid-string" -} -``` - -**Response:** -```json -{ - "message": "Episode ended successfully" -} -``` - -### `GET /health` -Health check endpoint. - -**Response:** -```json -{ - "status": "healthy", - "game": "frozen_lake" -} -``` - -## Game Logic - -### Board Layout -``` -S F F F -F H F H -F F F H -H F F G -``` - -### Game Rules -- **S**: Starting position (safe) -- **F**: Frozen lake (safe to step on) -- **H**: Hole (game over if stepped on) -- **G**: Goal (win condition) - -### Win/Loss Conditions -- **Win**: Reach the goal position (G) -- **Loss**: Step on a hole (H) or exceed maximum steps - -## Running the Server - -### Prerequisites -- Python 3.8+ -- FastAPI -- Uvicorn - -### Installation -```bash -pip install fastapi uvicorn -``` - -### Start Server -```bash -python http_rollout_server.py -``` - -The server will start on `http://localhost:8080` - -### Configuration -Environment variables: -- `PORT`: Server port (default: 8080) -- `HOST`: Server host (default: 0.0.0.0) - -## Integration Notes - -This server is designed to work with any HTTP rollout-compatible evaluation framework. The client side handles: -- Action translation (string → numeric) -- State interpretation -- Reward calculation -- Episode management - -## Customization - -### Different Board Layouts -Modify the `FROZEN_LAKE_MAP` constant: -```python -FROZEN_LAKE_MAP = [ - "SFFF", - "FHFH", - "FFFH", - "HFFG" -] -``` - -### Game Variants -- Slippery surfaces (movement uncertainty) -- Larger boards -- Dynamic obstacles -- Multi-goal scenarios - -## Development - -### Testing the API -```bash -# Health check -curl http://localhost:8080/health - -# Start episode -curl -X POST http://localhost:8080/start_episode - -# Take action -curl -X POST http://localhost:8080/step \ - -H "Content-Type: application/json" \ - -d '{"episode_id": "your-episode-id", "action": 2}' -``` - -### Logging -The server logs all API interactions for debugging and monitoring. - -## Protocol Compliance - -This implementation follows the HTTP Rollout Protocol specification: -- Stateless episode management -- Structured observation format -- Standardized error handling -- Health monitoring endpoint - -Game environment developers can use this as a reference implementation for creating their own HTTP rollout-compatible environments. diff --git a/examples/frozen_lake/server/http_rollout_server.py b/examples/frozen_lake/server/http_rollout_server.py deleted file mode 100644 index afca6959..00000000 --- a/examples/frozen_lake/server/http_rollout_server.py +++ /dev/null @@ -1,104 +0,0 @@ -""" -HTTP rollout server for Frozen Lake game. - -This server implements the standard HTTP rollout protocol using the reward-kit -library's standardized types for consistent client/server communication. -""" - -import os -import sys -import uuid -from typing import Dict - -from fastapi import FastAPI, HTTPException - -# Add parent directory to path to import gymnasium frozen lake server -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from gymnasium_frozen_lake_server import GymnasiumFrozenLakeGame as FrozenLakeGame - -# Additional models for responses -from pydantic import BaseModel - -# Import standardized HTTP rollout protocol types from reward-kit -from eval_protocol.agent.resources import ( - EndEpisodeRequest, - EndEpisodeResponse, - HealthResponse, - StartEpisodeRequest, - StartEpisodeResponse, - StepRequest, - StepResponse, -) - -# FastAPI app -app = FastAPI(title="Frozen Lake HTTP Rollout Server") - -# Store active episodes -episodes: Dict[str, FrozenLakeGame] = {} - - -@app.post("/start_episode", response_model=StartEpisodeResponse) -async def start_episode( - request: StartEpisodeRequest = StartEpisodeRequest(), -) -> StartEpisodeResponse: - """Start a new episode of the Frozen Lake game.""" - episode_id = str(uuid.uuid4()) - - # Extract request data to pass to the game constructor - request_data = request.dict() if hasattr(request, "dict") else request.model_dump() - - # Create Gymnasium-based game with configuration from request - # Default values maintained for backward compatibility - game_config = { - "map_name": "4x4", - "is_slippery": True, # Enable stochastic behavior to demonstrate seed effect - "render_mode": None, - } - # Override with any values from the request (like seed) - game_config.update(request_data) - - game = FrozenLakeGame(**game_config) - observation = game.reset() - episodes[episode_id] = game - - return StartEpisodeResponse(episode_id=episode_id, observation=observation) - - -@app.post("/step", response_model=StepResponse) -async def step(req: StepRequest) -> StepResponse: - """Take a step in the specified episode.""" - if req.episode_id not in episodes: - raise HTTPException(status_code=404, detail="Episode not found") - - game = episodes[req.episode_id] - observation, is_done = game.step(req.action) - - return StepResponse(observation=observation, is_done=is_done) - - -@app.post("/end_episode", response_model=EndEpisodeResponse) -async def end_episode(req: EndEpisodeRequest) -> EndEpisodeResponse: - """End the specified episode.""" - if req.episode_id not in episodes: - raise HTTPException(status_code=404, detail="Episode not found") - - del episodes[req.episode_id] - return EndEpisodeResponse(message=f"Episode {req.episode_id} ended successfully") - - -@app.get("/health", response_model=HealthResponse) -async def health_check() -> HealthResponse: - """Health check endpoint.""" - return HealthResponse(status="healthy", game="frozen_lake_gymnasium") - - -if __name__ == "__main__": - import argparse - - import uvicorn - - parser = argparse.ArgumentParser(description="Frozen Lake HTTP Rollout Server") - parser.add_argument("--port", type=int, default=8080, help="Port to run the server on") - args = parser.parse_args() - - uvicorn.run(app, host="0.0.0.0", port=args.port) diff --git a/tests/test_frozen_lake_http_server.py b/tests/test_frozen_lake_http_server.py deleted file mode 100644 index ebe7104f..00000000 --- a/tests/test_frozen_lake_http_server.py +++ /dev/null @@ -1,269 +0,0 @@ -""" -Tests for FrozenLake HTTP rollout server seed handling. - -This module tests the HTTP server's ability to accept and use seed parameters -to create reproducible game environments. -""" - -import json -from unittest.mock import MagicMock, patch - -import pytest -from fastapi.testclient import TestClient - -from examples.frozen_lake.gymnasium_frozen_lake_server import GymnasiumFrozenLakeGame - -# Import the server components -from examples.frozen_lake.server.http_rollout_server import app - - -class TestFrozenLakeHttpServer: - """Tests for the FrozenLake HTTP rollout server.""" - - def setup_method(self): - """Set up test client.""" - self.client = TestClient(app) - - def test_start_episode_without_seed(self): - """Test starting episode without seed uses default behavior.""" - response = self.client.post("/start_episode") - - assert response.status_code == 200 - data = response.json() - - assert "episode_id" in data - assert "observation" in data - - # Should have standard game state - observation = data["observation"] - assert "position" in observation - assert "current_cell" in observation - assert "visual" in observation - assert observation["position"] == [0, 0] # Start position - assert observation["current_cell"] == "S" # Start cell - - def test_start_episode_with_seed(self): - """Test starting episode with seed parameter.""" - seed_value = 42 - request_data = {"seed": seed_value} - - response = self.client.post("/start_episode", json=request_data) - - assert response.status_code == 200 - data = response.json() - - assert "episode_id" in data - assert "observation" in data - - # Should still start at position (0,0) - observation = data["observation"] - assert observation["position"] == [0, 0] - assert observation["current_cell"] == "S" - - def test_different_seeds_create_different_episodes(self): - """Test that different seeds create episodes with different board layouts.""" - # Start episode with seed 42 - response1 = self.client.post("/start_episode", json={"seed": 42}) - assert response1.status_code == 200 - data1 = response1.json() - episode_id1 = data1["episode_id"] - visual1 = data1["observation"]["visual"] - - # Start episode with seed 123 - response2 = self.client.post("/start_episode", json={"seed": 123}) - assert response2.status_code == 200 - data2 = response2.json() - episode_id2 = data2["episode_id"] - visual2 = data2["observation"]["visual"] - - # Episodes should have different IDs - assert episode_id1 != episode_id2 - - # Board layouts should be different (high probability with different seeds) - assert visual1 != visual2, "Different seeds should create different board layouts" - - def test_same_seed_creates_identical_episodes(self): - """Test that same seed creates episodes with identical board layouts.""" - seed_value = 999 - - # Start two episodes with same seed - response1 = self.client.post("/start_episode", json={"seed": seed_value}) - assert response1.status_code == 200 - data1 = response1.json() - visual1 = data1["observation"]["visual"] - - response2 = self.client.post("/start_episode", json={"seed": seed_value}) - assert response2.status_code == 200 - data2 = response2.json() - visual2 = data2["observation"]["visual"] - - # Board layouts should be identical - assert visual1 == visual2, "Same seed should create identical board layouts" - - def test_start_episode_with_additional_parameters(self): - """Test that server accepts additional parameters beyond seed.""" - request_data = {"seed": 42, "custom_param": "test_value", "id": "test_run_001"} - - response = self.client.post("/start_episode", json=request_data) - - assert response.status_code == 200 - data = response.json() - - # Should work normally despite extra parameters - assert "episode_id" in data - assert "observation" in data - - def test_step_action_in_seeded_episode(self): - """Test taking actions in a seeded episode.""" - # Start episode with seed - response = self.client.post("/start_episode", json={"seed": 42}) - assert response.status_code == 200 - episode_id = response.json()["episode_id"] - - # Take a step action - step_response = self.client.post("/step", json={"episode_id": episode_id, "action": "right"}) - - assert step_response.status_code == 200 - step_data = step_response.json() - - assert "observation" in step_data - assert "is_done" in step_data - - # Position should have changed (unless blocked) - observation = step_data["observation"] - assert "position" in observation - assert "current_cell" in observation - - def test_health_endpoint(self): - """Test the health check endpoint.""" - response = self.client.get("/health") - - assert response.status_code == 200 - data = response.json() - assert data["status"] == "healthy" - - def test_episode_cleanup_on_completion(self): - """Test that episodes are properly tracked and can be cleaned up.""" - # Start an episode - response = self.client.post("/start_episode", json={"seed": 42}) - assert response.status_code == 200 - episode_id = response.json()["episode_id"] - - # Episode should be trackable via step endpoint - step_response = self.client.post("/step", json={"episode_id": episode_id, "action": "right"}) - assert step_response.status_code == 200 - - def test_invalid_episode_id_handling(self): - """Test handling of invalid episode IDs.""" - # Try to step with non-existent episode ID - response = self.client.post("/step", json={"episode_id": "non_existent_episode", "action": "right"}) - - # Should return an error (400 or 404) - assert response.status_code in [400, 404] - - def test_server_configuration_with_slippery_environment(self): - """Test that server is configured with slippery environment for seed demonstration.""" - # Mock the game creation to verify configuration - with patch("examples.frozen_lake.server.http_rollout_server.FrozenLakeGame") as mock_game_class: - mock_game_instance = MagicMock() - mock_game_instance.reset.return_value = { - "position": [0, 0], - "current_cell": "S", - "visual": "test_visual", - "done": False, - } - mock_game_class.return_value = mock_game_instance - - response = self.client.post("/start_episode", json={"seed": 42}) - - # Verify game was created with correct configuration - mock_game_class.assert_called_once() - call_kwargs = mock_game_class.call_args[1] - - # Should include slippery=True and the seed - assert call_kwargs.get("is_slippery") is True - assert call_kwargs.get("seed") == 42 - - -class TestGymnasiumFrozenLakeIntegration: - """Integration tests between HTTP server and GymnasiumFrozenLakeGame.""" - - def test_gymnasium_integration_with_seeds(self): - """Test that the HTTP server correctly integrates with GymnasiumFrozenLakeGame seeds.""" - client = TestClient(app) - - # Test multiple seeds to ensure they work through the HTTP interface - seeds = [42, 123, 999] - board_layouts = [] - - for seed in seeds: - response = client.post("/start_episode", json={"seed": seed}) - assert response.status_code == 200 - - observation = response.json()["observation"] - board_layouts.append(observation["visual"]) - - # All board layouts should be different - unique_layouts = set(board_layouts) - assert len(unique_layouts) == len(seeds), "Each seed should produce a unique board layout" - - def test_episode_state_consistency(self): - """Test that episode state remains consistent within a single game.""" - client = TestClient(app) - - # Start episode with specific seed - response = client.post("/start_episode", json={"seed": 42}) - assert response.status_code == 200 - - episode_id = response.json()["episode_id"] - initial_visual = response.json()["observation"]["visual"] - - # Take a few actions and verify board layout doesn't change - for action in ["right", "down", "left"]: - step_response = client.post("/step", json={"episode_id": episode_id, "action": action}) - assert step_response.status_code == 200 - - observation = step_response.json()["observation"] - # Visual board should remain the same (only position marker changes) - current_visual = observation["visual"] - - # Board structure should be preserved (same letters, different position marker) - # Extract just the board cells without position markers - initial_cells = initial_visual.replace("[", "").replace("]", "") - current_cells = current_visual.replace("[", "").replace("]", "") - - # The underlying board structure should be identical - assert len(initial_cells) == len(current_cells), "Board size should remain constant" - - def test_seed_parameter_propagation(self): - """Test that seed parameter correctly propagates to the game engine.""" - # This test verifies the complete data flow from HTTP request to game creation - - with patch("examples.frozen_lake.server.http_rollout_server.FrozenLakeGame") as mock_game_class: - mock_game_instance = MagicMock() - mock_game_instance.reset.return_value = { - "position": [0, 0], - "current_cell": "S", - "visual": "mocked_visual", - "done": False, - "won": False, - "message": "test_message", - } - mock_game_class.return_value = mock_game_instance - - client = TestClient(app) - - # Send request with seed - seed_value = 1337 - response = client.post("/start_episode", json={"seed": seed_value}) - - assert response.status_code == 200 - - # Verify the game was created with the correct seed - mock_game_class.assert_called_once() - call_kwargs = mock_game_class.call_args[1] - assert call_kwargs["seed"] == seed_value - - -if __name__ == "__main__": - pytest.main([__file__, "-v"]) diff --git a/tests/test_frozen_lake_seed_evaluation.py b/tests/test_frozen_lake_seed_evaluation.py deleted file mode 100644 index 54bcf1c6..00000000 --- a/tests/test_frozen_lake_seed_evaluation.py +++ /dev/null @@ -1,541 +0,0 @@ -""" -Tests for seed-based reproducible evaluation in FrozenLake example. - -This module tests the complete data-driven evaluation pipeline including: -- Seed-based map generation for reproducible game boards -- Data-driven evaluation infrastructure in TaskManager -- Protocol enhancements for sample data passing -- End-to-end integration tests -""" - -import json -import tempfile -from pathlib import Path -from typing import Any, Dict -from unittest.mock import AsyncMock, MagicMock, patch - -import pytest -import pytest_asyncio - -from eval_protocol.agent.orchestrator import Orchestrator -from eval_protocol.agent.resources.http_rollout_protocol import StartEpisodeRequest -from eval_protocol.agent.resources.http_rollout_resource import HttpRolloutResource -from eval_protocol.agent.task_manager import TaskManager -from eval_protocol.models import TaskDefinitionModel - -# Import components under test -from examples.frozen_lake.gymnasium_frozen_lake_server import GymnasiumFrozenLakeGame - - -class TestSeedBasedMapGeneration: - """Tests for seed-based reproducible map generation in GymnasiumFrozenLakeGame.""" - - def test_seed_generates_different_maps(self): - """Test that different seeds generate different map layouts.""" - seed1 = 42 - seed2 = 123 - - game1 = GymnasiumFrozenLakeGame(seed=seed1) - game2 = GymnasiumFrozenLakeGame(seed=seed2) - - # Get map descriptions - map1 = game1.desc.tolist() - map2 = game2.desc.tolist() - - # Maps should be different - assert map1 != map2, "Different seeds should generate different maps" - - # Both should be 4x4 by default - assert len(map1) == 4 and len(map1[0]) == 4 - assert len(map2) == 4 and len(map2[0]) == 4 - - game1.close() - game2.close() - - def test_same_seed_generates_identical_maps(self): - """Test that the same seed generates identical map layouts.""" - seed = 42 - - game1 = GymnasiumFrozenLakeGame(seed=seed) - game2 = GymnasiumFrozenLakeGame(seed=seed) - - # Get map descriptions - map1 = game1.desc.tolist() - map2 = game2.desc.tolist() - - # Maps should be identical - assert map1 == map2, "Same seed should generate identical maps" - - game1.close() - game2.close() - - def test_no_seed_uses_fixed_map(self): - """Test that no seed uses the fixed predefined map.""" - game1 = GymnasiumFrozenLakeGame() # No seed - game2 = GymnasiumFrozenLakeGame() # No seed - - # Get map descriptions - map1 = game1.desc.tolist() - map2 = game2.desc.tolist() - - # Maps should be identical (both using fixed 4x4 map) - assert map1 == map2, "No seed should use identical fixed maps" - - # Should be the standard 4x4 FrozenLake map - expected_map = [ - [b"S", b"F", b"F", b"F"], - [b"F", b"H", b"F", b"H"], - [b"F", b"F", b"F", b"H"], - [b"H", b"F", b"F", b"G"], - ] - assert map1 == expected_map, "Should use standard 4x4 map when no seed" - - game1.close() - game2.close() - - def test_seed_with_8x8_map(self): - """Test seed-based generation with 8x8 map size.""" - seed = 999 - - game = GymnasiumFrozenLakeGame(map_name="8x8", seed=seed) - - # Should be 8x8 - assert game.desc.shape == (8, 8) - - # Should have start and goal - flat_map = game.desc.flatten() - assert b"S" in flat_map - assert b"G" in flat_map - - game.close() - - def test_seed_affects_reset_behavior(self): - """Test that seed affects the reset behavior for stochastic environments.""" - seed = 42 - - # Test with slippery environment - game = GymnasiumFrozenLakeGame(seed=seed, is_slippery=True) - - # Reset multiple times - should get same initial state - state1 = game.reset() - state2 = game.reset() - - # Initial position should be consistent - assert state1["position"] == state2["position"] == (0, 0) - assert state1["current_cell"] == state2["current_cell"] == "S" - - game.close() - - def test_map_has_valid_path(self): - """Test that generated maps always have a valid path from start to goal.""" - # Test multiple seeds to ensure path validity - for seed in [42, 123, 999, 1337]: - game = GymnasiumFrozenLakeGame(seed=seed) - - # Should have exactly one start and one goal - flat_map = game.desc.flatten() - start_count = sum(1 for cell in flat_map if cell == b"S") - goal_count = sum(1 for cell in flat_map if cell == b"G") - - assert start_count == 1, f"Should have exactly one start for seed {seed}" - assert goal_count == 1, f"Should have exactly one goal for seed {seed}" - - # Start should be at (0,0) and goal should exist - assert game.desc[0, 0] == b"S", f"Start should be at (0,0) for seed {seed}" - assert game.start_pos == ( - 0, - 0, - ), f"Start position should be (0,0) for seed {seed}" - assert game.goal_pos is not None, f"Goal position should exist for seed {seed}" - - game.close() - - -class TestStartEpisodeRequest: - """Tests for the enhanced StartEpisodeRequest protocol.""" - - def test_start_episode_request_accepts_arbitrary_fields(self): - """Test that StartEpisodeRequest accepts arbitrary fields like seed.""" - # Should accept seed and other fields - request = StartEpisodeRequest(seed=42, custom_field="test_value") - - # Access via model_dump or dict - if hasattr(request, "model_dump"): - data = request.model_dump() - else: - data = request.dict() - - assert data["seed"] == 42 - assert data["custom_field"] == "test_value" - - def test_start_episode_request_empty(self): - """Test that StartEpisodeRequest works with no extra fields.""" - request = StartEpisodeRequest() - - # Should work without errors - if hasattr(request, "model_dump"): - data = request.model_dump() - else: - data = request.dict() - - # Should be empty dict or have no extra fields - assert isinstance(data, dict) - - -class TestDataDrivenTaskDefinition: - """Tests for data-driven evaluation fields in TaskDefinitionModel.""" - - def test_task_definition_with_dataset_path(self): - """Test TaskDefinitionModel with dataset_path field.""" - task_def_dict = { - "name": "test_task", - "description": "Test task", - "resource_type": "http_rollout", - "base_resource_config": {"base_url": "http://localhost:8080"}, - "reward_function_path": "test.reward", - "dataset_path": "test_dataset.jsonl", - "num_rollouts_per_sample": 3, - } - - task_def = TaskDefinitionModel(**task_def_dict) - - assert task_def.dataset_path == "test_dataset.jsonl" - assert task_def.num_rollouts_per_sample == 3 - - def test_task_definition_without_dataset_path(self): - """Test TaskDefinitionModel without dataset_path (traditional evaluation).""" - task_def_dict = { - "name": "test_task", - "description": "Test task", - "resource_type": "http_rollout", - "base_resource_config": {"base_url": "http://localhost:8080"}, - "reward_function_path": "test.reward", - "num_rollouts": 5, - } - - task_def = TaskDefinitionModel(**task_def_dict) - - assert task_def.dataset_path is None - assert task_def.num_rollouts_per_sample == 1 # Default value - assert task_def.num_rollouts == 5 - - def test_num_rollouts_per_sample_validation(self): - """Test that num_rollouts_per_sample must be >= 1.""" - task_def_dict = { - "name": "test_task", - "description": "Test task", - "resource_type": "http_rollout", - "base_resource_config": {"base_url": "http://localhost:8080"}, - "reward_function_path": "test.reward", - "dataset_path": "test.jsonl", - "num_rollouts_per_sample": 0, # Invalid - } - - with pytest.raises(ValueError): - TaskDefinitionModel(**task_def_dict) - - -@pytest.mark.asyncio -class TestTaskManagerDataDrivenEvaluation: - """Tests for data-driven evaluation functionality in TaskManager.""" - - def test_load_dataset_samples_valid_jsonl(self): - """Test loading valid JSONL dataset.""" - # Create temporary JSONL file - with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f: - f.write('{"id": "sample1", "seed": 42}\n') - f.write('{"id": "sample2", "seed": 123}\n') - f.write('{"id": "sample3", "seed": 999}\n') - temp_file = f.name - - try: - task_manager = TaskManager() - samples = task_manager._load_dataset_samples(temp_file) - - assert len(samples) == 3 - assert samples[0] == {"id": "sample1", "seed": 42} - assert samples[1] == {"id": "sample2", "seed": 123} - assert samples[2] == {"id": "sample3", "seed": 999} - finally: - Path(temp_file).unlink() - - def test_load_dataset_samples_invalid_json(self): - """Test loading JSONL with invalid JSON lines.""" - # Create temporary JSONL file with some invalid lines - with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f: - f.write('{"id": "sample1", "seed": 42}\n') - f.write("invalid json line\n") # Invalid JSON - f.write('{"id": "sample2", "seed": 123}\n') - temp_file = f.name - - try: - task_manager = TaskManager() - samples = task_manager._load_dataset_samples(temp_file) - - # Should skip invalid line and load valid ones - assert len(samples) == 2 - assert samples[0] == {"id": "sample1", "seed": 42} - assert samples[1] == {"id": "sample2", "seed": 123} - finally: - Path(temp_file).unlink() - - def test_load_dataset_samples_nonexistent_file(self): - """Test loading from nonexistent file.""" - task_manager = TaskManager() - samples = task_manager._load_dataset_samples("nonexistent_file.jsonl") - - assert samples == [] - - def test_load_dataset_samples_empty_file(self): - """Test loading from empty file.""" - with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f: - # Empty file - temp_file = f.name - - try: - task_manager = TaskManager() - samples = task_manager._load_dataset_samples(temp_file) - - assert samples == [] - finally: - Path(temp_file).unlink() - - def test_load_dataset_samples_relative_path(self): - """Test loading dataset with relative path.""" - # Create a temporary directory and file - with tempfile.TemporaryDirectory() as temp_dir: - dataset_file = Path(temp_dir) / "test_dataset.jsonl" - with open(dataset_file, "w") as f: - f.write('{"id": "test", "seed": 42}\n') - - task_manager = TaskManager() - - # Test with absolute path since temp dir is not relative to cwd - absolute_path = str(dataset_file) - samples = task_manager._load_dataset_samples(absolute_path) - - assert len(samples) == 1 - assert samples[0] == {"id": "test", "seed": 42} - - -@pytest.mark.asyncio -class TestHttpRolloutResourceInitialization: - """Tests for HttpRolloutResource initialization with sample data.""" - - async def test_initialize_with_kwargs(self): - """Test that initialize method sends kwargs in POST request.""" - # Mock the HTTP client - mock_client = MagicMock() - mock_response = MagicMock() - mock_response.raise_for_status.return_value = None - mock_response.json.return_value = { - "episode_id": "test_episode", - "observation": {}, - } - mock_client.post.return_value = mock_response - - # Create resource with mock client - config = { - "base_url": "http://localhost:8080", - "start_episode_endpoint": "/start_episode", - } - - resource = HttpRolloutResource() - await resource.setup(config) - resource.client = mock_client # Replace with mock - - # Initialize with sample data - sample_data = {"seed": 42, "custom_param": "test_value"} - await resource.initialize(**sample_data) - - # Verify POST was called with correct parameters - mock_client.post.assert_called_once_with("http://localhost:8080/start_episode", json=sample_data) - - async def test_initialize_without_kwargs(self): - """Test that initialize method works without kwargs.""" - # Mock the HTTP client - mock_client = MagicMock() - mock_response = MagicMock() - mock_response.raise_for_status.return_value = None - mock_response.json.return_value = { - "episode_id": "test_episode", - "observation": {}, - } - mock_client.post.return_value = mock_response - - # Create resource with mock client - config = { - "base_url": "http://localhost:8080", - "start_episode_endpoint": "/start_episode", - } - - resource = HttpRolloutResource() - await resource.setup(config) - resource.client = mock_client # Replace with mock - - # Initialize without sample data - await resource.initialize() - - # Verify POST was called without json parameter - mock_client.post.assert_called_once_with("http://localhost:8080/start_episode") - - -@pytest.mark.asyncio -class TestOrchestratorSampleDataPassing: - """Tests for sample data passing in Orchestrator.""" - - async def test_execute_task_poc_with_sample_data(self): - """Test that execute_task_poc passes sample data to resource initialization.""" - # Create a minimal task definition - task_def_dict = { - "name": "test_task", - "description": "Test task", - "resource_type": "test_resource", - "base_resource_config": {}, - "reward_function_path": "test.reward", - "messages": [{"role": "user", "content": "test"}], - } - task_def = TaskDefinitionModel(**task_def_dict) - - # Mock the base resource - mock_resource = AsyncMock() - mock_resource.fork.return_value = AsyncMock() - mock_episode_resource = mock_resource.fork.return_value - mock_episode_resource.initialize = AsyncMock() - - # Create orchestrator - orchestrator = Orchestrator(task_definition=task_def) - orchestrator.base_resource = mock_resource - - # Mock execute_task_poc to just test the sample data passing logic - async def mock_execute_task_poc(sample_data=None): - if sample_data: - # Simulate the resource initialization that would happen - episode_resource = await orchestrator.base_resource.fork() - await episode_resource.initialize(**sample_data) - return {"score": 1.0} - - with patch.object(orchestrator, "execute_task_poc", side_effect=mock_execute_task_poc): - sample_data = {"seed": 42, "test_param": "value"} - await orchestrator.execute_task_poc(sample_data=sample_data) - - # Verify that episode resource was initialized with sample data - mock_episode_resource.initialize.assert_called_once_with(**sample_data) - - async def test_execute_task_poc_without_sample_data(self): - """Test that execute_task_poc works without sample data.""" - # Create a minimal task definition - task_def_dict = { - "name": "test_task", - "description": "Test task", - "resource_type": "test_resource", - "base_resource_config": {}, - "reward_function_path": "test.reward", - "messages": [{"role": "user", "content": "test"}], - } - task_def = TaskDefinitionModel(**task_def_dict) - - # Mock the base resource - mock_resource = AsyncMock() - mock_resource.fork.return_value = AsyncMock() - mock_episode_resource = mock_resource.fork.return_value - mock_episode_resource.initialize = AsyncMock() - - # Create orchestrator - orchestrator = Orchestrator(task_definition=task_def) - orchestrator.base_resource = mock_resource - - # Mock execute_task_poc to just test the sample data passing logic - async def mock_execute_task_poc(sample_data=None): - if sample_data: - # Simulate the resource initialization that would happen - episode_resource = await orchestrator.base_resource.fork() - await episode_resource.initialize(**sample_data) - return {"score": 1.0} - - with patch.object(orchestrator, "execute_task_poc", side_effect=mock_execute_task_poc): - await orchestrator.execute_task_poc(sample_data=None) - - # Verify that episode resource was not initialized (no sample_data) - mock_episode_resource.initialize.assert_not_called() - - -@pytest.mark.asyncio -class TestEndToEndDataDrivenEvaluation: - """Integration tests for end-to-end data-driven evaluation.""" - - async def test_data_driven_task_execution_flow(self): - """Test the complete flow of data-driven task execution.""" - # Create temporary dataset file - with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f: - f.write('{"id": "run_001", "seed": 42}\n') - f.write('{"id": "run_002", "seed": 123}\n') - temp_dataset = f.name - - try: - # Create task definition with dataset - task_def_dict = { - "name": "frozen_lake_test", - "description": "Test frozen lake with seeds", - "resource_type": "http_rollout", - "base_resource_config": {"base_url": "http://localhost:8080"}, - "reward_function_path": "test.reward", - "dataset_path": temp_dataset, - "num_rollouts_per_sample": 1, - "messages": [{"role": "user", "content": "test"}], - } - - task_manager = TaskManager() - task_manager.register_task("test_task", TaskDefinitionModel(**task_def_dict)) - - # Mock the orchestrator execution - with patch.object(task_manager, "_execute_data_driven_rollouts") as mock_execute: - mock_execute.return_value = [ - {"score": 1.0, "sample_data": {"id": "run_001", "seed": 42}}, - {"score": 0.0, "sample_data": {"id": "run_002", "seed": 123}}, - ] - - # Execute tasks - results = await task_manager.execute_tasks(["test_task"], max_concurrency=1) - - # Verify data-driven execution was called - mock_execute.assert_called_once() - call_args = mock_execute.call_args - samples = call_args[0][1] # Second argument is samples - - assert len(samples) == 2 - assert samples[0] == {"id": "run_001", "seed": 42} - assert samples[1] == {"id": "run_002", "seed": 123} - - finally: - Path(temp_dataset).unlink() - - def test_frozen_lake_dataset_format_validation(self): - """Test that the actual frozen lake dataset has correct format.""" - dataset_path = Path("examples/frozen_lake/client/dataset.jsonl") - - if dataset_path.exists(): - with open(dataset_path, "r") as f: - lines = f.readlines() - - # Should have at least one sample - assert len(lines) > 0, "Dataset should not be empty" - - for i, line in enumerate(lines): - line = line.strip() - if not line: - continue - - try: - sample = json.loads(line) - except json.JSONDecodeError: - pytest.fail(f"Invalid JSON on line {i+1}: {line}") - - # Each sample should have id and seed - assert "id" in sample, f"Sample {i+1} missing 'id' field" - assert "seed" in sample, f"Sample {i+1} missing 'seed' field" - assert isinstance(sample["seed"], int), f"Sample {i+1} seed should be integer" - - -if __name__ == "__main__": - pytest.main([__file__, "-v"]) From 7edd65e9402421f16c3294713a6e3c7d08ca8343 Mon Sep 17 00:00:00 2001 From: Derek Xu <32891260+xzrderek@users.noreply.github.com> Date: Wed, 13 Aug 2025 00:25:38 -0700 Subject: [PATCH 06/26] Pipelining (#46) * Add AIME2025, GPQA, HealthBench evaluation_test suites; unify row-limiting via pytest flag; clean up examples * evaluation with aggregated scores * WIP: vibe coded as an mvp * merge * remove * updated logger * formatting * formatting * fixing tests --------- Co-authored-by: benjibc --- eval_protocol/mcp/execution/manager.py | 143 +++++++--------- eval_protocol/mcp_env.py | 14 +- eval_protocol/pytest/__init__.py | 4 +- .../pytest/default_agent_rollout_processor.py | 52 ++++-- .../default_mcp_gym_rollout_processor.py | 18 +- .../pytest/default_no_op_rollout_process.py | 9 +- .../default_single_turn_rollout_process.py | 39 ++++- eval_protocol/pytest/evaluation_test.py | 158 +++++++++--------- eval_protocol/pytest/plugin.py | 13 +- eval_protocol/pytest/types.py | 4 +- eval_protocol/pytest/utils.py | 4 +- tests/pytest/test_pytest_ids.py | 8 +- .../test_rollout_control_plane_integration.py | 39 +++-- 13 files changed, 271 insertions(+), 234 deletions(-) diff --git a/eval_protocol/mcp/execution/manager.py b/eval_protocol/mcp/execution/manager.py index 5664e5ac..e0d101a7 100644 --- a/eval_protocol/mcp/execution/manager.py +++ b/eval_protocol/mcp/execution/manager.py @@ -12,7 +12,7 @@ import threading import time from dataclasses import asdict -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Any, AsyncIterator, Callable, Dict, List, Optional, Union import anyio from openai.types import CompletionUsage @@ -43,7 +43,7 @@ async def execute_rollouts( openai_format_log_file: Optional[str] = None, max_concurrent_rollouts: int = 8, evaluation_rows: Optional[List[EvaluationRow]] = None, - ) -> List[EvaluationRow]: + ) -> AsyncIterator[EvaluationRow]: """ Execute general rollouts using tool calling interface with automatic record/playback. @@ -66,7 +66,7 @@ async def execute_rollouts( - Set and file exists: Playback mode (uses recorded data) Returns: - List of EvaluationRow objects with unified evaluation data format + AsyncIterator of EvaluationRow objects with unified evaluation data format """ start_time = time.time() @@ -92,96 +92,77 @@ async def execute_rollouts( logger.info(f"🧵 Starting {envs.n} rollouts with max {max_concurrent_rollouts} concurrent threads...") - results = {} + if evaluation_rows is None: + evaluation_rows = [EvaluationRow(messages=[], input_metadata=InputMetadata()) for _ in range(envs.n)] + + shared_tool_schema = envs.tool_schemas semaphore = asyncio.Semaphore(max_concurrent_rollouts) async def _execute_with_semaphore(idx): async with semaphore: - result = await self._execute_rollout( + trajectory = await self._execute_rollout( envs, policy, idx, steps, openai_logger, recording_mode, playback_mode, start_time ) - return result - - tasks = [_execute_with_semaphore(i) for i in range(envs.n)] - # exceptions will be try catched inside single _execute_rollout - trajectories = await asyncio.gather(*tasks) - - # Calculate durations - total_duration = time.time() - start_time - for trajectory in trajectories: - trajectory.duration = total_duration - - shared_tool_schema = envs.tool_schemas - - # Enhanced reporting with control plane info - successful = sum(1 for traj in trajectories if traj.total_reward > 0) - terminated_by_control_plane = sum( - 1 - for traj in trajectories - if traj.control_plane_summary.get("termination_reason") == "control_plane_signal" - ) + # Convert trajectory to EvaluationRow immediately + evaluation_row = evaluation_rows[idx] + + # Handle multimodal content by extracting text from complex content structures + messages = [] + for msg in trajectory.conversation_history: + # Create a copy to avoid modifying the original + msg_dict = dict(msg) + + # Handle multimodal content (list of content blocks) by extracting text + if isinstance(msg_dict.get("content"), list): + text_content = None + for content_block in msg_dict["content"]: + if isinstance(content_block, dict) and content_block.get("type") == "text": + text_content = content_block.get("text") + break + msg_dict["content"] = text_content or "" + + messages.append(Message.model_validate(msg_dict)) + + evaluation_row.messages = messages + evaluation_row.tools = shared_tool_schema + evaluation_row.usage = CompletionUsage(**trajectory.usage) + evaluation_row.input_metadata.completion_params = CompletionParams( + model=policy.model_id, + temperature=getattr(policy, "temperature", None), + max_tokens=getattr(policy, "max_tokens", None), + max_tool_calls=getattr(policy, "max_tools_per_turn", None), + ) - logger.info(f"📊 Rollout complete: {successful}/{len(trajectories)} reached goal") - logger.info(f"🎛️ Control plane terminations: {terminated_by_control_plane}/{len(trajectories)}") - logger.info(f"⏱️ Total duration: {total_duration:.2f}s") - logger.info(f"🧵 Used {max_concurrent_rollouts} concurrent threads") + if trajectory.terminated: + if trajectory.termination_reason == TerminationReason.ERROR: + evaluation_row.rollout_status.status = "error" + evaluation_row.rollout_status.error_message = trajectory.control_plane_summary.get( + "error_message", None + ) + else: + evaluation_row.rollout_status.status = "finished" + evaluation_row.rollout_status.termination_reason = trajectory.termination_reason + else: + evaluation_row.rollout_status.status = "running" - # Print log file locations if created - if openai_format_log_file: - logger.info(f"💬 OpenAI format log: {openai_format_log_file}") - if recording_mode: - logger.info(f"📝 Recorded trajectory: {playback_file}") - # Add note about control plane separation - logger.info(f"🎛️ Trajectories include control plane separation") + return evaluation_row - # Convert trajectories to unified EvaluationRow format. If no evaluation_rows are provided, create empty ones for backwards compatibility. - if evaluation_rows is None: - evaluation_rows = [EvaluationRow(messages=[], input_metadata=InputMetadata()) for _ in trajectories] - - for idx, trajectory in enumerate(trajectories): - # Handle multimodal content by extracting text from complex content structures - messages = [] - for msg in trajectory.conversation_history: - # Create a copy to avoid modifying the original - msg_dict = dict(msg) - - # Handle multimodal content (list of content blocks) by extracting text - if isinstance(msg_dict.get("content"), list): - text_content = None - for content_block in msg_dict["content"]: - if isinstance(content_block, dict) and content_block.get("type") == "text": - text_content = content_block.get("text") - break - msg_dict["content"] = text_content or "" - - messages.append(Message.model_validate(msg_dict)) - - evaluation_rows[idx].messages = messages - # evaluation_rows[idx].input_metadata.row_id = envs.dataset_rows[idx].id - # evaluation_rows[idx].input_metadata.dataset_info = asdict(envs.dataset_rows[idx]) - evaluation_rows[idx].tools = shared_tool_schema - evaluation_rows[idx].usage = CompletionUsage(**trajectory.usage) - evaluation_rows[idx].input_metadata.completion_params = CompletionParams( - model=policy.model_id, - temperature=getattr(policy, "temperature", None), - max_tokens=getattr(policy, "max_tokens", None), - max_tool_calls=getattr(policy, "max_tools_per_turn", None), - ) - if trajectory.terminated: - if trajectory.termination_reason == TerminationReason.ERROR: - evaluation_rows[idx].rollout_status.status = "error" - evaluation_rows[idx].rollout_status.termination_reason = trajectory.control_plane_summary.get( - "error_message", None - ) - else: - evaluation_rows[idx].rollout_status.status = "finished" - evaluation_rows[idx].rollout_status.termination_reason = trajectory.termination_reason - else: - evaluation_rows[idx].rollout_status.status = "running" + # Create all tasks + tasks = [asyncio.create_task(_execute_with_semaphore(i)) for i in range(envs.n)] - return evaluation_rows + # Yield results as they complete (note that they're not necessarily in original order) + try: + for task in asyncio.as_completed(tasks): + try: + yield await task + except Exception: + logger.exception("Error processing rollout") + finally: + for t in tasks: + t.cancel() + await asyncio.gather(*tasks, return_exceptions=True) async def _execute_rollout( self, diff --git a/eval_protocol/mcp_env.py b/eval_protocol/mcp_env.py index 5ec67658..5d930a4e 100644 --- a/eval_protocol/mcp_env.py +++ b/eval_protocol/mcp_env.py @@ -41,11 +41,13 @@ """ import asyncio +import hashlib +import json # For legacy compatibility - import the facade functions import logging import random -from typing import Any, Callable, Dict, List, Optional, Union +from typing import Any, AsyncIterator, Callable, Dict, List, Optional, Union # Import all functionality from the new modular components from .mcp.execution.manager import ExecutionManager @@ -53,9 +55,6 @@ from .mcp.session.manager import GeneralMCPVectorEnv from .models import EvaluationRow from .types import DatasetRow, MCPSession, MCPToolCall -import asyncio -import hashlib -import json logger = logging.getLogger(__name__) @@ -247,7 +246,7 @@ async def rollout( steps: int = 512, openai_format_log_file: Optional[str] = None, max_concurrent_rollouts: int = 8, -) -> List[EvaluationRow]: +) -> AsyncIterator[EvaluationRow]: """ Execute general rollouts using tool calling interface with automatic record/playback. @@ -307,9 +306,10 @@ async def rollout( # Use the new ExecutionManager for execution execution_manager = ExecutionManager() - return await execution_manager.execute_rollouts( + async for evaluation_row in execution_manager.execute_rollouts( envs, policy, steps, openai_format_log_file, max_concurrent_rollouts, evaluation_rows - ) + ): + yield evaluation_row async def test_mcp(base_url: str, seeds: List[int]) -> Dict[str, Any]: diff --git a/eval_protocol/pytest/__init__.py b/eval_protocol/pytest/__init__.py index ce881ccc..2d2576d6 100644 --- a/eval_protocol/pytest/__init__.py +++ b/eval_protocol/pytest/__init__.py @@ -1,12 +1,14 @@ from .default_agent_rollout_processor import default_agent_rollout_processor +from .default_dataset_adapter import default_dataset_adapter +from .default_mcp_gym_rollout_processor import default_mcp_gym_rollout_processor from .default_no_op_rollout_process import default_no_op_rollout_processor from .default_single_turn_rollout_process import default_single_turn_rollout_processor from .evaluation_test import evaluation_test from .types import RolloutProcessor, RolloutProcessorConfig -from .default_dataset_adapter import default_dataset_adapter __all__ = [ "default_agent_rollout_processor", + "default_mcp_gym_rollout_processor", "default_no_op_rollout_processor", "default_single_turn_rollout_processor", "default_dataset_adapter", diff --git a/eval_protocol/pytest/default_agent_rollout_processor.py b/eval_protocol/pytest/default_agent_rollout_processor.py index bd7c62c2..6a158b54 100644 --- a/eval_protocol/pytest/default_agent_rollout_processor.py +++ b/eval_protocol/pytest/default_agent_rollout_processor.py @@ -1,7 +1,8 @@ import asyncio import json +import logging import os -from typing import Any, List, Optional, Union +from typing import Any, AsyncIterator, List, Optional, Union from mcp.types import CallToolResult, TextContent from openai import NOT_GIVEN, NotGiven @@ -14,6 +15,8 @@ from eval_protocol.models import EvaluationRow, Message from eval_protocol.pytest.types import Dataset, RolloutProcessorConfig +logger = logging.getLogger(__name__) + class Agent: """ @@ -114,13 +117,42 @@ def _get_content_from_tool_result(self, tool_result: CallToolResult) -> List[Tex async def default_agent_rollout_processor( rows: List[EvaluationRow], config: RolloutProcessorConfig -) -> List[EvaluationRow]: - dataset: Dataset = [] - for row in rows: +) -> AsyncIterator[EvaluationRow]: + """Process agent rollouts with bounded concurrency and yield as they complete.""" + + max_concurrent = getattr(config, "max_concurrent_rollouts", 8) or 8 + semaphore = asyncio.Semaphore(max_concurrent) + + async def process_row(row: EvaluationRow) -> EvaluationRow: + """Process a single row with agent rollout.""" agent = Agent(model=config.model, row=row, config_path=config.mcp_config_path, logger=config.logger) - await agent.setup() - await agent.call_agent() - dataset.append(agent.evaluation_row) - if agent.mcp_client: - await agent.mcp_client.cleanup() - return dataset + try: + await agent.setup() + await agent.call_agent() + return agent.evaluation_row + finally: + if agent.mcp_client: + await agent.mcp_client.cleanup() + + async def _sem_wrapper(r: EvaluationRow) -> EvaluationRow: + async with semaphore: + try: + return await process_row(r) + except Exception as e: + logger.exception(f"Error processing row {r.input_metadata.row_id}: {e}") + return r + + # Create all tasks + tasks = [asyncio.create_task(_sem_wrapper(row)) for row in rows] + + # Yield results as they complete (note that they're not necessarily in original order) + try: + for task in asyncio.as_completed(tasks): + try: + yield await task + except Exception: + logger.exception("Error processing row") + finally: + for t in tasks: + t.cancel() + await asyncio.gather(*tasks, return_exceptions=True) diff --git a/eval_protocol/pytest/default_mcp_gym_rollout_processor.py b/eval_protocol/pytest/default_mcp_gym_rollout_processor.py index 5037cbad..de9d8ca1 100644 --- a/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +++ b/eval_protocol/pytest/default_mcp_gym_rollout_processor.py @@ -6,7 +6,7 @@ import subprocess import time from pathlib import Path -from typing import List, Optional +from typing import AsyncIterator, List, Optional import eval_protocol as ep from eval_protocol.models import EvaluationRow, Message @@ -194,22 +194,19 @@ def __exit__(self, exc_type, exc_val, exc_tb): async def default_mcp_gym_rollout_processor( rows: List[EvaluationRow], config: RolloutProcessorConfig -) -> List[EvaluationRow]: +) -> AsyncIterator[EvaluationRow]: """ Rollout processor for tau bench environments. - This processor starts an MCP server, creates tau bench environments, and runs rollouts - using the eval_protocol framework, following the pattern from test_tau2_e2e.py. - + using the eval_protocol framework, yielding results as they complete. Args: rows: List of EvaluationRow objects containing messages and dataset info in input_metadata config: RolloutProcessorConfig with model and other parameters - Returns: - List of EvaluationRow objects with completed conversations + AsyncIterator of EvaluationRow objects with completed conversations """ if config.server_script_path is None: raise ValueError("server_script_path is required for default_mcp_gym_rollout_processor") @@ -233,15 +230,14 @@ async def default_mcp_gym_rollout_processor( ) # Run rollout with environments and policy - evaluation_rows = await ep.rollout( + async for evaluation_row in ep.rollout( envs, policy=policy, evaluation_rows=rows, steps=config.steps, max_concurrent_rollouts=config.max_concurrent_rollouts, - ) - - return evaluation_rows + ): + yield evaluation_row finally: # Always clean up the server diff --git a/eval_protocol/pytest/default_no_op_rollout_process.py b/eval_protocol/pytest/default_no_op_rollout_process.py index bae733c3..47cb17be 100644 --- a/eval_protocol/pytest/default_no_op_rollout_process.py +++ b/eval_protocol/pytest/default_no_op_rollout_process.py @@ -1,12 +1,15 @@ -from typing import List +from typing import AsyncIterator, List from eval_protocol.models import EvaluationRow from eval_protocol.pytest.types import RolloutProcessorConfig -def default_no_op_rollout_processor(rows: List[EvaluationRow], config: RolloutProcessorConfig) -> List[EvaluationRow]: +async def default_no_op_rollout_processor( + rows: List[EvaluationRow], config: RolloutProcessorConfig +) -> AsyncIterator[EvaluationRow]: """ Simply passes input dataset through to the test function. This can be useful if you want to run the rollout yourself. """ - return rows + for row in rows: + yield row diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py index 95613ebc..424347cd 100644 --- a/eval_protocol/pytest/default_single_turn_rollout_process.py +++ b/eval_protocol/pytest/default_single_turn_rollout_process.py @@ -1,15 +1,23 @@ import asyncio import logging import os -from typing import List +import time +from typing import AsyncIterator, List -from eval_protocol.models import ChatCompletionMessageToolCall, EvaluationRow, Message +import litellm +from litellm import acompletion +from openai.types.chat.chat_completion_message import ChatCompletionMessageToolCall + +from eval_protocol.dataset_logger import default_logger +from eval_protocol.models import EvaluationRow, Message from eval_protocol.pytest.types import RolloutProcessorConfig +logger = logging.getLogger(__name__) + async def default_single_turn_rollout_processor( rows: List[EvaluationRow], config: RolloutProcessorConfig -) -> List[EvaluationRow]: +) -> AsyncIterator[EvaluationRow]: """Generate a single response from any supported model provider using LiteLLM.""" # Quiet LiteLLM logs in test runs unless user overrode @@ -41,7 +49,10 @@ async def process_row(row: EvaluationRow) -> EvaluationRow: if isinstance(config.input_params, dict): if "reasoning_effort" in config.input_params: effort_val = str(config.input_params["reasoning_effort"]) # flat shape - elif isinstance(config.input_params.get("extra_body"), dict) and "reasoning_effort" in config.input_params["extra_body"]: + elif ( + isinstance(config.input_params.get("extra_body"), dict) + and "reasoning_effort" in config.input_params["extra_body"] + ): # Accept if user passed it directly inside extra_body effort_val = str(config.input_params["extra_body"]["reasoning_effort"]) # already in extra_body @@ -89,10 +100,10 @@ async def process_row(row: EvaluationRow) -> EvaluationRow: ] row.messages = messages - config.logger.log(row) + default_logger.log(row) return row - # Process rows with bounded concurrency if configured + # Process rows with bounded concurrency and yield as they complete max_concurrent = getattr(config, "max_concurrent_rollouts", 8) or 8 semaphore = asyncio.Semaphore(max_concurrent) @@ -103,7 +114,17 @@ async def _sem_wrapper(r: EvaluationRow) -> EvaluationRow: except Exception as e: return r - tasks = [_sem_wrapper(row) for row in rows] - dataset = list(await asyncio.gather(*tasks)) + # Create all tasks + tasks = [asyncio.create_task(_sem_wrapper(row)) for row in rows] - return dataset + # Yield results as they complete (note that they're not necessarily in original order) + try: + for task in asyncio.as_completed(tasks): + try: + yield await task + except Exception: + logger.exception("Error processing row") + finally: + for t in tasks: + t.cancel() + await asyncio.gather(*tasks, return_exceptions=True) diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index f1d9af50..81856ff6 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -1,8 +1,13 @@ +import asyncio import copy import inspect +import json import math import os +import pathlib +import re import statistics +import time from typing import Any, Callable, Dict, List, Literal, Optional, Union import pytest @@ -173,7 +178,7 @@ def decorator( if sig.return_annotation is not List[EvaluationRow]: raise ValueError("In batch mode, your eval function must return a list of EvaluationRow instances") - def execute_with_params( + async def execute_with_params( test_func: TestFunction, processed_row: EvaluationRow | None = None, processed_dataset: List[EvaluationRow] | None = None, @@ -190,7 +195,12 @@ def execute_with_params( if "rows" in evaluation_test_kwargs: raise ValueError("'rows' is a reserved parameter for the evaluation function") kwargs.update(evaluation_test_kwargs) - return execute_function(test_func, **kwargs) + + # Handle both sync and async test functions + if asyncio.iscoroutinefunction(test_func): + return await test_func(**kwargs) + else: + return test_func(**kwargs) # Calculate all possible combinations of parameters def _parse_ep_max_rows(default_value: int | None) -> int | None: @@ -300,7 +310,7 @@ def create_wrapper_with_signature() -> Callable: # Create the function body that will be used invocation_id = generate_id() - def wrapper_body(**kwargs): + async def wrapper_body(**kwargs): model_name = kwargs["model"] eval_metadata = None all_results: List[List[EvaluationRow]] = [[] for _ in range(num_runs)] @@ -423,26 +433,40 @@ def _log_eval_error( for row in fresh_dataset: active_logger.log(row) - processed_dataset = execute_function(rollout_processor, rows=fresh_dataset, config=config) + rollout_result = rollout_processor(fresh_dataset, config) if mode == "pointwise": - # Pointwise mode: apply the evaluator function to each row - for row in processed_dataset: - result = execute_with_params( - test_func, - processed_row=row, - evaluation_test_kwargs=kwargs.get("evaluation_test_kwargs") or {}, - ) - if result is None or not isinstance(result, EvaluationRow): - raise ValueError( - f"Test function {test_func.__name__} did not return an EvaluationRow instance. You must return an EvaluationRow instance from your test function decorated with @evaluation_test." + # Pointwise mode, rollouts will return as they complete so we can pipeline evaluation_test execution + semaphore = asyncio.Semaphore(max_concurrent_rollouts) + tasks = [] + + async def _execute_with_semaphore(row): + async with semaphore: + result = await execute_with_params( + test_func, + processed_row=row, + evaluation_test_kwargs=kwargs.get("evaluation_test_kwargs") or {}, ) - all_results[i].append(result) + if result is None or not isinstance(result, EvaluationRow): + raise ValueError( + f"Test function {test_func.__name__} did not return an EvaluationRow instance. You must return an EvaluationRow instance from your test function decorated with @evaluation_test." + ) + return result + + async for row in rollout_processor(fresh_dataset, config): + tasks.append(asyncio.create_task(_execute_with_semaphore(row))) + + all_results[i] = await asyncio.gather(*tasks) + else: - # Batch mode: call the test function with the full dataset - results = execute_with_params( + # Batch mode: collect all results first, then evaluate (no pipelining) + input_dataset = [] + async for row in rollout_result: + input_dataset.append(row) + + results = await execute_with_params( test_func, - processed_dataset=processed_dataset, + processed_dataset=input_dataset, evaluation_test_kwargs=kwargs.get("evaluation_test_kwargs") or {}, ) if results is None: @@ -568,10 +592,6 @@ def _log_eval_error( ) # As per project convention, avoid printing per-metric CI lines to reduce noise if summary_path: - import json - import pathlib - import re - import time def _sanitize_filename(text: str) -> str: safe = re.sub(r"[^A-Za-z0-9._-]+", "-", text.strip()) @@ -667,6 +687,7 @@ def _extract_effort_tag(params: dict) -> str | None: # Create the pytest wrapper pytest_wrapper = create_wrapper_with_signature() pytest_wrapper = pytest.mark.parametrize(test_param_names, param_tuples)(pytest_wrapper) + pytest_wrapper = pytest.mark.asyncio(pytest_wrapper) def create_dual_mode_wrapper() -> Callable: """ @@ -687,66 +708,39 @@ def create_dual_mode_wrapper() -> Callable: # Check if the test function is async is_async = asyncio.iscoroutinefunction(test_func) - if is_async: - - async def dual_mode_wrapper(*args, **kwargs): - # Check if this is a direct call with the expected signature - if mode == "pointwise": - # For pointwise mode, check if called with a single row argument - if len(args) == 1 and isinstance(args[0], EvaluationRow) and not kwargs: - return await test_func(row=args[0]) - else: - # For batch mode, check if called with rows argument - if ( - len(args) == 1 - and isinstance(args[0], list) - and all(isinstance(r, EvaluationRow) for r in args[0]) - and not kwargs - ): - return await test_func(rows=args[0]) - # Also check if called with keyword argument 'rows' - if ( - len(args) == 0 - and "rows" in kwargs - and isinstance(kwargs["rows"], list) - and all(isinstance(r, EvaluationRow) for r in kwargs["rows"]) - ): - return await test_func(**kwargs) - - # If not a direct call, use the pytest wrapper - return pytest_wrapper(*args, **kwargs) - - else: - - def dual_mode_wrapper(*args, **kwargs): - # Check if this is a direct call with the expected signature - if mode == "pointwise": - # For pointwise mode, check if called with a single row argument - if len(args) == 1 and isinstance(args[0], EvaluationRow) and not kwargs: - return test_func(row=args[0]) - - if len(args) == 0 and "row" in kwargs and isinstance(kwargs["row"], EvaluationRow): - return test_func(**kwargs) - else: - # For batch mode, check if called with rows argument - if ( - len(args) == 1 - and isinstance(args[0], list) - and all(isinstance(r, EvaluationRow) for r in args[0]) - and not kwargs - ): - return test_func(rows=args[0]) - # Also check if called with keyword argument 'rows' - if ( - len(args) == 0 - and "rows" in kwargs - and isinstance(kwargs["rows"], list) - and all(isinstance(r, EvaluationRow) for r in kwargs["rows"]) - ): - return test_func(**kwargs) - - # If not a direct call, use the pytest wrapper - return pytest_wrapper(*args, **kwargs) + async def call_test_func(**call_kwargs): + """Helper to call test_func with proper async/sync handling""" + if is_async: + return await test_func(**call_kwargs) + else: + return test_func(**call_kwargs) + + async def dual_mode_wrapper(*args, **kwargs): + # Check if this is a direct call with the expected signature + if mode == "pointwise": + # For pointwise mode, check if called with a single row argument + if len(args) == 1 and isinstance(args[0], EvaluationRow) and not kwargs: + return await call_test_func(row=args[0]) + else: + # For batch mode, check if called with rows argument + if ( + len(args) == 1 + and isinstance(args[0], list) + and all(isinstance(r, EvaluationRow) for r in args[0]) + and not kwargs + ): + return await call_test_func(rows=args[0]) + # Also check if called with keyword argument 'rows' + if ( + len(args) == 0 + and "rows" in kwargs + and isinstance(kwargs["rows"], list) + and all(isinstance(r, EvaluationRow) for r in kwargs["rows"]) + ): + return await call_test_func(**kwargs) + + # If not a direct call, use the pytest wrapper + return await pytest_wrapper(*args, **kwargs) # Copy all attributes from the pytest wrapper to our dual mode wrapper import functools diff --git a/eval_protocol/pytest/plugin.py b/eval_protocol/pytest/plugin.py index 6c58d1e2..3a5ec0e2 100644 --- a/eval_protocol/pytest/plugin.py +++ b/eval_protocol/pytest/plugin.py @@ -12,8 +12,8 @@ max_dataset_rows value set in the decorator). """ -import os import logging +import os from typing import Optional @@ -32,17 +32,13 @@ def pytest_addoption(parser) -> None: "--ep-print-summary", action="store_true", default=False, - help=( - "Print a concise summary line (suite/model/effort/agg score) at the end of each evaluation_test." - ), + help=("Print a concise summary line (suite/model/effort/agg score) at the end of each evaluation_test."), ) group.addoption( "--ep-summary-json", action="store", default=None, - help=( - "Write a JSON summary artifact at the given path (e.g., ./outputs/aime_low.json)." - ), + help=("Write a JSON summary artifact at the given path (e.g., ./outputs/aime_low.json)."), ) group.addoption( "--ep-input-param", @@ -108,6 +104,7 @@ def pytest_configure(config) -> None: try: import json as _json import pathlib as _pathlib + merged: dict = {} input_params_opts = config.getoption("--ep-input-param") if input_params_opts: @@ -139,5 +136,3 @@ def pytest_configure(config) -> None: except Exception: # best effort, do not crash pytest session pass - - diff --git a/eval_protocol/pytest/types.py b/eval_protocol/pytest/types.py index c6de681e..9f564ce1 100644 --- a/eval_protocol/pytest/types.py +++ b/eval_protocol/pytest/types.py @@ -3,7 +3,7 @@ """ from dataclasses import dataclass, field -from typing import Any, Callable, Dict, List, Literal, Optional +from typing import Any, AsyncIterator, Callable, Dict, List, Literal, Optional from eval_protocol.dataset_logger import default_logger from eval_protocol.dataset_logger.dataset_logger import DatasetLogger @@ -53,4 +53,4 @@ class RolloutProcessorConfig: kwargs: Dict[str, Any] = field(default_factory=dict) # any additional kwargs to pass to the rollout processor -RolloutProcessor = Callable[[List[EvaluationRow], RolloutProcessorConfig], List[EvaluationRow]] +RolloutProcessor = Callable[[List[EvaluationRow], RolloutProcessorConfig], AsyncIterator[EvaluationRow]] diff --git a/eval_protocol/pytest/utils.py b/eval_protocol/pytest/utils.py index 981c1ed3..23a5722d 100644 --- a/eval_protocol/pytest/utils.py +++ b/eval_protocol/pytest/utils.py @@ -88,8 +88,8 @@ def create_dynamically_parameterized_wrapper(test_func, wrapper_body, test_param from functools import wraps @wraps(test_func) - def wrapper(**kwargs): - return wrapper_body(**kwargs) + async def wrapper(**kwargs): + return await wrapper_body(**kwargs) parameters = [inspect.Parameter(name, inspect.Parameter.POSITIONAL_OR_KEYWORD) for name in test_param_names] wrapper.__signature__ = inspect.Signature(parameters) diff --git a/tests/pytest/test_pytest_ids.py b/tests/pytest/test_pytest_ids.py index 0131bcbe..24ba3baf 100644 --- a/tests/pytest/test_pytest_ids.py +++ b/tests/pytest/test_pytest_ids.py @@ -19,7 +19,7 @@ def read(self): return list(self._rows.values()) -def test_evaluation_test_decorator(monkeypatch): +async def test_evaluation_test_decorator(monkeypatch): from eval_protocol.pytest.evaluation_test import evaluation_test logger = InMemoryLogger() @@ -45,13 +45,13 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow: # Manually invoke all parameter combinations within a single test for ds_path in dataset_paths: - eval_fn(model="dummy/local-model", dataset_path=[ds_path]) + await eval_fn(model="dummy/local-model", dataset_path=[ds_path]) # Assertions on IDs generated by the decorator logic assert len(logger.read()) == 38 -def test_evaluation_test_decorator_ids_single(monkeypatch): +async def test_evaluation_test_decorator_ids_single(monkeypatch): in_memory_logger = InMemoryLogger() unique_run_ids = set() unique_experiment_ids = set() @@ -92,7 +92,7 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow: # Manually invoke all parameter combinations within a single test for ds_path in dataset_paths: for params in input_params_list: - eval_fn(model="dummy/local-model", dataset_path=[ds_path], input_params=params) + await eval_fn(model="dummy/local-model", dataset_path=[ds_path], input_params=params) # Assertions on IDs generated by the decorator logic assert len(unique_invocation_ids) == 1 diff --git a/tests/test_rollout_control_plane_integration.py b/tests/test_rollout_control_plane_integration.py index dcaac0e9..1b92d5aa 100644 --- a/tests/test_rollout_control_plane_integration.py +++ b/tests/test_rollout_control_plane_integration.py @@ -239,7 +239,9 @@ def mock_step_side_effect(env_index, tool_call): policy = MockPolicy(["right", "down", "right"]) # Execute rollout - evaluation_rows = await self.execution_manager.execute_rollouts(mock_env, policy, steps=10) + evaluation_rows = [] + async for row in self.execution_manager.execute_rollouts(mock_env, policy, steps=10): + evaluation_rows.append(row) # Validate results assert len(evaluation_rows) == 1, "Should have one evaluation row" @@ -457,7 +459,9 @@ async def test_rollout_handles_control_plane_failure_gracefully(self): # Execute rollout with control plane failure policy = MockPolicy(["right"]) - evaluation_rows = await self.execution_manager.execute_rollouts(mock_env, policy, steps=1) + evaluation_rows = [] + async for row in self.execution_manager.execute_rollouts(mock_env, policy, steps=1): + evaluation_rows.append(row) # Should still work, but without control plane info assert len(evaluation_rows) == 1 @@ -500,15 +504,26 @@ async def test_rollout_creates_envs_from_url(self): mock_make.return_value = mock_env manager_instance = MockManager.return_value - manager_instance.execute_rollouts = AsyncMock(return_value=["ok"]) - result = await ep.rollout( + # Mock execute_rollouts to return an async generator and track calls + call_args = [] + + async def mock_execute_rollouts(*args, **kwargs): + call_args.append((args, kwargs)) + for item in ["ok"]: + yield item + + manager_instance.execute_rollouts = mock_execute_rollouts + + result = [] + async for row in ep.rollout( "http://localhost:1234/mcp/", policy, dataset=dataset, model_id="test_model", steps=5, - ) + ): + result.append(row) mock_make.assert_called_once_with( "http://localhost:1234/mcp/", @@ -517,14 +532,12 @@ async def test_rollout_creates_envs_from_url(self): model_id="test_model", ) - manager_instance.execute_rollouts.assert_called_once_with( - mock_make.return_value, - policy, - 5, - None, - 8, - None, - ) + # Verify execute_rollouts was called with correct arguments + assert len(call_args) == 1, "execute_rollouts should be called once" + args, kwargs = call_args[0] + assert args[0] == mock_make.return_value, "First arg should be mock env" + assert args[1] == policy, "Second arg should be policy" + assert args[2] == 5, "Third arg should be steps" assert result == ["ok"] From c3574f9adcfad53d65dece4a9ae5c038172cfb06 Mon Sep 17 00:00:00 2001 From: Derek Xu <32891260+xzrderek@users.noreply.github.com> Date: Wed, 13 Aug 2025 13:20:11 -0700 Subject: [PATCH 07/26] E2E Smoke Test (#75) * e2e smoke test * temp adding * update * test * adjust bounds * change back to regular schedule * final --- .github/workflows/e2e-smoke-test.yml | 188 ++++++++++++++++++++ tests/pytest/test_tau_bench_airline.py | 2 +- tests/test_tau_bench_airline_smoke.py | 236 +++++++++++++++++++++++++ 3 files changed, 425 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/e2e-smoke-test.yml create mode 100644 tests/test_tau_bench_airline_smoke.py diff --git a/.github/workflows/e2e-smoke-test.yml b/.github/workflows/e2e-smoke-test.yml new file mode 100644 index 00000000..ec91875c --- /dev/null +++ b/.github/workflows/e2e-smoke-test.yml @@ -0,0 +1,188 @@ +name: E2E Smoke Test + +# Run every 6 hours: at 00:00, 06:00, 12:00, and 18:00 UTC +on: + schedule: + - cron: '0 */6 * * *' + workflow_dispatch: # Allow manual triggering + inputs: + debug_mode: + description: 'Enable debug output' + required: false + default: 'false' + type: boolean + +jobs: + e2e-smoke-test: + name: E2E Smoke Test + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Python 3.12 + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install uv + uses: astral-sh/setup-uv@v6 + with: + enable-cache: true + + - name: Install the project + run: uv sync --locked --all-extras --dev + + - name: Install tau2 for testing + run: uv pip install git+https://github.com/sierra-research/tau2-bench.git@main + + - name: Run E2E Smoke Test + id: run_test + env: + FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }} + FIREWORKS_ACCOUNT_ID: ${{ secrets.FIREWORKS_ACCOUNT_ID }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + PYTHONWARNINGS: "ignore::DeprecationWarning,ignore::RuntimeWarning" + run: | + echo "Running e2e smoke test..." + + # Run the test and capture both stdout and exit code + set +e # Don't exit on failure + + uv run pytest tests/test_tau_bench_airline_smoke.py::test_tau_bench_airline_smoke_evaluation \ + -v --tb=short --durations=10 \ + --ep-print-summary \ + --ep-summary-json=ep_summary.json 2>&1 | tee test_output.log + + TEST_EXIT_CODE=$? + + echo "test_exit_code=$TEST_EXIT_CODE" >> $GITHUB_OUTPUT + + # List generated files for debugging + echo "📁 Generated files:" + ls -la *.json 2>/dev/null || echo "No JSON files found" + ls -la ep_summary* 2>/dev/null || echo "No ep_summary files found" + + # Parse EP summary from terminal output (more reliable than JSON files) + if [ -f test_output.log ]; then + echo "📋 Parsing EP summary from terminal output..." + + # Show the terminal output for debugging + echo "Terminal output:" + cat test_output.log + echo "" + + # Extract the EP Summary line from the terminal output + EP_SUMMARY_LINE=$(grep "EP Summary |" test_output.log 2>/dev/null || echo "") + + if [ -n "$EP_SUMMARY_LINE" ]; then + echo "Found EP Summary line:" + echo "$EP_SUMMARY_LINE" + + # Parse the agg score from the line: "EP Summary | ... agg=0.420 ..." + SUCCESS_RATE=$(echo "$EP_SUMMARY_LINE" | grep -o "agg=[0-9.]*" | cut -d= -f2 2>/dev/null || echo "0") + + # Extract other info + NUM_RUNS=$(echo "$EP_SUMMARY_LINE" | grep -o "runs=[0-9]*" | cut -d= -f2 2>/dev/null || echo "0") + NUM_ROWS=$(echo "$EP_SUMMARY_LINE" | grep -o "rows=[0-9]*" | cut -d= -f2 2>/dev/null || echo "0") + + echo "success_rate=$SUCCESS_RATE" >> $GITHUB_OUTPUT + + # Check if success rate meets thresholds (36% - 60% acceptable range) + LOWER_BOUND=0.36 # 36% + UPPER_BOUND=0.6 # 60% + LOWER_BOUND_MET=$(echo "$SUCCESS_RATE >= $LOWER_BOUND" | bc -l 2>/dev/null || echo "0") + UPPER_BOUND_MET=$(echo "$SUCCESS_RATE <= $UPPER_BOUND" | bc -l 2>/dev/null || echo "0") + THRESHOLD_MET=$(echo "$LOWER_BOUND_MET && $UPPER_BOUND_MET" | bc -l 2>/dev/null || echo "0") + + echo "lower_bound_met=$LOWER_BOUND_MET" >> $GITHUB_OUTPUT + echo "upper_bound_met=$UPPER_BOUND_MET" >> $GITHUB_OUTPUT + echo "threshold_met=$THRESHOLD_MET" >> $GITHUB_OUTPUT + + echo "📊 Evaluation Summary (from terminal output):" + echo " - Success rate: $(echo "$SUCCESS_RATE * 100" | bc -l 2>/dev/null || echo "unknown")%" + echo " - Dataset rows evaluated: $NUM_ROWS" + echo " - Number of runs: $NUM_RUNS" + echo " - Lower bound (≥36%) met: $([ "$LOWER_BOUND_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")" + echo " - Upper bound (≤60%) met: $([ "$UPPER_BOUND_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")" + echo " - Within acceptable range: $([ "$THRESHOLD_MET" = "1" ] && echo "✅ YES" || echo "❌ NO")" + else + echo "❌ No EP Summary line found in terminal output" + echo "threshold_met=0" >> $GITHUB_OUTPUT + echo "success_rate=0" >> $GITHUB_OUTPUT + fi + else + echo "❌ No terminal output file found" + echo "threshold_met=0" >> $GITHUB_OUTPUT + echo "success_rate=0" >> $GITHUB_OUTPUT + fi + + - name: Upload test results + if: always() + uses: actions/upload-artifact@v4 + with: + name: e2e-smoke-test-results-${{ github.run_number }} + path: | + test_output.log + ep_summary*.json + *.log + retention-days: 7 + + - name: Validate test results + if: always() + run: | + echo "Validating test results against thresholds..." + + TEST_EXIT_CODE="${{ steps.run_test.outputs.test_exit_code }}" + THRESHOLD_MET="${{ steps.run_test.outputs.threshold_met }}" + LOWER_BOUND_MET="${{ steps.run_test.outputs.lower_bound_met }}" + UPPER_BOUND_MET="${{ steps.run_test.outputs.upper_bound_met }}" + SUCCESS_RATE="${{ steps.run_test.outputs.success_rate }}" + + echo "Test exit code: $TEST_EXIT_CODE" + echo "Threshold met (40%-60%): $THRESHOLD_MET" + echo "Lower bound met (≥40%): $LOWER_BOUND_MET" + echo "Upper bound met (≤60%): $UPPER_BOUND_MET" + echo "Success rate: $SUCCESS_RATE" + + # Fail the job if tests didn't run successfully or thresholds weren't met + if [ "$TEST_EXIT_CODE" != "0" ] && [ "$THRESHOLD_MET" != "1" ]; then + echo "❌ E2E smoke test FAILED" + echo " - Test execution failed (exit code: $TEST_EXIT_CODE)" + echo " - Success rate outside acceptable range (required: 40%-60%, actual: ${SUCCESS_RATE:-unknown})" + exit 1 + elif [ "$TEST_EXIT_CODE" != "0" ]; then + echo "⚠️ E2E smoke test had test execution issues but may have met thresholds" + echo " - Test exit code: $TEST_EXIT_CODE" + echo " - Thresholds met: $THRESHOLD_MET" + # Don't exit with error if thresholds were actually met despite test issues + if [ "$THRESHOLD_MET" = "1" ]; then + echo "✅ Thresholds met despite execution issues - considering this a pass" + else + exit 1 + fi + elif [ "$THRESHOLD_MET" != "1" ]; then + # Determine which bound was violated + if [ "$LOWER_BOUND_MET" != "1" ]; then + echo "❌ E2E smoke test FAILED - success rate too low" + echo " - Success rate: ${SUCCESS_RATE:-unknown}" + echo " - Required: ≥40%" + elif [ "$UPPER_BOUND_MET" != "1" ]; then + echo "❌ E2E smoke test FAILED - success rate suspiciously high" + echo " - Success rate: ${SUCCESS_RATE:-unknown}" + echo " - Maximum expected: ≤60%" + echo " - This may indicate test issues or unrealistic performance" + else + echo "❌ E2E smoke test FAILED - success rate outside acceptable range" + echo " - Success rate: ${SUCCESS_RATE:-unknown}" + echo " - Required range: 40%-60%" + fi + exit 1 + else + echo "✅ E2E smoke test PASSED" + echo " - Success rate: ${SUCCESS_RATE:-unknown}" + echo " - Within acceptable range: 40%-60%" + fi diff --git a/tests/pytest/test_tau_bench_airline.py b/tests/pytest/test_tau_bench_airline.py index 80aadf14..f5472092 100644 --- a/tests/pytest/test_tau_bench_airline.py +++ b/tests/pytest/test_tau_bench_airline.py @@ -65,7 +65,7 @@ def tau_bench_airline_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Eval input_dataset=["tests/pytest/data/airline_dataset.jsonl"], dataset_adapter=tau_bench_airline_to_evaluation_row, model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"], - rollout_input_params=[{"temperature": 0.8, "max_tokens": 4096, "reasoning_effort": "low"}], + rollout_input_params=[{"temperature": 0.8, "extra_body": {"reasoning_effort": "medium"}}], rollout_processor=default_mcp_gym_rollout_processor, passed_threshold={"success": 0.4, "standard_deviation": 0.1}, num_runs=8, diff --git a/tests/test_tau_bench_airline_smoke.py b/tests/test_tau_bench_airline_smoke.py new file mode 100644 index 00000000..e96baabe --- /dev/null +++ b/tests/test_tau_bench_airline_smoke.py @@ -0,0 +1,236 @@ +""" +Smoke test for tau bench airline evaluation - runs with minimal configuration for CI/CD monitoring. + +This is a lightweight version of the full tau bench airline test, designed specifically +for automated smoke testing in CI/CD pipelines. It runs with only 1 iteration to provide +quick feedback on system health while minimizing resource usage. +""" + +import json +from datetime import datetime +from pathlib import Path +from typing import Any, Dict, List + +from eval_protocol.models import CompletionParams, EvaluateResult, EvaluationRow, InputMetadata, Message +from eval_protocol.pytest import evaluation_test +from eval_protocol.pytest.default_mcp_gym_rollout_processor import default_mcp_gym_rollout_processor +from vendor.tau2.data_model.message import ( + AssistantMessage, + SystemMessage, + ToolCall, + ToolMessage, + UserMessage, +) +from vendor.tau2.data_model.tasks import Action, EvaluationCriteria, RewardType, Task, UserScenario +from vendor.tau2.evaluator.evaluator import EnvironmentEvaluator +from vendor.tau2.evaluator.evaluator_action import ActionEvaluator +from vendor.tau2.evaluator.evaluator_communicate import CommunicateEvaluator +from vendor.tau2.evaluator.evaluator_nl_assertions import NLAssertionsEvaluator +from vendor.tau2.registry import registry + + +def tau_bench_airline_smoke_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]: + """ + Convert entries from airline dataset to EvaluationRow objects for smoke testing. + """ + rows = [] + test_dir = Path(__file__).parent.parent / "examples" / "tau2_mcp" / "tests" + + # Load system prompt from file so we can change it in one place + domain = data[0]["environment_context"]["domain"] + prompt_file = test_dir / f"system_prompts/{domain}_agent_system_prompt.md" + + with open(prompt_file, "r") as f: + system_prompt = f.read().strip() + + for row in data: + eval_row = EvaluationRow( + messages=[Message(role="system", content=system_prompt)], + input_metadata=InputMetadata( + row_id=row["id"], + dataset_info={ + "environment_context": row["environment_context"], + "user_simulation": row["user_simulation"], + "evaluation_criteria": row["evaluation_criteria"], + "user_prompt_template": row["user_prompt_template"], + }, + ), + ) + + rows.append(eval_row) + + return rows + + +@evaluation_test( + input_dataset=["tests/pytest/data/airline_dataset.jsonl"], + dataset_adapter=tau_bench_airline_smoke_to_evaluation_row, + model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"], + rollout_input_params=[{"temperature": 0.8, "extra_body": {"reasoning_effort": "medium"}}], + rollout_processor=default_mcp_gym_rollout_processor, + passed_threshold=0.36, + num_runs=1, # Smoke test: single run for quick feedback + mode="pointwise", + max_concurrent_rollouts=50, # Standard concurrency + server_script_path="examples/tau2_mcp/server.py", +) +def test_tau_bench_airline_smoke_evaluation(row: EvaluationRow) -> EvaluationRow: + """ + Smoke test for tau bench airline evaluation - single run version for CI/CD monitoring. + + This is a lightweight smoke test that runs the tau bench airline evaluation with + minimal configuration (1 run) to quickly validate system health and model performance. + It uses the same evaluation logic as the full test but with reduced resource usage. + + Args: + row: EvaluationRow object from tau bench airline dataset after rollout + + Returns: + EvaluationRow with tau2 evaluation results + """ + messages = row.messages + + # Get evaluation criteria and user_simulation from input_metadata.dataset_info + dataset_info = row.input_metadata.dataset_info if row.input_metadata else {} + evaluation_criteria = dataset_info.get("evaluation_criteria", {}) + + nl_assertions = evaluation_criteria.get("nl_assertions", []) + communicate_info = evaluation_criteria.get("communicate_info", []) + actions = evaluation_criteria.get("actions", []) + + # Convert Message objects directly to tau2-bench message objects + trajectory_objects = [] + for msg in messages: + role = msg.role + content = msg.content + + if role == "system": + trajectory_objects.append(SystemMessage(role=role, content=content)) + elif role == "assistant": + tau2_tool_calls = [] + if msg.tool_calls: + for tool_call in msg.tool_calls: + arguments = json.loads(tool_call.function.arguments) + tau2_tool_call = ToolCall( + id=tool_call.id, + name=tool_call.function.name, + arguments=arguments, + ) + tau2_tool_calls.append(tau2_tool_call) + + trajectory_objects.append(AssistantMessage(role=role, content=content, tool_calls=tau2_tool_calls)) + elif role == "user": + trajectory_objects.append(UserMessage(role=role, content=content)) + elif role == "tool": + tool_id = msg.tool_call_id + trajectory_objects.append(ToolMessage(id=tool_id, role=role, content=content)) + + reward = 1.0 + + evaluation_criteria = EvaluationCriteria( + nl_assertions=nl_assertions, + communicate_info=communicate_info, + actions=actions, + reward_basis=[ + RewardType.DB, + RewardType.COMMUNICATE, + ], + ) + + task = Task( + id="SmokeTest", evaluation_criteria=evaluation_criteria, user_scenario=UserScenario(instructions="SmokeTest") + ) # id and user_scenario are required for the Task type but not used in calculating reward + + if RewardType.DB in task.evaluation_criteria.reward_basis: + env_reward_info = EnvironmentEvaluator.calculate_reward( + environment_constructor=registry.get_env_constructor("airline"), + task=task, + full_trajectory=trajectory_objects, + ) + if RewardType.ACTION in task.evaluation_criteria.reward_basis: + action_reward_info = ActionEvaluator.calculate_reward( + task=task, + full_trajectory=trajectory_objects, + ) + if RewardType.COMMUNICATE in task.evaluation_criteria.reward_basis: + communicate_reward_info = CommunicateEvaluator.calculate_reward( + task=task, + full_trajectory=trajectory_objects, + ) + if RewardType.NL_ASSERTION in task.evaluation_criteria.reward_basis: + nl_reward_info = NLAssertionsEvaluator.calculate_reward( + task=task, + full_trajectory=trajectory_objects, + ) + + reward = 1.0 + env_bases = {RewardType.DB, RewardType.ENV_ASSERTION} + action_bases = {RewardType.ACTION} + nl_bases = {RewardType.NL_ASSERTION} + comm_bases = {RewardType.COMMUNICATE} + task_reward_basis = set(task.evaluation_criteria.reward_basis) + + reward_breakdown = {} + if task_reward_basis & env_bases: + if env_reward_info.reward_breakdown is not None: + reward_breakdown.update(env_reward_info.reward_breakdown) + reward *= env_reward_info.reward + if task_reward_basis & action_bases: + if action_reward_info.reward_breakdown is not None: + reward_breakdown.update(action_reward_info.reward_breakdown) + reward *= action_reward_info.reward + if task_reward_basis & nl_bases: + if nl_reward_info.reward_breakdown is not None: + reward_breakdown.update(nl_reward_info.reward_breakdown) + reward *= nl_reward_info.reward + if task_reward_basis & comm_bases: + if communicate_reward_info.reward_breakdown is not None: + reward_breakdown.update(communicate_reward_info.reward_breakdown) + reward *= communicate_reward_info.reward + + # Generate reason showing only failed components + failed_reasons = [] + + if task_reward_basis & env_bases and env_reward_info.reward == 0: + failed_reasons.append("❌ Environment/DB check failed") + + if task_reward_basis & action_bases and action_reward_info.reward == 0: + failed_actions = [] + if hasattr(action_reward_info, "action_checks") and action_reward_info.action_checks: + failed_actions = [ + f"{ac.action.name}({ac.action.arguments})" + for ac in action_reward_info.action_checks + if not ac.action_match + ] + if failed_actions: + failed_reasons.append(f"❌ Failed actions: {failed_actions}") + else: + failed_reasons.append("❌ Actions failed") + + if task_reward_basis & nl_bases and nl_reward_info.reward == 0: + failed_nl = [] + if hasattr(nl_reward_info, "nl_assertions") and nl_reward_info.nl_assertions: + failed_nl = [nla.nl_assertion for nla in nl_reward_info.nl_assertions if not nla.met] + if failed_nl: + failed_reasons.append(f"❌ Failed NL assertions: {failed_nl}") + else: + failed_reasons.append("❌ NL Assertions failed") + + if task_reward_basis & comm_bases and communicate_reward_info.reward == 0: + failed_comm = [] + if hasattr(communicate_reward_info, "communicate_checks") and communicate_reward_info.communicate_checks: + failed_comm = [cc.info for cc in communicate_reward_info.communicate_checks if not cc.met] + if failed_comm: + failed_reasons.append(f"❌ Failed communication: {failed_comm}") + else: + failed_reasons.append("❌ Communication failed") + + # If everything passed, show success + reason = "\n".join(failed_reasons) if failed_reasons else "✅ All checks passed [SMOKE TEST]" + + row.evaluation_result = EvaluateResult( + score=reward, + reason=reason, + metrics={}, + ) + return row From 93272b18a8628e51282f602725e1ef8eee405c72 Mon Sep 17 00:00:00 2001 From: Derek Xu <32891260+xzrderek@users.noreply.github.com> Date: Wed, 13 Aug 2025 13:37:18 -0700 Subject: [PATCH 08/26] No Smoke Test on normal CI (#76) --- .github/workflows/ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d0b21795..a1cf6aec 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -92,6 +92,7 @@ jobs: --ignore=tests/pytest/test_frozen_lake.py \ --ignore=tests/pytest/test_lunar_lander.py \ --ignore=tests/pytest/test_tau_bench_airline.py \ + --ignore=tests/test_tau_bench_airline_smoke.py \ --cov=eval_protocol --cov-append --cov-report=xml --cov-report=term-missing -v --durations=10 - name: Store coverage file From 8ad4c063fb51feacaa17bdb290053346afbcca1d Mon Sep 17 00:00:00 2001 From: Derek Xu <32891260+xzrderek@users.noreply.github.com> Date: Wed, 13 Aug 2025 14:20:52 -0700 Subject: [PATCH 09/26] Better Exceptions (#77) --- eval_protocol/mcp/execution/manager.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/eval_protocol/mcp/execution/manager.py b/eval_protocol/mcp/execution/manager.py index e0d101a7..e30f67d5 100644 --- a/eval_protocol/mcp/execution/manager.py +++ b/eval_protocol/mcp/execution/manager.py @@ -466,12 +466,13 @@ async def _execute_rollout( trajectory.control_plane_summary.update({"error_message": f"{failure_reason}"}) try: await envs.connection_manager.reset_session(session) - except: # noqa: E722 - logger.error(f"Error resetting session {session.session_id}") + except Exception as e: + logger.warning(f"Failed to reset session {session.session_id}: {type(e).__name__}: {e}", exc_info=True) + try: await envs.connection_manager.close_session(session) - except: # noqa: E722 - logger.error(f"Error closing session {session.session_id}") + except Exception as e: + logger.warning(f"Failed to close session {session.session_id}: {type(e).__name__}: {e}", exc_info=True) return trajectory async def _get_control_plane_status(self, session) -> Optional[Dict[str, Any]]: From 67918cb71e01121a608f22728b53106816596ab6 Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Wed, 13 Aug 2025 14:26:35 -0700 Subject: [PATCH 10/26] Rollout input params to completion params (#73) * convert rollout_input_params to completion_params * fix * DISABLE_EP_SQLITE_LOG * fix kwargs access to "model" * DRY completion params and make it a dict * fix tests * revert * fix * ensure logging * fix smoke test params --- eval_protocol/adapters/huggingface.py | 268 +++++++++--------- eval_protocol/adapters/langfuse.py | 16 +- eval_protocol/benchmarks/suites/aime25.py | 9 +- eval_protocol/benchmarks/suites/gpqa.py | 10 +- .../suites/livebench_data_analysis.py | 39 ++- .../benchmarks/suites/tau_bench_retail.py | 11 +- eval_protocol/dataset_logger/__init__.py | 9 +- eval_protocol/mcp/execution/manager.py | 14 +- eval_protocol/models.py | 23 +- .../pytest/default_agent_rollout_processor.py | 4 +- .../default_mcp_gym_rollout_processor.py | 8 +- .../default_single_turn_rollout_process.py | 20 +- eval_protocol/pytest/evaluation_test.py | 133 ++++----- eval_protocol/pytest/types.py | 6 +- examples/gpqa/tests/test_gpqa.py | 5 +- examples/healthbench/tests/test_evaluation.py | 5 +- tests/pytest/test_apps_coding.py | 5 +- tests/pytest/test_basic_coding.py | 5 +- tests/pytest/test_frozen_lake.py | 7 +- tests/pytest/test_hallucination.py | 5 +- tests/pytest/test_lunar_lander.py | 5 +- tests/pytest/test_markdown_highlighting.py | 5 +- tests/pytest/test_pytest_async.py | 4 +- ..._pytest_default_agent_rollout_processor.py | 4 +- tests/pytest/test_pytest_ensure_logging.py | 73 +++++ tests/pytest/test_pytest_flaky_sometimes.py | 2 +- tests/pytest/test_pytest_function_calling.py | 2 +- tests/pytest/test_pytest_ids.py | 19 +- tests/pytest/test_pytest_input_messages.py | 2 +- tests/pytest/test_pytest_json_schema.py | 2 +- tests/pytest/test_pytest_math_example.py | 3 +- .../pytest/test_pytest_math_format_length.py | 3 +- tests/pytest/test_pytest_mcp_config.py | 2 +- tests/pytest/test_pytest_mcp_url.py | 4 +- .../pytest/test_pytest_word_count_example.py | 3 +- tests/pytest/test_tau_bench_airline.py | 12 +- tests/test_models.py | 5 +- tests/test_tau_bench_airline_smoke.py | 9 +- vite-app/src/components/EvaluationRow.tsx | 2 +- vite-app/src/types/eval-protocol.ts | 9 +- 40 files changed, 431 insertions(+), 341 deletions(-) create mode 100644 tests/pytest/test_pytest_ensure_logging.py diff --git a/eval_protocol/adapters/huggingface.py b/eval_protocol/adapters/huggingface.py index 15391181..2825dafa 100644 --- a/eval_protocol/adapters/huggingface.py +++ b/eval_protocol/adapters/huggingface.py @@ -4,21 +4,20 @@ transformation functions to convert them to EvaluationRow format. """ -from typing import Any, Callable, Dict, Iterator, List, Optional import logging +from typing import Any, Callable, Dict, Iterator, List, Optional -from eval_protocol.models import EvaluationRow, Message, InputMetadata, CompletionParams +from eval_protocol.models import CompletionParams, EvaluationRow, InputMetadata, Message logger = logging.getLogger(__name__) try: - from datasets import load_dataset, Dataset, DatasetDict + from datasets import Dataset, DatasetDict, load_dataset + DATASETS_AVAILABLE = True except ImportError: DATASETS_AVAILABLE = False - logger.warning( - "HuggingFace datasets not installed. Install with: pip install 'eval-protocol[huggingface]'" - ) + logger.warning("HuggingFace datasets not installed. Install with: pip install 'eval-protocol[huggingface]'") # Type alias for transformation function TransformFunction = Callable[[Dict[str, Any]], Dict[str, Any]] @@ -26,11 +25,11 @@ class HuggingFaceAdapter: """Generic adapter to load HuggingFace datasets with custom transformations. - + This adapter loads datasets from HuggingFace Hub and applies a user-provided - transformation function to convert each row to the format expected by + transformation function to convert each row to the format expected by EvaluationRow. - + The transformation function should take a dataset row dictionary and return: { 'messages': List[Dict] - list of message dictionaries with 'role' and 'content' @@ -38,7 +37,7 @@ class HuggingFaceAdapter: 'metadata': Optional[Dict] - any additional metadata to preserve 'tools': Optional[List[Dict]] - tool definitions for tool calling scenarios } - + Examples: Simple Q&A dataset: >>> def transform(row): @@ -49,7 +48,7 @@ class HuggingFaceAdapter: ... } >>> adapter = HuggingFaceAdapter("my-dataset", transform_fn=transform) >>> rows = list(adapter.get_evaluation_rows(split="test", limit=10)) - + Math problems with system prompt: >>> def gsm8k_transform(row): ... return { @@ -62,7 +61,7 @@ class HuggingFaceAdapter: ... } >>> adapter = HuggingFaceAdapter("gsm8k", config_name="main", transform_fn=gsm8k_transform) """ - + def __init__( self, dataset_id: str, @@ -72,7 +71,7 @@ def __init__( **load_dataset_kwargs, ): """Initialize the HuggingFace adapter. - + Args: dataset_id: HuggingFace dataset identifier (e.g., "gsm8k", "squad", "org/dataset") transform_fn: Function to transform dataset rows to evaluation format @@ -84,16 +83,16 @@ def __init__( raise ImportError( "HuggingFace datasets not installed. Install with: pip install 'eval-protocol[huggingface]'" ) - + self.dataset_id = dataset_id self.transform_fn = transform_fn self.config_name = config_name self.revision = revision self.load_dataset_kwargs = load_dataset_kwargs - + # Load the dataset self.dataset = self._load_dataset() - + @classmethod def from_local( cls, @@ -102,53 +101,49 @@ def from_local( **load_dataset_kwargs, ) -> "HuggingFaceAdapter": """Create adapter from local dataset file. - + Args: path: Path to local dataset file (JSON, JSONL, CSV, etc.) transform_fn: Function to transform dataset rows **load_dataset_kwargs: Additional arguments to pass to load_dataset - + Returns: HuggingFaceAdapter instance """ # Determine file format - if path.endswith('.jsonl'): + if path.endswith(".jsonl"): dataset_type = "json" - elif path.endswith('.json'): + elif path.endswith(".json"): dataset_type = "json" - elif path.endswith('.csv'): + elif path.endswith(".csv"): dataset_type = "csv" - elif path.endswith('.parquet'): + elif path.endswith(".parquet"): dataset_type = "parquet" else: # Let HuggingFace auto-detect dataset_type = None - - load_kwargs = {'data_files': path, **load_dataset_kwargs} - - return cls( - dataset_id=dataset_type or "json", - transform_fn=transform_fn, - **load_kwargs - ) - + + load_kwargs = {"data_files": path, **load_dataset_kwargs} + + return cls(dataset_id=dataset_type or "json", transform_fn=transform_fn, **load_kwargs) + def _load_dataset(self) -> "Dataset | DatasetDict": """Load the dataset from HuggingFace Hub or local source.""" try: kwargs = {} if self.config_name: - kwargs['name'] = self.config_name + kwargs["name"] = self.config_name if self.revision: - kwargs['revision'] = self.revision - + kwargs["revision"] = self.revision + kwargs.update(self.load_dataset_kwargs) - + return load_dataset(self.dataset_id, **kwargs) - + except (OSError, ValueError, RuntimeError) as e: logger.error("Failed to load dataset %s: %s", self.dataset_id, e) raise - + def get_evaluation_rows( self, split: Optional[str] = None, @@ -160,7 +155,7 @@ def get_evaluation_rows( **completion_params_kwargs, ) -> Iterator[EvaluationRow]: """Convert dataset entries to EvaluationRow format. - + Args: split: Dataset split to use (if dataset has multiple splits) limit: Maximum number of rows to return @@ -169,7 +164,7 @@ def get_evaluation_rows( temperature: Temperature for completion parameters max_tokens: Max tokens for completion parameters **completion_params_kwargs: Additional completion parameters - + Yields: EvaluationRow: Converted evaluation rows """ @@ -183,35 +178,33 @@ def get_evaluation_rows( dataset = self.dataset[split] elif split is not None: logger.warning("Split '%s' specified but dataset is not split", split) - + # Apply offset and limit total_rows = len(dataset) end_idx = min(offset + limit, total_rows) if limit else total_rows - + if offset >= total_rows: logger.warning("Offset %d is greater than dataset size %d", offset, total_rows) return - + # Create completion parameters - completion_params = CompletionParams( - model=model_name, - temperature=temperature, - max_tokens=max_tokens, + completion_params: CompletionParams = { + "model": model_name, + "temperature": temperature, + "max_tokens": max_tokens, **completion_params_kwargs, - ) - + } + # Convert each row for i in range(offset, end_idx): try: raw_row = dataset[i] - eval_row = self._convert_row_to_evaluation_row( - raw_row, i, completion_params, split - ) + eval_row = self._convert_row_to_evaluation_row(raw_row, i, completion_params, split) yield eval_row except (AttributeError, ValueError, KeyError) as e: logger.warning("Failed to convert row %d: %s", i, e) continue - + def _convert_row_to_evaluation_row( self, raw_row: Dict[str, Any], @@ -220,83 +213,87 @@ def _convert_row_to_evaluation_row( split: Optional[str] = None, ) -> EvaluationRow: """Convert a single dataset row to EvaluationRow format. - + Args: raw_row: Raw dataset row dictionary row_index: Index of the row in the dataset completion_params: Completion parameters to use split: Dataset split name - + Returns: EvaluationRow object """ # Apply user transformation transformed = self.transform_fn(raw_row) - + # Validate required fields - if 'messages' not in transformed: + if "messages" not in transformed: raise ValueError("Transform function must return 'messages' field") - + # Convert message dictionaries to Message objects messages = [] - for msg_dict in transformed['messages']: + for msg_dict in transformed["messages"]: if not isinstance(msg_dict, dict): raise ValueError("Each message must be a dictionary") - if 'role' not in msg_dict: + if "role" not in msg_dict: raise ValueError("Each message must have a 'role' field") - - messages.append(Message( - role=msg_dict['role'], - content=msg_dict.get('content'), - name=msg_dict.get('name'), - tool_call_id=msg_dict.get('tool_call_id'), - tool_calls=msg_dict.get('tool_calls'), - function_call=msg_dict.get('function_call'), - )) - + + messages.append( + Message( + role=msg_dict["role"], + content=msg_dict.get("content"), + name=msg_dict.get("name"), + tool_call_id=msg_dict.get("tool_call_id"), + tool_calls=msg_dict.get("tool_calls"), + function_call=msg_dict.get("function_call"), + ) + ) + # Extract other fields - ground_truth = transformed.get('ground_truth') - tools = transformed.get('tools') - user_metadata = transformed.get('metadata', {}) - + ground_truth = transformed.get("ground_truth") + tools = transformed.get("tools") + user_metadata = transformed.get("metadata", {}) + # Create dataset info dataset_info = { - 'dataset_id': self.dataset_id, - 'config_name': self.config_name, - 'revision': self.revision, - 'split': split, - 'row_index': row_index, - 'transform_function': self.transform_fn.__name__ if hasattr(self.transform_fn, '__name__') else 'anonymous', + "dataset_id": self.dataset_id, + "config_name": self.config_name, + "revision": self.revision, + "split": split, + "row_index": row_index, + "transform_function": ( + self.transform_fn.__name__ if hasattr(self.transform_fn, "__name__") else "anonymous" + ), } - + # Add user metadata dataset_info.update(user_metadata) - + # Add original row data (with prefix to avoid conflicts) for key, value in raw_row.items(): - dataset_info[f'original_{key}'] = value - + dataset_info[f"original_{key}"] = value + # Create input metadata input_metadata = InputMetadata( row_id=f"{self.dataset_id}_{row_index}", completion_params=completion_params, dataset_info=dataset_info, session_data={ - 'dataset_source': 'huggingface', - 'timestamp': None, - } + "dataset_source": "huggingface", + "timestamp": None, + }, ) - + return EvaluationRow( messages=messages, tools=tools, input_metadata=input_metadata, ground_truth=str(ground_truth) if ground_truth is not None else None, ) - + def get_splits(self) -> List[str]: """Get available dataset splits. - + Returns: List of available split names """ @@ -304,27 +301,29 @@ def get_splits(self) -> List[str]: return list(self.dataset.keys()) else: return ["train"] # Default split name for non-split datasets - + def get_dataset_info(self) -> Dict[str, Any]: """Get information about the loaded dataset. - + Returns: Dictionary with dataset information """ info = { - 'dataset_id': self.dataset_id, - 'config_name': self.config_name, - 'revision': self.revision, - 'splits': self.get_splits(), - 'transform_function': self.transform_fn.__name__ if hasattr(self.transform_fn, '__name__') else 'anonymous', + "dataset_id": self.dataset_id, + "config_name": self.config_name, + "revision": self.revision, + "splits": self.get_splits(), + "transform_function": ( + self.transform_fn.__name__ if hasattr(self.transform_fn, "__name__") else "anonymous" + ), } - + # Add split sizes if isinstance(self.dataset, DatasetDict): - info['split_sizes'] = {split: len(data) for split, data in self.dataset.items()} + info["split_sizes"] = {split: len(data) for split, data in self.dataset.items()} else: - info['total_size'] = len(self.dataset) - + info["total_size"] = len(self.dataset) + return info @@ -336,14 +335,14 @@ def create_huggingface_adapter( **load_dataset_kwargs, ) -> HuggingFaceAdapter: """Factory function to create a HuggingFace adapter. - + Args: dataset_id: HuggingFace dataset identifier transform_fn: Function to transform dataset rows to evaluation format config_name: Optional configuration name revision: Optional dataset revision/commit hash **load_dataset_kwargs: Additional arguments for load_dataset - + Returns: HuggingFaceAdapter instance """ @@ -362,11 +361,11 @@ def create_gsm8k_adapter( revision: Optional[str] = None, ) -> HuggingFaceAdapter: """Create adapter specifically configured for GSM8K dataset. - + Args: system_prompt: Optional system prompt for math problems revision: Optional dataset revision/commit - + Returns: HuggingFaceAdapter configured for GSM8K """ @@ -374,24 +373,24 @@ def create_gsm8k_adapter( "You are a helpful assistant that solves math problems step by step. " "Show your work and provide the final answer." ) - + system_content = system_prompt or default_system_prompt - + def gsm8k_transform(row: Dict[str, Any]) -> Dict[str, Any]: """Transform GSM8K row to evaluation format.""" return { - 'messages': [ - {'role': 'system', 'content': system_content}, - {'role': 'user', 'content': row['question']}, + "messages": [ + {"role": "system", "content": system_content}, + {"role": "user", "content": row["question"]}, ], - 'ground_truth': row['answer'], - 'metadata': { - 'dataset': 'gsm8k', - 'question_length': len(row['question']), - 'answer_length': len(row['answer']), - } + "ground_truth": row["answer"], + "metadata": { + "dataset": "gsm8k", + "question_length": len(row["question"]), + "answer_length": len(row["answer"]), + }, } - + return create_huggingface_adapter( dataset_id="gsm8k", config_name="main", @@ -405,40 +404,39 @@ def create_math_adapter( revision: Optional[str] = None, ) -> HuggingFaceAdapter: """Create adapter specifically configured for MATH competition dataset. - + Args: system_prompt: Optional system prompt for math problems revision: Optional dataset revision/commit - + Returns: HuggingFaceAdapter configured for MATH dataset """ default_system_prompt = ( - "You are an expert mathematician. Solve this advanced math problem " - "step by step, showing detailed work." + "You are an expert mathematician. Solve this advanced math problem " "step by step, showing detailed work." ) - + system_content = system_prompt or default_system_prompt - + def math_transform(row: Dict[str, Any]) -> Dict[str, Any]: """Transform MATH dataset row to evaluation format.""" return { - 'messages': [ - {'role': 'system', 'content': system_content}, - {'role': 'user', 'content': row['problem']}, + "messages": [ + {"role": "system", "content": system_content}, + {"role": "user", "content": row["problem"]}, ], - 'ground_truth': row['solution'], - 'metadata': { - 'dataset': 'hendrycks_math', - 'type': row.get('type', 'unknown'), - 'level': row.get('level', 'unknown'), - 'problem_length': len(row['problem']), - 'solution_length': len(row['solution']), - } + "ground_truth": row["solution"], + "metadata": { + "dataset": "hendrycks_math", + "type": row.get("type", "unknown"), + "level": row.get("level", "unknown"), + "problem_length": len(row["problem"]), + "solution_length": len(row["solution"]), + }, } - + return create_huggingface_adapter( dataset_id="hendrycks/competition_math", transform_fn=math_transform, revision=revision, - ) \ No newline at end of file + ) diff --git a/eval_protocol/adapters/langfuse.py b/eval_protocol/adapters/langfuse.py index a3f35cba..0061b983 100644 --- a/eval_protocol/adapters/langfuse.py +++ b/eval_protocol/adapters/langfuse.py @@ -4,11 +4,11 @@ to EvaluationRow format for use in evaluation pipelines. """ -from typing import Any, Dict, Iterator, List, Optional -from datetime import datetime import logging +from datetime import datetime +from typing import Any, Dict, Iterator, List, Optional -from eval_protocol.models import EvaluationRow, Message, InputMetadata, CompletionParams +from eval_protocol.models import EvaluationRow, InputMetadata, Message logger = logging.getLogger(__name__) @@ -277,20 +277,20 @@ def _create_input_metadata(self, trace: Any, observations: List[Any]) -> InputMe InputMetadata object """ # Extract completion parameters from observations - completion_params = CompletionParams() + completion_params = {} # Look for model parameters in observations for obs in observations: if hasattr(obs, "model") and obs.model: - completion_params.model = obs.model + completion_params["model"] = obs.model if hasattr(obs, "model_parameters") and obs.model_parameters: params = obs.model_parameters if "temperature" in params: - completion_params.temperature = params["temperature"] + completion_params["temperature"] = params["temperature"] if "max_tokens" in params: - completion_params.max_tokens = params["max_tokens"] + completion_params["max_tokens"] = params["max_tokens"] if "top_p" in params: - completion_params.top_p = params["top_p"] + completion_params["top_p"] = params["top_p"] break # Create dataset info from trace metadata diff --git a/eval_protocol/benchmarks/suites/aime25.py b/eval_protocol/benchmarks/suites/aime25.py index 4a5d3a4c..3558eaa1 100644 --- a/eval_protocol/benchmarks/suites/aime25.py +++ b/eval_protocol/benchmarks/suites/aime25.py @@ -60,13 +60,18 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]: @export_benchmark("aime25") @evaluation_test( - model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"], input_dataset=[ "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl", "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl", ], dataset_adapter=aime2025_dataset_adapter, - rollout_input_params=[{"max_tokens": 131000, "extra_body": {"reasoning_effort": "low"}}], + completion_params=[ + { + "max_tokens": 131000, + "extra_body": {"reasoning_effort": "low"}, + "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b", + } + ], rollout_processor=default_single_turn_rollout_processor, aggregation_method="mean", passed_threshold=None, diff --git a/eval_protocol/benchmarks/suites/gpqa.py b/eval_protocol/benchmarks/suites/gpqa.py index 91620c9a..76967beb 100644 --- a/eval_protocol/benchmarks/suites/gpqa.py +++ b/eval_protocol/benchmarks/suites/gpqa.py @@ -55,6 +55,7 @@ def _extract_abcd_letter(text: str) -> str | None: _GPQA_INPUT_MESSAGES = _load_gpqa_messages_from_csv() + def _strip_gt_messages(msgs: List[Message]) -> List[Message]: return [m for m in msgs if not (m.role == "system" and (m.content or "").startswith("__GT__:"))] @@ -67,16 +68,19 @@ async def gpqa_strip_gt_rollout_processor(rows: List[EvaluationRow], config) -> if gt_tokens: gt_val = gt_tokens[-1].split(":", 1)[1].strip() r.ground_truth = gt_val - r.messages = [m for m in r.messages if not (m.role == "system" and (m.content or "").startswith("__GT__:"))] + r.messages = [ + m for m in r.messages if not (m.role == "system" and (m.content or "").startswith("__GT__:")) + ] processed.append(r) return await default_single_turn_rollout_processor(processed, config) @export_benchmark("gpqa") @evaluation_test( - model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"], input_messages=_GPQA_INPUT_MESSAGES, - rollout_input_params=[{"extra_body": {"reasoning_effort": "low"}}], + completion_params=[ + {"extra_body": {"reasoning_effort": "low"}, "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"} + ], rollout_processor=gpqa_strip_gt_rollout_processor, aggregation_method="mean", passed_threshold=None, diff --git a/eval_protocol/benchmarks/suites/livebench_data_analysis.py b/eval_protocol/benchmarks/suites/livebench_data_analysis.py index 1c04b6fd..fc5abb4e 100644 --- a/eval_protocol/benchmarks/suites/livebench_data_analysis.py +++ b/eval_protocol/benchmarks/suites/livebench_data_analysis.py @@ -1,20 +1,19 @@ -from typing import Any, Dict, List, Optional - import json import re +from typing import Any, Dict, List, Optional +from eval_protocol.benchmarks.registry import export_benchmark, register_composite_benchmark from eval_protocol.models import EvaluateResult, EvaluationRow, Message, MetricResult from eval_protocol.pytest.default_single_turn_rollout_process import ( default_single_turn_rollout_processor, ) from eval_protocol.pytest.evaluation_test import evaluation_test -from eval_protocol.benchmarks.registry import export_benchmark, register_composite_benchmark - # ------------------------- # Lightweight ports of LiveBench scoring utilities for data_analysis tasks # ------------------------- + def _lb_clean_text(text: str) -> str: text = text.lower().strip() text = re.sub(r"[^\w]", "", text) @@ -36,9 +35,7 @@ def _cta_process_results(ground_truth: str, llm_answer: str) -> int: boxed = _extract_last_boxed_segment(parsed_answer) if boxed is not None: parsed_answer = boxed - parsed_answer = ( - parsed_answer.replace("\\text{", "").replace("}", "").replace("\\", "") - ) + parsed_answer = parsed_answer.replace("\\text{", "").replace("}", "").replace("\\", "") gt_clean = _lb_clean_text(ground_truth) ans_clean = _lb_clean_text(parsed_answer) @@ -132,17 +129,15 @@ def _tablejoin_process_results(ground_truth: Any, llm_answer: str) -> float: return round((2 * tp) / denom, 2) -def _tablereformat_process_results( - input_command: str, ground_truth: str, llm_answer: str, version: str -) -> int: +def _tablereformat_process_results(input_command: str, ground_truth: str, llm_answer: str, version: str) -> int: try: import pandas as pd # type: ignore except Exception: return 0 - from io import StringIO import math as _math import traceback as _traceback + from io import StringIO def _read_df_v1(df_type: str, df_str: str): if df_type == "json": @@ -252,8 +247,12 @@ def _read_jsonl_table_from_text(text: str, header_cols: List[str]): ) else: lines = input_command.split("\n") - input_fmt = [l for l in lines if "Source Format" in l][-1].split("Source Format: ")[-1].strip().lower() - output_fmt = [l for l in lines if "Target Format" in l][-1].split("Target Format: ")[-1].strip().lower() + input_fmt = ( + [line for line in lines if "Source Format" in line][-1].split("Source Format: ")[-1].strip().lower() + ) + output_fmt = ( + [line for line in lines if "Target Format" in line][-1].split("Target Format: ")[-1].strip().lower() + ) reader = _read_df_v1 if version == "v1" else _read_df_v2 gt_df = reader(output_fmt, ground_truth) @@ -373,9 +372,9 @@ def _extract_gt(row: EvaluationRow) -> Dict[str, Any]: @export_benchmark("live_bench/data_analysis/cta") @evaluation_test( - model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"], + completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}], input_messages=[[m for m in r.messages] for r in _CTA_ROWS], - rollout_input_params=[{"extra_body": {"reasoning_effort": "low"}}], + rollout_processor_kwargs=[{"extra_body": {"reasoning_effort": "low"}}], rollout_processor=default_single_turn_rollout_processor, aggregation_method="mean", passed_threshold=None, @@ -416,9 +415,9 @@ def livebench_cta_pointwise(row: EvaluationRow) -> EvaluationRow: @export_benchmark("live_bench/data_analysis/tablejoin") @evaluation_test( - model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"], + completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}], input_messages=[[m for m in r.messages] for r in _TABLEJOIN_ROWS], - rollout_input_params=[{"extra_body": {"reasoning_effort": "low"}}], + rollout_processor_kwargs=[{"extra_body": {"reasoning_effort": "low"}}], rollout_processor=default_single_turn_rollout_processor, aggregation_method="mean", passed_threshold=None, @@ -460,9 +459,9 @@ def livebench_tablejoin_pointwise(row: EvaluationRow) -> EvaluationRow: @export_benchmark("live_bench/data_analysis/tablereformat") @evaluation_test( - model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"], + completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}], input_messages=[[m for m in r.messages] for r in _TABLEREFORMAT_ROWS], - rollout_input_params=[{"extra_body": {"reasoning_effort": "low"}}], + rollout_processor_kwargs=[{"extra_body": {"reasoning_effort": "low"}}], rollout_processor=default_single_turn_rollout_processor, aggregation_method="mean", passed_threshold=None, @@ -508,5 +507,3 @@ def livebench_tablereformat_pointwise(row: EvaluationRow) -> EvaluationRow: "live_bench/data_analysis/tablereformat", ], ) - - diff --git a/eval_protocol/benchmarks/suites/tau_bench_retail.py b/eval_protocol/benchmarks/suites/tau_bench_retail.py index 9e1104d4..8e8aaea0 100644 --- a/eval_protocol/benchmarks/suites/tau_bench_retail.py +++ b/eval_protocol/benchmarks/suites/tau_bench_retail.py @@ -11,7 +11,7 @@ from typing import Any, Dict, List from eval_protocol.benchmarks.registry import export_benchmark -from eval_protocol.models import CompletionParams, EvaluateResult, EvaluationRow, InputMetadata, Message +from eval_protocol.models import EvaluateResult, EvaluationRow, InputMetadata, Message from eval_protocol.pytest import evaluation_test from eval_protocol.pytest.default_mcp_gym_rollout_processor import default_mcp_gym_rollout_processor from vendor.tau2.data_model.message import ( @@ -66,8 +66,13 @@ def tau_bench_retail_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evalu @evaluation_test( input_dataset=["tests/pytest/data/retail_dataset.jsonl"], dataset_adapter=tau_bench_retail_to_evaluation_row, - model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"], - rollout_input_params=[{"temperature": 0.8, "extra_body": {"reasoning_effort": "medium"}}], + completion_params=[ + { + "temperature": 0.8, + "extra_body": {"reasoning_effort": "medium"}, + "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b", + } + ], rollout_processor=default_mcp_gym_rollout_processor, rollout_processor_kwargs={"domain": "retail"}, num_runs=8, diff --git a/eval_protocol/dataset_logger/__init__.py b/eval_protocol/dataset_logger/__init__.py index 9478ec6f..1caf6adc 100644 --- a/eval_protocol/dataset_logger/__init__.py +++ b/eval_protocol/dataset_logger/__init__.py @@ -1,11 +1,14 @@ -from eval_protocol.dataset_logger.sqlite_dataset_logger_adapter import SqliteDatasetLoggerAdapter import os +from eval_protocol.dataset_logger.dataset_logger import DatasetLogger +from eval_protocol.dataset_logger.sqlite_dataset_logger_adapter import SqliteDatasetLoggerAdapter + # Allow disabling sqlite logger to avoid environment-specific constraints in simple CLI runs. -if os.getenv("EP_SQLITE_LOG", "0").strip() == "1": +if os.getenv("DISABLE_EP_SQLITE_LOG", "0").strip() == "1": default_logger = SqliteDatasetLoggerAdapter() else: - class _NoOpLogger: + + class _NoOpLogger(DatasetLogger): def log(self, row): return None diff --git a/eval_protocol/mcp/execution/manager.py b/eval_protocol/mcp/execution/manager.py index e30f67d5..405e72b4 100644 --- a/eval_protocol/mcp/execution/manager.py +++ b/eval_protocol/mcp/execution/manager.py @@ -20,7 +20,7 @@ from vendor.tau2.data_model.message import AssistantMessage, UserMessage from vendor.tau2.user.user_simulator import UserSimulator -from ...models import CompletionParams, EvaluationRow, InputMetadata, Message +from ...models import EvaluationRow, InputMetadata, Message from ...types import MCPSession, MCPToolCall, TerminationReason, Trajectory if TYPE_CHECKING: @@ -128,12 +128,12 @@ async def _execute_with_semaphore(idx): evaluation_row.messages = messages evaluation_row.tools = shared_tool_schema evaluation_row.usage = CompletionUsage(**trajectory.usage) - evaluation_row.input_metadata.completion_params = CompletionParams( - model=policy.model_id, - temperature=getattr(policy, "temperature", None), - max_tokens=getattr(policy, "max_tokens", None), - max_tool_calls=getattr(policy, "max_tools_per_turn", None), - ) + evaluation_row.input_metadata.completion_params = { + "model": policy.model_id, + "temperature": getattr(policy, "temperature", None), + "max_tokens": getattr(policy, "max_tokens", None), + "max_tool_calls": getattr(policy, "max_tools_per_turn", None), + } if trajectory.terminated: if trajectory.termination_reason == TerminationReason.ERROR: diff --git a/eval_protocol/models.py b/eval_protocol/models.py index 77707c23..3f4391fa 100644 --- a/eval_protocol/models.py +++ b/eval_protocol/models.py @@ -1,6 +1,6 @@ import os from datetime import datetime -from typing import Any, Dict, List, Literal, Optional, Union +from typing import Any, Dict, List, Literal, Optional, TypedDict, Union from openai.types import CompletionUsage from openai.types.chat.chat_completion_message import ( @@ -178,13 +178,18 @@ def __iter__(self): return iter(self.__fields__.keys()) # Changed to __fields__ -class CompletionParams(BaseModel): - """Configuration for the language model used in the session.""" +CompletionParams = Dict[str, Any] +""" +Common set of completion parameters that most model providers support in their +API. Set total=False to allow extra fields since LiteLLM + providers have their +own set of parameters. The following parameters are common fields that are +populated. - model: str = Field(..., description="Model identifier (e.g., 'gpt-4.1', 'fireworks/llama')") - temperature: Optional[float] = Field(None, description="Temperature setting for model generation") - max_tokens: Optional[int] = Field(None, description="Maximum tokens to generate") - max_tool_calls: Optional[int] = Field(None, description="Maximum tool calls per turn") +model: str +temperature: Optional[float] +max_tokens: Optional[int] +top_p: Optional[float] +""" class InputMetadata(BaseModel): @@ -193,7 +198,9 @@ class InputMetadata(BaseModel): model_config = ConfigDict(extra="allow") row_id: Optional[str] = Field(default_factory=generate_id, description="Unique string to ID the row") - completion_params: Optional[CompletionParams] = Field(None, description="Completion endpoint parameters used") + completion_params: CompletionParams = Field( + default_factory=dict, description="Completion endpoint parameters used" + ) dataset_info: Optional[Dict[str, Any]] = Field( None, description="Dataset row details: seed, system_prompt, environment_context, etc" ) diff --git a/eval_protocol/pytest/default_agent_rollout_processor.py b/eval_protocol/pytest/default_agent_rollout_processor.py index 6a158b54..b3997c49 100644 --- a/eval_protocol/pytest/default_agent_rollout_processor.py +++ b/eval_protocol/pytest/default_agent_rollout_processor.py @@ -125,7 +125,9 @@ async def default_agent_rollout_processor( async def process_row(row: EvaluationRow) -> EvaluationRow: """Process a single row with agent rollout.""" - agent = Agent(model=config.model, row=row, config_path=config.mcp_config_path, logger=config.logger) + agent = Agent( + model=config.completion_params.model, row=row, config_path=config.mcp_config_path, logger=config.logger + ) try: await agent.setup() await agent.call_agent() diff --git a/eval_protocol/pytest/default_mcp_gym_rollout_processor.py b/eval_protocol/pytest/default_mcp_gym_rollout_processor.py index de9d8ca1..2b90239d 100644 --- a/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +++ b/eval_protocol/pytest/default_mcp_gym_rollout_processor.py @@ -216,10 +216,10 @@ async def default_mcp_gym_rollout_processor( server.start() policy = ep.LiteLLMPolicy( - model_id=config.model, - temperature=config.input_params.get("temperature", 0.0), - max_tokens=config.input_params.get("max_tokens", 4096), - reasoning_effort=config.input_params.get("reasoning_effort", None), + model_id=config.completion_params.model, + temperature=config.completion_params.get("temperature", 0.0), + max_tokens=config.completion_params.get("max_tokens", 4096), + reasoning_effort=config.completion_params.get("reasoning_effort", None), ) # Create MCP environments directly from evaluation_rows diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py index 424347cd..ef2ad48b 100644 --- a/eval_protocol/pytest/default_single_turn_rollout_process.py +++ b/eval_protocol/pytest/default_single_turn_rollout_process.py @@ -41,20 +41,20 @@ async def process_row(row: EvaluationRow) -> EvaluationRow: messages_payload = [{"role": m.role, "content": m.content} for m in row.messages] - request_params = {"model": config.model, "messages": messages_payload, **config.input_params} + request_params = {"messages": messages_payload, **config.completion_params} # Ensure caching is disabled only for this request (review feedback) request_params["cache"] = {"no-cache": True} # Single-level reasoning effort: expect `reasoning_effort` only effort_val = None - if isinstance(config.input_params, dict): - if "reasoning_effort" in config.input_params: - effort_val = str(config.input_params["reasoning_effort"]) # flat shape - elif ( - isinstance(config.input_params.get("extra_body"), dict) - and "reasoning_effort" in config.input_params["extra_body"] - ): - # Accept if user passed it directly inside extra_body - effort_val = str(config.input_params["extra_body"]["reasoning_effort"]) # already in extra_body + + if "reasoning_effort" in config.completion_params: + effort_val = str(config.completion_params["reasoning_effort"]) # flat shape + elif ( + isinstance(config.completion_params.get("extra_body"), dict) + and "reasoning_effort" in config.completion_params["extra_body"] + ): + # Accept if user passed it directly inside extra_body + effort_val = str(config.completion_params["extra_body"]["reasoning_effort"]) # already in extra_body if effort_val: # Always under extra_body so LiteLLM forwards to provider-specific param set diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index 81856ff6..dd7ecb04 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -32,7 +32,6 @@ EvaluationTestMode, InputMessagesParam, ModelParam, - RolloutInputParam, RolloutProcessor, RolloutProcessorConfig, RolloutProcessorInputParam, @@ -52,16 +51,15 @@ def evaluation_test( # noqa: C901 *, - model: List[ModelParam], + completion_params: List[CompletionParams], input_messages: Optional[List[InputMessagesParam]] = None, input_dataset: Optional[List[DatasetPathParam]] = None, dataset_adapter: Callable[[List[Dict[str, Any]]], Dataset] = default_dataset_adapter, - rollout_input_params: Optional[List[RolloutInputParam]] = None, rollout_processor: RolloutProcessor = default_no_op_rollout_processor, evaluation_test_kwargs: Optional[List[EvaluationInputParam]] = None, rollout_processor_kwargs: Optional[RolloutProcessorInputParam] = None, aggregation_method: AggregationMethod = "mean", - passed_threshold: Optional[Union[EvaluationThreshold, float]] = None, + passed_threshold: Optional[Union[EvaluationThreshold, float, dict]] = None, num_runs: int = 1, max_dataset_rows: Optional[int] = None, mcp_config_path: Optional[str] = None, @@ -109,7 +107,6 @@ def evaluation_test( # noqa: C901 which can be used to easily group and identify your dataset by. Args: - model: Model identifiers to query. input_messages: Messages to send to the model. This is useful if you don't have a dataset but can hard-code the messages. Will be passed as "input_dataset" to the test function. @@ -118,12 +115,12 @@ def evaluation_test( # noqa: C901 to a list of EvaluationRows if you have a custom dataset format. dataset_adapter: Function to convert the input dataset to a list of EvaluationRows. This is useful if you have a custom dataset format. - rollout_input_params: Generation parameters for the rollout. + completion_params: Generation parameters for the rollout. rollout_processor: Function used to perform the rollout. evaluation_test_kwargs: Kwargs for the evaluation function. rollout_processor_kwargs: Kwargs for the rollout processor. aggregation_method: How to aggregate scores across rows. - passed_threshold: Threshold configuration for test success. + passed_threshold: Threshold configuration for test success. Must be a float or EvaluationThreshold object. Success rate must be above success, and if set, standard deviation must be below standard_deviation. num_runs: Number of times to repeat the rollout and evaluations. max_dataset_rows: Limit dataset to the first N rows. @@ -242,7 +239,7 @@ def generate_combinations(): datasets = [[input_dataset]] # type: ignore else: datasets = [None] - rips: List[Optional[RolloutInputParam]] = rollout_input_params if rollout_input_params is not None else [None] # type: ignore + cps: List[Optional[CompletionParams]] = completion_params if completion_params is not None else [None] # type: ignore # Apply EP_MAX_DATASET_ROWS to input_messages, but do NOT parameterize over # each row. Instead, pass the entire sliced list through in a single test run # so summaries aggregate all rows together (AIME-style behavior). @@ -259,17 +256,16 @@ def generate_combinations(): kwargs: List[Optional[EvaluationInputParam]] = evaluation_test_kwargs if evaluation_test_kwargs is not None else [None] # type: ignore # Generate all combinations - for m in model: - for ds in datasets: - for rip in rips: - for im in messages: - for etk in kwargs: - # if no dataset and no messages, raise an error - if ds is None and im is None: - raise ValueError( - "No dataset or messages provided. Please provide at least one of input_dataset or input_messages." - ) - combinations.append((m, ds, rip, im, etk)) + for ds in datasets: + for cp in cps: + for im in messages: + for etk in kwargs: + # if no dataset and no messages, raise an error + if ds is None and im is None: + raise ValueError( + "No dataset or messages provided. Please provide at least one of input_dataset or input_messages." + ) + combinations.append((ds, cp, im, etk)) return combinations @@ -282,12 +278,12 @@ def generate_combinations(): # Create parameter tuples for pytest.mark.parametrize param_tuples = [] for combo in combinations: - model_name, dataset, rip, messages, etk = combo - param_tuple = [model_name] + dataset, cp, messages, etk = combo + param_tuple = [] if input_dataset is not None: param_tuple.append(dataset) - if rollout_input_params is not None: - param_tuple.append(rip) + if completion_params is not None: + param_tuple.append(cp) if input_messages is not None: param_tuple.append(messages) if evaluation_test_kwargs is not None: @@ -295,11 +291,11 @@ def generate_combinations(): param_tuples.append(tuple(param_tuple)) # For batch mode, use the original parameter names - test_param_names = ["model"] + test_param_names = [] if input_dataset is not None: test_param_names.append("dataset_path") - if rollout_input_params is not None: - test_param_names.append("input_params") + if completion_params is not None: + test_param_names.append("completion_params") if input_messages is not None: test_param_names.append("input_messages") if evaluation_test_kwargs is not None: @@ -311,7 +307,6 @@ def create_wrapper_with_signature() -> Callable: invocation_id = generate_id() async def wrapper_body(**kwargs): - model_name = kwargs["model"] eval_metadata = None all_results: List[List[EvaluationRow]] = [[] for _ in range(num_runs)] @@ -352,7 +347,16 @@ def _log_eval_error( else: raise ValueError("No input dataset or input messages provided") - input_params = kwargs.get("input_params") or {} + if "completion_params" not in kwargs or not kwargs["completion_params"]: + raise ValueError( + "No completion parameters provided. Please provide a completion parameters object." + ) + completion_params = kwargs["completion_params"] + if "model" not in completion_params or not completion_params["model"]: + raise ValueError( + "No model provided. Please provide a model in the completion parameters object." + ) + # Optional global overrides via environment for ad-hoc experimentation # EP_INPUT_PARAMS_JSON can contain a JSON object that will be deep-merged # into input_params (e.g., '{"temperature":0,"extra_body":{"reasoning":{"effort":"low"}}}'). @@ -363,7 +367,7 @@ def _log_eval_error( if _env_override: override_obj = _json.loads(_env_override) if isinstance(override_obj, dict): - input_params = _deep_update_dict(dict(input_params), override_obj) + completion_params = _deep_update_dict(dict(completion_params), override_obj) except Exception: pass @@ -378,14 +382,6 @@ def _log_eval_error( passed=None, ) - # Populate completion_params in input_metadata for all rows and initialize eval_metadata BEFORE rollouts - completion_params = CompletionParams( - model=model_name, - temperature=input_params.get("temperature"), - max_tokens=input_params.get("max_tokens"), - max_tool_calls=input_params.get("max_tool_calls"), - ) - for row in data: if row.input_metadata is None: row.input_metadata = InputMetadata() @@ -405,14 +401,13 @@ def _log_eval_error( # Prepare rollout processor config once; we will generate fresh outputs per run config = RolloutProcessorConfig( - model=model_name, - input_params=input_params, + completion_params=completion_params, mcp_config_path=mcp_config_path or "", max_concurrent_rollouts=max_concurrent_rollouts, server_script_path=server_script_path, steps=steps, logger=active_logger, - kwargs=rollout_processor_kwargs, + kwargs=rollout_processor_kwargs or {}, ) for i in range(num_runs): @@ -535,7 +530,7 @@ async def _execute_with_semaphore(row): should_print = os.getenv("EP_PRINT_SUMMARY") == "1" summary_path = os.getenv("EP_SUMMARY_JSON") suite_name = test_func.__name__ - model_used = model_name + model_used = config.completion_params.model total_rows = len([item for sublist in all_results for item in sublist]) summary_obj = { "suite": suite_name, @@ -619,7 +614,7 @@ def _extract_effort_tag(params: dict) -> str | None: return None model_slug = _sanitize_filename(model_used) - effort_tag = _extract_effort_tag(input_params) or "" + effort_tag = _extract_effort_tag(completion_params) or "" effort_suffix = f"__effort-{_sanitize_filename(effort_tag)}" if effort_tag else "" base_name = f"{suite_name}__{model_slug}{effort_suffix}__{mode}__runs{num_runs}.json" @@ -756,11 +751,10 @@ async def dual_mode_wrapper(*args, **kwargs): try: dual_mode_wrapper.__ep_original_test_func = test_func # type: ignore[attr-defined] dual_mode_wrapper.__ep_config = { - "model": model, "input_messages": input_messages, "input_dataset": input_dataset, "dataset_adapter": dataset_adapter, - "rollout_input_params": rollout_input_params, + "rollout_input_params": completion_params, "rollout_processor": rollout_processor, "evaluation_test_kwargs": evaluation_test_kwargs, "rollout_processor_kwargs": rollout_processor_kwargs, @@ -794,14 +788,13 @@ def __ep_run_direct( rip = rip_list[0] if isinstance(rip_list, list) and rip_list else {} return run_evaluation_test_direct( test_func=dual_mode_wrapper.__ep_original_test_func, # type: ignore[attr-defined] - model=_model, input_messages=cfg.get("input_messages"), input_dataset=cfg.get("input_dataset"), dataset_adapter=cfg.get("dataset_adapter"), - rollout_input_params=rip, + completion_params=rip, rollout_processor=cfg.get("rollout_processor"), aggregation_method=cfg.get("aggregation_method"), - threshold_of_success=cfg.get("passed_threshold"), + passed_threshold=cfg.get("passed_threshold"), num_runs=(num_runs_override if num_runs_override is not None else cfg.get("num_runs")), max_dataset_rows=cfg.get("max_dataset_rows"), mcp_config_path=cfg.get("mcp_config_path"), @@ -825,15 +818,14 @@ def __ep_run_direct( def run_evaluation_test_direct( *, test_func: TestFunction, - model: str, input_messages: Optional[List[InputMessagesParam]] = None, input_dataset: Optional[List[DatasetPathParam]] = None, dataset_adapter: Callable[[List[Dict[str, Any]]], Dataset] = default_dataset_adapter, - rollout_input_params: Optional[RolloutInputParam] = None, + completion_params: Optional[CompletionParams] = None, rollout_processor: RolloutProcessor = default_no_op_rollout_processor, rollout_processor_kwargs: Optional[RolloutProcessorInputParam] = None, aggregation_method: AggregationMethod = "mean", - threshold_of_success: Optional[float] = None, + passed_threshold: Optional[Union[EvaluationThreshold, float]] = None, num_runs: int = 1, max_dataset_rows: Optional[int] = None, mcp_config_path: Optional[str] = None, @@ -849,6 +841,9 @@ def run_evaluation_test_direct( Returns a dict with keys: summary, results. """ + if passed_threshold is not None and not isinstance(passed_threshold, EvaluationThreshold): + passed_threshold = EvaluationThreshold(success=passed_threshold) + def _parse_ep_max_rows(default_value: int | None) -> int | None: raw = os.getenv("EP_MAX_DATASET_ROWS") if raw is None: @@ -893,7 +888,7 @@ def _deep_update_dict(base: dict, override: dict) -> dict: raise ValueError("No input dataset or input messages provided") # Build input params and apply env JSON override - input_params: Dict[str, Any] = rollout_input_params or {} + completion_params: Dict[str, Any] = completion_params or {} try: import json as _json @@ -901,7 +896,7 @@ def _deep_update_dict(base: dict, override: dict) -> dict: if _env_override: override_obj = _json.loads(_env_override) if isinstance(override_obj, dict): - input_params = _deep_update_dict(dict(input_params), override_obj) + completion_params = _deep_update_dict(dict(completion_params), override_obj) except Exception: pass @@ -912,17 +907,10 @@ def _deep_update_dict(base: dict, override: dict) -> dict: status="running", num_runs=num_runs, aggregation_method=aggregation_method, - threshold_of_success=threshold_of_success, + passed_threshold=passed_threshold, passed=None, ) - completion_params = CompletionParams( - model=model, - temperature=input_params.get("temperature"), - max_tokens=input_params.get("max_tokens"), - max_tool_calls=input_params.get("max_tool_calls"), - ) - for row in data: if row.input_metadata is None: row.input_metadata = InputMetadata() @@ -935,13 +923,12 @@ def _deep_update_dict(base: dict, override: dict) -> dict: default_logger.log(row) config = RolloutProcessorConfig( - model=model, - input_params=input_params, + completion_params=completion_params, mcp_config_path=mcp_config_path or "", max_concurrent_rollouts=max_concurrent_rollouts, server_script_path=server_script_path, steps=steps, - kwargs=rollout_processor_kwargs, + kwargs=rollout_processor_kwargs or {}, ) all_results: List[EvaluationRow] = [] @@ -986,8 +973,8 @@ def _deep_update_dict(base: dict, override: dict) -> dict: ci_high = None passed = None - if threshold_of_success is not None: - passed = agg_score >= threshold_of_success + if passed_threshold is not None: + passed = agg_score >= passed_threshold.success for r in all_results: if r.eval_metadata is not None: r.eval_metadata.status = "finished" @@ -1003,7 +990,7 @@ def _deep_update_dict(base: dict, override: dict) -> dict: total_rows = len(all_results) summary_obj = { "suite": suite_name, - "model": model, + "model": config.completion_params.model, "agg_score": float(agg_score) if agg_score is not None else None, "num_runs": num_runs, "rows": total_rows, @@ -1014,11 +1001,11 @@ def _deep_update_dict(base: dict, override: dict) -> dict: if should_print: if ci_low is not None and ci_high is not None: print( - f"EP Summary | suite={suite_name} model={model} agg={summary_obj['agg_score']:.3f} ci95=[{ci_low:.3f},{ci_high:.3f}] runs={num_runs} rows={total_rows}" + f"EP Summary | suite={suite_name} model={config.completion_params.model} agg={summary_obj['agg_score']:.3f} ci95=[{ci_low:.3f},{ci_high:.3f}] runs={num_runs} rows={total_rows}" ) else: print( - f"EP Summary | suite={suite_name} model={model} agg={summary_obj['agg_score']:.3f} runs={num_runs} rows={total_rows}" + f"EP Summary | suite={suite_name} model={config.completion_params.model} agg={summary_obj['agg_score']:.3f} runs={num_runs} rows={total_rows}" ) if summary_path: import json as _json @@ -1050,8 +1037,8 @@ def _extract_effort_tag(params: dict) -> str | None: return None return None - model_slug = _sanitize_filename(model) - effort_tag = _extract_effort_tag(input_params) or "" + model_slug = _sanitize_filename(config.completion_params.model) + effort_tag = _extract_effort_tag(completion_params) or "" effort_suffix = f"__effort-{_sanitize_filename(effort_tag)}" if effort_tag else "" base_name = f"{suite_name}__{model_slug}{effort_suffix}__{mode}__runs{num_runs}.json" @@ -1073,10 +1060,10 @@ def _extract_effort_tag(params: dict) -> str | None: except Exception: pass - if threshold_of_success is not None and not passed: + if passed_threshold is not None and not passed: assert ( - agg_score >= threshold_of_success - ), f"Aggregated score {agg_score:.3f} below threshold {threshold_of_success}" + agg_score >= passed_threshold.success + ), f"Aggregated score {agg_score:.3f} below threshold {passed_threshold}" return {"summary": summary_obj, "results": all_results} except Exception: diff --git a/eval_protocol/pytest/types.py b/eval_protocol/pytest/types.py index 9f564ce1..1a80254b 100644 --- a/eval_protocol/pytest/types.py +++ b/eval_protocol/pytest/types.py @@ -8,11 +8,10 @@ from eval_protocol.dataset_logger import default_logger from eval_protocol.dataset_logger.dataset_logger import DatasetLogger -from ..models import EvaluationRow, Message +from ..models import CompletionParams, EvaluationRow, Message ModelParam = str # gpt-4o, gpt-4o-mini, accounts/fireworks/models/llama-3.1-8b-instruct DatasetPathParam = str -RolloutInputParam = Dict[str, Any] InputMessagesParam = List[Message] EvaluationInputParam = Dict[str, Any] RolloutProcessorInputParam = Dict[str, Any] @@ -41,8 +40,7 @@ @dataclass class RolloutProcessorConfig: - model: ModelParam - input_params: RolloutInputParam # optional input parameters for inference + completion_params: CompletionParams # input parameters for inference mcp_config_path: str server_script_path: Optional[str] = ( None # TODO: change from server_script_path to mcp_config_path for agent rollout processor diff --git a/examples/gpqa/tests/test_gpqa.py b/examples/gpqa/tests/test_gpqa.py index c59d51e4..dcbf7b53 100644 --- a/examples/gpqa/tests/test_gpqa.py +++ b/examples/gpqa/tests/test_gpqa.py @@ -62,10 +62,9 @@ def _load_gpqa_messages_from_csv() -> List[List[Message]]: @evaluation_test( - model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"], input_messages=_GPQA_INPUT_MESSAGES, - rollout_input_params=[ - {"extra_body": {"reasoning_effort": "low"}} + completion_params=[ + {"extra_body": {"reasoning_effort": "low"}, "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"} ], # default to low effort; override via CLI plugin rollout_processor=default_single_turn_rollout_processor, aggregation_method="mean", diff --git a/examples/healthbench/tests/test_evaluation.py b/examples/healthbench/tests/test_evaluation.py index be70c261..a40c5d96 100644 --- a/examples/healthbench/tests/test_evaluation.py +++ b/examples/healthbench/tests/test_evaluation.py @@ -47,9 +47,10 @@ @evaluation_test( - model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"], input_messages=_HB_INPUT_MESSAGES, - rollout_input_params=[{"temperature": 0.2, "max_tokens": 512}], + completion_params=[ + {"temperature": 0.2, "max_tokens": 512, "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"} + ], rollout_processor=default_single_turn_rollout_processor, aggregation_method="mean", passed_threshold=None, diff --git a/tests/pytest/test_apps_coding.py b/tests/pytest/test_apps_coding.py index f9c84695..7cb976ac 100644 --- a/tests/pytest/test_apps_coding.py +++ b/tests/pytest/test_apps_coding.py @@ -26,8 +26,9 @@ def apps_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio @evaluation_test( input_dataset=["tests/pytest/data/apps_sample_dataset.jsonl"], dataset_adapter=apps_dataset_to_evaluation_row, - model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"], - rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}], + completion_params=[ + {"temperature": 0.0, "max_tokens": 4096, "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"} + ], passed_threshold=0.33, rollout_processor=default_single_turn_rollout_processor, num_runs=1, diff --git a/tests/pytest/test_basic_coding.py b/tests/pytest/test_basic_coding.py index c96a8302..2b1c2a4a 100644 --- a/tests/pytest/test_basic_coding.py +++ b/tests/pytest/test_basic_coding.py @@ -28,8 +28,9 @@ def coding_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluat @evaluation_test( input_dataset=["tests/pytest/data/basic_coding_dataset.jsonl"], dataset_adapter=coding_dataset_to_evaluation_row, - model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"], - rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}], + completion_params=[ + {"temperature": 0.0, "max_tokens": 4096, "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"} + ], passed_threshold=0.8, rollout_processor=default_single_turn_rollout_processor, num_runs=1, diff --git a/tests/pytest/test_frozen_lake.py b/tests/pytest/test_frozen_lake.py index 74d5e317..bea42bed 100644 --- a/tests/pytest/test_frozen_lake.py +++ b/tests/pytest/test_frozen_lake.py @@ -7,7 +7,7 @@ from typing import Any, Dict, List -from eval_protocol.models import CompletionParams, EvaluateResult, EvaluationRow, InputMetadata, Message, MetricResult +from eval_protocol.models import EvaluateResult, EvaluationRow, InputMetadata, Message, MetricResult from eval_protocol.pytest import evaluation_test from eval_protocol.pytest.default_mcp_gym_rollout_processor import default_mcp_gym_rollout_processor @@ -38,8 +38,9 @@ def frozen_lake_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluation @evaluation_test( input_dataset=["tests/pytest/data/frozen_lake_dataset.jsonl"], dataset_adapter=frozen_lake_to_evaluation_row, - model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"], - rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}], + completion_params=[ + {"temperature": 0.0, "max_tokens": 4096, "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"} + ], rollout_processor=default_mcp_gym_rollout_processor, passed_threshold=0.66, num_runs=1, diff --git a/tests/pytest/test_hallucination.py b/tests/pytest/test_hallucination.py index 54779f09..b29fb53c 100644 --- a/tests/pytest/test_hallucination.py +++ b/tests/pytest/test_hallucination.py @@ -32,8 +32,9 @@ def hallucination_dataset_adapter(data: List[Dict[str, Any]]) -> List[Evaluation @evaluation_test( input_dataset=["tests/pytest/data/halueval_sample_dataset.jsonl"], dataset_adapter=hallucination_dataset_adapter, - model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"], - rollout_input_params=[{"temperature": 0.0, "max_tokens": 512}], + completion_params=[ + {"temperature": 0.0, "max_tokens": 512, "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"} + ], rollout_processor=default_single_turn_rollout_processor, passed_threshold=0.33, num_runs=1, diff --git a/tests/pytest/test_lunar_lander.py b/tests/pytest/test_lunar_lander.py index ab4dad69..3fddac62 100644 --- a/tests/pytest/test_lunar_lander.py +++ b/tests/pytest/test_lunar_lander.py @@ -7,7 +7,7 @@ from typing import Any, Dict, List -from eval_protocol.models import CompletionParams, EvaluateResult, EvaluationRow, InputMetadata, Message +from eval_protocol.models import EvaluateResult, EvaluationRow, InputMetadata, Message from eval_protocol.pytest import evaluation_test from eval_protocol.pytest.default_mcp_gym_rollout_processor import default_mcp_gym_rollout_processor @@ -38,8 +38,7 @@ def lunar_lander_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio @evaluation_test( input_dataset=["tests/pytest/data/lunar_lander_dataset.jsonl"], dataset_adapter=lunar_lander_to_evaluation_row, - model=["gpt-4.1"], - rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}], + completion_params=[{"temperature": 0.0, "max_tokens": 4096, "model": "gpt-4.1"}], rollout_processor=default_mcp_gym_rollout_processor, passed_threshold=0.0, num_runs=1, diff --git a/tests/pytest/test_markdown_highlighting.py b/tests/pytest/test_markdown_highlighting.py index cf32cb5f..9c70721f 100644 --- a/tests/pytest/test_markdown_highlighting.py +++ b/tests/pytest/test_markdown_highlighting.py @@ -28,8 +28,9 @@ def markdown_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evalu @evaluation_test( input_dataset=["tests/pytest/data/markdown_dataset.jsonl"], dataset_adapter=markdown_dataset_to_evaluation_row, - model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"], - rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}], + completion_params=[ + {"temperature": 0.0, "max_tokens": 4096, "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"} + ], passed_threshold=0.5, rollout_processor=default_single_turn_rollout_processor, num_runs=1, diff --git a/tests/pytest/test_pytest_async.py b/tests/pytest/test_pytest_async.py index 60f6fd6a..1cfc2db6 100644 --- a/tests/pytest/test_pytest_async.py +++ b/tests/pytest/test_pytest_async.py @@ -17,7 +17,7 @@ Message(role="user", content="What is the capital of the moon?"), ], ], - model=["accounts/fireworks/models/kimi-k2-instruct"], + completion_params=[{"model": "accounts/fireworks/models/kimi-k2-instruct"}], ) async def test_pytest_async(rows: List[EvaluationRow]) -> List[EvaluationRow]: """Run math evaluation on sample dataset using pytest interface.""" @@ -30,7 +30,7 @@ async def test_pytest_async(rows: List[EvaluationRow]) -> List[EvaluationRow]: Message(role="user", content="What is the capital of France?"), ], ], - model=["accounts/fireworks/models/kimi-k2-instruct"], + completion_params=[{"model": "accounts/fireworks/models/kimi-k2-instruct"}], mode="pointwise", ) async def test_pytest_async_pointwise(row: EvaluationRow) -> EvaluationRow: diff --git a/tests/pytest/test_pytest_default_agent_rollout_processor.py b/tests/pytest/test_pytest_default_agent_rollout_processor.py index 06762046..8320ec8a 100644 --- a/tests/pytest/test_pytest_default_agent_rollout_processor.py +++ b/tests/pytest/test_pytest_default_agent_rollout_processor.py @@ -1,7 +1,7 @@ from datetime import datetime from typing import List -from eval_protocol.models import Message, EvaluationRow +from eval_protocol.models import EvaluationRow, Message from eval_protocol.pytest import default_agent_rollout_processor, evaluation_test @@ -17,7 +17,7 @@ ] ], rollout_processor=default_agent_rollout_processor, - model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"], + completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}], ) def test_pytest_default_agent_rollout_processor(rows: List[EvaluationRow]) -> List[EvaluationRow]: """Run math evaluation on sample dataset using pytest interface.""" diff --git a/tests/pytest/test_pytest_ensure_logging.py b/tests/pytest/test_pytest_ensure_logging.py new file mode 100644 index 00000000..4300e1b4 --- /dev/null +++ b/tests/pytest/test_pytest_ensure_logging.py @@ -0,0 +1,73 @@ +from typing import List +from unittest.mock import Mock, patch + +import eval_protocol.dataset_logger as dataset_logger +from eval_protocol.dataset_logger.dataset_logger import DatasetLogger +from eval_protocol.dataset_logger.sqlite_evaluation_row_store import SqliteEvaluationRowStore +from eval_protocol.models import EvaluationRow +from eval_protocol.pytest.default_no_op_rollout_process import default_no_op_rollout_processor +from tests.pytest.test_markdown_highlighting import markdown_dataset_to_evaluation_row + + +async def test_ensure_logging(monkeypatch): + """ + Ensure that default SQLITE logger gets called by mocking the storage and checking that the storage is called. + """ + from eval_protocol.pytest.evaluation_test import evaluation_test + + # Mock the SqliteEvaluationRowStore to track calls + mock_store = Mock(spec=SqliteEvaluationRowStore) + mock_store.upsert_row = Mock() + mock_store.read_rows = Mock(return_value=[]) + mock_store.db_path = "/tmp/test.db" + + # Create a custom logger that uses our mocked store + class MockSqliteLogger(DatasetLogger): + def __init__(self, store: SqliteEvaluationRowStore): + self._store = store + + def log(self, row: EvaluationRow) -> None: + data = row.model_dump(exclude_none=True, mode="json") + self._store.upsert_row(data=data) + + def read(self, rollout_id=None) -> List[EvaluationRow]: + results = self._store.read_rows(rollout_id=rollout_id) + return [EvaluationRow(**data) for data in results] + + mock_logger = MockSqliteLogger(mock_store) + + @evaluation_test( + input_dataset=[ + "tests/pytest/data/markdown_dataset.jsonl", + ], + completion_params=[{"temperature": 0.0, "model": "dummy/local-model"}], + dataset_adapter=markdown_dataset_to_evaluation_row, + rollout_processor=default_no_op_rollout_processor, + mode="pointwise", + combine_datasets=False, + num_runs=2, + logger=mock_logger, # Use our mocked logger + ) + def eval_fn(row: EvaluationRow) -> EvaluationRow: + return row + + await eval_fn( + dataset_path=["tests/pytest/data/markdown_dataset.jsonl"], + completion_params={"temperature": 0.0, "model": "dummy/local-model"}, + ) + + # Verify that the store's upsert_row method was called + assert mock_store.upsert_row.called, "SqliteEvaluationRowStore.upsert_row should have been called" + + # Check that it was called multiple times (once for each row) + call_count = mock_store.upsert_row.call_count + assert call_count > 0, f"Expected upsert_row to be called at least once, but it was called {call_count} times" + + # Verify the calls were made with proper data structure + for call in mock_store.upsert_row.call_args_list: + args, kwargs = call + data = args[0] if args else kwargs.get("data") + assert data is not None, "upsert_row should be called with data parameter" + assert isinstance(data, dict), "data should be a dictionary" + assert "execution_metadata" in data, "data should contain execution_metadata" + assert "rollout_id" in data["execution_metadata"], "data should contain rollout_id in execution_metadata" diff --git a/tests/pytest/test_pytest_flaky_sometimes.py b/tests/pytest/test_pytest_flaky_sometimes.py index cb70ec1e..65e1e63d 100644 --- a/tests/pytest/test_pytest_flaky_sometimes.py +++ b/tests/pytest/test_pytest_flaky_sometimes.py @@ -12,7 +12,7 @@ @pytest.mark.skipif(os.getenv("CI") == "true", reason="Skipping flaky test in CI") @evaluation_test( input_messages=[[Message(role="user", content="Return HEADS or TAILS at random.")]], - model=["dummy/local-model"], + completion_params=[{"model": "dummy/local-model"}], rollout_processor=default_no_op_rollout_processor, mode="pointwise", num_runs=5, diff --git a/tests/pytest/test_pytest_function_calling.py b/tests/pytest/test_pytest_function_calling.py index 84f44fc5..63488dbe 100644 --- a/tests/pytest/test_pytest_function_calling.py +++ b/tests/pytest/test_pytest_function_calling.py @@ -20,7 +20,7 @@ def function_calling_to_evaluation_row(rows: List[Dict[str, Any]]) -> List[Evalu @evaluation_test( input_dataset=["tests/pytest/data/function_calling.jsonl"], - model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"], + completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}], mode="pointwise", dataset_adapter=function_calling_to_evaluation_row, rollout_processor=default_single_turn_rollout_processor, diff --git a/tests/pytest/test_pytest_ids.py b/tests/pytest/test_pytest_ids.py index 24ba3baf..045d2a19 100644 --- a/tests/pytest/test_pytest_ids.py +++ b/tests/pytest/test_pytest_ids.py @@ -28,7 +28,7 @@ async def test_evaluation_test_decorator(monkeypatch): input_dataset=[ "tests/pytest/data/markdown_dataset.jsonl", ], - model=["dummy/local-model"], + completion_params=[{"temperature": 0.0, "model": "dummy/local-model"}], dataset_adapter=markdown_dataset_to_evaluation_row, rollout_processor=default_no_op_rollout_processor, mode="pointwise", @@ -45,7 +45,7 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow: # Manually invoke all parameter combinations within a single test for ds_path in dataset_paths: - await eval_fn(model="dummy/local-model", dataset_path=[ds_path]) + await eval_fn(dataset_path=[ds_path], completion_params={"temperature": 0.0, "model": "dummy/local-model"}) # Assertions on IDs generated by the decorator logic assert len(logger.read()) == 38 @@ -66,8 +66,10 @@ async def test_evaluation_test_decorator_ids_single(monkeypatch): "tests/pytest/data/markdown_dataset.jsonl", "tests/pytest/data/markdown_dataset.jsonl", ], - rollout_input_params=[{"temperature": 0.0}, {"temperature": 1.0}], - model=["dummy/local-model"], + completion_params=[ + {"temperature": 0.0, "model": "dummy/local-model"}, + {"temperature": 1.0, "model": "dummy/local-model"}, + ], dataset_adapter=markdown_dataset_to_evaluation_row, rollout_processor=default_no_op_rollout_processor, mode="pointwise", @@ -87,12 +89,15 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow: "tests/pytest/data/markdown_dataset.jsonl", "tests/pytest/data/markdown_dataset.jsonl", ] - input_params_list = [{"temperature": 0.0}, {"temperature": 1.0}] + completion_params_list = [ + {"temperature": 0.0, "model": "dummy/local-model"}, + {"temperature": 1.0, "model": "dummy/local-model"}, + ] # Manually invoke all parameter combinations within a single test for ds_path in dataset_paths: - for params in input_params_list: - await eval_fn(model="dummy/local-model", dataset_path=[ds_path], input_params=params) + for params in completion_params_list: + await eval_fn(dataset_path=[ds_path], completion_params=params) # Assertions on IDs generated by the decorator logic assert len(unique_invocation_ids) == 1 diff --git a/tests/pytest/test_pytest_input_messages.py b/tests/pytest/test_pytest_input_messages.py index edb69b83..dc460aa5 100644 --- a/tests/pytest/test_pytest_input_messages.py +++ b/tests/pytest/test_pytest_input_messages.py @@ -10,7 +10,7 @@ Message(role="user", content="What is the capital of France?"), ] ], - model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"], + completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}], rollout_processor=default_single_turn_rollout_processor, ) def test_input_messages_in_decorator(rows: List[EvaluationRow]) -> List[EvaluationRow]: diff --git a/tests/pytest/test_pytest_json_schema.py b/tests/pytest/test_pytest_json_schema.py index 3c18ff2b..158874f1 100644 --- a/tests/pytest/test_pytest_json_schema.py +++ b/tests/pytest/test_pytest_json_schema.py @@ -24,7 +24,7 @@ def json_schema_to_evaluation_row(rows: List[Dict[str, Any]]) -> List[Evaluation @evaluation_test( input_dataset=["tests/pytest/data/json_schema.jsonl"], - model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"], + completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}], mode="pointwise", rollout_processor=default_single_turn_rollout_processor, dataset_adapter=json_schema_to_evaluation_row, diff --git a/tests/pytest/test_pytest_math_example.py b/tests/pytest/test_pytest_math_example.py index afe74a4e..23010797 100644 --- a/tests/pytest/test_pytest_math_example.py +++ b/tests/pytest/test_pytest_math_example.py @@ -8,8 +8,7 @@ @evaluation_test( input_dataset=["development/gsm8k_sample.jsonl"], dataset_adapter=gsm8k_to_evaluation_row, - model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"], - rollout_input_params=[{"temperature": 0.0}], + completion_params=[{"temperature": 0.0, "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}], max_dataset_rows=5, passed_threshold=0.0, rollout_processor=default_single_turn_rollout_processor, diff --git a/tests/pytest/test_pytest_math_format_length.py b/tests/pytest/test_pytest_math_format_length.py index e51b062f..5bba5c0e 100644 --- a/tests/pytest/test_pytest_math_format_length.py +++ b/tests/pytest/test_pytest_math_format_length.py @@ -11,8 +11,7 @@ @evaluation_test( input_dataset=["development/gsm8k_sample.jsonl"], dataset_adapter=gsm8k_to_evaluation_row, - model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"], - rollout_input_params=[{"temperature": 0.0}], + completion_params=[{"temperature": 0.0, "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}], max_dataset_rows=5, passed_threshold=0.0, rollout_processor=default_single_turn_rollout_processor, diff --git a/tests/pytest/test_pytest_mcp_config.py b/tests/pytest/test_pytest_mcp_config.py index c1b55d51..dde15aa9 100644 --- a/tests/pytest/test_pytest_mcp_config.py +++ b/tests/pytest/test_pytest_mcp_config.py @@ -20,7 +20,7 @@ ] ], rollout_processor=default_agent_rollout_processor, - model=["fireworks_ai/accounts/fireworks/models/gpt-oss-20b"], + completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-20b"}], mode="pointwise", mcp_config_path="tests/pytest/mcp_configurations/mock_discord_mcp_config.json", ) diff --git a/tests/pytest/test_pytest_mcp_url.py b/tests/pytest/test_pytest_mcp_url.py index 2a1c1cfc..01c06c45 100644 --- a/tests/pytest/test_pytest_mcp_url.py +++ b/tests/pytest/test_pytest_mcp_url.py @@ -1,4 +1,4 @@ -from eval_protocol.models import EvaluateResult, Message, EvaluationRow +from eval_protocol.models import EvaluateResult, EvaluationRow, Message from eval_protocol.pytest import default_agent_rollout_processor, evaluation_test @@ -19,7 +19,7 @@ ] ], rollout_processor=default_agent_rollout_processor, - model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"], + completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}], mode="pointwise", mcp_config_path="tests/pytest/mcp_configurations/docs_mcp_config.json", ) diff --git a/tests/pytest/test_pytest_word_count_example.py b/tests/pytest/test_pytest_word_count_example.py index b0c4850d..339c5152 100644 --- a/tests/pytest/test_pytest_word_count_example.py +++ b/tests/pytest/test_pytest_word_count_example.py @@ -8,8 +8,7 @@ @evaluation_test( input_dataset=["development/gsm8k_sample.jsonl"], dataset_adapter=word_count_to_evaluation_row, - model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"], - rollout_input_params=[{"temperature": 0.0}], + completion_params=[{"temperature": 0.0, "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}], max_dataset_rows=5, passed_threshold=0.3, # Reasonable threshold for word count evaluation rollout_processor=default_single_turn_rollout_processor, diff --git a/tests/pytest/test_tau_bench_airline.py b/tests/pytest/test_tau_bench_airline.py index f5472092..0eeba626 100644 --- a/tests/pytest/test_tau_bench_airline.py +++ b/tests/pytest/test_tau_bench_airline.py @@ -10,7 +10,7 @@ from pathlib import Path from typing import Any, Dict, List -from eval_protocol.models import CompletionParams, EvaluateResult, EvaluationRow, InputMetadata, Message +from eval_protocol.models import EvaluateResult, EvaluationRow, InputMetadata, Message from eval_protocol.pytest import evaluation_test from eval_protocol.pytest.default_mcp_gym_rollout_processor import default_mcp_gym_rollout_processor from vendor.tau2.data_model.message import ( @@ -64,8 +64,14 @@ def tau_bench_airline_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Eval @evaluation_test( input_dataset=["tests/pytest/data/airline_dataset.jsonl"], dataset_adapter=tau_bench_airline_to_evaluation_row, - model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"], - rollout_input_params=[{"temperature": 0.8, "extra_body": {"reasoning_effort": "medium"}}], + completion_params=[ + { + "temperature": 0.8, + "max_tokens": 4096, + "extra_body": {"reasoning_effort": "low"}, + "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b", + } + ], rollout_processor=default_mcp_gym_rollout_processor, passed_threshold={"success": 0.4, "standard_deviation": 0.1}, num_runs=8, diff --git a/tests/test_models.py b/tests/test_models.py index 61c3b3c0..1358344b 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -4,7 +4,6 @@ import pytest from eval_protocol.models import ( # Added Message to existing import - CompletionParams, EvaluateResult, EvaluationRow, InputMetadata, @@ -271,7 +270,7 @@ def test_evaluation_row_creation(): evaluation_result=evaluation_result, input_metadata=InputMetadata( row_id="math_001", - completion_params=CompletionParams(model="gpt-4"), + completion_params={"model": "gpt-4"}, dataset_info={"source": "math_eval"}, session_data={"timestamp": 1234567890}, ), @@ -322,7 +321,7 @@ def test_evaluation_row_serialization(): evaluation_result=evaluation_result, input_metadata=InputMetadata( row_id="test_123", - completion_params=CompletionParams(model="gpt-4"), + completion_params={"model": "gpt-4"}, dataset_info={"test": True}, session_data={"timestamp": 1234567890}, ), diff --git a/tests/test_tau_bench_airline_smoke.py b/tests/test_tau_bench_airline_smoke.py index e96baabe..200f7ca8 100644 --- a/tests/test_tau_bench_airline_smoke.py +++ b/tests/test_tau_bench_airline_smoke.py @@ -65,8 +65,13 @@ def tau_bench_airline_smoke_to_evaluation_row(data: List[Dict[str, Any]]) -> Lis @evaluation_test( input_dataset=["tests/pytest/data/airline_dataset.jsonl"], dataset_adapter=tau_bench_airline_smoke_to_evaluation_row, - model=["fireworks_ai/accounts/fireworks/models/gpt-oss-120b"], - rollout_input_params=[{"temperature": 0.8, "extra_body": {"reasoning_effort": "medium"}}], + completion_params=[ + { + "temperature": 0.8, + "extra_body": {"reasoning_effort": "medium"}, + "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b", + } + ], rollout_processor=default_mcp_gym_rollout_processor, passed_threshold=0.36, num_runs=1, # Smoke test: single run for quick feedback diff --git a/vite-app/src/components/EvaluationRow.tsx b/vite-app/src/components/EvaluationRow.tsx index fdeaf03c..03412e61 100644 --- a/vite-app/src/components/EvaluationRow.tsx +++ b/vite-app/src/components/EvaluationRow.tsx @@ -231,7 +231,7 @@ export const EvaluationRow = observer( {/* Model */} - + {/* Score */} diff --git a/vite-app/src/types/eval-protocol.ts b/vite-app/src/types/eval-protocol.ts index b18697f1..f86ea058 100644 --- a/vite-app/src/types/eval-protocol.ts +++ b/vite-app/src/types/eval-protocol.ts @@ -54,16 +54,11 @@ export const EvaluateResultSchema = z.object({ final_control_plane_info: z.record(z.string(), z.any()).optional().describe('The final control plane state that led to termination.') }); -export const CompletionParamsSchema = z.object({ - model: z.string().describe('Model identifier (e.g., \'gpt-4.1\', \'fireworks/llama\')'), - temperature: z.number().optional().describe('Temperature setting for model generation'), - max_tokens: z.number().optional().describe('Maximum tokens to generate'), - max_tool_calls: z.number().optional().describe('Maximum tool calls per turn') -}); +export const CompletionParamsSchema = z.record(z.string(), z.any()); export const InputMetadataSchema = z.object({ row_id: z.string().optional().describe('Unique string to ID the row'), - completion_params: CompletionParamsSchema.optional().describe('Completion endpoint parameters used'), + completion_params: CompletionParamsSchema.describe('Completion endpoint parameters used'), dataset_info: z.record(z.string(), z.any()).optional().describe('Dataset row details: seed, system_prompt, environment_context, etc'), session_data: z.record(z.string(), z.any()).optional().describe('Session metadata like timestamp (input only, no duration/usage)') }).loose(); From 180417f91ced47de0a4fc0685467b3f410be821e Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Wed, 13 Aug 2025 14:29:22 -0700 Subject: [PATCH 11/26] hotfix --- eval_protocol/dataset_logger/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eval_protocol/dataset_logger/__init__.py b/eval_protocol/dataset_logger/__init__.py index 1caf6adc..ff1675f8 100644 --- a/eval_protocol/dataset_logger/__init__.py +++ b/eval_protocol/dataset_logger/__init__.py @@ -4,7 +4,7 @@ from eval_protocol.dataset_logger.sqlite_dataset_logger_adapter import SqliteDatasetLoggerAdapter # Allow disabling sqlite logger to avoid environment-specific constraints in simple CLI runs. -if os.getenv("DISABLE_EP_SQLITE_LOG", "0").strip() == "1": +if os.getenv("DISABLE_EP_SQLITE_LOG", "0").strip() != "1": default_logger = SqliteDatasetLoggerAdapter() else: From 1f2dadcdb64488b6adc090c5701e6c087792f26c Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Wed, 13 Aug 2025 14:57:36 -0700 Subject: [PATCH 12/26] vite build --- .../{index-dHlKwEPE.js => index-CpPWargc.js} | 30 +++++++++---------- ...-dHlKwEPE.js.map => index-CpPWargc.js.map} | 2 +- vite-app/dist/index.html | 2 +- 3 files changed, 17 insertions(+), 17 deletions(-) rename vite-app/dist/assets/{index-dHlKwEPE.js => index-CpPWargc.js} (84%) rename vite-app/dist/assets/{index-dHlKwEPE.js.map => index-CpPWargc.js.map} (62%) diff --git a/vite-app/dist/assets/index-dHlKwEPE.js b/vite-app/dist/assets/index-CpPWargc.js similarity index 84% rename from vite-app/dist/assets/index-dHlKwEPE.js rename to vite-app/dist/assets/index-CpPWargc.js index f29c690a..402c97cd 100644 --- a/vite-app/dist/assets/index-dHlKwEPE.js +++ b/vite-app/dist/assets/index-CpPWargc.js @@ -14,7 +14,7 @@ * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. - */var Im;function T_(){if(Im)return fe;Im=1;var n=Symbol.for("react.transitional.element"),l=Symbol.for("react.portal"),r=Symbol.for("react.fragment"),u=Symbol.for("react.strict_mode"),o=Symbol.for("react.profiler"),f=Symbol.for("react.consumer"),d=Symbol.for("react.context"),v=Symbol.for("react.forward_ref"),m=Symbol.for("react.suspense"),p=Symbol.for("react.memo"),b=Symbol.for("react.lazy"),x=Symbol.iterator;function O(S){return S===null||typeof S!="object"?null:(S=x&&S[x]||S["@@iterator"],typeof S=="function"?S:null)}var C={isMounted:function(){return!1},enqueueForceUpdate:function(){},enqueueReplaceState:function(){},enqueueSetState:function(){}},$=Object.assign,L={};function G(S,k,J){this.props=S,this.context=k,this.refs=L,this.updater=J||C}G.prototype.isReactComponent={},G.prototype.setState=function(S,k){if(typeof S!="object"&&typeof S!="function"&&S!=null)throw Error("takes an object of state variables to update or a function which returns an object of state variables.");this.updater.enqueueSetState(this,S,k,"setState")},G.prototype.forceUpdate=function(S){this.updater.enqueueForceUpdate(this,S,"forceUpdate")};function j(){}j.prototype=G.prototype;function H(S,k,J){this.props=S,this.context=k,this.refs=L,this.updater=J||C}var V=H.prototype=new j;V.constructor=H,$(V,G.prototype),V.isPureReactComponent=!0;var X=Array.isArray,K={H:null,A:null,T:null,S:null,V:null},ce=Object.prototype.hasOwnProperty;function pe(S,k,J,P,q,le){return J=le.ref,{$$typeof:n,type:S,key:k,ref:J!==void 0?J:null,props:le}}function we(S,k){return pe(S.type,k,void 0,void 0,void 0,S.props)}function ae(S){return typeof S=="object"&&S!==null&&S.$$typeof===n}function Ce(S){var k={"=":"=0",":":"=2"};return"$"+S.replace(/[=:]/g,function(J){return k[J]})}var Fe=/\/+/g;function Ve(S,k){return typeof S=="object"&&S!==null&&S.key!=null?Ce(""+S.key):k.toString(36)}function wt(){}function dn(S){switch(S.status){case"fulfilled":return S.value;case"rejected":throw S.reason;default:switch(typeof S.status=="string"?S.then(wt,wt):(S.status="pending",S.then(function(k){S.status==="pending"&&(S.status="fulfilled",S.value=k)},function(k){S.status==="pending"&&(S.status="rejected",S.reason=k)})),S.status){case"fulfilled":return S.value;case"rejected":throw S.reason}}throw S}function qe(S,k,J,P,q){var le=typeof S;(le==="undefined"||le==="boolean")&&(S=null);var ne=!1;if(S===null)ne=!0;else switch(le){case"bigint":case"string":case"number":ne=!0;break;case"object":switch(S.$$typeof){case n:case l:ne=!0;break;case b:return ne=S._init,qe(ne(S._payload),k,J,P,q)}}if(ne)return q=q(S),ne=P===""?"."+Ve(S,0):P,X(q)?(J="",ne!=null&&(J=ne.replace(Fe,"$&/")+"/"),qe(q,k,J,"",function(Ht){return Ht})):q!=null&&(ae(q)&&(q=we(q,J+(q.key==null||S&&S.key===q.key?"":(""+q.key).replace(Fe,"$&/")+"/")+ne)),k.push(q)),1;ne=0;var gt=P===""?".":P+":";if(X(S))for(var Ze=0;Ze>>1,S=D[be];if(0>>1;beo(P,ie))qo(le,P)?(D[be]=le,D[q]=ie,be=q):(D[be]=P,D[J]=ie,be=J);else if(qo(le,ie))D[be]=le,D[q]=ie,be=q;else break e}}return Q}function o(D,Q){var ie=D.sortIndex-Q.sortIndex;return ie!==0?ie:D.id-Q.id}if(n.unstable_now=void 0,typeof performance=="object"&&typeof performance.now=="function"){var f=performance;n.unstable_now=function(){return f.now()}}else{var d=Date,v=d.now();n.unstable_now=function(){return d.now()-v}}var m=[],p=[],b=1,x=null,O=3,C=!1,$=!1,L=!1,G=!1,j=typeof setTimeout=="function"?setTimeout:null,H=typeof clearTimeout=="function"?clearTimeout:null,V=typeof setImmediate<"u"?setImmediate:null;function X(D){for(var Q=r(p);Q!==null;){if(Q.callback===null)u(p);else if(Q.startTime<=D)u(p),Q.sortIndex=Q.expirationTime,l(m,Q);else break;Q=r(p)}}function K(D){if(L=!1,X(D),!$)if(r(m)!==null)$=!0,ce||(ce=!0,Ve());else{var Q=r(p);Q!==null&&qe(K,Q.startTime-D)}}var ce=!1,pe=-1,we=5,ae=-1;function Ce(){return G?!0:!(n.unstable_now()-aeD&&Ce());){var be=x.callback;if(typeof be=="function"){x.callback=null,O=x.priorityLevel;var S=be(x.expirationTime<=D);if(D=n.unstable_now(),typeof S=="function"){x.callback=S,X(D),Q=!0;break t}x===r(m)&&u(m),X(D)}else u(m);x=r(m)}if(x!==null)Q=!0;else{var k=r(p);k!==null&&qe(K,k.startTime-D),Q=!1}}break e}finally{x=null,O=ie,C=!1}Q=void 0}}finally{Q?Ve():ce=!1}}}var Ve;if(typeof V=="function")Ve=function(){V(Fe)};else if(typeof MessageChannel<"u"){var wt=new MessageChannel,dn=wt.port2;wt.port1.onmessage=Fe,Ve=function(){dn.postMessage(null)}}else Ve=function(){j(Fe,0)};function qe(D,Q){pe=j(function(){D(n.unstable_now())},Q)}n.unstable_IdlePriority=5,n.unstable_ImmediatePriority=1,n.unstable_LowPriority=4,n.unstable_NormalPriority=3,n.unstable_Profiling=null,n.unstable_UserBlockingPriority=2,n.unstable_cancelCallback=function(D){D.callback=null},n.unstable_forceFrameRate=function(D){0>D||125be?(D.sortIndex=ie,l(p,D),r(m)===null&&D===r(p)&&(L?(H(pe),pe=-1):L=!0,qe(K,ie-be))):(D.sortIndex=S,l(m,D),$||C||($=!0,ce||(ce=!0,Ve()))),D},n.unstable_shouldYield=Ce,n.unstable_wrapCallback=function(D){var Q=O;return function(){var ie=O;O=Q;try{return D.apply(this,arguments)}finally{O=ie}}}}(af)),af}var np;function z_(){return np||(np=1,nf.exports=R_()),nf.exports}var lf={exports:{}},ht={};/** + */var tp;function R_(){return tp||(tp=1,function(n){function l(D,Q){var ie=D.length;D.push(Q);e:for(;0>>1,S=D[be];if(0>>1;be<$;){var J=2*(be+1)-1,P=D[J],q=J+1,le=D[q];if(0>o(P,ie))qo(le,P)?(D[be]=le,D[q]=ie,be=q):(D[be]=P,D[J]=ie,be=J);else if(qo(le,ie))D[be]=le,D[q]=ie,be=q;else break e}}return Q}function o(D,Q){var ie=D.sortIndex-Q.sortIndex;return ie!==0?ie:D.id-Q.id}if(n.unstable_now=void 0,typeof performance=="object"&&typeof performance.now=="function"){var f=performance;n.unstable_now=function(){return f.now()}}else{var d=Date,v=d.now();n.unstable_now=function(){return d.now()-v}}var m=[],p=[],b=1,x=null,O=3,C=!1,k=!1,L=!1,G=!1,j=typeof setTimeout=="function"?setTimeout:null,H=typeof clearTimeout=="function"?clearTimeout:null,V=typeof setImmediate<"u"?setImmediate:null;function X(D){for(var Q=r(p);Q!==null;){if(Q.callback===null)u(p);else if(Q.startTime<=D)u(p),Q.sortIndex=Q.expirationTime,l(m,Q);else break;Q=r(p)}}function K(D){if(L=!1,X(D),!k)if(r(m)!==null)k=!0,ce||(ce=!0,He());else{var Q=r(p);Q!==null&&Ve(K,Q.startTime-D)}}var ce=!1,pe=-1,we=5,ae=-1;function Ce(){return G?!0:!(n.unstable_now()-aeD&&Ce());){var be=x.callback;if(typeof be=="function"){x.callback=null,O=x.priorityLevel;var S=be(x.expirationTime<=D);if(D=n.unstable_now(),typeof S=="function"){x.callback=S,X(D),Q=!0;break t}x===r(m)&&u(m),X(D)}else u(m);x=r(m)}if(x!==null)Q=!0;else{var $=r(p);$!==null&&Ve(K,$.startTime-D),Q=!1}}break e}finally{x=null,O=ie,C=!1}Q=void 0}}finally{Q?He():ce=!1}}}var He;if(typeof V=="function")He=function(){V(Fe)};else if(typeof MessageChannel<"u"){var At=new MessageChannel,dn=At.port2;At.port1.onmessage=Fe,He=function(){dn.postMessage(null)}}else He=function(){j(Fe,0)};function Ve(D,Q){pe=j(function(){D(n.unstable_now())},Q)}n.unstable_IdlePriority=5,n.unstable_ImmediatePriority=1,n.unstable_LowPriority=4,n.unstable_NormalPriority=3,n.unstable_Profiling=null,n.unstable_UserBlockingPriority=2,n.unstable_cancelCallback=function(D){D.callback=null},n.unstable_forceFrameRate=function(D){0>D||125be?(D.sortIndex=ie,l(p,D),r(m)===null&&D===r(p)&&(L?(H(pe),pe=-1):L=!0,Ve(K,ie-be))):(D.sortIndex=S,l(m,D),k||C||(k=!0,ce||(ce=!0,He()))),D},n.unstable_shouldYield=Ce,n.unstable_wrapCallback=function(D){var Q=O;return function(){var ie=O;O=Q;try{return D.apply(this,arguments)}finally{O=ie}}}}(af)),af}var np;function z_(){return np||(np=1,nf.exports=R_()),nf.exports}var lf={exports:{}},dt={};/** * @license React * react-dom.production.js * @@ -30,7 +30,7 @@ * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. - */var ap;function N_(){if(ap)return ht;ap=1;var n=Eo();function l(m){var p="https://react.dev/errors/"+m;if(1"u"||typeof __REACT_DEVTOOLS_GLOBAL_HOOK__.checkDCE!="function"))try{__REACT_DEVTOOLS_GLOBAL_HOOK__.checkDCE(n)}catch(l){console.error(l)}}return n(),lf.exports=N_(),lf.exports}/** + */var ap;function N_(){if(ap)return dt;ap=1;var n=Eo();function l(m){var p="https://react.dev/errors/"+m;if(1"u"||typeof __REACT_DEVTOOLS_GLOBAL_HOOK__.checkDCE!="function"))try{__REACT_DEVTOOLS_GLOBAL_HOOK__.checkDCE(n)}catch(l){console.error(l)}}return n(),lf.exports=N_(),lf.exports}/** * @license React * react-dom-client.production.js * @@ -38,15 +38,15 @@ * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. - */var ip;function j_(){if(ip)return rr;ip=1;var n=z_(),l=Eo(),r=pg();function u(e){var t="https://react.dev/errors/"+e;if(1S||(e.current=be[S],be[S]=null,S--)}function P(e,t){S++,be[S]=e.current,e.current=t}var q=k(null),le=k(null),ne=k(null),gt=k(null);function Ze(e,t){switch(P(ne,t),P(le,e),P(q,null),t.nodeType){case 9:case 11:e=(e=t.documentElement)&&(e=e.namespaceURI)?wm(e):0;break;default:if(e=t.tagName,t=t.namespaceURI)t=wm(t),e=Am(t,e);else switch(e){case"svg":e=1;break;case"math":e=2;break;default:e=0}}J(q),P(q,e)}function Ht(){J(q),J(le),J(ne)}function ri(e){e.memoizedState!==null&&P(gt,e);var t=q.current,a=Am(t,e.type);t!==a&&(P(le,e),P(q,a))}function rl(e){le.current===e&&(J(q),J(le)),gt.current===e&&(J(gt),er._currentValue=ie)}var Fn=Object.prototype.hasOwnProperty,Wn=n.unstable_scheduleCallback,ui=n.unstable_cancelCallback,ld=n.unstable_shouldYield,lb=n.unstable_requestPaint,hn=n.unstable_now,ib=n.unstable_getCurrentPriorityLevel,id=n.unstable_ImmediatePriority,rd=n.unstable_UserBlockingPriority,Rr=n.unstable_NormalPriority,rb=n.unstable_LowPriority,ud=n.unstable_IdlePriority,ub=n.log,ob=n.unstable_setDisableYieldValue,oi=null,At=null;function In(e){if(typeof ub=="function"&&ob(e),At&&typeof At.setStrictMode=="function")try{At.setStrictMode(oi,e)}catch{}}var Tt=Math.clz32?Math.clz32:fb,sb=Math.log,cb=Math.LN2;function fb(e){return e>>>=0,e===0?32:31-(sb(e)/cb|0)|0}var zr=256,Nr=4194304;function za(e){var t=e&42;if(t!==0)return t;switch(e&-e){case 1:return 1;case 2:return 2;case 4:return 4;case 8:return 8;case 16:return 16;case 32:return 32;case 64:return 64;case 128:return 128;case 256:case 512:case 1024:case 2048:case 4096:case 8192:case 16384:case 32768:case 65536:case 131072:case 262144:case 524288:case 1048576:case 2097152:return e&4194048;case 4194304:case 8388608:case 16777216:case 33554432:return e&62914560;case 67108864:return 67108864;case 134217728:return 134217728;case 268435456:return 268435456;case 536870912:return 536870912;case 1073741824:return 0;default:return e}}function jr(e,t,a){var i=e.pendingLanes;if(i===0)return 0;var s=0,c=e.suspendedLanes,h=e.pingedLanes;e=e.warmLanes;var g=i&134217727;return g!==0?(i=g&~c,i!==0?s=za(i):(h&=g,h!==0?s=za(h):a||(a=g&~e,a!==0&&(s=za(a))))):(g=i&~c,g!==0?s=za(g):h!==0?s=za(h):a||(a=i&~e,a!==0&&(s=za(a)))),s===0?0:t!==0&&t!==s&&(t&c)===0&&(c=s&-s,a=t&-t,c>=a||c===32&&(a&4194048)!==0)?t:s}function si(e,t){return(e.pendingLanes&~(e.suspendedLanes&~e.pingedLanes)&t)===0}function db(e,t){switch(e){case 1:case 2:case 4:case 8:case 64:return t+250;case 16:case 32:case 128:case 256:case 512:case 1024:case 2048:case 4096:case 8192:case 16384:case 32768:case 65536:case 131072:case 262144:case 524288:case 1048576:case 2097152:return t+5e3;case 4194304:case 8388608:case 16777216:case 33554432:return-1;case 67108864:case 134217728:case 268435456:case 536870912:case 1073741824:return-1;default:return-1}}function od(){var e=zr;return zr<<=1,(zr&4194048)===0&&(zr=256),e}function sd(){var e=Nr;return Nr<<=1,(Nr&62914560)===0&&(Nr=4194304),e}function Vo(e){for(var t=[],a=0;31>a;a++)t.push(e);return t}function ci(e,t){e.pendingLanes|=t,t!==268435456&&(e.suspendedLanes=0,e.pingedLanes=0,e.warmLanes=0)}function hb(e,t,a,i,s,c){var h=e.pendingLanes;e.pendingLanes=a,e.suspendedLanes=0,e.pingedLanes=0,e.warmLanes=0,e.expiredLanes&=a,e.entangledLanes&=a,e.errorRecoveryDisabledLanes&=a,e.shellSuspendCounter=0;var g=e.entanglements,_=e.expirationTimes,R=e.hiddenUpdates;for(a=h&~a;0S||(e.current=be[S],be[S]=null,S--)}function P(e,t){S++,be[S]=e.current,e.current=t}var q=$(null),le=$(null),ne=$(null),gt=$(null);function Ze(e,t){switch(P(ne,t),P(le,e),P(q,null),t.nodeType){case 9:case 11:e=(e=t.documentElement)&&(e=e.namespaceURI)?wm(e):0;break;default:if(e=t.tagName,t=t.namespaceURI)t=wm(t),e=Am(t,e);else switch(e){case"svg":e=1;break;case"math":e=2;break;default:e=0}}J(q),P(q,e)}function Ht(){J(q),J(le),J(ne)}function ri(e){e.memoizedState!==null&&P(gt,e);var t=q.current,a=Am(t,e.type);t!==a&&(P(le,e),P(q,a))}function rl(e){le.current===e&&(J(q),J(le)),gt.current===e&&(J(gt),er._currentValue=ie)}var Fn=Object.prototype.hasOwnProperty,Wn=n.unstable_scheduleCallback,ui=n.unstable_cancelCallback,ld=n.unstable_shouldYield,lb=n.unstable_requestPaint,hn=n.unstable_now,ib=n.unstable_getCurrentPriorityLevel,id=n.unstable_ImmediatePriority,rd=n.unstable_UserBlockingPriority,Rr=n.unstable_NormalPriority,rb=n.unstable_LowPriority,ud=n.unstable_IdlePriority,ub=n.log,ob=n.unstable_setDisableYieldValue,oi=null,Tt=null;function In(e){if(typeof ub=="function"&&ob(e),Tt&&typeof Tt.setStrictMode=="function")try{Tt.setStrictMode(oi,e)}catch{}}var Rt=Math.clz32?Math.clz32:fb,sb=Math.log,cb=Math.LN2;function fb(e){return e>>>=0,e===0?32:31-(sb(e)/cb|0)|0}var zr=256,Nr=4194304;function za(e){var t=e&42;if(t!==0)return t;switch(e&-e){case 1:return 1;case 2:return 2;case 4:return 4;case 8:return 8;case 16:return 16;case 32:return 32;case 64:return 64;case 128:return 128;case 256:case 512:case 1024:case 2048:case 4096:case 8192:case 16384:case 32768:case 65536:case 131072:case 262144:case 524288:case 1048576:case 2097152:return e&4194048;case 4194304:case 8388608:case 16777216:case 33554432:return e&62914560;case 67108864:return 67108864;case 134217728:return 134217728;case 268435456:return 268435456;case 536870912:return 536870912;case 1073741824:return 0;default:return e}}function jr(e,t,a){var i=e.pendingLanes;if(i===0)return 0;var s=0,c=e.suspendedLanes,h=e.pingedLanes;e=e.warmLanes;var g=i&134217727;return g!==0?(i=g&~c,i!==0?s=za(i):(h&=g,h!==0?s=za(h):a||(a=g&~e,a!==0&&(s=za(a))))):(g=i&~c,g!==0?s=za(g):h!==0?s=za(h):a||(a=i&~e,a!==0&&(s=za(a)))),s===0?0:t!==0&&t!==s&&(t&c)===0&&(c=s&-s,a=t&-t,c>=a||c===32&&(a&4194048)!==0)?t:s}function si(e,t){return(e.pendingLanes&~(e.suspendedLanes&~e.pingedLanes)&t)===0}function db(e,t){switch(e){case 1:case 2:case 4:case 8:case 64:return t+250;case 16:case 32:case 128:case 256:case 512:case 1024:case 2048:case 4096:case 8192:case 16384:case 32768:case 65536:case 131072:case 262144:case 524288:case 1048576:case 2097152:return t+5e3;case 4194304:case 8388608:case 16777216:case 33554432:return-1;case 67108864:case 134217728:case 268435456:case 536870912:case 1073741824:return-1;default:return-1}}function od(){var e=zr;return zr<<=1,(zr&4194048)===0&&(zr=256),e}function sd(){var e=Nr;return Nr<<=1,(Nr&62914560)===0&&(Nr=4194304),e}function Vo(e){for(var t=[],a=0;31>a;a++)t.push(e);return t}function ci(e,t){e.pendingLanes|=t,t!==268435456&&(e.suspendedLanes=0,e.pingedLanes=0,e.warmLanes=0)}function hb(e,t,a,i,s,c){var h=e.pendingLanes;e.pendingLanes=a,e.suspendedLanes=0,e.pingedLanes=0,e.warmLanes=0,e.expiredLanes&=a,e.entangledLanes&=a,e.errorRecoveryDisabledLanes&=a,e.shellSuspendCounter=0;var g=e.entanglements,_=e.expirationTimes,R=e.hiddenUpdates;for(a=h&~a;0)":-1s||_[i]!==R[s]){var M=` `+_[i].replace(" at new "," at ");return e.displayName&&M.includes("")&&(M=M.replace("",e.displayName)),M}while(1<=i&&0<=s);break}}}finally{Qo=!1,Error.prepareStackTrace=a}return(a=e?e.displayName||e.name:"")?dl(a):""}function bb(e){switch(e.tag){case 26:case 27:case 5:return dl(e.type);case 16:return dl("Lazy");case 13:return dl("Suspense");case 19:return dl("SuspenseList");case 0:case 15:return Po(e.type,!1);case 11:return Po(e.type.render,!1);case 1:return Po(e.type,!0);case 31:return dl("Activity");default:return""}}function bd(e){try{var t="";do t+=bb(e),e=e.return;while(e);return t}catch(a){return` Error generating stack: `+a.message+` -`+a.stack}}function Vt(e){switch(typeof e){case"bigint":case"boolean":case"number":case"string":case"undefined":return e;case"object":return e;default:return""}}function _d(e){var t=e.type;return(e=e.nodeName)&&e.toLowerCase()==="input"&&(t==="checkbox"||t==="radio")}function _b(e){var t=_d(e)?"checked":"value",a=Object.getOwnPropertyDescriptor(e.constructor.prototype,t),i=""+e[t];if(!e.hasOwnProperty(t)&&typeof a<"u"&&typeof a.get=="function"&&typeof a.set=="function"){var s=a.get,c=a.set;return Object.defineProperty(e,t,{configurable:!0,get:function(){return s.call(this)},set:function(h){i=""+h,c.call(this,h)}}),Object.defineProperty(e,t,{enumerable:a.enumerable}),{getValue:function(){return i},setValue:function(h){i=""+h},stopTracking:function(){e._valueTracker=null,delete e[t]}}}}function Mr(e){e._valueTracker||(e._valueTracker=_b(e))}function Sd(e){if(!e)return!1;var t=e._valueTracker;if(!t)return!0;var a=t.getValue(),i="";return e&&(i=_d(e)?e.checked?"true":"false":e.value),e=i,e!==a?(t.setValue(e),!0):!1}function Ur(e){if(e=e||(typeof document<"u"?document:void 0),typeof e>"u")return null;try{return e.activeElement||e.body}catch{return e.body}}var Sb=/[\n"\\]/g;function qt(e){return e.replace(Sb,function(t){return"\\"+t.charCodeAt(0).toString(16)+" "})}function Jo(e,t,a,i,s,c,h,g){e.name="",h!=null&&typeof h!="function"&&typeof h!="symbol"&&typeof h!="boolean"?e.type=h:e.removeAttribute("type"),t!=null?h==="number"?(t===0&&e.value===""||e.value!=t)&&(e.value=""+Vt(t)):e.value!==""+Vt(t)&&(e.value=""+Vt(t)):h!=="submit"&&h!=="reset"||e.removeAttribute("value"),t!=null?Fo(e,h,Vt(t)):a!=null?Fo(e,h,Vt(a)):i!=null&&e.removeAttribute("value"),s==null&&c!=null&&(e.defaultChecked=!!c),s!=null&&(e.checked=s&&typeof s!="function"&&typeof s!="symbol"),g!=null&&typeof g!="function"&&typeof g!="symbol"&&typeof g!="boolean"?e.name=""+Vt(g):e.removeAttribute("name")}function xd(e,t,a,i,s,c,h,g){if(c!=null&&typeof c!="function"&&typeof c!="symbol"&&typeof c!="boolean"&&(e.type=c),t!=null||a!=null){if(!(c!=="submit"&&c!=="reset"||t!=null))return;a=a!=null?""+Vt(a):"",t=t!=null?""+Vt(t):a,g||t===e.value||(e.value=t),e.defaultValue=t}i=i??s,i=typeof i!="function"&&typeof i!="symbol"&&!!i,e.checked=g?e.checked:!!i,e.defaultChecked=!!i,h!=null&&typeof h!="function"&&typeof h!="symbol"&&typeof h!="boolean"&&(e.name=h)}function Fo(e,t,a){t==="number"&&Ur(e.ownerDocument)===e||e.defaultValue===""+a||(e.defaultValue=""+a)}function hl(e,t,a,i){if(e=e.options,t){t={};for(var s=0;s"u"||typeof window.document>"u"||typeof window.document.createElement>"u"),ns=!1;if(zn)try{var vi={};Object.defineProperty(vi,"passive",{get:function(){ns=!0}}),window.addEventListener("test",vi,vi),window.removeEventListener("test",vi,vi)}catch{ns=!1}var ta=null,as=null,Br=null;function zd(){if(Br)return Br;var e,t=as,a=t.length,i,s="value"in ta?ta.value:ta.textContent,c=s.length;for(e=0;e=gi),Ud=" ",Zd=!1;function Bd(e,t){switch(e){case"keyup":return Pb.indexOf(t.keyCode)!==-1;case"keydown":return t.keyCode!==229;case"keypress":case"mousedown":case"focusout":return!0;default:return!1}}function Ld(e){return e=e.detail,typeof e=="object"&&"data"in e?e.data:null}var gl=!1;function Fb(e,t){switch(e){case"compositionend":return Ld(t);case"keypress":return t.which!==32?null:(Zd=!0,Ud);case"textInput":return e=t.data,e===Ud&&Zd?null:e;default:return null}}function Wb(e,t){if(gl)return e==="compositionend"||!os&&Bd(e,t)?(e=zd(),Br=as=ta=null,gl=!1,e):null;switch(e){case"paste":return null;case"keypress":if(!(t.ctrlKey||t.altKey||t.metaKey)||t.ctrlKey&&t.altKey){if(t.char&&1=t)return{node:a,offset:t-e};e=i}e:{for(;a;){if(a.nextSibling){a=a.nextSibling;break e}a=a.parentNode}a=void 0}a=Xd(a)}}function Qd(e,t){return e&&t?e===t?!0:e&&e.nodeType===3?!1:t&&t.nodeType===3?Qd(e,t.parentNode):"contains"in e?e.contains(t):e.compareDocumentPosition?!!(e.compareDocumentPosition(t)&16):!1:!1}function Pd(e){e=e!=null&&e.ownerDocument!=null&&e.ownerDocument.defaultView!=null?e.ownerDocument.defaultView:window;for(var t=Ur(e.document);t instanceof e.HTMLIFrameElement;){try{var a=typeof t.contentWindow.location.href=="string"}catch{a=!1}if(a)e=t.contentWindow;else break;t=Ur(e.document)}return t}function fs(e){var t=e&&e.nodeName&&e.nodeName.toLowerCase();return t&&(t==="input"&&(e.type==="text"||e.type==="search"||e.type==="tel"||e.type==="url"||e.type==="password")||t==="textarea"||e.contentEditable==="true")}var r0=zn&&"documentMode"in document&&11>=document.documentMode,yl=null,ds=null,Si=null,hs=!1;function Jd(e,t,a){var i=a.window===a?a.document:a.nodeType===9?a:a.ownerDocument;hs||yl==null||yl!==Ur(i)||(i=yl,"selectionStart"in i&&fs(i)?i={start:i.selectionStart,end:i.selectionEnd}:(i=(i.ownerDocument&&i.ownerDocument.defaultView||window).getSelection(),i={anchorNode:i.anchorNode,anchorOffset:i.anchorOffset,focusNode:i.focusNode,focusOffset:i.focusOffset}),Si&&_i(Si,i)||(Si=i,i=Tu(ds,"onSelect"),0>=h,s-=h,jn=1<<32-Tt(t)+s|a<c?c:8;var h=D.T,g={};D.T=g,Ws(e,!1,t,a);try{var _=s(),R=D.S;if(R!==null&&R(g,_),_!==null&&typeof _=="object"&&typeof _.then=="function"){var M=m0(_,i);Zi(e,t,M,Dt(e))}else Zi(e,t,i,Dt(e))}catch(B){Zi(e,t,{then:function(){},status:"rejected",reason:B},Dt())}finally{Q.p=c,D.T=h}}function _0(){}function Js(e,t,a,i){if(e.tag!==5)throw Error(u(476));var s=Fh(e).queue;Jh(e,s,t,ie,a===null?_0:function(){return Wh(e),a(i)})}function Fh(e){var t=e.memoizedState;if(t!==null)return t;t={memoizedState:ie,baseState:ie,baseQueue:null,queue:{pending:null,lanes:0,dispatch:null,lastRenderedReducer:Un,lastRenderedState:ie},next:null};var a={};return t.next={memoizedState:a,baseState:a,baseQueue:null,queue:{pending:null,lanes:0,dispatch:null,lastRenderedReducer:Un,lastRenderedState:a},next:null},e.memoizedState=t,e=e.alternate,e!==null&&(e.memoizedState=t),t}function Wh(e){var t=Fh(e).next.queue;Zi(e,t,{},Dt())}function Fs(){return dt(er)}function Ih(){return Ie().memoizedState}function ev(){return Ie().memoizedState}function S0(e){for(var t=e.return;t!==null;){switch(t.tag){case 24:case 3:var a=Dt();e=la(a);var i=ia(t,e,a);i!==null&&(Mt(i,t,a),Ni(i,t,a)),t={cache:Ts()},e.payload=t;return}t=t.return}}function x0(e,t,a){var i=Dt();a={lane:i,revertLane:0,action:a,hasEagerState:!1,eagerState:null,next:null},uu(e)?nv(t,a):(a=gs(e,t,a,i),a!==null&&(Mt(a,e,i),av(a,t,i)))}function tv(e,t,a){var i=Dt();Zi(e,t,a,i)}function Zi(e,t,a,i){var s={lane:i,revertLane:0,action:a,hasEagerState:!1,eagerState:null,next:null};if(uu(e))nv(t,s);else{var c=e.alternate;if(e.lanes===0&&(c===null||c.lanes===0)&&(c=t.lastRenderedReducer,c!==null))try{var h=t.lastRenderedState,g=c(h,a);if(s.hasEagerState=!0,s.eagerState=g,Rt(g,h))return Gr(e,t,s,0),De===null&&qr(),!1}catch{}finally{}if(a=gs(e,t,s,i),a!==null)return Mt(a,e,i),av(a,t,i),!0}return!1}function Ws(e,t,a,i){if(i={lane:2,revertLane:Nc(),action:i,hasEagerState:!1,eagerState:null,next:null},uu(e)){if(t)throw Error(u(479))}else t=gs(e,a,i,2),t!==null&&Mt(t,e,2)}function uu(e){var t=e.alternate;return e===de||t!==null&&t===de}function nv(e,t){Rl=tu=!0;var a=e.pending;a===null?t.next=t:(t.next=a.next,a.next=t),e.pending=t}function av(e,t,a){if((a&4194048)!==0){var i=t.lanes;i&=e.pendingLanes,a|=i,t.lanes=a,fd(e,a)}}var ou={readContext:dt,use:au,useCallback:Pe,useContext:Pe,useEffect:Pe,useImperativeHandle:Pe,useLayoutEffect:Pe,useInsertionEffect:Pe,useMemo:Pe,useReducer:Pe,useRef:Pe,useState:Pe,useDebugValue:Pe,useDeferredValue:Pe,useTransition:Pe,useSyncExternalStore:Pe,useId:Pe,useHostTransitionStatus:Pe,useFormState:Pe,useActionState:Pe,useOptimistic:Pe,useMemoCache:Pe,useCacheRefresh:Pe},lv={readContext:dt,use:au,useCallback:function(e,t){return _t().memoizedState=[e,t===void 0?null:t],e},useContext:dt,useEffect:Hh,useImperativeHandle:function(e,t,a){a=a!=null?a.concat([e]):null,ru(4194308,4,Yh.bind(null,t,e),a)},useLayoutEffect:function(e,t){return ru(4194308,4,e,t)},useInsertionEffect:function(e,t){ru(4,2,e,t)},useMemo:function(e,t){var a=_t();t=t===void 0?null:t;var i=e();if(Va){In(!0);try{e()}finally{In(!1)}}return a.memoizedState=[i,t],i},useReducer:function(e,t,a){var i=_t();if(a!==void 0){var s=a(t);if(Va){In(!0);try{a(t)}finally{In(!1)}}}else s=t;return i.memoizedState=i.baseState=s,e={pending:null,lanes:0,dispatch:null,lastRenderedReducer:e,lastRenderedState:s},i.queue=e,e=e.dispatch=x0.bind(null,de,e),[i.memoizedState,e]},useRef:function(e){var t=_t();return e={current:e},t.memoizedState=e},useState:function(e){e=Xs(e);var t=e.queue,a=tv.bind(null,de,t);return t.dispatch=a,[e.memoizedState,a]},useDebugValue:Qs,useDeferredValue:function(e,t){var a=_t();return Ps(a,e,t)},useTransition:function(){var e=Xs(!1);return e=Jh.bind(null,de,e.queue,!0,!1),_t().memoizedState=e,[!1,e]},useSyncExternalStore:function(e,t,a){var i=de,s=_t();if(xe){if(a===void 0)throw Error(u(407));a=a()}else{if(a=t(),De===null)throw Error(u(349));(ge&124)!==0||wh(i,t,a)}s.memoizedState=a;var c={value:a,getSnapshot:t};return s.queue=c,Hh(Th.bind(null,i,c,e),[e]),i.flags|=2048,Nl(9,iu(),Ah.bind(null,i,c,a,t),null),a},useId:function(){var e=_t(),t=De.identifierPrefix;if(xe){var a=Cn,i=jn;a=(i&~(1<<32-Tt(i)-1)).toString(32)+a,t="«"+t+"R"+a,a=nu++,0ue?(it=te,te=null):it=te.sibling;var _e=z(w,te,T[ue],Z);if(_e===null){te===null&&(te=it);break}e&&te&&_e.alternate===null&&t(w,te),E=c(_e,E,ue),he===null?F=_e:he.sibling=_e,he=_e,te=it}if(ue===T.length)return a(w,te),xe&&Za(w,ue),F;if(te===null){for(;ueue?(it=te,te=null):it=te.sibling;var xa=z(w,te,_e.value,Z);if(xa===null){te===null&&(te=it);break}e&&te&&xa.alternate===null&&t(w,te),E=c(xa,E,ue),he===null?F=xa:he.sibling=xa,he=xa,te=it}if(_e.done)return a(w,te),xe&&Za(w,ue),F;if(te===null){for(;!_e.done;ue++,_e=T.next())_e=B(w,_e.value,Z),_e!==null&&(E=c(_e,E,ue),he===null?F=_e:he.sibling=_e,he=_e);return xe&&Za(w,ue),F}for(te=i(te);!_e.done;ue++,_e=T.next())_e=N(te,w,ue,_e.value,Z),_e!==null&&(e&&_e.alternate!==null&&te.delete(_e.key===null?ue:_e.key),E=c(_e,E,ue),he===null?F=_e:he.sibling=_e,he=_e);return e&&te.forEach(function(O_){return t(w,O_)}),xe&&Za(w,ue),F}function Re(w,E,T,Z){if(typeof T=="object"&&T!==null&&T.type===$&&T.key===null&&(T=T.props.children),typeof T=="object"&&T!==null){switch(T.$$typeof){case O:e:{for(var F=T.key;E!==null;){if(E.key===F){if(F=T.type,F===$){if(E.tag===7){a(w,E.sibling),Z=s(E,T.props.children),Z.return=w,w=Z;break e}}else if(E.elementType===F||typeof F=="object"&&F!==null&&F.$$typeof===we&&rv(F)===E.type){a(w,E.sibling),Z=s(E,T.props),Li(Z,T),Z.return=w,w=Z;break e}a(w,E);break}else t(w,E);E=E.sibling}T.type===$?(Z=Ma(T.props.children,w.mode,Z,T.key),Z.return=w,w=Z):(Z=Xr(T.type,T.key,T.props,null,w.mode,Z),Li(Z,T),Z.return=w,w=Z)}return h(w);case C:e:{for(F=T.key;E!==null;){if(E.key===F)if(E.tag===4&&E.stateNode.containerInfo===T.containerInfo&&E.stateNode.implementation===T.implementation){a(w,E.sibling),Z=s(E,T.children||[]),Z.return=w,w=Z;break e}else{a(w,E);break}else t(w,E);E=E.sibling}Z=_s(T,w.mode,Z),Z.return=w,w=Z}return h(w);case we:return F=T._init,T=F(T._payload),Re(w,E,T,Z)}if(qe(T))return oe(w,E,T,Z);if(Ve(T)){if(F=Ve(T),typeof F!="function")throw Error(u(150));return T=F.call(T),re(w,E,T,Z)}if(typeof T.then=="function")return Re(w,E,su(T),Z);if(T.$$typeof===V)return Re(w,E,Jr(w,T),Z);cu(w,T)}return typeof T=="string"&&T!==""||typeof T=="number"||typeof T=="bigint"?(T=""+T,E!==null&&E.tag===6?(a(w,E.sibling),Z=s(E,T),Z.return=w,w=Z):(a(w,E),Z=bs(T,w.mode,Z),Z.return=w,w=Z),h(w)):a(w,E)}return function(w,E,T,Z){try{Bi=0;var F=Re(w,E,T,Z);return jl=null,F}catch(te){if(te===Ri||te===Wr)throw te;var he=zt(29,te,null,w.mode);return he.lanes=Z,he.return=w,he}finally{}}}var Cl=uv(!0),ov=uv(!1),Qt=k(null),mn=null;function ua(e){var t=e.alternate;P(tt,tt.current&1),P(Qt,e),mn===null&&(t===null||Tl.current!==null||t.memoizedState!==null)&&(mn=e)}function sv(e){if(e.tag===22){if(P(tt,tt.current),P(Qt,e),mn===null){var t=e.alternate;t!==null&&t.memoizedState!==null&&(mn=e)}}else oa()}function oa(){P(tt,tt.current),P(Qt,Qt.current)}function Zn(e){J(Qt),mn===e&&(mn=null),J(tt)}var tt=k(0);function fu(e){for(var t=e;t!==null;){if(t.tag===13){var a=t.memoizedState;if(a!==null&&(a=a.dehydrated,a===null||a.data==="$?"||Vc(a)))return t}else if(t.tag===19&&t.memoizedProps.revealOrder!==void 0){if((t.flags&128)!==0)return t}else if(t.child!==null){t.child.return=t,t=t.child;continue}if(t===e)break;for(;t.sibling===null;){if(t.return===null||t.return===e)return null;t=t.return}t.sibling.return=t.return,t=t.sibling}return null}function Is(e,t,a,i){t=e.memoizedState,a=a(i,t),a=a==null?t:b({},t,a),e.memoizedState=a,e.lanes===0&&(e.updateQueue.baseState=a)}var ec={enqueueSetState:function(e,t,a){e=e._reactInternals;var i=Dt(),s=la(i);s.payload=t,a!=null&&(s.callback=a),t=ia(e,s,i),t!==null&&(Mt(t,e,i),Ni(t,e,i))},enqueueReplaceState:function(e,t,a){e=e._reactInternals;var i=Dt(),s=la(i);s.tag=1,s.payload=t,a!=null&&(s.callback=a),t=ia(e,s,i),t!==null&&(Mt(t,e,i),Ni(t,e,i))},enqueueForceUpdate:function(e,t){e=e._reactInternals;var a=Dt(),i=la(a);i.tag=2,t!=null&&(i.callback=t),t=ia(e,i,a),t!==null&&(Mt(t,e,a),Ni(t,e,a))}};function cv(e,t,a,i,s,c,h){return e=e.stateNode,typeof e.shouldComponentUpdate=="function"?e.shouldComponentUpdate(i,c,h):t.prototype&&t.prototype.isPureReactComponent?!_i(a,i)||!_i(s,c):!0}function fv(e,t,a,i){e=t.state,typeof t.componentWillReceiveProps=="function"&&t.componentWillReceiveProps(a,i),typeof t.UNSAFE_componentWillReceiveProps=="function"&&t.UNSAFE_componentWillReceiveProps(a,i),t.state!==e&&ec.enqueueReplaceState(t,t.state,null)}function qa(e,t){var a=t;if("ref"in t){a={};for(var i in t)i!=="ref"&&(a[i]=t[i])}if(e=e.defaultProps){a===t&&(a=b({},a));for(var s in e)a[s]===void 0&&(a[s]=e[s])}return a}var du=typeof reportError=="function"?reportError:function(e){if(typeof window=="object"&&typeof window.ErrorEvent=="function"){var t=new window.ErrorEvent("error",{bubbles:!0,cancelable:!0,message:typeof e=="object"&&e!==null&&typeof e.message=="string"?String(e.message):String(e),error:e});if(!window.dispatchEvent(t))return}else if(typeof process=="object"&&typeof process.emit=="function"){process.emit("uncaughtException",e);return}console.error(e)};function dv(e){du(e)}function hv(e){console.error(e)}function vv(e){du(e)}function hu(e,t){try{var a=e.onUncaughtError;a(t.value,{componentStack:t.stack})}catch(i){setTimeout(function(){throw i})}}function mv(e,t,a){try{var i=e.onCaughtError;i(a.value,{componentStack:a.stack,errorBoundary:t.tag===1?t.stateNode:null})}catch(s){setTimeout(function(){throw s})}}function tc(e,t,a){return a=la(a),a.tag=3,a.payload={element:null},a.callback=function(){hu(e,t)},a}function pv(e){return e=la(e),e.tag=3,e}function gv(e,t,a,i){var s=a.type.getDerivedStateFromError;if(typeof s=="function"){var c=i.value;e.payload=function(){return s(c)},e.callback=function(){mv(t,a,i)}}var h=a.stateNode;h!==null&&typeof h.componentDidCatch=="function"&&(e.callback=function(){mv(t,a,i),typeof s!="function"&&(va===null?va=new Set([this]):va.add(this));var g=i.stack;this.componentDidCatch(i.value,{componentStack:g!==null?g:""})})}function O0(e,t,a,i,s){if(a.flags|=32768,i!==null&&typeof i=="object"&&typeof i.then=="function"){if(t=a.alternate,t!==null&&wi(t,a,s,!0),a=Qt.current,a!==null){switch(a.tag){case 13:return mn===null?wc():a.alternate===null&&Ke===0&&(Ke=3),a.flags&=-257,a.flags|=65536,a.lanes=s,i===Ns?a.flags|=16384:(t=a.updateQueue,t===null?a.updateQueue=new Set([i]):t.add(i),Tc(e,i,s)),!1;case 22:return a.flags|=65536,i===Ns?a.flags|=16384:(t=a.updateQueue,t===null?(t={transitions:null,markerInstances:null,retryQueue:new Set([i])},a.updateQueue=t):(a=t.retryQueue,a===null?t.retryQueue=new Set([i]):a.add(i)),Tc(e,i,s)),!1}throw Error(u(435,a.tag))}return Tc(e,i,s),wc(),!1}if(xe)return t=Qt.current,t!==null?((t.flags&65536)===0&&(t.flags|=256),t.flags|=65536,t.lanes=s,i!==Es&&(e=Error(u(422),{cause:i}),Oi(Gt(e,a)))):(i!==Es&&(t=Error(u(423),{cause:i}),Oi(Gt(t,a))),e=e.current.alternate,e.flags|=65536,s&=-s,e.lanes|=s,i=Gt(i,a),s=tc(e.stateNode,i,s),Ds(e,s),Ke!==4&&(Ke=2)),!1;var c=Error(u(520),{cause:i});if(c=Gt(c,a),Yi===null?Yi=[c]:Yi.push(c),Ke!==4&&(Ke=2),t===null)return!0;i=Gt(i,a),a=t;do{switch(a.tag){case 3:return a.flags|=65536,e=s&-s,a.lanes|=e,e=tc(a.stateNode,i,e),Ds(a,e),!1;case 1:if(t=a.type,c=a.stateNode,(a.flags&128)===0&&(typeof t.getDerivedStateFromError=="function"||c!==null&&typeof c.componentDidCatch=="function"&&(va===null||!va.has(c))))return a.flags|=65536,s&=-s,a.lanes|=s,s=pv(s),gv(s,e,a,i),Ds(a,s),!1}a=a.return}while(a!==null);return!1}var yv=Error(u(461)),at=!1;function ut(e,t,a,i){t.child=e===null?ov(t,null,a,i):Cl(t,e.child,a,i)}function bv(e,t,a,i,s){a=a.render;var c=t.ref;if("ref"in i){var h={};for(var g in i)g!=="ref"&&(h[g]=i[g])}else h=i;return $a(t),i=Ls(e,t,a,h,c,s),g=ks(),e!==null&&!at?($s(e,t,s),Bn(e,t,s)):(xe&&g&&Ss(t),t.flags|=1,ut(e,t,i,s),t.child)}function _v(e,t,a,i,s){if(e===null){var c=a.type;return typeof c=="function"&&!ys(c)&&c.defaultProps===void 0&&a.compare===null?(t.tag=15,t.type=c,Sv(e,t,c,i,s)):(e=Xr(a.type,null,i,t,t.mode,s),e.ref=t.ref,e.return=t,t.child=e)}if(c=e.child,!sc(e,s)){var h=c.memoizedProps;if(a=a.compare,a=a!==null?a:_i,a(h,i)&&e.ref===t.ref)return Bn(e,t,s)}return t.flags|=1,e=Nn(c,i),e.ref=t.ref,e.return=t,t.child=e}function Sv(e,t,a,i,s){if(e!==null){var c=e.memoizedProps;if(_i(c,i)&&e.ref===t.ref)if(at=!1,t.pendingProps=i=c,sc(e,s))(e.flags&131072)!==0&&(at=!0);else return t.lanes=e.lanes,Bn(e,t,s)}return nc(e,t,a,i,s)}function xv(e,t,a){var i=t.pendingProps,s=i.children,c=e!==null?e.memoizedState:null;if(i.mode==="hidden"){if((t.flags&128)!==0){if(i=c!==null?c.baseLanes|a:a,e!==null){for(s=t.child=e.child,c=0;s!==null;)c=c|s.lanes|s.childLanes,s=s.sibling;t.childLanes=c&~i}else t.childLanes=0,t.child=null;return Ev(e,t,i,a)}if((a&536870912)!==0)t.memoizedState={baseLanes:0,cachePool:null},e!==null&&Fr(t,c!==null?c.cachePool:null),c!==null?Sh(t,c):Us(),sv(t);else return t.lanes=t.childLanes=536870912,Ev(e,t,c!==null?c.baseLanes|a:a,a)}else c!==null?(Fr(t,c.cachePool),Sh(t,c),oa(),t.memoizedState=null):(e!==null&&Fr(t,null),Us(),oa());return ut(e,t,s,a),t.child}function Ev(e,t,a,i){var s=zs();return s=s===null?null:{parent:et._currentValue,pool:s},t.memoizedState={baseLanes:a,cachePool:s},e!==null&&Fr(t,null),Us(),sv(t),e!==null&&wi(e,t,i,!0),null}function vu(e,t){var a=t.ref;if(a===null)e!==null&&e.ref!==null&&(t.flags|=4194816);else{if(typeof a!="function"&&typeof a!="object")throw Error(u(284));(e===null||e.ref!==a)&&(t.flags|=4194816)}}function nc(e,t,a,i,s){return $a(t),a=Ls(e,t,a,i,void 0,s),i=ks(),e!==null&&!at?($s(e,t,s),Bn(e,t,s)):(xe&&i&&Ss(t),t.flags|=1,ut(e,t,a,s),t.child)}function Ov(e,t,a,i,s,c){return $a(t),t.updateQueue=null,a=Eh(t,i,a,s),xh(e),i=ks(),e!==null&&!at?($s(e,t,c),Bn(e,t,c)):(xe&&i&&Ss(t),t.flags|=1,ut(e,t,a,c),t.child)}function wv(e,t,a,i,s){if($a(t),t.stateNode===null){var c=xl,h=a.contextType;typeof h=="object"&&h!==null&&(c=dt(h)),c=new a(i,c),t.memoizedState=c.state!==null&&c.state!==void 0?c.state:null,c.updater=ec,t.stateNode=c,c._reactInternals=t,c=t.stateNode,c.props=i,c.state=t.memoizedState,c.refs={},js(t),h=a.contextType,c.context=typeof h=="object"&&h!==null?dt(h):xl,c.state=t.memoizedState,h=a.getDerivedStateFromProps,typeof h=="function"&&(Is(t,a,h,i),c.state=t.memoizedState),typeof a.getDerivedStateFromProps=="function"||typeof c.getSnapshotBeforeUpdate=="function"||typeof c.UNSAFE_componentWillMount!="function"&&typeof c.componentWillMount!="function"||(h=c.state,typeof c.componentWillMount=="function"&&c.componentWillMount(),typeof c.UNSAFE_componentWillMount=="function"&&c.UNSAFE_componentWillMount(),h!==c.state&&ec.enqueueReplaceState(c,c.state,null),Ci(t,i,c,s),ji(),c.state=t.memoizedState),typeof c.componentDidMount=="function"&&(t.flags|=4194308),i=!0}else if(e===null){c=t.stateNode;var g=t.memoizedProps,_=qa(a,g);c.props=_;var R=c.context,M=a.contextType;h=xl,typeof M=="object"&&M!==null&&(h=dt(M));var B=a.getDerivedStateFromProps;M=typeof B=="function"||typeof c.getSnapshotBeforeUpdate=="function",g=t.pendingProps!==g,M||typeof c.UNSAFE_componentWillReceiveProps!="function"&&typeof c.componentWillReceiveProps!="function"||(g||R!==h)&&fv(t,c,i,h),aa=!1;var z=t.memoizedState;c.state=z,Ci(t,i,c,s),ji(),R=t.memoizedState,g||z!==R||aa?(typeof B=="function"&&(Is(t,a,B,i),R=t.memoizedState),(_=aa||cv(t,a,_,i,z,R,h))?(M||typeof c.UNSAFE_componentWillMount!="function"&&typeof c.componentWillMount!="function"||(typeof c.componentWillMount=="function"&&c.componentWillMount(),typeof c.UNSAFE_componentWillMount=="function"&&c.UNSAFE_componentWillMount()),typeof c.componentDidMount=="function"&&(t.flags|=4194308)):(typeof c.componentDidMount=="function"&&(t.flags|=4194308),t.memoizedProps=i,t.memoizedState=R),c.props=i,c.state=R,c.context=h,i=_):(typeof c.componentDidMount=="function"&&(t.flags|=4194308),i=!1)}else{c=t.stateNode,Cs(e,t),h=t.memoizedProps,M=qa(a,h),c.props=M,B=t.pendingProps,z=c.context,R=a.contextType,_=xl,typeof R=="object"&&R!==null&&(_=dt(R)),g=a.getDerivedStateFromProps,(R=typeof g=="function"||typeof c.getSnapshotBeforeUpdate=="function")||typeof c.UNSAFE_componentWillReceiveProps!="function"&&typeof c.componentWillReceiveProps!="function"||(h!==B||z!==_)&&fv(t,c,i,_),aa=!1,z=t.memoizedState,c.state=z,Ci(t,i,c,s),ji();var N=t.memoizedState;h!==B||z!==N||aa||e!==null&&e.dependencies!==null&&Pr(e.dependencies)?(typeof g=="function"&&(Is(t,a,g,i),N=t.memoizedState),(M=aa||cv(t,a,M,i,z,N,_)||e!==null&&e.dependencies!==null&&Pr(e.dependencies))?(R||typeof c.UNSAFE_componentWillUpdate!="function"&&typeof c.componentWillUpdate!="function"||(typeof c.componentWillUpdate=="function"&&c.componentWillUpdate(i,N,_),typeof c.UNSAFE_componentWillUpdate=="function"&&c.UNSAFE_componentWillUpdate(i,N,_)),typeof c.componentDidUpdate=="function"&&(t.flags|=4),typeof c.getSnapshotBeforeUpdate=="function"&&(t.flags|=1024)):(typeof c.componentDidUpdate!="function"||h===e.memoizedProps&&z===e.memoizedState||(t.flags|=4),typeof c.getSnapshotBeforeUpdate!="function"||h===e.memoizedProps&&z===e.memoizedState||(t.flags|=1024),t.memoizedProps=i,t.memoizedState=N),c.props=i,c.state=N,c.context=_,i=M):(typeof c.componentDidUpdate!="function"||h===e.memoizedProps&&z===e.memoizedState||(t.flags|=4),typeof c.getSnapshotBeforeUpdate!="function"||h===e.memoizedProps&&z===e.memoizedState||(t.flags|=1024),i=!1)}return c=i,vu(e,t),i=(t.flags&128)!==0,c||i?(c=t.stateNode,a=i&&typeof a.getDerivedStateFromError!="function"?null:c.render(),t.flags|=1,e!==null&&i?(t.child=Cl(t,e.child,null,s),t.child=Cl(t,null,a,s)):ut(e,t,a,s),t.memoizedState=c.state,e=t.child):e=Bn(e,t,s),e}function Av(e,t,a,i){return Ei(),t.flags|=256,ut(e,t,a,i),t.child}var ac={dehydrated:null,treeContext:null,retryLane:0,hydrationErrors:null};function lc(e){return{baseLanes:e,cachePool:hh()}}function ic(e,t,a){return e=e!==null?e.childLanes&~a:0,t&&(e|=Pt),e}function Tv(e,t,a){var i=t.pendingProps,s=!1,c=(t.flags&128)!==0,h;if((h=c)||(h=e!==null&&e.memoizedState===null?!1:(tt.current&2)!==0),h&&(s=!0,t.flags&=-129),h=(t.flags&32)!==0,t.flags&=-33,e===null){if(xe){if(s?ua(t):oa(),xe){var g=Xe,_;if(_=g){e:{for(_=g,g=vn;_.nodeType!==8;){if(!g){g=null;break e}if(_=an(_.nextSibling),_===null){g=null;break e}}g=_}g!==null?(t.memoizedState={dehydrated:g,treeContext:Ua!==null?{id:jn,overflow:Cn}:null,retryLane:536870912,hydrationErrors:null},_=zt(18,null,null,0),_.stateNode=g,_.return=t,t.child=_,mt=t,Xe=null,_=!0):_=!1}_||La(t)}if(g=t.memoizedState,g!==null&&(g=g.dehydrated,g!==null))return Vc(g)?t.lanes=32:t.lanes=536870912,null;Zn(t)}return g=i.children,i=i.fallback,s?(oa(),s=t.mode,g=mu({mode:"hidden",children:g},s),i=Ma(i,s,a,null),g.return=t,i.return=t,g.sibling=i,t.child=g,s=t.child,s.memoizedState=lc(a),s.childLanes=ic(e,h,a),t.memoizedState=ac,i):(ua(t),rc(t,g))}if(_=e.memoizedState,_!==null&&(g=_.dehydrated,g!==null)){if(c)t.flags&256?(ua(t),t.flags&=-257,t=uc(e,t,a)):t.memoizedState!==null?(oa(),t.child=e.child,t.flags|=128,t=null):(oa(),s=i.fallback,g=t.mode,i=mu({mode:"visible",children:i.children},g),s=Ma(s,g,a,null),s.flags|=2,i.return=t,s.return=t,i.sibling=s,t.child=i,Cl(t,e.child,null,a),i=t.child,i.memoizedState=lc(a),i.childLanes=ic(e,h,a),t.memoizedState=ac,t=s);else if(ua(t),Vc(g)){if(h=g.nextSibling&&g.nextSibling.dataset,h)var R=h.dgst;h=R,i=Error(u(419)),i.stack="",i.digest=h,Oi({value:i,source:null,stack:null}),t=uc(e,t,a)}else if(at||wi(e,t,a,!1),h=(a&e.childLanes)!==0,at||h){if(h=De,h!==null&&(i=a&-a,i=(i&42)!==0?1:qo(i),i=(i&(h.suspendedLanes|a))!==0?0:i,i!==0&&i!==_.retryLane))throw _.retryLane=i,Sl(e,i),Mt(h,e,i),yv;g.data==="$?"||wc(),t=uc(e,t,a)}else g.data==="$?"?(t.flags|=192,t.child=e.child,t=null):(e=_.treeContext,Xe=an(g.nextSibling),mt=t,xe=!0,Ba=null,vn=!1,e!==null&&(Xt[Kt++]=jn,Xt[Kt++]=Cn,Xt[Kt++]=Ua,jn=e.id,Cn=e.overflow,Ua=t),t=rc(t,i.children),t.flags|=4096);return t}return s?(oa(),s=i.fallback,g=t.mode,_=e.child,R=_.sibling,i=Nn(_,{mode:"hidden",children:i.children}),i.subtreeFlags=_.subtreeFlags&65011712,R!==null?s=Nn(R,s):(s=Ma(s,g,a,null),s.flags|=2),s.return=t,i.return=t,i.sibling=s,t.child=i,i=s,s=t.child,g=e.child.memoizedState,g===null?g=lc(a):(_=g.cachePool,_!==null?(R=et._currentValue,_=_.parent!==R?{parent:R,pool:R}:_):_=hh(),g={baseLanes:g.baseLanes|a,cachePool:_}),s.memoizedState=g,s.childLanes=ic(e,h,a),t.memoizedState=ac,i):(ua(t),a=e.child,e=a.sibling,a=Nn(a,{mode:"visible",children:i.children}),a.return=t,a.sibling=null,e!==null&&(h=t.deletions,h===null?(t.deletions=[e],t.flags|=16):h.push(e)),t.child=a,t.memoizedState=null,a)}function rc(e,t){return t=mu({mode:"visible",children:t},e.mode),t.return=e,e.child=t}function mu(e,t){return e=zt(22,e,null,t),e.lanes=0,e.stateNode={_visibility:1,_pendingMarkers:null,_retryCache:null,_transitions:null},e}function uc(e,t,a){return Cl(t,e.child,null,a),e=rc(t,t.pendingProps.children),e.flags|=2,t.memoizedState=null,e}function Rv(e,t,a){e.lanes|=t;var i=e.alternate;i!==null&&(i.lanes|=t),ws(e.return,t,a)}function oc(e,t,a,i,s){var c=e.memoizedState;c===null?e.memoizedState={isBackwards:t,rendering:null,renderingStartTime:0,last:i,tail:a,tailMode:s}:(c.isBackwards=t,c.rendering=null,c.renderingStartTime=0,c.last=i,c.tail=a,c.tailMode=s)}function zv(e,t,a){var i=t.pendingProps,s=i.revealOrder,c=i.tail;if(ut(e,t,i.children,a),i=tt.current,(i&2)!==0)i=i&1|2,t.flags|=128;else{if(e!==null&&(e.flags&128)!==0)e:for(e=t.child;e!==null;){if(e.tag===13)e.memoizedState!==null&&Rv(e,a,t);else if(e.tag===19)Rv(e,a,t);else if(e.child!==null){e.child.return=e,e=e.child;continue}if(e===t)break e;for(;e.sibling===null;){if(e.return===null||e.return===t)break e;e=e.return}e.sibling.return=e.return,e=e.sibling}i&=1}switch(P(tt,i),s){case"forwards":for(a=t.child,s=null;a!==null;)e=a.alternate,e!==null&&fu(e)===null&&(s=a),a=a.sibling;a=s,a===null?(s=t.child,t.child=null):(s=a.sibling,a.sibling=null),oc(t,!1,s,a,c);break;case"backwards":for(a=null,s=t.child,t.child=null;s!==null;){if(e=s.alternate,e!==null&&fu(e)===null){t.child=s;break}e=s.sibling,s.sibling=a,a=s,s=e}oc(t,!0,a,null,c);break;case"together":oc(t,!1,null,null,void 0);break;default:t.memoizedState=null}return t.child}function Bn(e,t,a){if(e!==null&&(t.dependencies=e.dependencies),ha|=t.lanes,(a&t.childLanes)===0)if(e!==null){if(wi(e,t,a,!1),(a&t.childLanes)===0)return null}else return null;if(e!==null&&t.child!==e.child)throw Error(u(153));if(t.child!==null){for(e=t.child,a=Nn(e,e.pendingProps),t.child=a,a.return=t;e.sibling!==null;)e=e.sibling,a=a.sibling=Nn(e,e.pendingProps),a.return=t;a.sibling=null}return t.child}function sc(e,t){return(e.lanes&t)!==0?!0:(e=e.dependencies,!!(e!==null&&Pr(e)))}function w0(e,t,a){switch(t.tag){case 3:Ze(t,t.stateNode.containerInfo),na(t,et,e.memoizedState.cache),Ei();break;case 27:case 5:ri(t);break;case 4:Ze(t,t.stateNode.containerInfo);break;case 10:na(t,t.type,t.memoizedProps.value);break;case 13:var i=t.memoizedState;if(i!==null)return i.dehydrated!==null?(ua(t),t.flags|=128,null):(a&t.child.childLanes)!==0?Tv(e,t,a):(ua(t),e=Bn(e,t,a),e!==null?e.sibling:null);ua(t);break;case 19:var s=(e.flags&128)!==0;if(i=(a&t.childLanes)!==0,i||(wi(e,t,a,!1),i=(a&t.childLanes)!==0),s){if(i)return zv(e,t,a);t.flags|=128}if(s=t.memoizedState,s!==null&&(s.rendering=null,s.tail=null,s.lastEffect=null),P(tt,tt.current),i)break;return null;case 22:case 23:return t.lanes=0,xv(e,t,a);case 24:na(t,et,e.memoizedState.cache)}return Bn(e,t,a)}function Nv(e,t,a){if(e!==null)if(e.memoizedProps!==t.pendingProps)at=!0;else{if(!sc(e,a)&&(t.flags&128)===0)return at=!1,w0(e,t,a);at=(e.flags&131072)!==0}else at=!1,xe&&(t.flags&1048576)!==0&&rh(t,Qr,t.index);switch(t.lanes=0,t.tag){case 16:e:{e=t.pendingProps;var i=t.elementType,s=i._init;if(i=s(i._payload),t.type=i,typeof i=="function")ys(i)?(e=qa(i,e),t.tag=1,t=wv(null,t,i,e,a)):(t.tag=0,t=nc(null,t,i,e,a));else{if(i!=null){if(s=i.$$typeof,s===X){t.tag=11,t=bv(null,t,i,e,a);break e}else if(s===pe){t.tag=14,t=_v(null,t,i,e,a);break e}}throw t=dn(i)||i,Error(u(306,t,""))}}return t;case 0:return nc(e,t,t.type,t.pendingProps,a);case 1:return i=t.type,s=qa(i,t.pendingProps),wv(e,t,i,s,a);case 3:e:{if(Ze(t,t.stateNode.containerInfo),e===null)throw Error(u(387));i=t.pendingProps;var c=t.memoizedState;s=c.element,Cs(e,t),Ci(t,i,null,a);var h=t.memoizedState;if(i=h.cache,na(t,et,i),i!==c.cache&&As(t,[et],a,!0),ji(),i=h.element,c.isDehydrated)if(c={element:i,isDehydrated:!1,cache:h.cache},t.updateQueue.baseState=c,t.memoizedState=c,t.flags&256){t=Av(e,t,i,a);break e}else if(i!==s){s=Gt(Error(u(424)),t),Oi(s),t=Av(e,t,i,a);break e}else{switch(e=t.stateNode.containerInfo,e.nodeType){case 9:e=e.body;break;default:e=e.nodeName==="HTML"?e.ownerDocument.body:e}for(Xe=an(e.firstChild),mt=t,xe=!0,Ba=null,vn=!0,a=ov(t,null,i,a),t.child=a;a;)a.flags=a.flags&-3|4096,a=a.sibling}else{if(Ei(),i===s){t=Bn(e,t,a);break e}ut(e,t,i,a)}t=t.child}return t;case 26:return vu(e,t),e===null?(a=Mm(t.type,null,t.pendingProps,null))?t.memoizedState=a:xe||(a=t.type,e=t.pendingProps,i=zu(ne.current).createElement(a),i[ft]=t,i[yt]=e,st(i,a,e),nt(i),t.stateNode=i):t.memoizedState=Mm(t.type,e.memoizedProps,t.pendingProps,e.memoizedState),null;case 27:return ri(t),e===null&&xe&&(i=t.stateNode=jm(t.type,t.pendingProps,ne.current),mt=t,vn=!0,s=Xe,ga(t.type)?(qc=s,Xe=an(i.firstChild)):Xe=s),ut(e,t,t.pendingProps.children,a),vu(e,t),e===null&&(t.flags|=4194304),t.child;case 5:return e===null&&xe&&((s=i=Xe)&&(i=I0(i,t.type,t.pendingProps,vn),i!==null?(t.stateNode=i,mt=t,Xe=an(i.firstChild),vn=!1,s=!0):s=!1),s||La(t)),ri(t),s=t.type,c=t.pendingProps,h=e!==null?e.memoizedProps:null,i=c.children,kc(s,c)?i=null:h!==null&&kc(s,h)&&(t.flags|=32),t.memoizedState!==null&&(s=Ls(e,t,g0,null,null,a),er._currentValue=s),vu(e,t),ut(e,t,i,a),t.child;case 6:return e===null&&xe&&((e=a=Xe)&&(a=e_(a,t.pendingProps,vn),a!==null?(t.stateNode=a,mt=t,Xe=null,e=!0):e=!1),e||La(t)),null;case 13:return Tv(e,t,a);case 4:return Ze(t,t.stateNode.containerInfo),i=t.pendingProps,e===null?t.child=Cl(t,null,i,a):ut(e,t,i,a),t.child;case 11:return bv(e,t,t.type,t.pendingProps,a);case 7:return ut(e,t,t.pendingProps,a),t.child;case 8:return ut(e,t,t.pendingProps.children,a),t.child;case 12:return ut(e,t,t.pendingProps.children,a),t.child;case 10:return i=t.pendingProps,na(t,t.type,i.value),ut(e,t,i.children,a),t.child;case 9:return s=t.type._context,i=t.pendingProps.children,$a(t),s=dt(s),i=i(s),t.flags|=1,ut(e,t,i,a),t.child;case 14:return _v(e,t,t.type,t.pendingProps,a);case 15:return Sv(e,t,t.type,t.pendingProps,a);case 19:return zv(e,t,a);case 31:return i=t.pendingProps,a=t.mode,i={mode:i.mode,children:i.children},e===null?(a=mu(i,a),a.ref=t.ref,t.child=a,a.return=t,t=a):(a=Nn(e.child,i),a.ref=t.ref,t.child=a,a.return=t,t=a),t;case 22:return xv(e,t,a);case 24:return $a(t),i=dt(et),e===null?(s=zs(),s===null&&(s=De,c=Ts(),s.pooledCache=c,c.refCount++,c!==null&&(s.pooledCacheLanes|=a),s=c),t.memoizedState={parent:i,cache:s},js(t),na(t,et,s)):((e.lanes&a)!==0&&(Cs(e,t),Ci(t,null,null,a),ji()),s=e.memoizedState,c=t.memoizedState,s.parent!==i?(s={parent:i,cache:i},t.memoizedState=s,t.lanes===0&&(t.memoizedState=t.updateQueue.baseState=s),na(t,et,i)):(i=c.cache,na(t,et,i),i!==s.cache&&As(t,[et],a,!0))),ut(e,t,t.pendingProps.children,a),t.child;case 29:throw t.pendingProps}throw Error(u(156,t.tag))}function Ln(e){e.flags|=4}function jv(e,t){if(t.type!=="stylesheet"||(t.state.loading&4)!==0)e.flags&=-16777217;else if(e.flags|=16777216,!km(t)){if(t=Qt.current,t!==null&&((ge&4194048)===ge?mn!==null:(ge&62914560)!==ge&&(ge&536870912)===0||t!==mn))throw zi=Ns,vh;e.flags|=8192}}function pu(e,t){t!==null&&(e.flags|=4),e.flags&16384&&(t=e.tag!==22?sd():536870912,e.lanes|=t,Zl|=t)}function ki(e,t){if(!xe)switch(e.tailMode){case"hidden":t=e.tail;for(var a=null;t!==null;)t.alternate!==null&&(a=t),t=t.sibling;a===null?e.tail=null:a.sibling=null;break;case"collapsed":a=e.tail;for(var i=null;a!==null;)a.alternate!==null&&(i=a),a=a.sibling;i===null?t||e.tail===null?e.tail=null:e.tail.sibling=null:i.sibling=null}}function Ge(e){var t=e.alternate!==null&&e.alternate.child===e.child,a=0,i=0;if(t)for(var s=e.child;s!==null;)a|=s.lanes|s.childLanes,i|=s.subtreeFlags&65011712,i|=s.flags&65011712,s.return=e,s=s.sibling;else for(s=e.child;s!==null;)a|=s.lanes|s.childLanes,i|=s.subtreeFlags,i|=s.flags,s.return=e,s=s.sibling;return e.subtreeFlags|=i,e.childLanes=a,t}function A0(e,t,a){var i=t.pendingProps;switch(xs(t),t.tag){case 31:case 16:case 15:case 0:case 11:case 7:case 8:case 12:case 9:case 14:return Ge(t),null;case 1:return Ge(t),null;case 3:return a=t.stateNode,i=null,e!==null&&(i=e.memoizedState.cache),t.memoizedState.cache!==i&&(t.flags|=2048),Mn(et),Ht(),a.pendingContext&&(a.context=a.pendingContext,a.pendingContext=null),(e===null||e.child===null)&&(xi(t)?Ln(t):e===null||e.memoizedState.isDehydrated&&(t.flags&256)===0||(t.flags|=1024,sh())),Ge(t),null;case 26:return a=t.memoizedState,e===null?(Ln(t),a!==null?(Ge(t),jv(t,a)):(Ge(t),t.flags&=-16777217)):a?a!==e.memoizedState?(Ln(t),Ge(t),jv(t,a)):(Ge(t),t.flags&=-16777217):(e.memoizedProps!==i&&Ln(t),Ge(t),t.flags&=-16777217),null;case 27:rl(t),a=ne.current;var s=t.type;if(e!==null&&t.stateNode!=null)e.memoizedProps!==i&&Ln(t);else{if(!i){if(t.stateNode===null)throw Error(u(166));return Ge(t),null}e=q.current,xi(t)?uh(t):(e=jm(s,i,a),t.stateNode=e,Ln(t))}return Ge(t),null;case 5:if(rl(t),a=t.type,e!==null&&t.stateNode!=null)e.memoizedProps!==i&&Ln(t);else{if(!i){if(t.stateNode===null)throw Error(u(166));return Ge(t),null}if(e=q.current,xi(t))uh(t);else{switch(s=zu(ne.current),e){case 1:e=s.createElementNS("http://www.w3.org/2000/svg",a);break;case 2:e=s.createElementNS("http://www.w3.org/1998/Math/MathML",a);break;default:switch(a){case"svg":e=s.createElementNS("http://www.w3.org/2000/svg",a);break;case"math":e=s.createElementNS("http://www.w3.org/1998/Math/MathML",a);break;case"script":e=s.createElement("div"),e.innerHTML=" + From 3c278017f9a524c673b24be4de4b298a3fef8b77 Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Wed, 13 Aug 2025 15:24:23 -0700 Subject: [PATCH 13/26] fix ensure_logging test (#78) --- eval_protocol/dataset_logger/__init__.py | 33 +++++-- tests/pytest/test_pytest_ensure_logging.py | 108 +++++++++------------ 2 files changed, 71 insertions(+), 70 deletions(-) diff --git a/eval_protocol/dataset_logger/__init__.py b/eval_protocol/dataset_logger/__init__.py index ff1675f8..4d04ce7d 100644 --- a/eval_protocol/dataset_logger/__init__.py +++ b/eval_protocol/dataset_logger/__init__.py @@ -3,16 +3,31 @@ from eval_protocol.dataset_logger.dataset_logger import DatasetLogger from eval_protocol.dataset_logger.sqlite_dataset_logger_adapter import SqliteDatasetLoggerAdapter + # Allow disabling sqlite logger to avoid environment-specific constraints in simple CLI runs. -if os.getenv("DISABLE_EP_SQLITE_LOG", "0").strip() != "1": - default_logger = SqliteDatasetLoggerAdapter() -else: +def _get_default_logger(): + if os.getenv("DISABLE_EP_SQLITE_LOG", "0").strip() != "1": + return SqliteDatasetLoggerAdapter() + else: + + class _NoOpLogger(DatasetLogger): + def log(self, row): + return None + + def read(self, rollout_id=None): + return [] + + return _NoOpLogger() + + +# Lazy property that creates the logger only when accessed +class _LazyLogger(DatasetLogger): + + def log(self, row): + return _get_default_logger().log(row) - class _NoOpLogger(DatasetLogger): - def log(self, row): - return None + def read(self, rollout_id=None): + return _get_default_logger().read(rollout_id) - def read(self, rollout_id=None): - return [] - default_logger = _NoOpLogger() +default_logger: DatasetLogger = _LazyLogger() diff --git a/tests/pytest/test_pytest_ensure_logging.py b/tests/pytest/test_pytest_ensure_logging.py index 4300e1b4..c9884756 100644 --- a/tests/pytest/test_pytest_ensure_logging.py +++ b/tests/pytest/test_pytest_ensure_logging.py @@ -1,73 +1,59 @@ -from typing import List +import os from unittest.mock import Mock, patch -import eval_protocol.dataset_logger as dataset_logger -from eval_protocol.dataset_logger.dataset_logger import DatasetLogger -from eval_protocol.dataset_logger.sqlite_evaluation_row_store import SqliteEvaluationRowStore -from eval_protocol.models import EvaluationRow -from eval_protocol.pytest.default_no_op_rollout_process import default_no_op_rollout_processor -from tests.pytest.test_markdown_highlighting import markdown_dataset_to_evaluation_row - async def test_ensure_logging(monkeypatch): """ Ensure that default SQLITE logger gets called by mocking the storage and checking that the storage is called. """ - from eval_protocol.pytest.evaluation_test import evaluation_test - # Mock the SqliteEvaluationRowStore to track calls - mock_store = Mock(spec=SqliteEvaluationRowStore) + mock_store = Mock() mock_store.upsert_row = Mock() mock_store.read_rows = Mock(return_value=[]) mock_store.db_path = "/tmp/test.db" - # Create a custom logger that uses our mocked store - class MockSqliteLogger(DatasetLogger): - def __init__(self, store: SqliteEvaluationRowStore): - self._store = store - - def log(self, row: EvaluationRow) -> None: - data = row.model_dump(exclude_none=True, mode="json") - self._store.upsert_row(data=data) - - def read(self, rollout_id=None) -> List[EvaluationRow]: - results = self._store.read_rows(rollout_id=rollout_id) - return [EvaluationRow(**data) for data in results] - - mock_logger = MockSqliteLogger(mock_store) - - @evaluation_test( - input_dataset=[ - "tests/pytest/data/markdown_dataset.jsonl", - ], - completion_params=[{"temperature": 0.0, "model": "dummy/local-model"}], - dataset_adapter=markdown_dataset_to_evaluation_row, - rollout_processor=default_no_op_rollout_processor, - mode="pointwise", - combine_datasets=False, - num_runs=2, - logger=mock_logger, # Use our mocked logger - ) - def eval_fn(row: EvaluationRow) -> EvaluationRow: - return row - - await eval_fn( - dataset_path=["tests/pytest/data/markdown_dataset.jsonl"], - completion_params={"temperature": 0.0, "model": "dummy/local-model"}, - ) - - # Verify that the store's upsert_row method was called - assert mock_store.upsert_row.called, "SqliteEvaluationRowStore.upsert_row should have been called" - - # Check that it was called multiple times (once for each row) - call_count = mock_store.upsert_row.call_count - assert call_count > 0, f"Expected upsert_row to be called at least once, but it was called {call_count} times" - - # Verify the calls were made with proper data structure - for call in mock_store.upsert_row.call_args_list: - args, kwargs = call - data = args[0] if args else kwargs.get("data") - assert data is not None, "upsert_row should be called with data parameter" - assert isinstance(data, dict), "data should be a dictionary" - assert "execution_metadata" in data, "data should contain execution_metadata" - assert "rollout_id" in data["execution_metadata"], "data should contain rollout_id in execution_metadata" + # Mock the SqliteEvaluationRowStore constructor so that when SqliteDatasetLoggerAdapter + # creates its store, it gets our mock instead + with patch( + "eval_protocol.dataset_logger.sqlite_dataset_logger_adapter.SqliteEvaluationRowStore", return_value=mock_store + ): + from eval_protocol.models import EvaluationRow + from eval_protocol.pytest.default_no_op_rollout_process import default_no_op_rollout_processor + from eval_protocol.pytest.evaluation_test import evaluation_test + from tests.pytest.test_markdown_highlighting import markdown_dataset_to_evaluation_row + + @evaluation_test( + input_dataset=[ + "tests/pytest/data/markdown_dataset.jsonl", + ], + completion_params=[{"temperature": 0.0, "model": "dummy/local-model"}], + dataset_adapter=markdown_dataset_to_evaluation_row, + rollout_processor=default_no_op_rollout_processor, + mode="pointwise", + combine_datasets=False, + num_runs=2, + # Don't pass logger parameter - let it use the default_logger (which we've replaced) + ) + def eval_fn(row: EvaluationRow) -> EvaluationRow: + return row + + await eval_fn( + dataset_path=["tests/pytest/data/markdown_dataset.jsonl"], + completion_params={"temperature": 0.0, "model": "dummy/local-model"}, + ) + + # Verify that the store's upsert_row method was called + assert mock_store.upsert_row.called, "SqliteEvaluationRowStore.upsert_row should have been called" + + # Check that it was called multiple times (once for each row) + call_count = mock_store.upsert_row.call_count + assert call_count > 0, f"Expected upsert_row to be called at least once, but it was called {call_count} times" + + # Verify the calls were made with proper data structure + for call in mock_store.upsert_row.call_args_list: + args, kwargs = call + data = args[0] if args else kwargs.get("data") + assert data is not None, "upsert_row should be called with data parameter" + assert isinstance(data, dict), "data should be a dictionary" + assert "execution_metadata" in data, "data should contain execution_metadata" + assert "rollout_id" in data["execution_metadata"], "data should contain rollout_id in execution_metadata" From 54333cf05a0e8bf3922d988ee1ce22fb5f73885c Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Wed, 13 Aug 2025 15:47:28 -0700 Subject: [PATCH 14/26] hotfix --- eval_protocol/pytest/default_agent_rollout_processor.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/eval_protocol/pytest/default_agent_rollout_processor.py b/eval_protocol/pytest/default_agent_rollout_processor.py index b3997c49..50f12231 100644 --- a/eval_protocol/pytest/default_agent_rollout_processor.py +++ b/eval_protocol/pytest/default_agent_rollout_processor.py @@ -58,7 +58,7 @@ async def call_agent(self) -> str: self.append_message_and_log(message) if message.tool_calls: # Create tasks for all tool calls to run them in parallel - tool_tasks = [] + tool_tasks: List[asyncio.Task[tuple[str, List[TextContent]]]] = [] for tool_call in message.tool_calls: tool_call_id = tool_call.id tool_name = tool_call.function.name @@ -70,7 +70,7 @@ async def call_agent(self) -> str: tool_tasks.append(task) # Execute all tool calls in parallel - tool_results: List[tuple[str, List[TextContent]]] = await asyncio.gather(*tool_tasks) + tool_results = await asyncio.gather(*tool_tasks) # Add all tool results to messages (they will be in the same order as tool_calls) for tool_call, (tool_call_id, content) in zip(message.tool_calls, tool_results): @@ -126,7 +126,7 @@ async def default_agent_rollout_processor( async def process_row(row: EvaluationRow) -> EvaluationRow: """Process a single row with agent rollout.""" agent = Agent( - model=config.completion_params.model, row=row, config_path=config.mcp_config_path, logger=config.logger + model=config.completion_params["model"], row=row, config_path=config.mcp_config_path, logger=config.logger ) try: await agent.setup() From cfa015d66f8b0c51718a0d9045cba6263349c36a Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Wed, 13 Aug 2025 16:35:23 -0700 Subject: [PATCH 15/26] Add invocation id to table (#79) * "Copy" button * consolidate filter configurations * filter button works * extract tooltip into its own component * vite build * Refactor AddFilterButton layout for improved styling and structure * vite build --- vite-app/dist/assets/index-CpPWargc.js | 93 -------------- vite-app/dist/assets/index-CpPWargc.js.map | 1 - vite-app/dist/assets/index-CpScNe1P.css | 1 - vite-app/dist/assets/index-D1ErODUS.js | 93 ++++++++++++++ vite-app/dist/assets/index-D1ErODUS.js.map | 1 + vite-app/dist/assets/index-D5KxcfFQ.css | 1 + vite-app/dist/index.html | 4 +- vite-app/src/GlobalState.tsx | 97 +++++++-------- vite-app/src/components/EvaluationRow.tsx | 129 +++++++++++++++++++- vite-app/src/components/EvaluationTable.tsx | 7 +- vite-app/src/components/PivotTab.tsx | 6 +- vite-app/src/components/Tooltip.tsx | 41 +++++++ vite-app/src/types/filters.ts | 11 +- vite-app/src/util/filter-utils.ts | 4 +- 14 files changed, 322 insertions(+), 167 deletions(-) delete mode 100644 vite-app/dist/assets/index-CpPWargc.js delete mode 100644 vite-app/dist/assets/index-CpPWargc.js.map delete mode 100644 vite-app/dist/assets/index-CpScNe1P.css create mode 100644 vite-app/dist/assets/index-D1ErODUS.js create mode 100644 vite-app/dist/assets/index-D1ErODUS.js.map create mode 100644 vite-app/dist/assets/index-D5KxcfFQ.css create mode 100644 vite-app/src/components/Tooltip.tsx diff --git a/vite-app/dist/assets/index-CpPWargc.js b/vite-app/dist/assets/index-CpPWargc.js deleted file mode 100644 index 402c97cd..00000000 --- a/vite-app/dist/assets/index-CpPWargc.js +++ /dev/null @@ -1,93 +0,0 @@ -(function(){const l=document.createElement("link").relList;if(l&&l.supports&&l.supports("modulepreload"))return;for(const o of document.querySelectorAll('link[rel="modulepreload"]'))u(o);new MutationObserver(o=>{for(const f of o)if(f.type==="childList")for(const d of f.addedNodes)d.tagName==="LINK"&&d.rel==="modulepreload"&&u(d)}).observe(document,{childList:!0,subtree:!0});function r(o){const f={};return o.integrity&&(f.integrity=o.integrity),o.referrerPolicy&&(f.referrerPolicy=o.referrerPolicy),o.crossOrigin==="use-credentials"?f.credentials="include":o.crossOrigin==="anonymous"?f.credentials="omit":f.credentials="same-origin",f}function u(o){if(o.ep)return;o.ep=!0;const f=r(o);fetch(o.href,f)}})();function mg(n){return n&&n.__esModule&&Object.prototype.hasOwnProperty.call(n,"default")?n.default:n}var Ic={exports:{}},ir={};/** - * @license React - * react-jsx-runtime.production.js - * - * Copyright (c) Meta Platforms, Inc. and affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */var Fm;function w_(){if(Fm)return ir;Fm=1;var n=Symbol.for("react.transitional.element"),l=Symbol.for("react.fragment");function r(u,o,f){var d=null;if(f!==void 0&&(d=""+f),o.key!==void 0&&(d=""+o.key),"key"in o){f={};for(var v in o)v!=="key"&&(f[v]=o[v])}else f=o;return o=f.ref,{$$typeof:n,type:u,key:d,ref:o!==void 0?o:null,props:f}}return ir.Fragment=l,ir.jsx=r,ir.jsxs=r,ir}var Wm;function A_(){return Wm||(Wm=1,Ic.exports=w_()),Ic.exports}var y=A_(),ef={exports:{}},fe={};/** - * @license React - * react.production.js - * - * Copyright (c) Meta Platforms, Inc. and affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */var Im;function T_(){if(Im)return fe;Im=1;var n=Symbol.for("react.transitional.element"),l=Symbol.for("react.portal"),r=Symbol.for("react.fragment"),u=Symbol.for("react.strict_mode"),o=Symbol.for("react.profiler"),f=Symbol.for("react.consumer"),d=Symbol.for("react.context"),v=Symbol.for("react.forward_ref"),m=Symbol.for("react.suspense"),p=Symbol.for("react.memo"),b=Symbol.for("react.lazy"),x=Symbol.iterator;function O(S){return S===null||typeof S!="object"?null:(S=x&&S[x]||S["@@iterator"],typeof S=="function"?S:null)}var C={isMounted:function(){return!1},enqueueForceUpdate:function(){},enqueueReplaceState:function(){},enqueueSetState:function(){}},k=Object.assign,L={};function G(S,$,J){this.props=S,this.context=$,this.refs=L,this.updater=J||C}G.prototype.isReactComponent={},G.prototype.setState=function(S,$){if(typeof S!="object"&&typeof S!="function"&&S!=null)throw Error("takes an object of state variables to update or a function which returns an object of state variables.");this.updater.enqueueSetState(this,S,$,"setState")},G.prototype.forceUpdate=function(S){this.updater.enqueueForceUpdate(this,S,"forceUpdate")};function j(){}j.prototype=G.prototype;function H(S,$,J){this.props=S,this.context=$,this.refs=L,this.updater=J||C}var V=H.prototype=new j;V.constructor=H,k(V,G.prototype),V.isPureReactComponent=!0;var X=Array.isArray,K={H:null,A:null,T:null,S:null,V:null},ce=Object.prototype.hasOwnProperty;function pe(S,$,J,P,q,le){return J=le.ref,{$$typeof:n,type:S,key:$,ref:J!==void 0?J:null,props:le}}function we(S,$){return pe(S.type,$,void 0,void 0,void 0,S.props)}function ae(S){return typeof S=="object"&&S!==null&&S.$$typeof===n}function Ce(S){var $={"=":"=0",":":"=2"};return"$"+S.replace(/[=:]/g,function(J){return $[J]})}var Fe=/\/+/g;function He(S,$){return typeof S=="object"&&S!==null&&S.key!=null?Ce(""+S.key):$.toString(36)}function At(){}function dn(S){switch(S.status){case"fulfilled":return S.value;case"rejected":throw S.reason;default:switch(typeof S.status=="string"?S.then(At,At):(S.status="pending",S.then(function($){S.status==="pending"&&(S.status="fulfilled",S.value=$)},function($){S.status==="pending"&&(S.status="rejected",S.reason=$)})),S.status){case"fulfilled":return S.value;case"rejected":throw S.reason}}throw S}function Ve(S,$,J,P,q){var le=typeof S;(le==="undefined"||le==="boolean")&&(S=null);var ne=!1;if(S===null)ne=!0;else switch(le){case"bigint":case"string":case"number":ne=!0;break;case"object":switch(S.$$typeof){case n:case l:ne=!0;break;case b:return ne=S._init,Ve(ne(S._payload),$,J,P,q)}}if(ne)return q=q(S),ne=P===""?"."+He(S,0):P,X(q)?(J="",ne!=null&&(J=ne.replace(Fe,"$&/")+"/"),Ve(q,$,J,"",function(Ht){return Ht})):q!=null&&(ae(q)&&(q=we(q,J+(q.key==null||S&&S.key===q.key?"":(""+q.key).replace(Fe,"$&/")+"/")+ne)),$.push(q)),1;ne=0;var gt=P===""?".":P+":";if(X(S))for(var Ze=0;Ze>>1,S=D[be];if(0>>1;be<$;){var J=2*(be+1)-1,P=D[J],q=J+1,le=D[q];if(0>o(P,ie))qo(le,P)?(D[be]=le,D[q]=ie,be=q):(D[be]=P,D[J]=ie,be=J);else if(qo(le,ie))D[be]=le,D[q]=ie,be=q;else break e}}return Q}function o(D,Q){var ie=D.sortIndex-Q.sortIndex;return ie!==0?ie:D.id-Q.id}if(n.unstable_now=void 0,typeof performance=="object"&&typeof performance.now=="function"){var f=performance;n.unstable_now=function(){return f.now()}}else{var d=Date,v=d.now();n.unstable_now=function(){return d.now()-v}}var m=[],p=[],b=1,x=null,O=3,C=!1,k=!1,L=!1,G=!1,j=typeof setTimeout=="function"?setTimeout:null,H=typeof clearTimeout=="function"?clearTimeout:null,V=typeof setImmediate<"u"?setImmediate:null;function X(D){for(var Q=r(p);Q!==null;){if(Q.callback===null)u(p);else if(Q.startTime<=D)u(p),Q.sortIndex=Q.expirationTime,l(m,Q);else break;Q=r(p)}}function K(D){if(L=!1,X(D),!k)if(r(m)!==null)k=!0,ce||(ce=!0,He());else{var Q=r(p);Q!==null&&Ve(K,Q.startTime-D)}}var ce=!1,pe=-1,we=5,ae=-1;function Ce(){return G?!0:!(n.unstable_now()-aeD&&Ce());){var be=x.callback;if(typeof be=="function"){x.callback=null,O=x.priorityLevel;var S=be(x.expirationTime<=D);if(D=n.unstable_now(),typeof S=="function"){x.callback=S,X(D),Q=!0;break t}x===r(m)&&u(m),X(D)}else u(m);x=r(m)}if(x!==null)Q=!0;else{var $=r(p);$!==null&&Ve(K,$.startTime-D),Q=!1}}break e}finally{x=null,O=ie,C=!1}Q=void 0}}finally{Q?He():ce=!1}}}var He;if(typeof V=="function")He=function(){V(Fe)};else if(typeof MessageChannel<"u"){var At=new MessageChannel,dn=At.port2;At.port1.onmessage=Fe,He=function(){dn.postMessage(null)}}else He=function(){j(Fe,0)};function Ve(D,Q){pe=j(function(){D(n.unstable_now())},Q)}n.unstable_IdlePriority=5,n.unstable_ImmediatePriority=1,n.unstable_LowPriority=4,n.unstable_NormalPriority=3,n.unstable_Profiling=null,n.unstable_UserBlockingPriority=2,n.unstable_cancelCallback=function(D){D.callback=null},n.unstable_forceFrameRate=function(D){0>D||125be?(D.sortIndex=ie,l(p,D),r(m)===null&&D===r(p)&&(L?(H(pe),pe=-1):L=!0,Ve(K,ie-be))):(D.sortIndex=S,l(m,D),k||C||(k=!0,ce||(ce=!0,He()))),D},n.unstable_shouldYield=Ce,n.unstable_wrapCallback=function(D){var Q=O;return function(){var ie=O;O=Q;try{return D.apply(this,arguments)}finally{O=ie}}}}(af)),af}var np;function z_(){return np||(np=1,nf.exports=R_()),nf.exports}var lf={exports:{}},dt={};/** - * @license React - * react-dom.production.js - * - * Copyright (c) Meta Platforms, Inc. and affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */var ap;function N_(){if(ap)return dt;ap=1;var n=Eo();function l(m){var p="https://react.dev/errors/"+m;if(1"u"||typeof __REACT_DEVTOOLS_GLOBAL_HOOK__.checkDCE!="function"))try{__REACT_DEVTOOLS_GLOBAL_HOOK__.checkDCE(n)}catch(l){console.error(l)}}return n(),lf.exports=N_(),lf.exports}/** - * @license React - * react-dom-client.production.js - * - * Copyright (c) Meta Platforms, Inc. and affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */var ip;function j_(){if(ip)return rr;ip=1;var n=z_(),l=Eo(),r=pg();function u(e){var t="https://react.dev/errors/"+e;if(1S||(e.current=be[S],be[S]=null,S--)}function P(e,t){S++,be[S]=e.current,e.current=t}var q=$(null),le=$(null),ne=$(null),gt=$(null);function Ze(e,t){switch(P(ne,t),P(le,e),P(q,null),t.nodeType){case 9:case 11:e=(e=t.documentElement)&&(e=e.namespaceURI)?wm(e):0;break;default:if(e=t.tagName,t=t.namespaceURI)t=wm(t),e=Am(t,e);else switch(e){case"svg":e=1;break;case"math":e=2;break;default:e=0}}J(q),P(q,e)}function Ht(){J(q),J(le),J(ne)}function ri(e){e.memoizedState!==null&&P(gt,e);var t=q.current,a=Am(t,e.type);t!==a&&(P(le,e),P(q,a))}function rl(e){le.current===e&&(J(q),J(le)),gt.current===e&&(J(gt),er._currentValue=ie)}var Fn=Object.prototype.hasOwnProperty,Wn=n.unstable_scheduleCallback,ui=n.unstable_cancelCallback,ld=n.unstable_shouldYield,lb=n.unstable_requestPaint,hn=n.unstable_now,ib=n.unstable_getCurrentPriorityLevel,id=n.unstable_ImmediatePriority,rd=n.unstable_UserBlockingPriority,Rr=n.unstable_NormalPriority,rb=n.unstable_LowPriority,ud=n.unstable_IdlePriority,ub=n.log,ob=n.unstable_setDisableYieldValue,oi=null,Tt=null;function In(e){if(typeof ub=="function"&&ob(e),Tt&&typeof Tt.setStrictMode=="function")try{Tt.setStrictMode(oi,e)}catch{}}var Rt=Math.clz32?Math.clz32:fb,sb=Math.log,cb=Math.LN2;function fb(e){return e>>>=0,e===0?32:31-(sb(e)/cb|0)|0}var zr=256,Nr=4194304;function za(e){var t=e&42;if(t!==0)return t;switch(e&-e){case 1:return 1;case 2:return 2;case 4:return 4;case 8:return 8;case 16:return 16;case 32:return 32;case 64:return 64;case 128:return 128;case 256:case 512:case 1024:case 2048:case 4096:case 8192:case 16384:case 32768:case 65536:case 131072:case 262144:case 524288:case 1048576:case 2097152:return e&4194048;case 4194304:case 8388608:case 16777216:case 33554432:return e&62914560;case 67108864:return 67108864;case 134217728:return 134217728;case 268435456:return 268435456;case 536870912:return 536870912;case 1073741824:return 0;default:return e}}function jr(e,t,a){var i=e.pendingLanes;if(i===0)return 0;var s=0,c=e.suspendedLanes,h=e.pingedLanes;e=e.warmLanes;var g=i&134217727;return g!==0?(i=g&~c,i!==0?s=za(i):(h&=g,h!==0?s=za(h):a||(a=g&~e,a!==0&&(s=za(a))))):(g=i&~c,g!==0?s=za(g):h!==0?s=za(h):a||(a=i&~e,a!==0&&(s=za(a)))),s===0?0:t!==0&&t!==s&&(t&c)===0&&(c=s&-s,a=t&-t,c>=a||c===32&&(a&4194048)!==0)?t:s}function si(e,t){return(e.pendingLanes&~(e.suspendedLanes&~e.pingedLanes)&t)===0}function db(e,t){switch(e){case 1:case 2:case 4:case 8:case 64:return t+250;case 16:case 32:case 128:case 256:case 512:case 1024:case 2048:case 4096:case 8192:case 16384:case 32768:case 65536:case 131072:case 262144:case 524288:case 1048576:case 2097152:return t+5e3;case 4194304:case 8388608:case 16777216:case 33554432:return-1;case 67108864:case 134217728:case 268435456:case 536870912:case 1073741824:return-1;default:return-1}}function od(){var e=zr;return zr<<=1,(zr&4194048)===0&&(zr=256),e}function sd(){var e=Nr;return Nr<<=1,(Nr&62914560)===0&&(Nr=4194304),e}function Vo(e){for(var t=[],a=0;31>a;a++)t.push(e);return t}function ci(e,t){e.pendingLanes|=t,t!==268435456&&(e.suspendedLanes=0,e.pingedLanes=0,e.warmLanes=0)}function hb(e,t,a,i,s,c){var h=e.pendingLanes;e.pendingLanes=a,e.suspendedLanes=0,e.pingedLanes=0,e.warmLanes=0,e.expiredLanes&=a,e.entangledLanes&=a,e.errorRecoveryDisabledLanes&=a,e.shellSuspendCounter=0;var g=e.entanglements,_=e.expirationTimes,R=e.hiddenUpdates;for(a=h&~a;0)":-1s||_[i]!==R[s]){var M=` -`+_[i].replace(" at new "," at ");return e.displayName&&M.includes("")&&(M=M.replace("",e.displayName)),M}while(1<=i&&0<=s);break}}}finally{Qo=!1,Error.prepareStackTrace=a}return(a=e?e.displayName||e.name:"")?dl(a):""}function bb(e){switch(e.tag){case 26:case 27:case 5:return dl(e.type);case 16:return dl("Lazy");case 13:return dl("Suspense");case 19:return dl("SuspenseList");case 0:case 15:return Po(e.type,!1);case 11:return Po(e.type.render,!1);case 1:return Po(e.type,!0);case 31:return dl("Activity");default:return""}}function bd(e){try{var t="";do t+=bb(e),e=e.return;while(e);return t}catch(a){return` -Error generating stack: `+a.message+` -`+a.stack}}function Vt(e){switch(typeof e){case"bigint":case"boolean":case"number":case"string":case"undefined":return e;case"object":return e;default:return""}}function _d(e){var t=e.type;return(e=e.nodeName)&&e.toLowerCase()==="input"&&(t==="checkbox"||t==="radio")}function _b(e){var t=_d(e)?"checked":"value",a=Object.getOwnPropertyDescriptor(e.constructor.prototype,t),i=""+e[t];if(!e.hasOwnProperty(t)&&typeof a<"u"&&typeof a.get=="function"&&typeof a.set=="function"){var s=a.get,c=a.set;return Object.defineProperty(e,t,{configurable:!0,get:function(){return s.call(this)},set:function(h){i=""+h,c.call(this,h)}}),Object.defineProperty(e,t,{enumerable:a.enumerable}),{getValue:function(){return i},setValue:function(h){i=""+h},stopTracking:function(){e._valueTracker=null,delete e[t]}}}}function Mr(e){e._valueTracker||(e._valueTracker=_b(e))}function Sd(e){if(!e)return!1;var t=e._valueTracker;if(!t)return!0;var a=t.getValue(),i="";return e&&(i=_d(e)?e.checked?"true":"false":e.value),e=i,e!==a?(t.setValue(e),!0):!1}function Ur(e){if(e=e||(typeof document<"u"?document:void 0),typeof e>"u")return null;try{return e.activeElement||e.body}catch{return e.body}}var Sb=/[\n"\\]/g;function qt(e){return e.replace(Sb,function(t){return"\\"+t.charCodeAt(0).toString(16)+" "})}function Jo(e,t,a,i,s,c,h,g){e.name="",h!=null&&typeof h!="function"&&typeof h!="symbol"&&typeof h!="boolean"?e.type=h:e.removeAttribute("type"),t!=null?h==="number"?(t===0&&e.value===""||e.value!=t)&&(e.value=""+Vt(t)):e.value!==""+Vt(t)&&(e.value=""+Vt(t)):h!=="submit"&&h!=="reset"||e.removeAttribute("value"),t!=null?Fo(e,h,Vt(t)):a!=null?Fo(e,h,Vt(a)):i!=null&&e.removeAttribute("value"),s==null&&c!=null&&(e.defaultChecked=!!c),s!=null&&(e.checked=s&&typeof s!="function"&&typeof s!="symbol"),g!=null&&typeof g!="function"&&typeof g!="symbol"&&typeof g!="boolean"?e.name=""+Vt(g):e.removeAttribute("name")}function xd(e,t,a,i,s,c,h,g){if(c!=null&&typeof c!="function"&&typeof c!="symbol"&&typeof c!="boolean"&&(e.type=c),t!=null||a!=null){if(!(c!=="submit"&&c!=="reset"||t!=null))return;a=a!=null?""+Vt(a):"",t=t!=null?""+Vt(t):a,g||t===e.value||(e.value=t),e.defaultValue=t}i=i??s,i=typeof i!="function"&&typeof i!="symbol"&&!!i,e.checked=g?e.checked:!!i,e.defaultChecked=!!i,h!=null&&typeof h!="function"&&typeof h!="symbol"&&typeof h!="boolean"&&(e.name=h)}function Fo(e,t,a){t==="number"&&Ur(e.ownerDocument)===e||e.defaultValue===""+a||(e.defaultValue=""+a)}function hl(e,t,a,i){if(e=e.options,t){t={};for(var s=0;s"u"||typeof window.document>"u"||typeof window.document.createElement>"u"),ns=!1;if(zn)try{var vi={};Object.defineProperty(vi,"passive",{get:function(){ns=!0}}),window.addEventListener("test",vi,vi),window.removeEventListener("test",vi,vi)}catch{ns=!1}var ta=null,as=null,Br=null;function zd(){if(Br)return Br;var e,t=as,a=t.length,i,s="value"in ta?ta.value:ta.textContent,c=s.length;for(e=0;e=gi),Ud=" ",Zd=!1;function Bd(e,t){switch(e){case"keyup":return Pb.indexOf(t.keyCode)!==-1;case"keydown":return t.keyCode!==229;case"keypress":case"mousedown":case"focusout":return!0;default:return!1}}function Ld(e){return e=e.detail,typeof e=="object"&&"data"in e?e.data:null}var gl=!1;function Fb(e,t){switch(e){case"compositionend":return Ld(t);case"keypress":return t.which!==32?null:(Zd=!0,Ud);case"textInput":return e=t.data,e===Ud&&Zd?null:e;default:return null}}function Wb(e,t){if(gl)return e==="compositionend"||!os&&Bd(e,t)?(e=zd(),Br=as=ta=null,gl=!1,e):null;switch(e){case"paste":return null;case"keypress":if(!(t.ctrlKey||t.altKey||t.metaKey)||t.ctrlKey&&t.altKey){if(t.char&&1=t)return{node:a,offset:t-e};e=i}e:{for(;a;){if(a.nextSibling){a=a.nextSibling;break e}a=a.parentNode}a=void 0}a=Xd(a)}}function Qd(e,t){return e&&t?e===t?!0:e&&e.nodeType===3?!1:t&&t.nodeType===3?Qd(e,t.parentNode):"contains"in e?e.contains(t):e.compareDocumentPosition?!!(e.compareDocumentPosition(t)&16):!1:!1}function Pd(e){e=e!=null&&e.ownerDocument!=null&&e.ownerDocument.defaultView!=null?e.ownerDocument.defaultView:window;for(var t=Ur(e.document);t instanceof e.HTMLIFrameElement;){try{var a=typeof t.contentWindow.location.href=="string"}catch{a=!1}if(a)e=t.contentWindow;else break;t=Ur(e.document)}return t}function fs(e){var t=e&&e.nodeName&&e.nodeName.toLowerCase();return t&&(t==="input"&&(e.type==="text"||e.type==="search"||e.type==="tel"||e.type==="url"||e.type==="password")||t==="textarea"||e.contentEditable==="true")}var r0=zn&&"documentMode"in document&&11>=document.documentMode,yl=null,ds=null,Si=null,hs=!1;function Jd(e,t,a){var i=a.window===a?a.document:a.nodeType===9?a:a.ownerDocument;hs||yl==null||yl!==Ur(i)||(i=yl,"selectionStart"in i&&fs(i)?i={start:i.selectionStart,end:i.selectionEnd}:(i=(i.ownerDocument&&i.ownerDocument.defaultView||window).getSelection(),i={anchorNode:i.anchorNode,anchorOffset:i.anchorOffset,focusNode:i.focusNode,focusOffset:i.focusOffset}),Si&&_i(Si,i)||(Si=i,i=Tu(ds,"onSelect"),0>=h,s-=h,jn=1<<32-Rt(t)+s|a<c?c:8;var h=D.T,g={};D.T=g,Ws(e,!1,t,a);try{var _=s(),R=D.S;if(R!==null&&R(g,_),_!==null&&typeof _=="object"&&typeof _.then=="function"){var M=m0(_,i);Zi(e,t,M,Mt(e))}else Zi(e,t,i,Mt(e))}catch(B){Zi(e,t,{then:function(){},status:"rejected",reason:B},Mt())}finally{Q.p=c,D.T=h}}function _0(){}function Js(e,t,a,i){if(e.tag!==5)throw Error(u(476));var s=Fh(e).queue;Jh(e,s,t,ie,a===null?_0:function(){return Wh(e),a(i)})}function Fh(e){var t=e.memoizedState;if(t!==null)return t;t={memoizedState:ie,baseState:ie,baseQueue:null,queue:{pending:null,lanes:0,dispatch:null,lastRenderedReducer:Un,lastRenderedState:ie},next:null};var a={};return t.next={memoizedState:a,baseState:a,baseQueue:null,queue:{pending:null,lanes:0,dispatch:null,lastRenderedReducer:Un,lastRenderedState:a},next:null},e.memoizedState=t,e=e.alternate,e!==null&&(e.memoizedState=t),t}function Wh(e){var t=Fh(e).next.queue;Zi(e,t,{},Mt())}function Fs(){return ft(er)}function Ih(){return Ie().memoizedState}function ev(){return Ie().memoizedState}function S0(e){for(var t=e.return;t!==null;){switch(t.tag){case 24:case 3:var a=Mt();e=la(a);var i=ia(t,e,a);i!==null&&(Ut(i,t,a),Ni(i,t,a)),t={cache:Ts()},e.payload=t;return}t=t.return}}function x0(e,t,a){var i=Mt();a={lane:i,revertLane:0,action:a,hasEagerState:!1,eagerState:null,next:null},uu(e)?nv(t,a):(a=gs(e,t,a,i),a!==null&&(Ut(a,e,i),av(a,t,i)))}function tv(e,t,a){var i=Mt();Zi(e,t,a,i)}function Zi(e,t,a,i){var s={lane:i,revertLane:0,action:a,hasEagerState:!1,eagerState:null,next:null};if(uu(e))nv(t,s);else{var c=e.alternate;if(e.lanes===0&&(c===null||c.lanes===0)&&(c=t.lastRenderedReducer,c!==null))try{var h=t.lastRenderedState,g=c(h,a);if(s.hasEagerState=!0,s.eagerState=g,zt(g,h))return Gr(e,t,s,0),De===null&&qr(),!1}catch{}finally{}if(a=gs(e,t,s,i),a!==null)return Ut(a,e,i),av(a,t,i),!0}return!1}function Ws(e,t,a,i){if(i={lane:2,revertLane:Nc(),action:i,hasEagerState:!1,eagerState:null,next:null},uu(e)){if(t)throw Error(u(479))}else t=gs(e,a,i,2),t!==null&&Ut(t,e,2)}function uu(e){var t=e.alternate;return e===de||t!==null&&t===de}function nv(e,t){Rl=tu=!0;var a=e.pending;a===null?t.next=t:(t.next=a.next,a.next=t),e.pending=t}function av(e,t,a){if((a&4194048)!==0){var i=t.lanes;i&=e.pendingLanes,a|=i,t.lanes=a,fd(e,a)}}var ou={readContext:ft,use:au,useCallback:Pe,useContext:Pe,useEffect:Pe,useImperativeHandle:Pe,useLayoutEffect:Pe,useInsertionEffect:Pe,useMemo:Pe,useReducer:Pe,useRef:Pe,useState:Pe,useDebugValue:Pe,useDeferredValue:Pe,useTransition:Pe,useSyncExternalStore:Pe,useId:Pe,useHostTransitionStatus:Pe,useFormState:Pe,useActionState:Pe,useOptimistic:Pe,useMemoCache:Pe,useCacheRefresh:Pe},lv={readContext:ft,use:au,useCallback:function(e,t){return _t().memoizedState=[e,t===void 0?null:t],e},useContext:ft,useEffect:Hh,useImperativeHandle:function(e,t,a){a=a!=null?a.concat([e]):null,ru(4194308,4,Yh.bind(null,t,e),a)},useLayoutEffect:function(e,t){return ru(4194308,4,e,t)},useInsertionEffect:function(e,t){ru(4,2,e,t)},useMemo:function(e,t){var a=_t();t=t===void 0?null:t;var i=e();if(Va){In(!0);try{e()}finally{In(!1)}}return a.memoizedState=[i,t],i},useReducer:function(e,t,a){var i=_t();if(a!==void 0){var s=a(t);if(Va){In(!0);try{a(t)}finally{In(!1)}}}else s=t;return i.memoizedState=i.baseState=s,e={pending:null,lanes:0,dispatch:null,lastRenderedReducer:e,lastRenderedState:s},i.queue=e,e=e.dispatch=x0.bind(null,de,e),[i.memoizedState,e]},useRef:function(e){var t=_t();return e={current:e},t.memoizedState=e},useState:function(e){e=Xs(e);var t=e.queue,a=tv.bind(null,de,t);return t.dispatch=a,[e.memoizedState,a]},useDebugValue:Qs,useDeferredValue:function(e,t){var a=_t();return Ps(a,e,t)},useTransition:function(){var e=Xs(!1);return e=Jh.bind(null,de,e.queue,!0,!1),_t().memoizedState=e,[!1,e]},useSyncExternalStore:function(e,t,a){var i=de,s=_t();if(xe){if(a===void 0)throw Error(u(407));a=a()}else{if(a=t(),De===null)throw Error(u(349));(ge&124)!==0||wh(i,t,a)}s.memoizedState=a;var c={value:a,getSnapshot:t};return s.queue=c,Hh(Th.bind(null,i,c,e),[e]),i.flags|=2048,Nl(9,iu(),Ah.bind(null,i,c,a,t),null),a},useId:function(){var e=_t(),t=De.identifierPrefix;if(xe){var a=Cn,i=jn;a=(i&~(1<<32-Rt(i)-1)).toString(32)+a,t="«"+t+"R"+a,a=nu++,0ue?(it=te,te=null):it=te.sibling;var _e=z(w,te,T[ue],Z);if(_e===null){te===null&&(te=it);break}e&&te&&_e.alternate===null&&t(w,te),E=c(_e,E,ue),he===null?F=_e:he.sibling=_e,he=_e,te=it}if(ue===T.length)return a(w,te),xe&&Za(w,ue),F;if(te===null){for(;ueue?(it=te,te=null):it=te.sibling;var xa=z(w,te,_e.value,Z);if(xa===null){te===null&&(te=it);break}e&&te&&xa.alternate===null&&t(w,te),E=c(xa,E,ue),he===null?F=xa:he.sibling=xa,he=xa,te=it}if(_e.done)return a(w,te),xe&&Za(w,ue),F;if(te===null){for(;!_e.done;ue++,_e=T.next())_e=B(w,_e.value,Z),_e!==null&&(E=c(_e,E,ue),he===null?F=_e:he.sibling=_e,he=_e);return xe&&Za(w,ue),F}for(te=i(te);!_e.done;ue++,_e=T.next())_e=N(te,w,ue,_e.value,Z),_e!==null&&(e&&_e.alternate!==null&&te.delete(_e.key===null?ue:_e.key),E=c(_e,E,ue),he===null?F=_e:he.sibling=_e,he=_e);return e&&te.forEach(function(O_){return t(w,O_)}),xe&&Za(w,ue),F}function Re(w,E,T,Z){if(typeof T=="object"&&T!==null&&T.type===k&&T.key===null&&(T=T.props.children),typeof T=="object"&&T!==null){switch(T.$$typeof){case O:e:{for(var F=T.key;E!==null;){if(E.key===F){if(F=T.type,F===k){if(E.tag===7){a(w,E.sibling),Z=s(E,T.props.children),Z.return=w,w=Z;break e}}else if(E.elementType===F||typeof F=="object"&&F!==null&&F.$$typeof===we&&rv(F)===E.type){a(w,E.sibling),Z=s(E,T.props),Li(Z,T),Z.return=w,w=Z;break e}a(w,E);break}else t(w,E);E=E.sibling}T.type===k?(Z=Ma(T.props.children,w.mode,Z,T.key),Z.return=w,w=Z):(Z=Xr(T.type,T.key,T.props,null,w.mode,Z),Li(Z,T),Z.return=w,w=Z)}return h(w);case C:e:{for(F=T.key;E!==null;){if(E.key===F)if(E.tag===4&&E.stateNode.containerInfo===T.containerInfo&&E.stateNode.implementation===T.implementation){a(w,E.sibling),Z=s(E,T.children||[]),Z.return=w,w=Z;break e}else{a(w,E);break}else t(w,E);E=E.sibling}Z=_s(T,w.mode,Z),Z.return=w,w=Z}return h(w);case we:return F=T._init,T=F(T._payload),Re(w,E,T,Z)}if(Ve(T))return oe(w,E,T,Z);if(He(T)){if(F=He(T),typeof F!="function")throw Error(u(150));return T=F.call(T),re(w,E,T,Z)}if(typeof T.then=="function")return Re(w,E,su(T),Z);if(T.$$typeof===V)return Re(w,E,Jr(w,T),Z);cu(w,T)}return typeof T=="string"&&T!==""||typeof T=="number"||typeof T=="bigint"?(T=""+T,E!==null&&E.tag===6?(a(w,E.sibling),Z=s(E,T),Z.return=w,w=Z):(a(w,E),Z=bs(T,w.mode,Z),Z.return=w,w=Z),h(w)):a(w,E)}return function(w,E,T,Z){try{Bi=0;var F=Re(w,E,T,Z);return jl=null,F}catch(te){if(te===Ri||te===Wr)throw te;var he=Nt(29,te,null,w.mode);return he.lanes=Z,he.return=w,he}finally{}}}var Cl=uv(!0),ov=uv(!1),Qt=$(null),mn=null;function ua(e){var t=e.alternate;P(tt,tt.current&1),P(Qt,e),mn===null&&(t===null||Tl.current!==null||t.memoizedState!==null)&&(mn=e)}function sv(e){if(e.tag===22){if(P(tt,tt.current),P(Qt,e),mn===null){var t=e.alternate;t!==null&&t.memoizedState!==null&&(mn=e)}}else oa()}function oa(){P(tt,tt.current),P(Qt,Qt.current)}function Zn(e){J(Qt),mn===e&&(mn=null),J(tt)}var tt=$(0);function fu(e){for(var t=e;t!==null;){if(t.tag===13){var a=t.memoizedState;if(a!==null&&(a=a.dehydrated,a===null||a.data==="$?"||Vc(a)))return t}else if(t.tag===19&&t.memoizedProps.revealOrder!==void 0){if((t.flags&128)!==0)return t}else if(t.child!==null){t.child.return=t,t=t.child;continue}if(t===e)break;for(;t.sibling===null;){if(t.return===null||t.return===e)return null;t=t.return}t.sibling.return=t.return,t=t.sibling}return null}function Is(e,t,a,i){t=e.memoizedState,a=a(i,t),a=a==null?t:b({},t,a),e.memoizedState=a,e.lanes===0&&(e.updateQueue.baseState=a)}var ec={enqueueSetState:function(e,t,a){e=e._reactInternals;var i=Mt(),s=la(i);s.payload=t,a!=null&&(s.callback=a),t=ia(e,s,i),t!==null&&(Ut(t,e,i),Ni(t,e,i))},enqueueReplaceState:function(e,t,a){e=e._reactInternals;var i=Mt(),s=la(i);s.tag=1,s.payload=t,a!=null&&(s.callback=a),t=ia(e,s,i),t!==null&&(Ut(t,e,i),Ni(t,e,i))},enqueueForceUpdate:function(e,t){e=e._reactInternals;var a=Mt(),i=la(a);i.tag=2,t!=null&&(i.callback=t),t=ia(e,i,a),t!==null&&(Ut(t,e,a),Ni(t,e,a))}};function cv(e,t,a,i,s,c,h){return e=e.stateNode,typeof e.shouldComponentUpdate=="function"?e.shouldComponentUpdate(i,c,h):t.prototype&&t.prototype.isPureReactComponent?!_i(a,i)||!_i(s,c):!0}function fv(e,t,a,i){e=t.state,typeof t.componentWillReceiveProps=="function"&&t.componentWillReceiveProps(a,i),typeof t.UNSAFE_componentWillReceiveProps=="function"&&t.UNSAFE_componentWillReceiveProps(a,i),t.state!==e&&ec.enqueueReplaceState(t,t.state,null)}function qa(e,t){var a=t;if("ref"in t){a={};for(var i in t)i!=="ref"&&(a[i]=t[i])}if(e=e.defaultProps){a===t&&(a=b({},a));for(var s in e)a[s]===void 0&&(a[s]=e[s])}return a}var du=typeof reportError=="function"?reportError:function(e){if(typeof window=="object"&&typeof window.ErrorEvent=="function"){var t=new window.ErrorEvent("error",{bubbles:!0,cancelable:!0,message:typeof e=="object"&&e!==null&&typeof e.message=="string"?String(e.message):String(e),error:e});if(!window.dispatchEvent(t))return}else if(typeof process=="object"&&typeof process.emit=="function"){process.emit("uncaughtException",e);return}console.error(e)};function dv(e){du(e)}function hv(e){console.error(e)}function vv(e){du(e)}function hu(e,t){try{var a=e.onUncaughtError;a(t.value,{componentStack:t.stack})}catch(i){setTimeout(function(){throw i})}}function mv(e,t,a){try{var i=e.onCaughtError;i(a.value,{componentStack:a.stack,errorBoundary:t.tag===1?t.stateNode:null})}catch(s){setTimeout(function(){throw s})}}function tc(e,t,a){return a=la(a),a.tag=3,a.payload={element:null},a.callback=function(){hu(e,t)},a}function pv(e){return e=la(e),e.tag=3,e}function gv(e,t,a,i){var s=a.type.getDerivedStateFromError;if(typeof s=="function"){var c=i.value;e.payload=function(){return s(c)},e.callback=function(){mv(t,a,i)}}var h=a.stateNode;h!==null&&typeof h.componentDidCatch=="function"&&(e.callback=function(){mv(t,a,i),typeof s!="function"&&(va===null?va=new Set([this]):va.add(this));var g=i.stack;this.componentDidCatch(i.value,{componentStack:g!==null?g:""})})}function O0(e,t,a,i,s){if(a.flags|=32768,i!==null&&typeof i=="object"&&typeof i.then=="function"){if(t=a.alternate,t!==null&&wi(t,a,s,!0),a=Qt.current,a!==null){switch(a.tag){case 13:return mn===null?wc():a.alternate===null&&Ke===0&&(Ke=3),a.flags&=-257,a.flags|=65536,a.lanes=s,i===Ns?a.flags|=16384:(t=a.updateQueue,t===null?a.updateQueue=new Set([i]):t.add(i),Tc(e,i,s)),!1;case 22:return a.flags|=65536,i===Ns?a.flags|=16384:(t=a.updateQueue,t===null?(t={transitions:null,markerInstances:null,retryQueue:new Set([i])},a.updateQueue=t):(a=t.retryQueue,a===null?t.retryQueue=new Set([i]):a.add(i)),Tc(e,i,s)),!1}throw Error(u(435,a.tag))}return Tc(e,i,s),wc(),!1}if(xe)return t=Qt.current,t!==null?((t.flags&65536)===0&&(t.flags|=256),t.flags|=65536,t.lanes=s,i!==Es&&(e=Error(u(422),{cause:i}),Oi(Gt(e,a)))):(i!==Es&&(t=Error(u(423),{cause:i}),Oi(Gt(t,a))),e=e.current.alternate,e.flags|=65536,s&=-s,e.lanes|=s,i=Gt(i,a),s=tc(e.stateNode,i,s),Ds(e,s),Ke!==4&&(Ke=2)),!1;var c=Error(u(520),{cause:i});if(c=Gt(c,a),Yi===null?Yi=[c]:Yi.push(c),Ke!==4&&(Ke=2),t===null)return!0;i=Gt(i,a),a=t;do{switch(a.tag){case 3:return a.flags|=65536,e=s&-s,a.lanes|=e,e=tc(a.stateNode,i,e),Ds(a,e),!1;case 1:if(t=a.type,c=a.stateNode,(a.flags&128)===0&&(typeof t.getDerivedStateFromError=="function"||c!==null&&typeof c.componentDidCatch=="function"&&(va===null||!va.has(c))))return a.flags|=65536,s&=-s,a.lanes|=s,s=pv(s),gv(s,e,a,i),Ds(a,s),!1}a=a.return}while(a!==null);return!1}var yv=Error(u(461)),at=!1;function ut(e,t,a,i){t.child=e===null?ov(t,null,a,i):Cl(t,e.child,a,i)}function bv(e,t,a,i,s){a=a.render;var c=t.ref;if("ref"in i){var h={};for(var g in i)g!=="ref"&&(h[g]=i[g])}else h=i;return ka(t),i=Ls(e,t,a,h,c,s),g=$s(),e!==null&&!at?(ks(e,t,s),Bn(e,t,s)):(xe&&g&&Ss(t),t.flags|=1,ut(e,t,i,s),t.child)}function _v(e,t,a,i,s){if(e===null){var c=a.type;return typeof c=="function"&&!ys(c)&&c.defaultProps===void 0&&a.compare===null?(t.tag=15,t.type=c,Sv(e,t,c,i,s)):(e=Xr(a.type,null,i,t,t.mode,s),e.ref=t.ref,e.return=t,t.child=e)}if(c=e.child,!sc(e,s)){var h=c.memoizedProps;if(a=a.compare,a=a!==null?a:_i,a(h,i)&&e.ref===t.ref)return Bn(e,t,s)}return t.flags|=1,e=Nn(c,i),e.ref=t.ref,e.return=t,t.child=e}function Sv(e,t,a,i,s){if(e!==null){var c=e.memoizedProps;if(_i(c,i)&&e.ref===t.ref)if(at=!1,t.pendingProps=i=c,sc(e,s))(e.flags&131072)!==0&&(at=!0);else return t.lanes=e.lanes,Bn(e,t,s)}return nc(e,t,a,i,s)}function xv(e,t,a){var i=t.pendingProps,s=i.children,c=e!==null?e.memoizedState:null;if(i.mode==="hidden"){if((t.flags&128)!==0){if(i=c!==null?c.baseLanes|a:a,e!==null){for(s=t.child=e.child,c=0;s!==null;)c=c|s.lanes|s.childLanes,s=s.sibling;t.childLanes=c&~i}else t.childLanes=0,t.child=null;return Ev(e,t,i,a)}if((a&536870912)!==0)t.memoizedState={baseLanes:0,cachePool:null},e!==null&&Fr(t,c!==null?c.cachePool:null),c!==null?Sh(t,c):Us(),sv(t);else return t.lanes=t.childLanes=536870912,Ev(e,t,c!==null?c.baseLanes|a:a,a)}else c!==null?(Fr(t,c.cachePool),Sh(t,c),oa(),t.memoizedState=null):(e!==null&&Fr(t,null),Us(),oa());return ut(e,t,s,a),t.child}function Ev(e,t,a,i){var s=zs();return s=s===null?null:{parent:et._currentValue,pool:s},t.memoizedState={baseLanes:a,cachePool:s},e!==null&&Fr(t,null),Us(),sv(t),e!==null&&wi(e,t,i,!0),null}function vu(e,t){var a=t.ref;if(a===null)e!==null&&e.ref!==null&&(t.flags|=4194816);else{if(typeof a!="function"&&typeof a!="object")throw Error(u(284));(e===null||e.ref!==a)&&(t.flags|=4194816)}}function nc(e,t,a,i,s){return ka(t),a=Ls(e,t,a,i,void 0,s),i=$s(),e!==null&&!at?(ks(e,t,s),Bn(e,t,s)):(xe&&i&&Ss(t),t.flags|=1,ut(e,t,a,s),t.child)}function Ov(e,t,a,i,s,c){return ka(t),t.updateQueue=null,a=Eh(t,i,a,s),xh(e),i=$s(),e!==null&&!at?(ks(e,t,c),Bn(e,t,c)):(xe&&i&&Ss(t),t.flags|=1,ut(e,t,a,c),t.child)}function wv(e,t,a,i,s){if(ka(t),t.stateNode===null){var c=xl,h=a.contextType;typeof h=="object"&&h!==null&&(c=ft(h)),c=new a(i,c),t.memoizedState=c.state!==null&&c.state!==void 0?c.state:null,c.updater=ec,t.stateNode=c,c._reactInternals=t,c=t.stateNode,c.props=i,c.state=t.memoizedState,c.refs={},js(t),h=a.contextType,c.context=typeof h=="object"&&h!==null?ft(h):xl,c.state=t.memoizedState,h=a.getDerivedStateFromProps,typeof h=="function"&&(Is(t,a,h,i),c.state=t.memoizedState),typeof a.getDerivedStateFromProps=="function"||typeof c.getSnapshotBeforeUpdate=="function"||typeof c.UNSAFE_componentWillMount!="function"&&typeof c.componentWillMount!="function"||(h=c.state,typeof c.componentWillMount=="function"&&c.componentWillMount(),typeof c.UNSAFE_componentWillMount=="function"&&c.UNSAFE_componentWillMount(),h!==c.state&&ec.enqueueReplaceState(c,c.state,null),Ci(t,i,c,s),ji(),c.state=t.memoizedState),typeof c.componentDidMount=="function"&&(t.flags|=4194308),i=!0}else if(e===null){c=t.stateNode;var g=t.memoizedProps,_=qa(a,g);c.props=_;var R=c.context,M=a.contextType;h=xl,typeof M=="object"&&M!==null&&(h=ft(M));var B=a.getDerivedStateFromProps;M=typeof B=="function"||typeof c.getSnapshotBeforeUpdate=="function",g=t.pendingProps!==g,M||typeof c.UNSAFE_componentWillReceiveProps!="function"&&typeof c.componentWillReceiveProps!="function"||(g||R!==h)&&fv(t,c,i,h),aa=!1;var z=t.memoizedState;c.state=z,Ci(t,i,c,s),ji(),R=t.memoizedState,g||z!==R||aa?(typeof B=="function"&&(Is(t,a,B,i),R=t.memoizedState),(_=aa||cv(t,a,_,i,z,R,h))?(M||typeof c.UNSAFE_componentWillMount!="function"&&typeof c.componentWillMount!="function"||(typeof c.componentWillMount=="function"&&c.componentWillMount(),typeof c.UNSAFE_componentWillMount=="function"&&c.UNSAFE_componentWillMount()),typeof c.componentDidMount=="function"&&(t.flags|=4194308)):(typeof c.componentDidMount=="function"&&(t.flags|=4194308),t.memoizedProps=i,t.memoizedState=R),c.props=i,c.state=R,c.context=h,i=_):(typeof c.componentDidMount=="function"&&(t.flags|=4194308),i=!1)}else{c=t.stateNode,Cs(e,t),h=t.memoizedProps,M=qa(a,h),c.props=M,B=t.pendingProps,z=c.context,R=a.contextType,_=xl,typeof R=="object"&&R!==null&&(_=ft(R)),g=a.getDerivedStateFromProps,(R=typeof g=="function"||typeof c.getSnapshotBeforeUpdate=="function")||typeof c.UNSAFE_componentWillReceiveProps!="function"&&typeof c.componentWillReceiveProps!="function"||(h!==B||z!==_)&&fv(t,c,i,_),aa=!1,z=t.memoizedState,c.state=z,Ci(t,i,c,s),ji();var N=t.memoizedState;h!==B||z!==N||aa||e!==null&&e.dependencies!==null&&Pr(e.dependencies)?(typeof g=="function"&&(Is(t,a,g,i),N=t.memoizedState),(M=aa||cv(t,a,M,i,z,N,_)||e!==null&&e.dependencies!==null&&Pr(e.dependencies))?(R||typeof c.UNSAFE_componentWillUpdate!="function"&&typeof c.componentWillUpdate!="function"||(typeof c.componentWillUpdate=="function"&&c.componentWillUpdate(i,N,_),typeof c.UNSAFE_componentWillUpdate=="function"&&c.UNSAFE_componentWillUpdate(i,N,_)),typeof c.componentDidUpdate=="function"&&(t.flags|=4),typeof c.getSnapshotBeforeUpdate=="function"&&(t.flags|=1024)):(typeof c.componentDidUpdate!="function"||h===e.memoizedProps&&z===e.memoizedState||(t.flags|=4),typeof c.getSnapshotBeforeUpdate!="function"||h===e.memoizedProps&&z===e.memoizedState||(t.flags|=1024),t.memoizedProps=i,t.memoizedState=N),c.props=i,c.state=N,c.context=_,i=M):(typeof c.componentDidUpdate!="function"||h===e.memoizedProps&&z===e.memoizedState||(t.flags|=4),typeof c.getSnapshotBeforeUpdate!="function"||h===e.memoizedProps&&z===e.memoizedState||(t.flags|=1024),i=!1)}return c=i,vu(e,t),i=(t.flags&128)!==0,c||i?(c=t.stateNode,a=i&&typeof a.getDerivedStateFromError!="function"?null:c.render(),t.flags|=1,e!==null&&i?(t.child=Cl(t,e.child,null,s),t.child=Cl(t,null,a,s)):ut(e,t,a,s),t.memoizedState=c.state,e=t.child):e=Bn(e,t,s),e}function Av(e,t,a,i){return Ei(),t.flags|=256,ut(e,t,a,i),t.child}var ac={dehydrated:null,treeContext:null,retryLane:0,hydrationErrors:null};function lc(e){return{baseLanes:e,cachePool:hh()}}function ic(e,t,a){return e=e!==null?e.childLanes&~a:0,t&&(e|=Pt),e}function Tv(e,t,a){var i=t.pendingProps,s=!1,c=(t.flags&128)!==0,h;if((h=c)||(h=e!==null&&e.memoizedState===null?!1:(tt.current&2)!==0),h&&(s=!0,t.flags&=-129),h=(t.flags&32)!==0,t.flags&=-33,e===null){if(xe){if(s?ua(t):oa(),xe){var g=Xe,_;if(_=g){e:{for(_=g,g=vn;_.nodeType!==8;){if(!g){g=null;break e}if(_=an(_.nextSibling),_===null){g=null;break e}}g=_}g!==null?(t.memoizedState={dehydrated:g,treeContext:Ua!==null?{id:jn,overflow:Cn}:null,retryLane:536870912,hydrationErrors:null},_=Nt(18,null,null,0),_.stateNode=g,_.return=t,t.child=_,vt=t,Xe=null,_=!0):_=!1}_||La(t)}if(g=t.memoizedState,g!==null&&(g=g.dehydrated,g!==null))return Vc(g)?t.lanes=32:t.lanes=536870912,null;Zn(t)}return g=i.children,i=i.fallback,s?(oa(),s=t.mode,g=mu({mode:"hidden",children:g},s),i=Ma(i,s,a,null),g.return=t,i.return=t,g.sibling=i,t.child=g,s=t.child,s.memoizedState=lc(a),s.childLanes=ic(e,h,a),t.memoizedState=ac,i):(ua(t),rc(t,g))}if(_=e.memoizedState,_!==null&&(g=_.dehydrated,g!==null)){if(c)t.flags&256?(ua(t),t.flags&=-257,t=uc(e,t,a)):t.memoizedState!==null?(oa(),t.child=e.child,t.flags|=128,t=null):(oa(),s=i.fallback,g=t.mode,i=mu({mode:"visible",children:i.children},g),s=Ma(s,g,a,null),s.flags|=2,i.return=t,s.return=t,i.sibling=s,t.child=i,Cl(t,e.child,null,a),i=t.child,i.memoizedState=lc(a),i.childLanes=ic(e,h,a),t.memoizedState=ac,t=s);else if(ua(t),Vc(g)){if(h=g.nextSibling&&g.nextSibling.dataset,h)var R=h.dgst;h=R,i=Error(u(419)),i.stack="",i.digest=h,Oi({value:i,source:null,stack:null}),t=uc(e,t,a)}else if(at||wi(e,t,a,!1),h=(a&e.childLanes)!==0,at||h){if(h=De,h!==null&&(i=a&-a,i=(i&42)!==0?1:qo(i),i=(i&(h.suspendedLanes|a))!==0?0:i,i!==0&&i!==_.retryLane))throw _.retryLane=i,Sl(e,i),Ut(h,e,i),yv;g.data==="$?"||wc(),t=uc(e,t,a)}else g.data==="$?"?(t.flags|=192,t.child=e.child,t=null):(e=_.treeContext,Xe=an(g.nextSibling),vt=t,xe=!0,Ba=null,vn=!1,e!==null&&(Xt[Kt++]=jn,Xt[Kt++]=Cn,Xt[Kt++]=Ua,jn=e.id,Cn=e.overflow,Ua=t),t=rc(t,i.children),t.flags|=4096);return t}return s?(oa(),s=i.fallback,g=t.mode,_=e.child,R=_.sibling,i=Nn(_,{mode:"hidden",children:i.children}),i.subtreeFlags=_.subtreeFlags&65011712,R!==null?s=Nn(R,s):(s=Ma(s,g,a,null),s.flags|=2),s.return=t,i.return=t,i.sibling=s,t.child=i,i=s,s=t.child,g=e.child.memoizedState,g===null?g=lc(a):(_=g.cachePool,_!==null?(R=et._currentValue,_=_.parent!==R?{parent:R,pool:R}:_):_=hh(),g={baseLanes:g.baseLanes|a,cachePool:_}),s.memoizedState=g,s.childLanes=ic(e,h,a),t.memoizedState=ac,i):(ua(t),a=e.child,e=a.sibling,a=Nn(a,{mode:"visible",children:i.children}),a.return=t,a.sibling=null,e!==null&&(h=t.deletions,h===null?(t.deletions=[e],t.flags|=16):h.push(e)),t.child=a,t.memoizedState=null,a)}function rc(e,t){return t=mu({mode:"visible",children:t},e.mode),t.return=e,e.child=t}function mu(e,t){return e=Nt(22,e,null,t),e.lanes=0,e.stateNode={_visibility:1,_pendingMarkers:null,_retryCache:null,_transitions:null},e}function uc(e,t,a){return Cl(t,e.child,null,a),e=rc(t,t.pendingProps.children),e.flags|=2,t.memoizedState=null,e}function Rv(e,t,a){e.lanes|=t;var i=e.alternate;i!==null&&(i.lanes|=t),ws(e.return,t,a)}function oc(e,t,a,i,s){var c=e.memoizedState;c===null?e.memoizedState={isBackwards:t,rendering:null,renderingStartTime:0,last:i,tail:a,tailMode:s}:(c.isBackwards=t,c.rendering=null,c.renderingStartTime=0,c.last=i,c.tail=a,c.tailMode=s)}function zv(e,t,a){var i=t.pendingProps,s=i.revealOrder,c=i.tail;if(ut(e,t,i.children,a),i=tt.current,(i&2)!==0)i=i&1|2,t.flags|=128;else{if(e!==null&&(e.flags&128)!==0)e:for(e=t.child;e!==null;){if(e.tag===13)e.memoizedState!==null&&Rv(e,a,t);else if(e.tag===19)Rv(e,a,t);else if(e.child!==null){e.child.return=e,e=e.child;continue}if(e===t)break e;for(;e.sibling===null;){if(e.return===null||e.return===t)break e;e=e.return}e.sibling.return=e.return,e=e.sibling}i&=1}switch(P(tt,i),s){case"forwards":for(a=t.child,s=null;a!==null;)e=a.alternate,e!==null&&fu(e)===null&&(s=a),a=a.sibling;a=s,a===null?(s=t.child,t.child=null):(s=a.sibling,a.sibling=null),oc(t,!1,s,a,c);break;case"backwards":for(a=null,s=t.child,t.child=null;s!==null;){if(e=s.alternate,e!==null&&fu(e)===null){t.child=s;break}e=s.sibling,s.sibling=a,a=s,s=e}oc(t,!0,a,null,c);break;case"together":oc(t,!1,null,null,void 0);break;default:t.memoizedState=null}return t.child}function Bn(e,t,a){if(e!==null&&(t.dependencies=e.dependencies),ha|=t.lanes,(a&t.childLanes)===0)if(e!==null){if(wi(e,t,a,!1),(a&t.childLanes)===0)return null}else return null;if(e!==null&&t.child!==e.child)throw Error(u(153));if(t.child!==null){for(e=t.child,a=Nn(e,e.pendingProps),t.child=a,a.return=t;e.sibling!==null;)e=e.sibling,a=a.sibling=Nn(e,e.pendingProps),a.return=t;a.sibling=null}return t.child}function sc(e,t){return(e.lanes&t)!==0?!0:(e=e.dependencies,!!(e!==null&&Pr(e)))}function w0(e,t,a){switch(t.tag){case 3:Ze(t,t.stateNode.containerInfo),na(t,et,e.memoizedState.cache),Ei();break;case 27:case 5:ri(t);break;case 4:Ze(t,t.stateNode.containerInfo);break;case 10:na(t,t.type,t.memoizedProps.value);break;case 13:var i=t.memoizedState;if(i!==null)return i.dehydrated!==null?(ua(t),t.flags|=128,null):(a&t.child.childLanes)!==0?Tv(e,t,a):(ua(t),e=Bn(e,t,a),e!==null?e.sibling:null);ua(t);break;case 19:var s=(e.flags&128)!==0;if(i=(a&t.childLanes)!==0,i||(wi(e,t,a,!1),i=(a&t.childLanes)!==0),s){if(i)return zv(e,t,a);t.flags|=128}if(s=t.memoizedState,s!==null&&(s.rendering=null,s.tail=null,s.lastEffect=null),P(tt,tt.current),i)break;return null;case 22:case 23:return t.lanes=0,xv(e,t,a);case 24:na(t,et,e.memoizedState.cache)}return Bn(e,t,a)}function Nv(e,t,a){if(e!==null)if(e.memoizedProps!==t.pendingProps)at=!0;else{if(!sc(e,a)&&(t.flags&128)===0)return at=!1,w0(e,t,a);at=(e.flags&131072)!==0}else at=!1,xe&&(t.flags&1048576)!==0&&rh(t,Qr,t.index);switch(t.lanes=0,t.tag){case 16:e:{e=t.pendingProps;var i=t.elementType,s=i._init;if(i=s(i._payload),t.type=i,typeof i=="function")ys(i)?(e=qa(i,e),t.tag=1,t=wv(null,t,i,e,a)):(t.tag=0,t=nc(null,t,i,e,a));else{if(i!=null){if(s=i.$$typeof,s===X){t.tag=11,t=bv(null,t,i,e,a);break e}else if(s===pe){t.tag=14,t=_v(null,t,i,e,a);break e}}throw t=dn(i)||i,Error(u(306,t,""))}}return t;case 0:return nc(e,t,t.type,t.pendingProps,a);case 1:return i=t.type,s=qa(i,t.pendingProps),wv(e,t,i,s,a);case 3:e:{if(Ze(t,t.stateNode.containerInfo),e===null)throw Error(u(387));i=t.pendingProps;var c=t.memoizedState;s=c.element,Cs(e,t),Ci(t,i,null,a);var h=t.memoizedState;if(i=h.cache,na(t,et,i),i!==c.cache&&As(t,[et],a,!0),ji(),i=h.element,c.isDehydrated)if(c={element:i,isDehydrated:!1,cache:h.cache},t.updateQueue.baseState=c,t.memoizedState=c,t.flags&256){t=Av(e,t,i,a);break e}else if(i!==s){s=Gt(Error(u(424)),t),Oi(s),t=Av(e,t,i,a);break e}else{switch(e=t.stateNode.containerInfo,e.nodeType){case 9:e=e.body;break;default:e=e.nodeName==="HTML"?e.ownerDocument.body:e}for(Xe=an(e.firstChild),vt=t,xe=!0,Ba=null,vn=!0,a=ov(t,null,i,a),t.child=a;a;)a.flags=a.flags&-3|4096,a=a.sibling}else{if(Ei(),i===s){t=Bn(e,t,a);break e}ut(e,t,i,a)}t=t.child}return t;case 26:return vu(e,t),e===null?(a=Mm(t.type,null,t.pendingProps,null))?t.memoizedState=a:xe||(a=t.type,e=t.pendingProps,i=zu(ne.current).createElement(a),i[ct]=t,i[yt]=e,st(i,a,e),nt(i),t.stateNode=i):t.memoizedState=Mm(t.type,e.memoizedProps,t.pendingProps,e.memoizedState),null;case 27:return ri(t),e===null&&xe&&(i=t.stateNode=jm(t.type,t.pendingProps,ne.current),vt=t,vn=!0,s=Xe,ga(t.type)?(qc=s,Xe=an(i.firstChild)):Xe=s),ut(e,t,t.pendingProps.children,a),vu(e,t),e===null&&(t.flags|=4194304),t.child;case 5:return e===null&&xe&&((s=i=Xe)&&(i=I0(i,t.type,t.pendingProps,vn),i!==null?(t.stateNode=i,vt=t,Xe=an(i.firstChild),vn=!1,s=!0):s=!1),s||La(t)),ri(t),s=t.type,c=t.pendingProps,h=e!==null?e.memoizedProps:null,i=c.children,$c(s,c)?i=null:h!==null&&$c(s,h)&&(t.flags|=32),t.memoizedState!==null&&(s=Ls(e,t,g0,null,null,a),er._currentValue=s),vu(e,t),ut(e,t,i,a),t.child;case 6:return e===null&&xe&&((e=a=Xe)&&(a=e_(a,t.pendingProps,vn),a!==null?(t.stateNode=a,vt=t,Xe=null,e=!0):e=!1),e||La(t)),null;case 13:return Tv(e,t,a);case 4:return Ze(t,t.stateNode.containerInfo),i=t.pendingProps,e===null?t.child=Cl(t,null,i,a):ut(e,t,i,a),t.child;case 11:return bv(e,t,t.type,t.pendingProps,a);case 7:return ut(e,t,t.pendingProps,a),t.child;case 8:return ut(e,t,t.pendingProps.children,a),t.child;case 12:return ut(e,t,t.pendingProps.children,a),t.child;case 10:return i=t.pendingProps,na(t,t.type,i.value),ut(e,t,i.children,a),t.child;case 9:return s=t.type._context,i=t.pendingProps.children,ka(t),s=ft(s),i=i(s),t.flags|=1,ut(e,t,i,a),t.child;case 14:return _v(e,t,t.type,t.pendingProps,a);case 15:return Sv(e,t,t.type,t.pendingProps,a);case 19:return zv(e,t,a);case 31:return i=t.pendingProps,a=t.mode,i={mode:i.mode,children:i.children},e===null?(a=mu(i,a),a.ref=t.ref,t.child=a,a.return=t,t=a):(a=Nn(e.child,i),a.ref=t.ref,t.child=a,a.return=t,t=a),t;case 22:return xv(e,t,a);case 24:return ka(t),i=ft(et),e===null?(s=zs(),s===null&&(s=De,c=Ts(),s.pooledCache=c,c.refCount++,c!==null&&(s.pooledCacheLanes|=a),s=c),t.memoizedState={parent:i,cache:s},js(t),na(t,et,s)):((e.lanes&a)!==0&&(Cs(e,t),Ci(t,null,null,a),ji()),s=e.memoizedState,c=t.memoizedState,s.parent!==i?(s={parent:i,cache:i},t.memoizedState=s,t.lanes===0&&(t.memoizedState=t.updateQueue.baseState=s),na(t,et,i)):(i=c.cache,na(t,et,i),i!==s.cache&&As(t,[et],a,!0))),ut(e,t,t.pendingProps.children,a),t.child;case 29:throw t.pendingProps}throw Error(u(156,t.tag))}function Ln(e){e.flags|=4}function jv(e,t){if(t.type!=="stylesheet"||(t.state.loading&4)!==0)e.flags&=-16777217;else if(e.flags|=16777216,!$m(t)){if(t=Qt.current,t!==null&&((ge&4194048)===ge?mn!==null:(ge&62914560)!==ge&&(ge&536870912)===0||t!==mn))throw zi=Ns,vh;e.flags|=8192}}function pu(e,t){t!==null&&(e.flags|=4),e.flags&16384&&(t=e.tag!==22?sd():536870912,e.lanes|=t,Zl|=t)}function $i(e,t){if(!xe)switch(e.tailMode){case"hidden":t=e.tail;for(var a=null;t!==null;)t.alternate!==null&&(a=t),t=t.sibling;a===null?e.tail=null:a.sibling=null;break;case"collapsed":a=e.tail;for(var i=null;a!==null;)a.alternate!==null&&(i=a),a=a.sibling;i===null?t||e.tail===null?e.tail=null:e.tail.sibling=null:i.sibling=null}}function qe(e){var t=e.alternate!==null&&e.alternate.child===e.child,a=0,i=0;if(t)for(var s=e.child;s!==null;)a|=s.lanes|s.childLanes,i|=s.subtreeFlags&65011712,i|=s.flags&65011712,s.return=e,s=s.sibling;else for(s=e.child;s!==null;)a|=s.lanes|s.childLanes,i|=s.subtreeFlags,i|=s.flags,s.return=e,s=s.sibling;return e.subtreeFlags|=i,e.childLanes=a,t}function A0(e,t,a){var i=t.pendingProps;switch(xs(t),t.tag){case 31:case 16:case 15:case 0:case 11:case 7:case 8:case 12:case 9:case 14:return qe(t),null;case 1:return qe(t),null;case 3:return a=t.stateNode,i=null,e!==null&&(i=e.memoizedState.cache),t.memoizedState.cache!==i&&(t.flags|=2048),Mn(et),Ht(),a.pendingContext&&(a.context=a.pendingContext,a.pendingContext=null),(e===null||e.child===null)&&(xi(t)?Ln(t):e===null||e.memoizedState.isDehydrated&&(t.flags&256)===0||(t.flags|=1024,sh())),qe(t),null;case 26:return a=t.memoizedState,e===null?(Ln(t),a!==null?(qe(t),jv(t,a)):(qe(t),t.flags&=-16777217)):a?a!==e.memoizedState?(Ln(t),qe(t),jv(t,a)):(qe(t),t.flags&=-16777217):(e.memoizedProps!==i&&Ln(t),qe(t),t.flags&=-16777217),null;case 27:rl(t),a=ne.current;var s=t.type;if(e!==null&&t.stateNode!=null)e.memoizedProps!==i&&Ln(t);else{if(!i){if(t.stateNode===null)throw Error(u(166));return qe(t),null}e=q.current,xi(t)?uh(t):(e=jm(s,i,a),t.stateNode=e,Ln(t))}return qe(t),null;case 5:if(rl(t),a=t.type,e!==null&&t.stateNode!=null)e.memoizedProps!==i&&Ln(t);else{if(!i){if(t.stateNode===null)throw Error(u(166));return qe(t),null}if(e=q.current,xi(t))uh(t);else{switch(s=zu(ne.current),e){case 1:e=s.createElementNS("http://www.w3.org/2000/svg",a);break;case 2:e=s.createElementNS("http://www.w3.org/1998/Math/MathML",a);break;default:switch(a){case"svg":e=s.createElementNS("http://www.w3.org/2000/svg",a);break;case"math":e=s.createElementNS("http://www.w3.org/1998/Math/MathML",a);break;case"script":e=s.createElement("div"),e.innerHTML=" - + +
diff --git a/vite-app/src/GlobalState.tsx b/vite-app/src/GlobalState.tsx index 4d7f6de1..94f760c5 100644 --- a/vite-app/src/GlobalState.tsx +++ b/vite-app/src/GlobalState.tsx @@ -11,11 +11,10 @@ const DEFAULT_PIVOT_CONFIG: PivotConfig = { selectedColumnFields: ["$.input_metadata.completion_params.model"], selectedValueField: "$.evaluation_result.score", selectedAggregator: "avg", - filters: [], }; -// Default table filter configuration -const DEFAULT_TABLE_FILTER_CONFIG: FilterGroup[] = []; +// Default filter configuration +const DEFAULT_FILTER_CONFIG: FilterGroup[] = []; // Default pagination configuration const DEFAULT_PAGINATION_CONFIG = { @@ -31,10 +30,10 @@ export class GlobalState { expandedRows: Record = {}; // Pivot configuration pivotConfig: PivotConfig; - // Table filter configuration - tableFilterConfig: FilterGroup[]; - // Debounced, actually applied table filter configuration (for performance while typing) - appliedTableFilterConfig: FilterGroup[]; + // Unified filter configuration for both pivot and table views + filterConfig: FilterGroup[]; + // Debounced, actually applied filter configuration (for performance while typing) + appliedFilterConfig: FilterGroup[]; // Pagination configuration currentPage: number; pageSize: number; @@ -49,19 +48,18 @@ export class GlobalState { // Debounce timers for localStorage saves and filter application private savePivotConfigTimer: ReturnType | null = null; - private saveTableFilterConfigTimer: ReturnType | null = - null; + private saveFilterConfigTimer: ReturnType | null = null; private savePaginationConfigTimer: ReturnType | null = null; - private applyTableFilterTimer: ReturnType | null = null; + private applyFilterTimer: ReturnType | null = null; constructor() { // Load pivot config from localStorage or use defaults this.pivotConfig = this.loadPivotConfig(); - // Load table filter config from localStorage or use defaults - this.tableFilterConfig = this.loadTableFilterConfig(); + // Load filter config from localStorage or use defaults + this.filterConfig = this.loadFilterConfig(); // Initialize applied filter config with current value - this.appliedTableFilterConfig = this.tableFilterConfig.slice(); + this.appliedFilterConfig = this.filterConfig.slice(); // Load pagination config from localStorage or use defaults const paginationConfig = this.loadPaginationConfig(); this.currentPage = paginationConfig.currentPage; @@ -84,21 +82,18 @@ export class GlobalState { return { ...DEFAULT_PIVOT_CONFIG }; } - // Load table filter configuration from localStorage - private loadTableFilterConfig(): FilterGroup[] { + // Load filter configuration from localStorage + private loadFilterConfig(): FilterGroup[] { try { - const stored = localStorage.getItem("tableFilterConfig"); + const stored = localStorage.getItem("filterConfig"); if (stored) { const parsed = JSON.parse(stored); - return Array.isArray(parsed) ? parsed : DEFAULT_TABLE_FILTER_CONFIG; + return Array.isArray(parsed) ? parsed : DEFAULT_FILTER_CONFIG; } } catch (error) { - console.warn( - "Failed to load table filter config from localStorage:", - error - ); + console.warn("Failed to load filter config from localStorage:", error); } - return DEFAULT_TABLE_FILTER_CONFIG; + return DEFAULT_FILTER_CONFIG; } // Load pagination configuration from localStorage @@ -116,7 +111,7 @@ export class GlobalState { error ); } - return { ...DEFAULT_PAGINATION_CONFIG }; + return DEFAULT_PAGINATION_CONFIG; } // Save pivot configuration to localStorage @@ -131,21 +126,14 @@ export class GlobalState { }, 200); } - // Save table filter configuration to localStorage - private saveTableFilterConfig() { - if (this.saveTableFilterConfigTimer) - clearTimeout(this.saveTableFilterConfigTimer); - this.saveTableFilterConfigTimer = setTimeout(() => { + // Save filter configuration to localStorage + private saveFilterConfig() { + if (this.saveFilterConfigTimer) clearTimeout(this.saveFilterConfigTimer); + this.saveFilterConfigTimer = setTimeout(() => { try { - localStorage.setItem( - "tableFilterConfig", - JSON.stringify(this.tableFilterConfig) - ); + localStorage.setItem("filterConfig", JSON.stringify(this.filterConfig)); } catch (error) { - console.warn( - "Failed to save table filter config to localStorage:", - error - ); + console.warn("Failed to save filter config to localStorage:", error); } }, 200); } @@ -178,15 +166,15 @@ export class GlobalState { this.savePivotConfig(); } - // Update table filter configuration and save to localStorage - updateTableFilterConfig(filters: FilterGroup[]) { - this.tableFilterConfig = filters; - this.saveTableFilterConfig(); + // Update filter configuration and save to localStorage + updateFilterConfig(filters: FilterGroup[]) { + this.filterConfig = filters; + this.saveFilterConfig(); // Debounce application of filters to avoid re-filtering on every keystroke - if (this.applyTableFilterTimer) clearTimeout(this.applyTableFilterTimer); - this.applyTableFilterTimer = setTimeout(() => { - this.appliedTableFilterConfig = this.tableFilterConfig.slice(); + if (this.applyFilterTimer) clearTimeout(this.applyFilterTimer); + this.applyFilterTimer = setTimeout(() => { + this.appliedFilterConfig = this.filterConfig.slice(); }, 150); } @@ -205,18 +193,15 @@ export class GlobalState { // Reset pivot configuration to defaults resetPivotConfig() { - this.pivotConfig = { - ...DEFAULT_PIVOT_CONFIG, - filters: [], // Ensure filters is an empty array of FilterGroups - }; + this.pivotConfig = { ...DEFAULT_PIVOT_CONFIG }; this.savePivotConfig(); } - // Reset table filter configuration to defaults - resetTableFilterConfig() { - this.tableFilterConfig = [...DEFAULT_TABLE_FILTER_CONFIG]; - this.appliedTableFilterConfig = [...DEFAULT_TABLE_FILTER_CONFIG]; - this.saveTableFilterConfig(); + // Reset filter configuration to defaults + resetFilterConfig() { + this.filterConfig = [...DEFAULT_FILTER_CONFIG]; + this.appliedFilterConfig = [...DEFAULT_FILTER_CONFIG]; + this.saveFilterConfig(); } // Reset pagination configuration to defaults @@ -315,20 +300,20 @@ export class GlobalState { } get filteredFlattenedDataset() { - if (this.appliedTableFilterConfig.length === 0) { + if (this.appliedFilterConfig.length === 0) { return this.flattenedDataset; } - const filterFunction = createFilterFunction(this.appliedTableFilterConfig)!; + const filterFunction = createFilterFunction(this.appliedFilterConfig)!; return this.flattenedDataset.filter(filterFunction); } get filteredOriginalDataset() { - if (this.appliedTableFilterConfig.length === 0) { + if (this.appliedFilterConfig.length === 0) { return this.sortedDataset; } - const filterFunction = createFilterFunction(this.appliedTableFilterConfig)!; + const filterFunction = createFilterFunction(this.appliedFilterConfig)!; return this.sortedIds .filter((id) => filterFunction(this.flattenedById[id])) .map((id) => this.dataset[id]); diff --git a/vite-app/src/components/EvaluationRow.tsx b/vite-app/src/components/EvaluationRow.tsx index 03412e61..66712836 100644 --- a/vite-app/src/components/EvaluationRow.tsx +++ b/vite-app/src/components/EvaluationRow.tsx @@ -5,6 +5,110 @@ import { MetadataSection } from "./MetadataSection"; import StatusIndicator from "./StatusIndicator"; import { state } from "../App"; import { TableCell, TableRowInteractive } from "./TableContainer"; +import { useState } from "react"; +import type { FilterGroup, FilterConfig } from "../types/filters"; +import { Tooltip } from "./Tooltip"; + +// Add filter button component +const AddFilterButton = observer( + ({ + fieldPath, + value, + label, + }: { + fieldPath: string; + value: string; + label: string; + }) => { + const [added, setAdded] = useState(false); + + const handleClick = (e: React.MouseEvent) => { + e.stopPropagation(); // Prevent row expansion + + // Create a new filter for this field/value + const newFilter: FilterConfig = { + field: fieldPath, + operator: "==", + value: value, + type: "text", + }; + + // Add the filter to the existing filter configuration + const currentFilters = state.filterConfig; + let newFilters: FilterGroup[]; + + if (currentFilters.length === 0) { + // If no filters exist, create a new filter group + newFilters = [ + { + logic: "AND", + filters: [newFilter], + }, + ]; + } else { + // Add to the first filter group (assuming AND logic) + newFilters = [...currentFilters]; + newFilters[0] = { + ...newFilters[0], + filters: [...newFilters[0].filters, newFilter], + }; + } + + state.updateFilterConfig(newFilters); + setAdded(true); + + // Reset to "Add Filter" state after 2 seconds + setTimeout(() => setAdded(false), 2000); + }; + + return ( + +
+ +
+
+ ); + } +); // Small, focused components following "dereference values late" principle const ExpandIcon = observer(({ rolloutId }: { rolloutId?: string }) => { @@ -64,6 +168,22 @@ const RolloutId = observer( } ); +const InvocationId = observer(({ invocationId }: { invocationId?: string }) => { + if (!invocationId) { + return null; + } + return ( + + {invocationId} + + + ); +}); + const RowModel = observer(({ model }: { model: string | undefined }) => ( {model || "N/A"} )); @@ -224,6 +344,13 @@ export const EvaluationRow = observer( /> + {/* Invocation ID */} + + + + {/* Rollout ID */} @@ -248,7 +375,7 @@ export const EvaluationRow = observer( {/* Expanded Content Row */} {isExpanded && ( - + { }; const handleFiltersChange = (filters: any[]) => { - state.updateTableFilterConfig(filters); + state.updateFilterConfig(filters); }; return ( @@ -59,7 +59,7 @@ export const EvaluationTable = observer(() => {

Table Filters

- {state.tableFilterConfig.length > 0 ? ( + {state.filterConfig.length > 0 ? ( <> Showing {totalRows} of {state.sortedDataset.length} rows {totalRows !== state.sortedDataset.length && ( @@ -74,7 +74,7 @@ export const EvaluationTable = observer(() => {
{   Name Status + Invocation ID Rollout ID Model Score diff --git a/vite-app/src/components/PivotTab.tsx b/vite-app/src/components/PivotTab.tsx index a381022c..0daa4fc3 100644 --- a/vite-app/src/components/PivotTab.tsx +++ b/vite-app/src/components/PivotTab.tsx @@ -148,7 +148,7 @@ const PivotTab = observer(() => { }; const updateFilters = (filters: FilterGroup[]) => { - state.updatePivotConfig({ filters }); + state.updateFilterConfig(filters); }; const createFieldHandler = ( @@ -246,7 +246,7 @@ const PivotTab = observer(() => { /> { } showRowTotals showColumnTotals - filter={createFilterFunction(pivotConfig.filters)} + filter={createFilterFunction(state.filterConfig)} />
); diff --git a/vite-app/src/components/Tooltip.tsx b/vite-app/src/components/Tooltip.tsx new file mode 100644 index 00000000..96bbfc53 --- /dev/null +++ b/vite-app/src/components/Tooltip.tsx @@ -0,0 +1,41 @@ +import React from "react"; + +interface TooltipProps { + children: React.ReactNode; + content: string; + position?: "top" | "bottom" | "left" | "right"; + className?: string; +} + +export const Tooltip: React.FC = ({ + children, + content, + position = "top", + className = "", +}) => { + const getPositionClasses = () => { + switch (position) { + case "top": + return "bottom-full left-1/2 transform -translate-x-1/2 mb-2"; + case "bottom": + return "top-full left-1/2 transform -translate-x-1/2 mt-2"; + case "left": + return "right-full top-1/2 transform -translate-y-1/2 mr-2"; + case "right": + return "left-full top-1/2 transform -translate-y-1/2 ml-2"; + default: + return "bottom-full left-1/2 transform -translate-x-1/2 mb-2"; + } + }; + + return ( +
+ {children} +
+ {content} +
+
+ ); +}; diff --git a/vite-app/src/types/filters.ts b/vite-app/src/types/filters.ts index dbd3a7b7..c73bf61f 100644 --- a/vite-app/src/types/filters.ts +++ b/vite-app/src/types/filters.ts @@ -1,16 +1,18 @@ +export type Operator = "==" | "!=" | ">" | "<" | ">=" | "<=" | "contains" | "!contains" | "between"; + // Filter configuration interface export interface FilterConfig { field: string; - operator: string; + operator: Operator; value: string; value2?: string; // For filtering between dates type?: "text" | "date" | "date-range"; } -export interface FilterOperator { - value: string; +export type FilterOperator = { + value: Operator; label: string; -} +}; // Filter group interface for AND/OR logic export interface FilterGroup { @@ -24,5 +26,4 @@ export interface PivotConfig { selectedColumnFields: string[]; selectedValueField: string; selectedAggregator: string; - filters: FilterGroup[]; } diff --git a/vite-app/src/util/filter-utils.ts b/vite-app/src/util/filter-utils.ts index 2f6c90c2..7a311476 100644 --- a/vite-app/src/util/filter-utils.ts +++ b/vite-app/src/util/filter-utils.ts @@ -1,4 +1,4 @@ -import type { FilterConfig, FilterGroup } from "../types/filters"; +import type { FilterConfig, FilterGroup, FilterOperator } from "../types/filters"; // Filter utilities export const isDateField = (field: string): boolean => { @@ -14,7 +14,7 @@ export const getFieldType = (field: string): "text" | "date" | "date-range" => { return isDateField(field) ? "date" : "text"; }; -export const getOperatorsForField = (field: string, type?: string) => { +export const getOperatorsForField = (field: string, type?: string): FilterOperator[] => { if (type === "date" || type === "date-range" || isDateField(field)) { return [ { value: ">=", label: "on or after" }, From 4491f565998aa047f3e54e7867e467dc4c61e083 Mon Sep 17 00:00:00 2001 From: "Yufei (Benny) Chen" <1585539+benjibc@users.noreply.github.com> Date: Thu, 14 Aug 2025 11:12:19 -0700 Subject: [PATCH 16/26] single string tool response should just be string (#81) --- .../pytest/default_agent_rollout_processor.py | 21 ++++++---- .../test_tool_response_single_string.py | 40 +++++++++++++++++++ 2 files changed, 54 insertions(+), 7 deletions(-) create mode 100644 tests/pytest/test_tool_response_single_string.py diff --git a/eval_protocol/pytest/default_agent_rollout_processor.py b/eval_protocol/pytest/default_agent_rollout_processor.py index 50f12231..57b3ef73 100644 --- a/eval_protocol/pytest/default_agent_rollout_processor.py +++ b/eval_protocol/pytest/default_agent_rollout_processor.py @@ -74,14 +74,9 @@ async def call_agent(self) -> str: # Add all tool results to messages (they will be in the same order as tool_calls) for tool_call, (tool_call_id, content) in zip(message.tool_calls, tool_results): + tool_message_content = self._format_tool_message_content(content) self.append_message_and_log( - Message( - role="tool", - content=[ - ChatCompletionContentPartTextParam(text=content.text, type="text") for content in content - ], - tool_call_id=tool_call_id, - ) + Message(role="tool", content=tool_message_content, tool_call_id=tool_call_id) ) return await self.call_agent() return message.content @@ -114,6 +109,18 @@ def _get_content_from_tool_result(self, tool_result: CallToolResult) -> List[Tex raise NotImplementedError("Non-text content is not supported yet") return tool_result.content + def _format_tool_message_content( + self, content: List[TextContent] + ) -> Union[str, List[ChatCompletionContentPartTextParam]]: + """Format tool result content for inclusion in a tool message. + + - If a single text item, return plain string per OpenAI semantics. + - If multiple items, return a list of text parts. + """ + if len(content) == 1 and isinstance(content[0], TextContent): + return content[0].text + return [ChatCompletionContentPartTextParam(text=c.text, type="text") for c in content] + async def default_agent_rollout_processor( rows: List[EvaluationRow], config: RolloutProcessorConfig diff --git a/tests/pytest/test_tool_response_single_string.py b/tests/pytest/test_tool_response_single_string.py new file mode 100644 index 00000000..87d1c391 --- /dev/null +++ b/tests/pytest/test_tool_response_single_string.py @@ -0,0 +1,40 @@ +import asyncio +from typing import List, Optional + +from mcp.types import TextContent +from openai.types.chat.chat_completion_message import ( + ChatCompletionMessageToolCall, + FunctionCall, +) + +from eval_protocol.models import EvaluationRow, Message +from eval_protocol.pytest.default_agent_rollout_processor import Agent + + +class NoOpLogger: + def log(self, row: EvaluationRow) -> None: + return None + + def read(self, row_id: Optional[str] = None) -> List[EvaluationRow]: + return [] + + +def test_tool_result_single_text_becomes_string(): + # Prepare a minimal evaluation row and agent + row = EvaluationRow(messages=[Message(role="user", content="use the tool")]) + agent = Agent(model="dummy", row=row, config_path="", logger=NoOpLogger()) + + # Single text content becomes a plain string + single = [TextContent(type="text", text="single result")] + formatted = agent._format_tool_message_content(single) + assert isinstance(formatted, str) + assert formatted == "single result" + + # Multiple text contents become a list of text parts + multiple = [ + TextContent(type="text", text="first"), + TextContent(type="text", text="second"), + ] + formatted_multi = agent._format_tool_message_content(multiple) + assert isinstance(formatted_multi, list) + assert [part["text"] for part in formatted_multi] == ["first", "second"] From d81b1f4bf6952bd236e5bac236332c118dc0bae9 Mon Sep 17 00:00:00 2001 From: Derek Xu <32891260+xzrderek@users.noreply.github.com> Date: Thu, 14 Aug 2025 12:00:09 -0700 Subject: [PATCH 17/26] SVG Bench example (#82) --- pyproject.toml | 3 + tests/pytest/data/svgbench_dataset.jsonl | 105 +++++ .../pytest/data/svgbench_sample_dataset.jsonl | 3 + tests/pytest/test_svgbench.py | 379 ++++++++++++++++++ uv.lock | 93 ++++- 5 files changed, 581 insertions(+), 2 deletions(-) create mode 100644 tests/pytest/data/svgbench_dataset.jsonl create mode 100644 tests/pytest/data/svgbench_sample_dataset.jsonl create mode 100644 tests/pytest/test_svgbench.py diff --git a/pyproject.toml b/pyproject.toml index 8820cfc6..4026ce9e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -114,6 +114,9 @@ adapters = [ "datasets>=2.0.0", "transformers>=4.0.0", ] +svgbench = [ + "selenium>=4.0.0", +] [tool.pytest.ini_options] addopts = "-q" diff --git a/tests/pytest/data/svgbench_dataset.jsonl b/tests/pytest/data/svgbench_dataset.jsonl new file mode 100644 index 00000000..5f2d0fc4 --- /dev/null +++ b/tests/pytest/data/svgbench_dataset.jsonl @@ -0,0 +1,105 @@ +{"requirements": ["Create a cow with clearly recognizable bovine features, including a body, head, four legs, tail, and udder.", "The cow must have black and white patches for its coloring.", "Add cow ears, eyes, and snout for facial recognition.", "Position the cow in a realistic plowing stance, leaning forward as if pulling.", "The cow's hooves must be colored brown, as if covered in soil from the field.", "Include a traditional wooden plow with a visible metal blade/share.", "Depict a wooden yoke across the cow's shoulders, connected to the plow by visible chains.", "The plow's blade must be partially buried in the soil, actively turning over a chunk of earth.", "Show at least three distinct, dark furrows in the soil trailing directly behind the plow.", "The field must be split into a plowed section and an unplowed section, with the cow and plow positioned at the boundary between them.", "The unplowed section of the field must have short green grass, which is visibly being overturned by the plow.", "Add a simple background with a clear horizon line and a blue sky containing a yellow sun."], "prompt": "Write `svg` code to draw an image of a cow plowing a field.", "id": "cow_plowing"} +{"requirements": ["The overall background of the SVG must be white.", "All primary elements (logo, search bar, buttons) must be horizontally centered on the canvas.", "Include the Google logo in the center, using its official multi-color scheme (blue, red, yellow, blue, green, red).", "Place a prominent search bar directly below the Google logo, with a vertical spacing equal to half the height of the logo.", "The search bar must be a rounded rectangle with a light gray border.", "The search bar must contain a gray magnifying glass icon perfectly aligned to the left side, inside the bar.", "The search bar must contain a gray microphone icon perfectly aligned to the right side, inside the bar.", "Place two distinct buttons below the search bar, horizontally centered with the search bar, and with a small, consistent gap between them.", "The left button must be labeled 'Google Search'.", "The right button must be labeled 'I'm Feeling Lucky'.", "Both buttons must have a light gray background, a thin gray border, and dark gray text.", "Create a header section at the top right of the canvas, with all its items vertically aligned with each other.", "The header must include text links for 'Gmail' and 'Images'.", "The header must include a 3x3 grid icon (Google Apps launcher) positioned between the 'Images' link and the 'Sign in' button.", "The header must include a prominent 'Sign in' button with a blue background and white text, positioned at the far right of the header."], "prompt": "Write `svg` code for a screenshot of the [Google homepage](https://google.com).", "id": "write_`svg`_code"} +{"requirements": ["Create an elliptical shape for the top surface of a round dinner table with a dark wood grain texture.", "Include exactly 4 sets of cutlery arranged around the table.", "Each cutlery set must consist of a recognizable fork, knife, and spoon.", "Position the 4 cutlery sets at distinct place settings (at 12, 3, 6, and 9 o'clock positions).", "Include a round dinner plate at each of the 4 place settings.", "The fork of each cutlery set must be placed to the left of its corresponding plate, and the knife and spoon to the right.", "Place exactly 3 main food dishes in the center of the table.", "First dish: A recognizable roasted turkey, golden-brown in color, showing a plump body, with one drumstick clearly carved off and missing.", "The turkey must be presented on its own large serving platter.", "Second dish: A round pizza with visible crust and toppings, cut into slices, with one slice missing from the pizza.", "The missing slice of pizza must be placed on the dinner plate at the 3 o'clock position.", "The missing turkey drumstick must be placed on the dinner plate at the 9 o'clock position.", "Third dish: A serving of at least two tacos with visible folded shells and fillings, presented in a red taco holder.", "Arrange the three main dishes in the center of the table, ensuring they don't unnaturally overlap.", "The overall perspective must be slightly isometric."], "prompt": "Write `svg` code for an image of a round dinner table with 4 sets of cutlery and 3 dishes on the table, including a turkey, pizza and tacos.", "id": "dinner_table"} +{"requirements": ["Create a central, cylindrical rocket body colored bright blue.", "Add a pointed, red nose cone attached to the top of the rocket body.", "Include exactly three yellow stabilizer fins, symmetrically attached to the base of the rocket body.", "Incorporate a single circular window on the rocket's body.", "Add two red horizontal stripes on the blue rocket body, one positioned above the window and one below it.", "Apply a clean, cartoonish art style with bold black outlines for all rocket parts.", "Include a visible engine nozzle at the bottom of the rocket, between the fins.", "Position the rocket as if it is launching, with its base just above the ground.", "Add a column of stylized orange and yellow flames emerging from the nozzle, which the rocket is standing on."], "prompt": "Write `svg` code for an image of a toy rocket.", "id": "write_`svg`_code"} +{"requirements": ["Create a classic rubber ducky shape with a distinct body and head, colored bright yellow.", "The duck must have an orange beak and a simple black dot for an eye.", "Draw a white, claw-foot bathtub shape, showing the inside view with a visible rim.", "Fill the lower portion of the bathtub with light blue water.", "Ensure the water line is clearly visible across the duck's body, showing the lower third of the duck submerged.", "Position the duck so it is floating on the water's surface, creating small, concentric circular ripples in the water around its base.", "Depict soap bubbles as clusters of overlapping circles with a slight iridescence, using semi-transparent white, light pink, and light blue fills.", "Place a small cluster of bubbles on top of the duck's head.", "Place a large pile of bubbles against one side of the tub and a few floating on the water's surface around the duck."], "prompt": "Write `svg` code for an image of a rubber ducky floating on top of a soapy bathtub.", "id": "write_`svg`_code"} +{"requirements": ["Create a scene set on top of a solid-looking cloudscape that serves as the ground.", "Include a hot air balloon with a large envelope featuring vertical red and white stripes and a brown wicker basket.", "The balloon's basket must be resting firmly on the surface of a large, flat-topped cloud.", "Show ropes connecting the balloon's envelope to the basket.", "Include four human figures styled as a family: two adults and two children.", "Position one adult figure holding a corner of a red and white checkered picnic blanket, while one child figure holds the opposite corner, as if they are spreading it out together on the cloud.", "Place an open picnic basket on a corner of the blanket that is already spread out.", "A thermos, a bunch of grapes, and a sandwich must be visible emerging from the open picnic basket.", "Position the second adult and second child near the landed hot air balloon, with the adult pointing up at the balloon's envelope.", "The background must be a clear blue sky containing a bright yellow sun and two small, distant clouds."], "prompt": "Write `svg` code for an image of a picnic on top of the clouds, where 2 parents and 2 children have landed with a hot air ballon, and are setting up a picnic with a tarp and food items.", "id": "write_`svg`_code"} +{"requirements": ["Create a red, sporty car with a body, wheels, and windows.", "Draw a distinct circular hoop that is completely surrounded by jagged, irregular flames colored with reds, oranges, and yellows.", "Include a take-off ramp on the left side of the fiery ring and a landing ramp on the right side.", "Position the car in mid-air, with its front half having passed through the ring and its back half still inside the ring.", "The car's rear wheels must be depicted as just having left the edge of the take-off ramp.", "Add a ground surface below the entire jump setup.", "Incorporate gray speed lines trailing behind the car to convey high speed.", "Add orange sparks where the car's tires last touched the take-off ramp."], "prompt": "Write `svg` code for an image of a stunt car jumping through a circle of fire.", "id": "write_`svg`_code"} +{"requirements": ["Create a recognizable grey dolphin with a streamlined body, dorsal fin, and tail fluke.", "Show a water surface below the dolphin with a large splash effect at the point where the dolphin has exited the water.", "Position the dolphin in a dynamic jumping arc, with its entire body in mid-air.", "Draw a circular, multi-colored hula hoop, and position the dolphin so its mid-section is passing through the center of the hoop.", "Include a human trainer's arm and hand extending into the frame.", "The trainer's hand must be holding a small fish by the tail.", "The dolphin's mouth must be depicted as wide open, just about to bite the body of the fish being held by the trainer."], "prompt": "Write `svg` code for an image of a dolphin jumping out of the water and through a hula hoop to bite a fish out of its trainers hand.", "id": "write_`svg`_code"} +{"requirements": ["Create a standard red wine glass shape with a wide bowl, a slender stem, and a circular base.", "The glass must appear transparent, rendered with a light grey tint and low opacity.", "Add bright white highlights on the rim and along the curved side of the bowl to simulate glass reflection.", "Fill the glass with a deep burgundy colored wine.", "The wine must fill the glass to exactly the halfway point of the bowl's height.", "The top surface of the wine must be a flat ellipse, indicating a level liquid surface.", "The body of the wine must perfectly conform to the curved shape of the inside of the glass bowl.", "A single drop of red wine must be shown running down the outside of the glass bowl, starting from the rim and ending just above the stem."], "prompt": "Write `svg` code for an image of half full glass of red wine.", "id": "write_`svg`_code"} +{"requirements": ["Create a full-screen background using a modern, abstract macOS wallpaper.", "Add a horizontal, semi-transparent menu bar at the top edge of the screen.", "Place an Apple logo icon in the top-left corner of the menu bar, followed by the menu text 'Finder', 'File', 'Edit', and 'View'.", "Add Wi-Fi, battery, and date/time icons to the right side of the menu bar.", "Design a glass-like Dock with rounded corners at the bottom of the screen, hovering slightly above the bottom edge.", "Populate the Dock with icons for Finder, Safari, Mail, and System Settings, with a Trash icon at the far right of the Dock.", "Draw a Finder window as the main foreground element, positioned over the desktop wallpaper.", "The Finder window must have a title bar containing the three 'traffic light' control buttons (red, yellow, green) in the top-left corner.", "The main content area of the Finder window must display several generic folder icons.", "One of the folder icons in the Finder window must be identical to the Finder icon in the Dock.", "Apply a prominent drop shadow to the entire Finder window to make it appear floating above the desktop wallpaper and the Dock."], "prompt": "Write `svg` code for a screenshot of the macOS desktop, with a finder window on top.", "id": "write_`svg`_code"} +{"requirements": ["Whip must be depicted in a coiled, spiral arrangement on a flat surface.", "Include a distinct, solid brown handle (the stock) with a visible wood grain texture.", "The handle must feature a silver knob or pommel at its base.", "The handle must include a leather wrist loop (keeper) hanging from the pommel.", "The main flexible part of the whip (the thong) must be attached to the handle and made of braided black leather.", "The thong must show a clear taper, starting thicker at the handle and getting progressively thinner towards the tip.", "The coils of the thong must overlap realistically, with the handle and the thickest part of the thong on top of the outer coils.", "Include a 'fall,' which is a thinner, smooth leather piece attached to the end of the main braided thong.", "Show a frayed white 'cracker' or 'popper' at the very tip of the fall.", "Add subtle shading and highlights to the coils and handle to give the whip a three-dimensional appearance and a slight leather sheen."], "prompt": "Write `svg` code for an image of a coiled whip.", "id": "write_`svg`_code"} +{"requirements": ["Create a first-person perspective, as if looking through the player's eyes.", "Include a recognizable CS:GO AK-47 weapon model held by player hands in the bottom-right of the screen.", "The player's hands must be wearing the default Terrorist team gloves.", "Place a green plus-sign crosshair in the exact center of the screen.", "Display a Heads-Up Display (HUD) with game information using a font style that mimics the actual CS:GO interface.", "In the bottom-left of the HUD, show player health as '100' next to a plus icon and armor as '100' next to a shield icon.", "In the bottom-center of the HUD, show the ammunition count as '30 / 90'.", "In the top-left, include a square radar/minimap with a player indicator arrow in the middle.", "In the top-center, display the round timer as '1:45'.", "Above the timer, show the team scores with the Terrorist icon and a score of '5' on the left, and the Counter-Terrorist icon and a score of '3' on the right.", "The background must depict the 'A-long' area from the map Dust II, with the crosshair aimed at the double doors."], "prompt": "Write `svg` code for an screen of a first person view in CS:GO.", "id": "write_`svg`_code"} +{"requirements": ["Create a main structure for a wooden fruit stall, including a counter.", "Add a red and white striped canopy over the stall, supported by two vertical wooden posts.", "Display exactly four different, recognizable fruits.", "First fruit: A pile of red apples in a wicker basket on the left side of the counter.", "Second fruit: A bunch of yellow bananas placed next to the apples.", "Third fruit: A pile of oranges in another wicker basket on the right side of the counter.", "Fourth fruit: A single, large slice of watermelon resting directly on the counter in the center.", "Include a character representing the stall vendor, a smiling man with a mustache, positioned behind the counter and between the baskets.", "Add a small, hanging chalkboard sign from the canopy that reads 'Fresh Fruit'.", "Depict a cobblestone ground surface in front of the stall.", "Include the silhouette of another market stall in the background to suggest a larger market setting."], "prompt": "Write `svg` code for an image of a fruit stall in the market.", "id": "write_`svg`_code"} +{"requirements": ["Create a wooden barrel with visible vertical planks and two horizontal metal hoops.", "The barrel must be buried in a mound of sand, so only the top half is visible.", "Show the sand mounded up slightly around the visible base of the barrel.", "The barrel must be open at the top and filled with treasure items.", "Show treasure overflowing from the top and spilling down one side of the barrel onto the sand.", "The treasure must include a large pile of shiny gold coins, both inside and outside the barrel.", "Add a variety of colorful gemstones (red rubies, green emeralds, blue sapphires) mixed in with the coins.", "A string of white pearls must be draped over the edge of the barrel, trailing down into the spilled coins.", "A golden goblet must be visible, partially buried in the coins inside the barrel.", "Use bright highlights and glint effects on the coins, gems, and goblet to make them look shiny."], "prompt": "Write `svg` code for an image of half buried barrel of treasure.", "id": "write_`svg`_code"} +{"requirements": ["Create a vintage-style rotary telephone as the main subject, colored classic black.", "The telephone must have a main body, a handset, and a curly cord connecting them.", "The rotary dial must have ten visible finger holes with the numbers 0-9 arranged in a circle beneath them.", "Place the telephone on a small, dark wooden table with a visible wood grain texture.", "The table must have four visible, tapered legs.", "Position the phone realistically on the tabletop, with the handset resting in its cradle.", "The curly cord must hang down from the handset and connect to the main body of the phone.", "Incorporate a distinct shadow cast by the telephone onto the surface of the table.", "Add another shadow on the floor cast by the table itself to create depth."], "prompt": "Write `svg` code for an image of a vintage rotary telephone on a small wooden table.", "id": "write_`svg`_code"} +{"requirements": ["Draw a knight in a full suit of silver armor with a metallic sheen created using highlights and gradients.", "The knight must be holding a longsword in one hand and a shield in the other.", "The shield must have a coat of arms, such as a lion, depicted on it.", "Depict the knight in a dynamic pose, with the shield raised to block and the sword ready to strike.", "Draw a large, menacing green dragon with scales, large wings, and a spiky tail.", "The dragon must be shown actively breathing a large plume of fire directly towards the knight.", "The fire effect must be colored with bright reds, oranges, and yellows.", "The knight's shield must be positioned to intercept the fire, with the flames shown splashing against it.", "The scene must be set in a dark, rocky cavern, with the dragon's fire being the primary light source.", "The fire must cast a bright orange light on the front of the knight and the cavern walls, creating long, dark shadows behind them."], "prompt": "Write `svg` code for an image of a knight in shining armor fighting a fire-breathing dragon.", "id": "write_`svg`_code"} +{"requirements": ["Replicate the user interface of the Slack application using the recognizable Slack color scheme (purple sidebar, white main view).", "Include a left sidebar with a list of channels, with the channel '#design-team' highlighted to show it is active.", "The main view must show the message history for the '#design-team' channel.", "Display exactly three distinct messages from different fictional users.", "The first message must be from 'Alice' with a user avatar, a timestamp, and the text 'Here is the latest mockup. What do you think?'.", "The second message, below the first, must be from 'Bob' with a different avatar, a later timestamp, and the text 'Looks great! I love the new color palette.'.", "The third message, below Bob's, must be from 'Charlie' with a third avatar, a later timestamp, and the text 'Agreed! Ship it!'.", "Add a thumbs-up emoji reaction from two users on Alice's message.", "Show the 'user is typing...' indicator below the last message, with 'David is typing...' visible.", "Include the message input box at the bottom of the channel view, with placeholder text inside it."], "prompt": "Write `svg` code for a screenshot of a Slack channel with several messages, reactions, and a user typing.", "id": "write_`svg`_code"} +{"requirements": ["Draw a mound of dirt on a green grass surface to represent the top of an ant hill.", "Create a cutaway view showing the underground cross-section of the hill, featuring a network of tunnels and chambers.", "Depict three distinct types of chambers connected by tunnels.", "The top chamber must be a food storage area, filled with small green leaf fragments and seeds.", "The middle chamber must be a nursery, containing white ant eggs and larvae.", "The bottom chamber must be the queen's chamber, containing a single, large queen ant.", "Populate the tunnels and chambers with numerous small, black ants.", "Show some ants carrying leaf fragments from the entrance to the food storage chamber.", "Show other ants in the nursery tending to the eggs.", "The queen ant must be significantly larger than the other ants and shown laying an egg.", "Use a dark brown color for the packed earth of the chamber walls and a lighter brown for the loose soil inside the tunnels."], "prompt": "Write `svg` code for an image of a cross-section of an ant hill, showing tunnels and chambers with ants.", "id": "write_`svg`_code"} +{"requirements": ["Draw a school bus from a flat, side-on perspective, colored 'school bus yellow'.", "Include the long, rectangular body of the bus with a series of five evenly spaced passenger windows.", "Draw two visible wheels with black tires and silver hubcaps.", "Incorporate the characteristic black horizontal stripes running the length of the bus.", "Include the text 'SCHOOL BUS' in black, capital letters on the side panel between the black stripes.", "Show the driver's door and window at the front of the bus, with a silhouette of a person visible in the driver's seat.", "Add a red, octagonal stop sign attached to the side, fully extended outwards from the bus.", "Include side mirrors at the front and visible red lights at the front and back of the bus body."], "prompt": "Write `svg` code for an image of a classic yellow school bus from a side-on view.", "id": "write_`svg`_code"} +{"requirements": ["Create a triangular slice of pie on a white plate.", "The pie crust must be a golden-brown color with a texture suggesting it is flaky.", "Design a lattice-style top crust with interwoven strips of pastry, allowing the filling to be seen.", "The pie filling, visible through the lattice and on the cut side, must be red with small, dark red circles to represent cherries.", "Place a scoop of off-white vanilla ice cream directly on top of the pie slice, near the back corner.", "The ice cream scoop must have a slightly irregular, melting shape, with a small puddle forming at its base on the pie.", "The ice cream must have tiny dark specks to indicate vanilla bean.", "A silver fork must be resting on the plate next to the pie slice, with a small piece of cherry filling on its tines."], "prompt": "Write `svg` code for an image of a slice of cherry pie with a lattice crust and a scoop of vanilla ice cream next to it.", "id": "write_`svg`_code"} +{"requirements": ["Design a robot with a 'friendly' appearance, characterized by rounded shapes and large, circular optic sensors.", "The robot should have a polished chrome metallic texture, with highlights and shadows that give it a 3D feel and reflect the bar's lighting.", "Position the robot behind a sleek, minimalist bar counter.", "One of the robot's arms is actively pouring a vibrant, glowing green liquid from a cocktail shaker into a futuristic-looking glass held steady by its other hand.", "The bar setting must look futuristic, with glowing blue neon light strips running along the edges of the counter and the background shelves.", "Include shelves in the background holding uniquely shaped, futuristic bottles, one of which is half-empty and contains the same glowing green liquid as the drink being poured.", "The sleek bar counter must have a reflective surface, showing a partial, distorted reflection of the robot and the neon lights.", "Compose the scene from the perspective of a customer at the bar, with a non-human, metallic hand visible in the foreground, resting on the counter and reaching towards the drink being prepared.", "The robot and the drink preparation process should be the central focus of the image."], "prompt": "Write `svg` code for an image of a friendly robot serving drinks at a futuristic bar.", "id": "write_`svg`_code"} +{"requirements": ["Place a large, glowing Sun at the center of the diagram, emitting visible rays of light.", "Include all eight planets of the solar system: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, and Neptune, arranged in the correct order from the Sun.", "The side of each planet facing the Sun must be brightly lit, while the opposite side is in shadow, demonstrating the Sun as the primary light source.", "Draw distinct elliptical paths to represent the orbit of each planet around the Sun.", "Represent the relative size differences between the planets accurately (e.g., Jupiter largest, Earth much smaller than Uranus, etc.).", "Each planet must have its key visual characteristic: Earth's continents, clouds, and its Moon orbiting it; Mars's red color and polar ice cap; Jupiter's Great Red Spot; and Saturn's prominent rings.", "Include a text label for the Sun and for each of the eight planets, connected to its corresponding celestial body with a thin, faint line.", "Draw the asteroid belt as a dense field of small rocks in a shared orbit between Mars and Jupiter.", "Use a dark background to represent outer space, populated with small, distant stars, and include a single comet with a visible tail that points away from the Sun."], "prompt": "Write `svg` code for an image of a detailed diagram of the solar system with all the planets orbiting the sun.", "id": "write_`svg`_code"} +{"requirements": ["Draw a beaver with recognizable features: brown fur, large front teeth, and a flat, paddle-shaped tail, positioned halfway on a dam it is building.", "The beaver should be holding a muddy stick in its paws, actively placing it onto a section of the dam.", "Construct a dam across a river, made of realistically interlocked sticks, branches, and mud.", "Illustrate a clear difference in the water level: the water on the upstream side is high, calm, and deep, forming a pond, while the water on the downstream side is low and shallow, revealing rocks on the riverbed.", "The environment must include a riverbank with a tree that has been partially gnawed through at its base, with a pile of wood chips around it. The stick the beaver is holding must match the wood of this tree.", "Include the beaver's lodge, a large mound-like home made of sticks and mud, on the edge of the pond created by the dam.", "The water in the newly formed pond should reflect the sky and the trees on the bank.", "The water flowing over a low point in the dam should be depicted with ripples and lines to indicate movement."], "prompt": "Write `svg` code for an image of a beaver building a dam in a river.", "id": "write_`svg`_code"} +{"requirements": ["Replicate the user interface of the Visual Studio Code editor using the Dark+ color theme.", "Include the Activity Bar on the far left with the 'Files' icon in an active state.", "Show the Side Bar with a file explorer tree, where a file named `bot_controller.py` is highlighted as active.", "The main editor pane must display the contents of this `bot_controller.py` file, containing a block of Python code.", "The Python code must have correct syntax highlighting for keywords (purple), strings (orange), comments (green), and function names (yellow). The code must contain a function with a descriptive comment above it.", "Display line numbers in the gutter to the left of the code.", "Include a blinking text cursor positioned on line 15, column 8, within the body of the function.", "Show editor tabs at the top, with the tab for `bot_controller.py` visually active and matching the highlighted file in the explorer.", "Include the Status Bar at the bottom, showing information that corresponds to the editor's state: the language mode ('Python'), the line and column number of the cursor ('Ln 15, Col 8'), and the active Python interpreter."], "prompt": "Write `svg` code for a screenshot of a VS Code editor with a colorful syntax-highlighted block of Python code.", "id": "write_`svg`_code"} +{"requirements": ["Draw a tall, upright grandfather clock in a room setting, placed next to a window.", "The clock case, made of dark mahogany with a visible wood grain texture, must consist of a hood with a decorative finial on top, a long trunk, and a base.", "The clock face within the hood must have Roman numerals, with the hour hand pointing directly at 'III' and the minute hand pointing at 'XII' to show the time is 3:00.", "Light from the window must cast a long shadow from the clock onto the floor, consistent with an afternoon sun.", "Show a swinging brass pendulum inside the trunk's glass panel, depicted at the far right of its arc to imply motion.", "Include three hanging brass weights on chains inside the trunk; the rightmost weight must be positioned slightly higher than the other two, as if it has just chimed the hour.", "Use shading and highlights to give the wooden case and metallic pendulum and weights a three-dimensional appearance."], "prompt": "Write `svg` code for an image of a grandfather clock with a swinging pendulum.", "id": "write_`svg`_code"} +{"requirements": ["Create a scene with a distinct 8-bit, pixelated art style and a limited, bright color palette.", "Design a main character in a side-view, mid-jump, with their head positioned directly beneath a floating 'question mark' block, as if about to hit it.", "A single, pixelated coin must be depicted emerging from the top of the 'question mark' block, frozen mid-air, as a result of the character hitting it.", "Include a ground level made of repeating brown square blocks, with a small gap in the ground that the character is currently jumping over.", "All blocks must have a simple 3D effect with shading on one side.", "Design a simple, 8-bit style walking mushroom enemy on the ground level, moving towards the spot where the character will land after their jump.", "Include a simple UI in the top-left corner of the screen, displaying a score ('SCORE: 005000') and a life count (a small pixelated icon of the character's head x 3).", "The background must have simple pixelated clouds and hills, reinforcing the side-scrolling video game perspective."], "prompt": "Write `svg` code for an image of an 8-bit video game level, similar to Super Mario Bros., with a character, blocks, and an enemy.", "id": "write_`svg`_code"} +{"requirements": ["Draw a large, deep ceramic bowl, viewed from a slightly angled perspective to show all ingredients clearly.", "Fill the bowl with a rich-looking, opaque broth, with highlights to give it a wet, glossy appearance.", "Depict a nest of wavy ramen noodles in the center of the bowl, with a pair of wooden chopsticks lifting a single noodle out of the broth.", "Include two slices of chashu pork with visible layers of meat and fat, with one slice partially submerged in the broth.", "Add a soft-boiled egg (ajitama) cut in half, revealing a bright orange, jammy yolk, nestled against the pork.", "Garnish with a pile of finely chopped green onions on one side of the bowl.", "Include a large, crisp sheet of nori (seaweed) standing upright behind the noodles and a small cluster of menma (bamboo shoots) next to the egg.", "The chopsticks should be resting on the rim of the bowl, with their tips pointing towards the noodles."], "prompt": "Write `svg` code for an image of a detailed bowl of Japanese ramen, with noodles, broth, a soft-boiled egg, and chashu pork.", "id": "write_`svg`_code"} +{"requirements": ["Draw a cat curled into a tight ball, sleeping soundly on a cushioned windowsill, with its body pressed lightly against the base of a potted plant.", "Include a window frame around the scene, viewed from inside a room with a warm and soft color palette.", "Draw vertical streaks and scattered droplets on the window pane to represent heavy rain.", "Create a small, circular patch of condensation on the glass where the cat's warm breath would be, slightly obscuring the view of the rain outside.", "The potted plant on the windowsill must have a single water droplet clinging to the tip of one of its leaves.", "The scene outside the window must be blurred and rendered in muted blues and grays to suggest a cold, overcast day, contrasting with the warm interior light.", "Add a subtle, distorted reflection of the room's interior, including the silhouette of a lamp, on the surface of the window glass."], "prompt": "Write `svg` code for an image of a cat sleeping on a windowsill next to a potted plant, with rain streaking down the window pane.", "id": "write_`svg`_code"} +{"requirements": ["Draw a Swiss Army knife with its main red casing having a glossy finish.", "The iconic white cross inside a red shield logo must be clearly visible and inlaid into the center of the casing.", "Show several tools extended from the knife's body in a fanned-out arrangement from a visible pivot point.", "The large knife blade must be fully extended.", "A pair of scissors must be included and shown slightly open.", "A corkscrew, a can opener, and a flathead screwdriver must also be included, fanned out at different angles from the blade.", "All tools must have a metallic, silver/gray appearance with sharp, specular highlights to suggest shininess.", "Place the knife on a neutral surface, casting a soft shadow beneath it that follows the shape of the knife and its extended tools."], "prompt": "Write `svg` code for an image of a Swiss Army knife with several tools extended.", "id": "write_`svg`_code"} +{"requirements": ["Draw a large, cone-shaped volcano, silhouetted against a dark night sky.", "The sky must be dark black and contain a crescent moon and scattered stars.", "Show a massive plume of smoke and ash billowing from the crater, where its underside is intensely illuminated with fiery reds and oranges from the eruption below.", "Depict bright, glowing red and orange lava erupting from the crater and being ejected high into the air as pyroclastic debris.", "Illustrate multiple rivers of molten lava flowing down the sides of the volcano, carving glowing paths through the dark rock and pooling at the mountain's base.", "The erupting lava must be the primary light source, casting a dramatic glow on the smoke plume and the slopes of the mountain.", "The stars near the bright smoke plume must be obscured or less visible due to the intense glow of the eruption.", "The contrast between the dark, unlit mountain/sky and the brilliant, glowing lava must be sharp and dramatic."], "prompt": "Write `svg` code for an image of a volcano erupting at night, with lava flowing down its side.", "id": "write_`svg`_code"} +{"requirements": ["Create a main rectangular board area with a light grey background color.", "Draw exactly three vertical columns with the headers 'To Do', 'In Progress', and 'Done' respectively.", "Populate the 'To Do' column with three rectangular white cards containing placeholder text.", "One card in the 'To Do' column must have a red 'Urgent' label.", "Populate the 'In Progress' column with one rectangular white card.", "The card in the 'In Progress' column must feature two circular user avatars, indicating it is assigned to two people.", "Populate the 'Done' column with two rectangular white cards.", "One card in the 'Done' column must have a green 'Completed' label and a paperclip icon, indicating a finished task with an attachment.", "Illustrate one card being dragged from the 'To Do' column towards the 'In Progress' column, positioned between the two columns with a slight rotation and a drop shadow to indicate it is actively being moved.", "Ensure consistent spacing and alignment between all columns and cards.", "Include the main board title 'Project Alpha' at the top of the image."], "prompt": "Write `svg` code for a screenshot of a Trello board with multiple columns and cards.", "id": "write_`svg`_code"} +{"requirements": ["Draw a cooked sausage (frankfurter) nestled inside a sliced hot dog bun.", "The bun must appear soft and lightly toasted, with its shape conforming to the sausage it holds.", "Add a wavy line of yellow mustard across the top of the sausage.", "Add a wavy line of red ketchup that intertwines and overlaps with the mustard line on top of the sausage.", "Show a small drip of yellow mustard that has fallen from the hot dog onto the paper plate below.", "Place the entire hot dog on a white paper plate with fluted/ridged edges.", "The hot dog must cast a slight shadow onto the surface of the plate to create depth.", "Use shading and highlights on the sausage and bun to give them a three-dimensional, rounded look.", "The condiments must appear as if they are sitting on top of the sausage, following its curved contour."], "prompt": "Write `svg` code for an image of a hot dog with mustard and ketchup in a bun, on a paper plate.", "id": "write_`svg`_code"} +{"requirements": ["Create an underwater scene with a blue water background that is light cyan at the top and gets progressively darker towards the sea floor.", "Draw a variety of colorful coral formations on the sea floor, including pink brain coral, orange staghorn coral, and purple sea fans.", "Include a large sea turtle as the central element, swimming towards a patch of sea grass near the coral.", "Depict a school of at least five small, yellow tang fish swimming in unison past the turtle.", "Show two orange-and-white striped clownfish peeking out from within the tentacles of a green sea anemone.", "Place a red starfish attached to a rock at the base of the coral formations.", "Illustrate rays of light filtering down from the water's surface, casting a dappled light pattern on the sea turtle's shell and the sea floor.", "One clownfish must be partially obscured by the anemone's tentacles.", "The overall composition must be vibrant and dense, with the turtle, fish, and coral overlapping to create a sense of depth."], "prompt": "Write `svg` code for an image of a coral reef teeming with colorful fish and a sea turtle.", "id": "write_`svg`_code"} +{"requirements": ["Draw a clear glass mason jar, complete with its characteristic screw-top threads and embossed lettering on its side.", "The jar must be transparent, with the marbles inside fully visible and their shapes slightly distorted by the curved glass.", "Add white, curved highlights and reflections on the jar's surface that follow its cylindrical shape.", "Fill the jar almost to the top with numerous overlapping and stacked spherical marbles.", "The marbles must include at least three distinct, visible patterns: solid blue, green and white swirled, and a classic 'cat's eye' with a colored vane inside.", "Place a single 'cat's eye' marble on the surface next to the jar, casting a small shadow.", "The jar itself must cast a faint, transparent shadow that is tinted by the colors of the marbles within it.", "Each marble, both inside and outside the jar, must have a small, sharp white highlight to indicate its glossy surface.", "Include dark contact shadows between the marbles where they touch each other and where they press against the inside of the jar to create a sense of volume and weight."], "prompt": "Write `svg` code for an image of a glass jar filled with colorful marbles.", "id": "write_`svg`_code"} +{"requirements": ["Depict a large, partially constructed pyramid with visible stone layers and an unfinished, flat top.", "Include a large earthen ramp spiraling up the side of the pyramid, leading to the current construction level.", "Show a group of at least five workers in ancient Egyptian loincloths pulling a large stone block up the ramp using thick ropes.", "The stone block must be resting on a wooden sledge.", "Depict another worker walking in front of the sledge, pouring water from a clay jug onto the sand of the ramp to reduce friction.", "On the top level of the pyramid, show two other workers using long wooden levers to pry another stone block into its final position next to an existing one.", "Several unused, rectangular stone blocks must be visible at the base of the ramp in the sand.", "The setting must be a vast desert landscape under a bright, clear blue sky with a harsh sun.", "The workers, the sledge, and the pyramid must cast long, dark shadows on the sand, consistent with the bright sun's position.", "The overall color palette must consist of sandy yellows, stone greys, and sky blues."], "prompt": "Write `svg` code for an image of an ancient Egyptian pyramid under construction, with workers moving large stone blocks.", "id": "write_`svg`_code"} +{"requirements": ["Draw a single rose flower head in full bloom, with vibrant red petals that overlap in a natural, spiral-like formation.", "Use shading and gradients on the petals to create depth and a velvety texture.", "Place a single, clear droplet of water on the edge of one of the outer petals, showing refraction of the red petal color within it and a sharp highlight.", "Include a long, slender green stem connected to the base of the flower head, with several small, sharp thorns protruding from it.", "Attach exactly two green leaves to the stem, each with serrated edges and visible veins.", "One of the leaves must have a small, irregular hole in it, as if from an insect bite.", "Show a single red petal that has fallen from the flower and is lying on the surface near the base of the stem.", "The rose and the fallen petal must cast a soft shadow on a simple, light-grey background."], "prompt": "Write `svg` code for an image of a single red rose with a long stem and thorns.", "id": "write_`svg`_code"} +{"requirements": ["Replicate the general layout of the YouTube homepage within a rectangular frame with a dark mode theme.", "Include a header section at the top with a dark grey background.", "The header must contain the white YouTube logo (play icon and text), a central dark search bar with a search icon, and user-related icons on the right (create, notifications, profile avatar).", "Include a collapsible sidebar on the left with navigation links and icons (Home, Shorts, Subscriptions, Library), with the 'Home' icon and text highlighted to indicate the current page.", "The main content area must be a grid of at least six video thumbnails.", "Each thumbnail must be a rectangle containing a placeholder image, with a small box in the corner indicating video length (e.g., '10:32').", "Below each thumbnail, include a circular channel avatar, a placeholder for the video title on one line, and the channel name and view count on a second line.", "One of the video thumbnails must show a progress bar at the bottom, indicating it has been partially watched.", "Use the official YouTube color scheme: red (#FF0000) for highlights like the logo and progress bar, and shades of dark grey and white for the UI."], "prompt": "Write `svg` code for a screenshot of the YouTube homepage, showing video thumbnails and a sidebar.", "id": "write_`svg`_code"} +{"requirements": ["Illustrate a stack of exactly four books, arranged vertically but slightly askew so they don't line up perfectly.", "Depict the books as old and leather-bound, using colors like dark brown, burgundy, and forest green for the covers.", "Show the spines of the books, with the top book's spine featuring raised bands and faded gold-leaf lettering for a title.", "The visible page edges must be a yellowish, aged color, with thin horizontal lines to represent individual pages.", "The book at the bottom of the stack must be larger and thicker than the others, forming a stable base.", "Use subtle textures and scuff marks on the leather covers and corners to indicate wear and tear.", "Render the stack in a 3D perspective, showing the top cover of the highest book and the side spines and page edges of all four.", "The entire stack must cast a soft shadow on the surface it is resting on."], "prompt": "Write `svg` code for an image of a stack of old, leather-bound books.", "id": "write_`svg`_code"} +{"requirements": ["Draw a chameleon with its characteristic features: a curled tail, a head crest, and a prominent, independently rotating eye.", "Position the chameleon on a tree branch that extends diagonally across the image.", "The branch must have a rough brown bark texture and several green leaves attached to it.", "Illustrate the camouflage effect by having the chameleon's skin pattern and color actively blending into the branch and leaves it is touching.", "The rear half of the chameleon's body and its back legs, which are on the bark, must mimic the brown, rough texture of the branch.", "The front half of the chameleon's body and its head, which are near the leaves, must mimic the green color and vein patterns of the leaves.", "Show a visible, soft gradient transition on the chameleon's torso where the brown bark pattern blends into the green leaf pattern.", "The chameleon must be in a realistic clinging pose, with its zygodactyl feet gripping the branch firmly."], "prompt": "Write `svg` code for an image of a chameleon on a branch, changing its color to match the leaves.", "id": "write_`svg`_code"} +{"requirements": ["The background must be a solid, dark 'blueprint blue' color (#000080).", "All lines and text must be white.", "Depict a top-down floor plan of a small house with at least two bedrooms and one bathroom.", "Use thick lines for exterior walls and thinner lines for interior walls.", "Show openings for doors and windows within the walls; doors must be indicated with a line and a quarter-circle arc showing the swing direction into the corresponding room.", "Include labels in all caps for each room: 'KITCHEN', 'BEDROOM 1', 'BEDROOM 2', 'LIVING ROOM', and 'BATH'.", "Add exterior dimension lines with measurement annotations (e.g., '30ft') along the outside of the walls.", "Add interior dimension lines to show the size of 'BEDROOM 1'.", "Include schematic outlines of key furniture: a bed and closet in each bedroom, a sofa in the living room, a toilet and shower in the bathroom, and kitchen counters with a sink.", "The kitchen counters must connect to the living room in an open-plan layout.", "Incorporate a title block in the bottom-right corner with text for 'Project Name: 'Small House'', 'Drawing: 'Floor Plan'', and 'Scale: '1/4\" = 1ft''."], "prompt": "Write `svg` code for an image of a detailed architectural blueprint of a small house.", "id": "write_`svg`_code"} +{"requirements": ["Draw a white bowl.", "Fill the bowl with a mound of spaghetti noodles, depicted with many overlapping, curved yellow lines.", "Cover the spaghetti with a generous amount of red tomato sauce that drips down the sides of the noodle mound.", "Add highlights to the sauce to give it a glossy appearance.", "Place exactly three round, brown meatballs on top of the spaghetti, nestled in the sauce.", "Add a sprinkle of green specks (parsley) over the dish, with some specks also landing on the meatballs and the rim of the bowl.", "Show a metallic fork (grey with highlights) actively twirling a small portion of spaghetti noodles, lifting them slightly from the bowl.", "The twirled noodles on the fork must be coated in sauce and have one of the meatballs caught in the twirl.", "The main mound of spaghetti must show an indentation where the fork has lifted the noodles from."], "prompt": "Write `svg` code for an image of a plate of spaghetti and meatballs, with a fork twirling some noodles.", "id": "write_`svg`_code"} +{"requirements": ["Design a spaceship with a sleek, futuristic aesthetic, featuring smooth curves and panel lines on its silver, metallic-looking surfaces.", "Show the spaceship with its landing gear deployed and resting on the ground, with dust and small rocks kicked up around the landing struts.", "Include a cockpit window through which a faint silhouette of a pilot is visible, engine exhausts that are still glowing faintly red from the landing, and several active glowing blue lights on the hull.", "The setting is a barren alien planet with a surface composed of red soil and scattered purple rocks.", "Depict two moons of different sizes visible in the sky.", "The sky must be a dark purple, with its color reflecting off the silver hull of the spaceship.", "The light from the larger of the two moons must cast a long, dramatic shadow of the spaceship across the red soil, with the smaller purple rocks also casting their own distinct shadows."], "prompt": "Write `svg` code for an image of a sleek, futuristic spaceship landing on a barren alien planet with two moons.", "id": "write_`svg`_code"} +{"requirements": ["The image must be a close-up (macro) view of a spiderweb.", "Draw the web with a classic orb-weaver structure: radial support lines originating from a center point, and a spiral of thinner capture threads.", "The web's threads should be thin and delicate.", "Scatter multiple small, circular dewdrops along the threads of the web.", "Each dewdrop must be rendered as translucent, showing the web lines behind them as slightly distorted or refracted.", "Add a small, white highlight to each drop, with all highlights consistently placed to indicate a single, low light source from the early morning sun.", "Include a small gnat trapped in one of the web's spiral threads, with the thread pulling taut from the insect's weight.", "Use a soft, blurred, out-of-focus background of green foliage to make the web and dewdrops stand out."], "prompt": "Write `svg` code for an image of a spiderweb with dewdrops on it, seen up close.", "id": "write_`svg`_code"} +{"requirements": ["Create a rectangular frame representing a smartphone screen.", "Design a clean, modern User Interface (UI) for a weather application.", "At the top, display the current location as 'San Francisco', the current temperature as '68°' in a large font, and the weather description as 'Sunny'.", "Include a large, clear sun icon next to the current conditions, matching the 'Sunny' description.", "The UI background must be a light blue gradient, reflecting the current 'Sunny' weather condition.", "Below the current conditions, display a horizontal 5-day forecast section.", "The first day of the forecast (MON) must show a sun icon with temps '72° / 55°'.", "The second day (TUE) must show a sun-and-cloud icon with temps '69° / 54°'.", "The third day (WED) must show a cloud icon with temps '65° / 52°'.", "The fourth and fifth days (THU, FRI) must show rain drop icons with temps '62° / 50°' and '60° / 49°' respectively, showing a clear progression of weather.", "Use a legible, sans-serif font throughout the UI."], "prompt": "Write `svg` code for a screenshot of a weather app UI, showing a 5-day forecast with icons for sun, clouds, and rain.", "id": "write_`svg`_code"} +{"requirements": ["Draw a three-story brick building with multiple windows on its facade.", "Show visible orange, red, and yellow flames and dark smoke billowing from a second-story window.", "Depict exactly two firefighters in full protective gear (helmet, coat, pants, boots).", "A ladder must extend from a partially visible red fire truck to a third-story window, where one firefighter is positioned, preparing to enter.", "The second firefighter must be on the ground, aiming a fire hose towards the flaming second-story window.", "A thick fire hose must connect the firefighter on the ground to the fire truck.", "Show a powerful stream of water spraying from the hose's nozzle, arcing up and entering the flaming window.", "The scene must be set against a dark night sky, where the orange glow from the fire illuminates the side of the building, both firefighters, the ladder, and the stream of water."], "prompt": "Write `svg` code for an image of a group of firefighters putting out a fire on a multi-story building.", "id": "write_`svg`_code"} +{"requirements": ["Draw a ceramic-style coffee cup with a handle.", "Place the cup on a matching saucer.", "Fill the cup with a two-toned liquid representing cappuccino: a dark brown coffee base and a lighter, creamy foam top.", "Create a distinct heart shape in the center of the foam using the darker coffee color, recognizable as latte art.", "Position the view from a slight angle to clearly display the heart design and the side of the cup.", "Add a small silver spoon resting on the saucer, with its reflection slightly visible on the side of the cup.", "Include subtle wisps of steam rising from the cup, with the heart art slightly distorting the path of the steam rising directly above it."], "prompt": "Write `svg` code for an image of a cup of cappuccino with latte art in the shape of a heart.", "id": "write_`svg`_code"} +{"requirements": ["Draw a thick, ancient-looking book with a decorative leather cover.", "The book must be open, showing two pages with diagrams and symbols.", "Place the open book on top of an ornate stone pedestal.", "Illustrate several runes floating in the air directly above the open pages.", "The runes must have a visible glow effect, casting a colored light down onto the pages of the book.", "One of the symbols on the book's page must match one of the glowing runes floating above it.", "The book itself must emit a faint glow that illuminates the top surface of the pedestal it rests on.", "Use a dark, atmospheric background of a stone chamber to make the glowing elements stand out."], "prompt": "Write `svg` code for an image of a wizard's spellbook open on a pedestal, with glowing runes floating above it.", "id": "write_`svg`_code"} +{"requirements": ["Frame the entire image with the opening of a camping tent, creating a first-person perspective from inside, with the tent's fabric, seams, and zipper visible as the frame.", "In the foreground, show the edge of a red sleeping bag and a green backpack, establishing the interior of the tent.", "Outside the tent, depict a lit campfire with visible logs and bright orange and yellow flames.", "The campfire must cast a warm, flickering glow on the ground in front of the tent and on the visible parts of the tent's opening.", "The background must be a dark night sky populated with numerous small dots representing stars.", "Include silhouettes of pine trees against the starry sky.", "The perspective must be low, as if lying down inside the tent on the sleeping bag, looking out past the campfire to the sky."], "prompt": "Write `svg` code for an image of the view from inside a tent, looking out at a campfire and a starry night sky.", "id": "write_`svg`_code"} +{"requirements": ["Draw the main base of the record player with a wood grain finish.", "Include a circular platter on the base, on which a black vinyl record is placed.", "The record must have visible concentric grooves and a red center label.", "Illustrate a tonearm with a headshell and cartridge, positioned so the stylus (needle) is resting within one of the grooves on the record's surface.", "Include control elements: a power knob that is in the 'on' position and a speed selector set to '33' rpm.", "Show an open, transparent dust cover hinged at the back of the base, with a slight reflection of the tonearm visible on its surface.", "Depict small, stylized musical notes floating up from the record to indicate that music is playing."], "prompt": "Write `svg` code for an image of a classic vinyl record player with a record on the turntable.", "id": "write_`svg`_code"} +{"requirements": ["Illustrate an athletic figure in a dynamic tennis serving pose, wearing a white shirt and blue shorts.", "The player's body must be arched backwards, conveying coiled power.", "One arm must be extended upwards, having just tossed a yellow tennis ball into the air.", "The other arm must be holding a tennis racquet, swung high and captured at the moment just before it strikes the ball.", "The tennis ball must be positioned in the air slightly in front of and above the player, at the peak of the toss, perfectly aligned with the center of the raised racquet.", "Use motion lines trailing the racquet to suggest the high speed of its upward swing.", "Depict a portion of a blue tennis court, including the white baseline the player is standing behind, and the net in the background.", "The bright sun must cast a sharp, dynamic shadow of the player and their raised racquet onto the court surface."], "prompt": "Write `svg` code for an image of a tennis player in the middle of a powerful serve.", "id": "write_`svg`_code"} +{"requirements": ["Create a diagram illustrating the four distinct stages of the butterfly life cycle on a single host plant.", "Stage 1: Show a cluster of small eggs on a green leaf.", "Stage 2: Show a caterpillar actively eating the edge of the same leaf where remnants of the hatched eggs are visible.", "Stage 3: Show a chrysalis (pupa) hanging from a twig directly above the leaf from Stage 2.", "Stage 4: Show a fully formed adult butterfly with patterned wings, positioned next to the now-empty chrysalis casing from which it has emerged.", "Arrange the four stages in a logical circular sequence on the plant.", "Use arrows to connect the stages in the correct order: from the eggs to the caterpillar, from the caterpillar to the chrysalis, and from the chrysalis to the butterfly.", "An arrow must go from the butterfly back towards a fresh leaf on the plant, as if to lay new eggs, visually completing the cycle.", "Each stage must have a clear text label pointing to the relevant part of the plant: 'Eggs', 'Caterpillar', 'Chrysalis', 'Butterfly'."], "prompt": "Write `svg` code for a diagram showing the life cycle of a butterfly, from egg to caterpillar to chrysalis to adult.", "id": "write_`svg`_code"} +{"requirements": ["Draw a research-style submersible, not a military one, exploring a deep-sea trench.", "The submersible must have a main viewport, external lights, and at least one robotic arm extended towards the seafloor.", "Show bright beams of light emanating from the submersible's lights, directly illuminating an anglerfish in the foreground.", "Create a dark, deep-sea trench environment with rocky walls and a seafloor populated with a cluster of glowing tube worms.", "The robotic arm must be positioned as if it is about to collect a rock sample from next to the tube worms.", "Include at least one anglerfish, with its characteristic glowing lure, caught within the submersible's main beam of light.", "The overall scene must be very dark, with light originating only from the submersible, the anglerfish's lure, and other bioluminescent life.", "Depict floating particles in the water to create a sense of depth and murkiness.", "Add several bioluminescent jellyfish floating in the mid-ground between the submersible and the trench wall."], "prompt": "Write `svg` code for an image of a submarine exploring a deep-sea trench with glowing anglerfish nearby.", "id": "write_`svg`_code"} +{"requirements": ["Create a UI window that resembles a desktop application, with a title bar and window controls (minimize, maximize, close).", "Display a monthly calendar view for 'October 2023' inside the window.", "Include a header with the month and year and navigation arrows. The 'next month' arrow must be depicted in a hovered or pressed state.", "Lay out a grid for the days of the month, with headers for the days of the week (e.g., S, M, T, W, T, F, S).", "Populate the grid cells with date numbers for October 2023.", "Fill one date cell with a colored block and the text 'Project Deadline'.", "Fill a separate three-day span of consecutive dates with a single colored block labeled 'Team Offsite'.", "Highlight the 'Project Deadline' date cell with a circular outline to represent it as the 'current day'.", "Add one other event on a different day labeled 'Team Sync'.", "The design should be clean and modern, typical of a calendar app."], "prompt": "Write `svg` code for a screenshot of a calendar application, showing a monthly view with several events scheduled.", "id": "write_`svg`_code"} +{"requirements": ["Draw a large, medieval-style catapult made of wood with visible wood grain texture and metal fittings.", "The catapult's structure must include a sturdy base frame and a long throwing arm, powered by a large counterweight.", "Depict the catapult in the middle of the launch action: the counterweight must be shown near the bottom of its downward swing, while the throwing arm has just reached the apex of its upward swing.", "A large, round boulder must be shown in mid-air, having just left the catapult's sling, following a clear trajectory.", "Use motion lines to convey the rapid movement of the throwing arm and the flight path of the boulder.", "The scene must be set on a muddy field, with disturbed ground around the catapult's base to suggest the force of the launch.", "Include a stone castle wall in the distant background, positioned as the clear target for the boulder's trajectory."], "prompt": "Write `svg` code for an image of a medieval catapult launching a boulder.", "id": "write_`svg`_code"} +{"requirements": ["Create a primary circular plate to serve as the base for the food.", "Include several pieces of nigiri sushi, differentiating toppings by color (red for tuna, orange for salmon). One piece of salmon nigiri must have a small dab of green wasabi on top.", "Add at least one type of maki roll (sushi roll), showing the outer layer of nori (seaweed) and the cross-section of rice and fillings.", "Include a few slices of sashimi (raw fish without rice), arranged artfully on the plate, with a visible empty space where one piece of nigiri was removed.", "Place a small, shallow bowl on the side, filled with dark brown soy sauce that has ripples on its surface.", "Include a pair of chopsticks, positioned to be actively lifting a piece of tuna nigiri from the plate. The fish on the lifted nigiri must be slightly darkened at the tip, as if it has just been dipped in the soy sauce.", "Include a small mound of green wasabi and a pile of pink pickled ginger (gari) as garnishes on the plate.", "Arrange all elements in a visually appealing composition, focusing on the action of eating.", "The overall image should have a clean and fresh aesthetic."], "prompt": "Write `svg` code for an image of a plate of sushi and sashimi, with chopsticks and a small bowl of soy sauce.", "id": "write_`svg`_code"} +{"requirements": ["Draw a tall, cylindrical lighthouse tower with red and white horizontal stripes, positioned on top of a dark, jagged, rocky cliff.", "Show the lantern room at the top of the lighthouse with a visible, glowing light source inside.", "Create a powerful beam of light, depicted as a solid, yellow, trapezoidal shape, emanating from the lantern.", "The light beam must cut across the scene and directly illuminate a specific, treacherous-looking rock jutting out of the water.", "Establish a night scene with a dark blue sky, where a faint crescent moon and a few stars are partially visible through thin fog.", "Incorporate a fog effect using semi-transparent white shapes, which is thickest around the base of the cliff and thins out towards the sky.", "Depict dark, churning water with large, white-capped waves shown actively crashing against both the base of the cliff and the illuminated rock."], "prompt": "Write `svg` code for an image of a lighthouse on a rocky cliff, with its light beam cutting through a foggy night.", "id": "write_`svg`_code"} +{"requirements": ["Illustrate two strands twisting around a central axis to form a right-handed double helix with clear 3D perspective.", "The two outer strands must represent the sugar-phosphate backbones, depicted as smooth, continuous helical lines.", "Connect the two backbones with horizontal rungs representing the base pairs.", "Use four distinct colors for the nucleobases: Adenine (e.g., blue), Guanine (e.g., red), Cytosine (e.g., yellow), and Thymine (e.g., green).", "The base pairing must be consistently shown, so that the color for Adenine always pairs with the color for Thymine, and the color for Cytosine always pairs with the color for Guanine.", "Clearly show the major and minor grooves created by the helical twist, and add labels with lines pointing to 'Major Groove' and 'Minor Groove'.", "Add labels with lines pointing to the 'Sugar-phosphate backbone' and a 'Base pair'.", "In one section of the helix, magnify a single C-G pair to explicitly label the 'C' and 'G' on their respective colored shapes to reinforce the pairing rule.", "Maintain a clean, scientifically recognizable, diagrammatic style."], "prompt": "Write `svg` code for an image of a DNA double helix strand.", "id": "write_`svg`_code"} +{"requirements": ["The scene's focal point must be a multi-layered birthday cake on a table, with several lit candles on top.", "Include a group of at least three stylized people around the table, all wearing colorful, conical party hats.", "One person must be shown leaning forward over the cake, with puffed cheeks, in the act of blowing out the candles. Faint motion lines should emanate from their mouth towards the candle flames.", "The other people must be looking at the person blowing out the candles, with expressions of excitement or cheering.", "Add a bunch of colorful balloons with strings floating in the background, with one string shown leading to the hand of one of the people.", "Decorate the scene with festive streamers hanging from the ceiling and confetti scattered on the table around the cake.", "Place a few wrapped gift boxes on the table, one of which is partially unwrapped with ribbon trailing onto the table.", "Use a bright and cheerful color palette to convey a celebratory mood in an indoor party room."], "prompt": "Write `svg` code for an image of a birthday party scene with a cake, balloons, and people wearing party hats.", "id": "write_`svg`_code"} +{"requirements": ["Draw the open case of a desktop PC tower, showing the internal components from a perspective view.", "Include a large motherboard as the main circuit board, serving as the base for other components.", "Show the CPU socket on the motherboard, covered by a large heatsink and a spinning fan assembly indicated by motion lines.", "Illustrate at least two RAM sticks slotted into the motherboard.", "Include a dedicated GPU card plugged into a PCI-e slot on the motherboard, with its own two cooling fans.", "Depict the PSU in its housing, with a bundle of colored wires extending from it and connecting to the motherboard and GPU.", "Show one rectangular HDD and one flatter SSD, both connected to the motherboard with visible SATA data cables.", "Draw power cables from the PSU connecting to the motherboard's main power connector, the CPU power connector, and the GPU.", "Add clear labels with lines pointing to each major component: 'Motherboard', 'CPU Cooler', 'GPU', 'RAM', 'PSU', 'HDD', and 'SSD'.", "Use a clear, technical diagram style with clean lines to show how the components are interconnected."], "prompt": "Write `svg` code for an image of a detailed diagram of the internal components of a desktop computer.", "id": "write_`svg`_code"} +{"requirements": ["Draw a large, mature oak tree with a thick, textured trunk and wide, spreading branches.", "Construct a rustic wooden treehouse, made of planks, nestled among and structurally supported by the tree's branches.", "The treehouse must have a simple roof, a window with a small flower box on its sill, and a door.", "A rope ladder with wooden rungs must hang from the treehouse entrance down to the ground.", "A tire swing must be shown hanging by a rope from a sturdy, lower branch of the same oak tree.", "A small, red flag must be attached to the peak of the treehouse roof.", "The scene must be set in a green, grassy backyard, with the base of the tree's trunk clearly visible in the grass.", "Use a bright color palette that suggests a sunny day, with a clear blue sky in the background."], "prompt": "Write `svg` code for an image of a treehouse with a rope ladder, nestled in a large oak tree.", "id": "write_`svg`_code"} +{"requirements": ["Depict a blacksmith character, shown with a strong build and wearing a work apron.", "The blacksmith must be holding a hammer in a raised position, positioned directly above a glowing sword blade on an anvil, as if about to strike.", "Place a classic-shaped, heavy anvil on a wooden stump in front of the blacksmith.", "On the anvil, place a sword blade that is glowing bright orange and yellow to indicate it is heated.", "Show sparks flying upwards from the specific point on the glowing blade where the hammer is about to make contact.", "In the background, include a forge with visible glowing coals and flames, which serves as a primary light source.", "The setting must be a dark, rustic workshop; a pair of tongs must be resting against the anvil's wooden stump.", "The forge and the glowing sword blade must be the only light sources, casting an orange glow on the side of the blacksmith and a bright yellow-orange light on his front and the top of the anvil, creating distinct shadows.", "The anvil must be dark and metallic, with its top surface reflecting the bright orange glow from the sword blade."], "prompt": "Write `svg` code for an image of a blacksmith at an anvil, hammering a glowing piece of metal.", "id": "write_`svg`_code"} +{"requirements": ["Create a multitude of nodes, represented as circles of varying sizes, organized into at least three distinct color-coded clusters (e.g., blue, green, red).", "Connect the nodes with a large number of lines (edges) to show interconnection.", "The graph must be arranged in a force-directed layout, creating a complex, organic, web-like structure.", "Within each cluster, a larger central hub node must be connected via thick lines to its smaller, peripheral nodes.", "Thinner lines must be used to connect the peripheral nodes to each other within the same cluster.", "A few thin, curved lines must bridge the different colored clusters, connecting peripheral nodes from one cluster to another to show cross-cluster interaction.", "The central hub nodes must be the largest in size, with many connections, while peripheral nodes are smaller with fewer connections.", "The curved lines must navigate around other nodes gracefully to avoid a messy appearance."], "prompt": "Write `svg` code for an image of a complex network graph with nodes and interconnected lines.", "id": "write_`svg`_code"} +{"requirements": ["Draw an old-fashioned steam locomotive as the main subject, viewed from a three-quarter perspective.", "The locomotive must have key features: a smokestack, a cowcatcher, large driving wheels with connecting rods, and a cab for the engineer.", "Show white steam puffing from the smokestack, trailing backward over the top of the first attached train car to indicate motion.", "Attach exactly two passenger cars behind the locomotive.", "The train must be positioned one-third of the way across a detailed wooden trestle bridge.", "The bridge's structure must show the crisscrossing wooden beams and supports of the trestles, and its reflection must be visible in the river below.", "The bridge must span a river flowing through a valley.", "The background must feature pine forests on rolling hills leading up to distant, snow-capped mountains.", "The three-quarter perspective must effectively show the length of the train and the scale of the bridge over the river."], "prompt": "Write `svg` code for an image of an old-fashioned steam train crossing a wooden trestle bridge.", "id": "write_`svg`_code"} +{"requirements": ["Create a rectangular frame representing a phone screen in dark mode, with a dark background and light text/icons.", "Include a large square area for the album art, which must be a graphic of a stylized sun setting over an ocean.", "Below the album art, display the song title 'Ocean Sunset' in a larger, bold font.", "Below the song title, display the artist name 'The Vectors' in a smaller font.", "Create a playback control bar at the bottom containing icon buttons for 'Previous', 'Pause', and 'Next'. The 'Pause' icon must be the most prominent, indicating the song is playing.", "Include a horizontal progress bar (scrubber) above the control buttons.", "The progress bar's handle must be positioned at the one-third mark to indicate the current playback position.", "Display the elapsed time timestamp '1:23' at the left end of the progress bar and the total duration '3:45' at the right end, corresponding to the handle's position.", "On the same line as the progress bar, include a 'Shuffle' icon on the left and a 'Repeat' icon on the right. The 'Shuffle' icon must be illuminated to indicate it is active."], "prompt": "Write `svg` code for a screenshot of a music player interface, like Spotify, showing album art and playback controls.", "id": "write_`svg`_code"} +{"requirements": ["Include a central figure of a chef wearing a traditional uniform (toque, jacket) with an expression of intense concentration.", "The chef must be in a dynamic, mid-action pose, bringing a chef's knife down in a chopping motion.", "The chef's knife must be positioned just above a carrot on a cutting board, at the peak of its downward chop.", "On the cutting board, show a whole carrot with several circular slices already cut and lying to its left.", "Depict at least three small, irregular pieces of the carrot in mid-air to the right of the knife, flying away from the point of impact.", "The setting must be a busy kitchen, with a metal bowl of other chopped vegetables (onions, celery) sitting on the counter next to the cutting board.", "Show a pot on a stove in the background with wavy, semi-transparent lines of steam rising from it.", "The steam must drift upwards and partially obscure a shelf of spices located behind the pot.", "The overall composition must convey a sense of action, with the chef's eyes focused directly on the point where the knife will meet the carrot."], "prompt": "Write `svg` code for an image of a chef in a busy kitchen, mid-chop, sending pieces of a carrot flying off a cutting board, with steam rising from pots in the background.", "id": "write_`svg`_code"} +{"requirements": ["Draw a scuba diver figure complete with gear: mask, regulator, air tank, and fins.", "The diver must be holding an underwater camera and aiming its lens directly at the eye of an octopus.", "The octopus must be positioned behind a piece of brain coral, with two tentacles wrapped around it and its head and one eye peeking out.", "The octopus's skin texture and color must mimic the bumpy, tan texture of the brain coral it is hiding behind, demonstrating camouflage.", "Create a detailed coral reef environment with varied shapes and colors of coral, rock, and a small, brightly colored clownfish swimming near the camouflaged octopus.", "The entire image must have a blue tint to simulate being underwater.", "Include light rays filtering down from the water's surface, illuminating the diver's back and casting a slight shadow over the area where the octopus is hiding.", "Show a continuous stream of bubbles rising from the diver's regulator, moving upwards and passing in front of a section of the background coral."], "prompt": "Write `svg` code for an image of a scuba diver using an underwater camera to take a picture of a shy octopus that is partially camouflaged against a coral reef.", "id": "write_`svg`_code"} +{"requirements": ["Design a visually complex Rube Goldberg machine with a cobbled-together, DIY aesthetic, using parts like ramps, levers, and scissors.", "A red marble must be shown at the end of a wooden ramp, making contact with one end of a see-saw-like lever.", "The lever must be tilted down on the side the marble has hit, and consequently tilted up on the opposite end.", "The rising end of the lever must be shown pushing a pin out from under a weight.", "The now-unsupported weight must be depicted falling downwards, pulling a cord taut.", "The cord must be attached to the handle of a pair of scissors, pulling the blades closed.", "The scissor blades must be shown halfway closed, with a taut red string positioned between them, moments from being severed.", "The chain of events—marble hitting lever, lever releasing weight, weight pulling cord, cord closing scissors—must be clearly and sequentially illustrated."], "prompt": "Write `svg` code for an image of a complex Rube Goldberg machine in action, where a falling marble has just triggered a lever, which is in the process of releasing a pair of scissors to cut a string.", "id": "write_`svg`_code"} +{"requirements": ["Depict two people, one young and one old, sitting opposite each other at a table.", "The young person's hand must be hovering directly over their white queen piece on an 8x8 chessboard.", "The chess pieces must be arranged in a late-game configuration where the white queen's next move results in checkmate.", "The old person's black king must be shown trapped on the board, with its potential escape squares blocked by other white pieces, such as a rook and a bishop.", "The young person must have a facial expression of confident triumph, with their eyes fixed on the opponent's trapped king.", "The old person must have a facial expression of sudden, defeated realization, with their wide eyes looking at their own trapped king.", "The perspective must be from slightly over the young person's shoulder, focusing attention on their hand, the white queen, and the checkmated black king.", "The queen piece must be clearly identifiable and the target of the player's action."], "prompt": "Write `svg` code for an image of two people, one young and one old, intensely focused on a chess game, where one player's hand is hovering over the queen to make a checkmate move.", "id": "write_`svg`_code"} +{"requirements": ["Show an alchemist's hands and forearms, wearing dark, rustic sleeves, holding a glass beaker.", "The alchemist must be tilting the beaker, pouring a stream of glowing blue liquid from it.", "The stream of glowing liquid must flow into a large, dark, metallic cauldron that is positioned over a crackling wood fire.", "The cauldron must contain a green potion, and at the point where the blue stream meets the green liquid, there must be a bright flash of white light indicating a reaction.", "The green potion must be bubbling violently, with the bubbling most intense at the point of contact with the blue liquid.", "Plumes of purple smoke must be rising from the cauldron, curling upwards to partially obscure a background shelf filled with glass jars.", "The alchemist's hands and the beaker must be illuminated by the blue glow of the liquid, while the front of the cauldron is illuminated by the orange fire beneath it, casting complex shadows on the stone wall behind."], "prompt": "Write `svg` code for an image of an alchemist pouring a glowing blue liquid from a beaker into a cauldron, causing the green potion inside to bubble violently and emit purple smoke.", "id": "write_`svg`_code"} +{"requirements": ["Draw a recognizable Formula 1 race car, stationary and lifted off the ground on front and rear jacks inside a pit box.", "Show multiple pit crew members in team uniforms in dynamic poses of urgent, precise action around the car.", "At the front-left wheel, depict one crew member removing the old wheel while another stands ready, holding the new wheel.", "At the rear-right wheel, depict a crew member using a pneumatic wheel gun to tighten the nut on a newly fitted wheel, with sparks flying from the gun's impact.", "A crew member on the right side of the car must have a large refueling hose firmly connected to the car's fuel port.", "The driver, wearing a helmet, must be visible in the cockpit with hands on the steering wheel, looking intently towards the pit lane exit.", "A 'lollipop man' crew member must be standing directly in front of the car, holding a sign that indicates 'Brakes On'.", "The scene must be set in a pit lane with appropriate ground markings and a pit wall gantry in the background."], "prompt": "Write `svg` code for an image of a pit crew in a Formula 1 race, simultaneously changing all four tires and refueling the car while the driver waits.", "id": "write_`svg`_code"} +{"requirements": ["Focus on a close-up view of a barista's hands, with one hand holding a ceramic coffee cup by its handle.", "The other hand must be gripping a stainless steel milk pitcher from the side, with the thumb on the handle for stability.", "The pitcher must be tilted, with its spout positioned directly over the center of the cup.", "Show a thin, controlled stream of white, steamed milk pouring from the pitcher's spout into the cup.", "The cup must contain dark brown liquid representing espresso, with a creamy layer of crema on top.", "On the surface of the crema, there must be a detailed latte art pattern in the shape of a fern (rosetta), which is nearly complete.", "The stream of milk must be shown connecting to the top of the rosetta, forming the final, delicate leaf of the fern pattern.", "The composition should be tightly cropped, showing parts of the barista's forearms and apron, to emphasize the action of pouring and the creation of the art."], "prompt": "Write `svg` code for an image of a barista pouring steamed milk from a metal pitcher into a cup of espresso, creating detailed latte art in the shape of a fern.", "id": "write_`svg`_code"} +{"requirements": ["Use a cutaway style to show the internal workings of a single engine cylinder.", "Include the main components: cylinder wall, piston, connecting rod, a portion of the crankshaft, and the cylinder head.", "The crankshaft must be shown rotating, causing the connecting rod to pull the piston downwards within the cylinder, representing the intake stroke.", "A downward-pointing arrow must be attached to the top of the piston to indicate its direction of motion.", "In the cylinder head, the intake valve must be shown fully open, while the exhaust valve is fully closed.", "Show a carburetor connected to the cylinder's intake port.", "Represent the fuel-air mixture as a blue-colored gas being drawn from the carburetor, flowing past the open intake valve, and filling the expanding space above the descending piston.", "A spark plug must be screwed into the top of the cylinder head, with its electrode visible inside the combustion chamber.", "The image must have the clean, technical look of a diagram with clear outlines and labels for the piston, crankshaft, and intake valve."], "prompt": "Write `svg` code for a cutaway diagram of a car engine where the piston is moving down during the intake stroke, drawing a fuel-air mixture in from a carburetor.", "id": "write_`svg`_code"} +{"requirements": ["Depict a glassblower's hands and forearms as the primary subject, with one hand wearing a protective glove.", "The gloved hand must be holding and rotating a long metal blowpipe.", "At the far end of the blowpipe, show a glowing, red-orange, molten glass bubble.", "The other hand must be holding a shaping tool made of a thick, wet, folded wad of newspaper.", "The newspaper tool must be pressed firmly against the side of the molten glass bubble, creating a visible indentation.", "A thick cloud of steam must be shown billowing up from the exact point of contact where the wet newspaper touches the hot glass.", "The indentation on the glass bubble must directly correspond to the shape of the newspaper tool pressing into it.", "The background must clearly show the glowing orange opening of a furnace (the glory hole), which is the source of light in the scene."], "prompt": "Write `svg` code for an image of a glassblower at the end of a blowpipe, shaping a molten glass bubble with a wet wad of newspaper, causing steam to rise.", "id": "write_`svg`_code"} +{"requirements": ["The image must be a close-up focusing on a pair of hands and forearms, covered in a texture representing wet, brown clay.", "The hands must be positioned around a lump of clay, with one hand inside the opening and the other shaping the exterior wall.", "The clay must be perfectly centered on the circular head of a potter's wheel, which is surrounded by a splash pan.", "The clay must be formed into the recognizable, in-progress shape of a vase, with a defined base, a swelling body, and a narrowing neck.", "Include concentric circular lines on the clay and the wheel head to indicate a rapid spinning motion.", "Show drips of watery clay slip running down the exterior of the vase and the potter's hands, pooling at the base of the clay on the wheel head.", "The background must be a simple, dark, out-of-focus wall to keep the focus on the hands and the creative process."], "prompt": "Write `svg` code for an image of a potter's hands, covered in clay, shaping a vase on a spinning potter's wheel.", "id": "write_`svg`_code"} +{"requirements": ["Include a mother bird, recognizable by adult plumage, perched on the edge of a nest.", "The nest must be depicted with a woven texture of twigs and grass.", "The nest must be securely situated in the fork of a tree branch, with green leaves framing the scene.", "There must be exactly three baby chicks inside the nest.", "The chicks should appear young and fluffy, with underdeveloped feathers.", "All three chicks must have their beaks wide open, necks stretched, and pointing upwards towards the mother bird in a hungry posture.", "The mother bird must be leaning over, with her beak positioned directly above one of the chick's open beaks.", "A pink worm must be clearly visible, held at its midpoint in the mother's beak, with one end of the worm just entering the chick's beak.", "The scene should be brightly lit to convey a sense of a sunny springtime day."], "prompt": "Write `svg` code for an image of a mother bird at her nest, placing a worm into the wide-open beak of one of her three hungry chicks.", "id": "write_`svg`_code"} +{"requirements": ["Depict an astronaut in a standard white Extravehicular Mobility Unit (spacesuit) with a golden-tinted helmet visor.", "The astronaut must be attached to the end of a multi-jointed robotic arm via a foot restraint, floating in a zero-gravity pose.", "The robotic arm must be positioned to hold the astronaut steady next to a large solar panel array.", "The astronaut must be holding a specialized repair tool in their gloved hands and actively applying it to a visible tear in the solar panel.", "The solar panel must have a distinct tear, with jagged edges.", "A portion of the truss structure of the International Space Station (ISS) must be visible, serving as the base for the robotic arm.", "The blue and white curve of the Earth must be prominent in the background.", "The background must be the blackness of space with a scattering of stars.", "A coiled safety tether must be clearly visible, with one end clipped to the astronaut's suit and the other end attached to the ISS structure."], "prompt": "Write `svg` code for an image of an astronaut on a spacewalk, using a robotic arm to repair a damaged solar panel on the International Space Station, with the Earth visible below.", "id": "write_`svg`_code"} +{"requirements": ["Focus on the hands of a gardener, one of which is wearing a gardening glove.", "Show a mature branch of a tree, representing the rootstock, which has a clean 'V'-shaped notch cut into it.", "Include a smaller, separate twig (the scion) with several visible buds on it.", "The base of the scion must be cut into a wedge shape that fits perfectly into the rootstock's 'V'-shaped notch.", "Depict the gardener's hands holding the scion firmly in place within the rootstock's notch, ensuring the cambium layers align.", "Show a strip of green grafting tape being wrapped tightly by the gardener's fingers around the union point, holding the two pieces together.", "Show that some grafting wax has already been applied from a small tin to seal the top cut-end of the scion.", "A sharp, clean grafting knife must be visible resting on the rootstock branch next to the graft site.", "The background must show out-of-focus rows of other trees, suggesting an orchard setting."], "prompt": "Write `svg` code for an image of a gardener carefully grafting a branch from an apple tree onto a different rootstock tree, with grafting tape and wax visible.", "id": "write_`svg`_code"} +{"requirements": ["Depict a monk in a traditional brown, hooded medieval robe, seated at a slanted wooden writing desk.", "The monk's hand must be holding a white feather quill, poised over an open manuscript.", "The tip of the quill must be positioned just above the manuscript page, with a single, dark drop of ink visible on the nib, about to touch the parchment.", "On the desk, there must be an open manuscript page which features a large, ornate, 'illuminated' letter 'I' decorated with gold leaf and intricate vines.", "The quill must be positioned directly after the illuminated letter, ready to write the next character on a pre-drawn ruled line.", "The setting must be a stone room, and a distinct beam of dusty light from an arched window must be shown falling across the desk, illuminating the manuscript and the monk's hands.", "Next to the open manuscript, there must be an open inkwell and a small pile of stacked, leather-bound books."], "prompt": "Write `svg` code for an image of a medieval monk in a scriptorium, dipping a quill into an inkwell, about to write on an illuminated manuscript.", "id": "write_`svg`_code"} +{"requirements": ["Include exactly three children on top of a grassy hill.", "The children must be depicted in active, cooperative poses: one child is holding the kite string reel, a second child is guiding the taut string with their hands, and the third is pointing up excitedly.", "Show a large, elaborate kite in the shape of a green dragon high in the sky.", "The dragon kite must have a long, segmented tail that is flowing and rippling in the wind.", "A single, taut kite string must be visible, connecting from the kite down to the reel held by the first child.", "The setting is the crest of a green, rolling hill.", "The wind must be visually represented by having the children's hair, their loose clothing, the kite's tail, and the blades of grass all blowing in the same direction.", "The blue sky must have several puffy white clouds that appear to be moving quickly.", "All three children's expressions must be joyful and their gaze directed upwards at the kite."], "prompt": "Write `svg` code for an image of a group of children working together to fly a large, elaborate dragon kite on a windy day.", "id": "write_`svg`_code"} +{"requirements": ["Create a recognizable web browser window with a frame, three tabs (with the Google Drive tab being active), and an address bar showing a 'drive.google.com' URL.", "The content of the browser window must be the Google Drive user interface, showing a grid of folders with names like 'Photos', 'Work', and 'Vacation 2023'.", "A portion of a blurred nature photograph desktop background must be visible behind the browser window.", "Include a standard image file icon labeled 'Mountain.jpg' on the desktop area.", "Show a mouse cursor (arrow pointer) positioned over the browser window.", "The cursor must be depicted as 'dragging' the 'Mountain.jpg' file icon; the icon should appear semi-transparent and be located directly beneath the cursor's tip.", "The cursor and the dragged file icon must be positioned directly over the 'Photos' folder within the Google Drive interface.", "The 'Photos' folder must be highlighted with a blue border and a slightly changed background color to indicate it is the active drop zone, a direct result of the cursor's position.", "The overall image must clearly represent the user action of dragging a file from the desktop to a specific cloud storage folder."], "prompt": "Write `svg` code for a screenshot of a user dragging and dropping a file from their desktop into a Google Drive folder in a web browser.", "id": "write_`svg`_code"} +{"requirements": ["Include a person dressed in a veterinarian's lab coat over blue scrubs.", "The veterinarian should be holding a stethoscope, with the earpieces in their ears and the chest-piece placed on a dog's chest.", "The veterinarian's free hand must be resting reassuringly on the dog's back.", "The dog must be a golden retriever, positioned on a stainless steel veterinary examination table.", "The dog must appear calm, with its head turned towards its owner.", "Include a second person, the dog's owner, standing beside the table.", "The owner's right hand must be extended, holding a visible dog treat, which the dog is sniffing.", "The owner's left hand must be gently stroking the dog's head.", "The setting must be identifiable as a vet's office, with a clean, tiled background and a cabinet with medical supplies visible.", "The overall mood must be calm and caring, emphasized by the physical contact between all three subjects."], "prompt": "Write `svg` code for an image of a veterinarian listening to a dog's heartbeat with a stethoscope, while the dog's owner offers it a treat.", "id": "write_`svg`_code"} +{"requirements": ["Depict a red and black bowling ball at the end of a polished wooden bowling lane.", "The image must capture the exact moment of impact between the ball and the front-most (#1) pin.", "Show the full set of ten bowling pins, with the #1 pin shattering into pieces from the powerful impact.", "A dynamic starburst effect must emanate from the point of impact between the ball and the #1 pin.", "The impact must be shown causing the adjacent #2 and #3 pins to begin tilting backwards, starting a chain reaction.", "The other seven pins must be standing but showing slight vibrations.", "Incorporate sharp speed lines trailing the bowling ball to indicate it was thrown at high speed.", "The perspective must be low and close to the pins, looking down the lane, to heighten the drama.", "Include the pin deck and dark gutters of the bowling lane, with reflections from the polished wood."], "prompt": "Write `svg` code for an image of a bowling ball just as it makes impact with the front pin, sending it flying back into the others.", "id": "write_`svg`_code"} +{"requirements": ["The image must be a close-up on a bomb-like device composed of a bundle of dynamite sticks wrapped in tape.", "The device must feature a bundle of multi-colored wires (red, blue, green, yellow) leading to a timer.", "A prominent red digital timer must be part of the device, clearly displaying the numbers '0:07'.", "Show a pair of hands wearing thick, black, protective bomb-disposal gloves.", "One hand must be holding a pair of wire cutters, with its blades actively cutting the red wire.", "The red wire must be shown partially severed, with a small white and yellow spark at the point of the cut.", "The other gloved hand must be steadying the bundle, with its fingers separating the red wire from the adjacent blue and green wires.", "The scene must be tense and focused, with a dark, out-of-focus background to isolate the action.", "The composition must create a tight focal point on the interaction between the wire cutters, the sparking red wire, and the timer."], "prompt": "Write `svg` code for an image of a bomb disposal expert cutting the red wire on a complex-looking bomb with a timer that reads \"0:07\".", "id": "write_`svg`_code"} +{"requirements": ["The image must be a diagrammatic cross-section of a plant, bisected by a horizontal line representing the ground.", "The below-ground section must show a root system spreading into dark brown soil containing blue water particles and brown nutrient particles.", "Blue arrows must originate at the water particles, enter the root tips, and travel up a channel (xylem) in the stem.", "The above-ground section must show the plant's stem, green leaves, and a yellow flower.", "Include a bright yellow sun in the top-left corner.", "Yellow arrows must represent sunlight traveling from the sun and pointing to the surfaces of the leaves.", "Gray arrows labeled 'CO2' must point from the air into the leaves.", "Light blue arrows labeled 'O2' must point from the leaves out into the air.", "The leaves must contain small green dots representing chloroplasts, where the sunlight arrows terminate.", "A second set of orange arrows labeled 'Sugars' must originate in the leaves and travel down a channel (phloem) in the stem towards the roots, showing the distribution of energy from photosynthesis."], "prompt": "Write `svg` code for a cross-section of a plant, showing the roots absorbing water from the soil and the leaves using sunlight for photosynthesis.", "id": "write_`svg`_code"} +{"requirements": ["Include the head and upper torso of a watchmaker, with a wrinkled brow to show intense concentration.", "A brass watchmaker's loupe must be fitted over the watchmaker's right eye.", "The reflection of the watch mechanism and tweezers must be visible on the lens of the loupe.", "The watchmaker's left hand must be steadying the casing of an open mechanical watch.", "The right hand must be holding a pair of fine-tipped tweezers, which are gripping a single, tiny brass watch gear.", "The open mechanical watch must reveal an intricate interior of interlocking silver and brass gears, springs, and red jewel bearings.", "The gear held by the tweezers must be positioned directly above an empty axle in the mechanism, fractions of a millimeter from being seated.", "A focused cone of light from an overhead desk lamp must illuminate the watch, the hands, and the tools.", "The background must be a dark, out-of-focus workshop to draw all attention to the detailed foreground action."], "prompt": "Write `svg` code for an image of a watchmaker using a loupe and fine tweezers to place a tiny gear into the intricate mechanism of a mechanical watch.", "id": "write_`svg`_code"} +{"requirements": ["Depict a child's face in three-quarter view, with puffed cheeks and puckered lips.", "The child must be holding a plastic bubble wand, with the ring held to their lips.", "A single, large bubble must be emerging from the wand, still connected to the soapy film in the wand's loop.", "The surface of the half-formed bubble must be transparent and show swirling, iridescent, rainbow-like colors.", "The distorted reflection of the child's face must be visible on the surface of the bubble.", "Use varying levels of transparency and opacity to make the bubble look delicate.", "The background must be a bright, sunny day in a grassy field with a clear blue sky.", "Include two fully-formed, iridescent bubbles floating away in the background, with distorted reflections of the sky and grass on their surfaces."], "prompt": "Write `svg` code for an image of a child blowing a bubble with a wand, with the bubble half-formed and showing iridescent reflections.", "id": "write_`svg`_code"} +{"requirements": ["Illustrate a blacksmith figure with muscular arms, wearing a heavy leather apron over a simple shirt.", "The blacksmith must be holding a long pair of tongs, gripping a sword by its tang (the part that goes into the hilt).", "The entire blade of the sword must be glowing a bright orange-yellow, indicating it is white-hot.", "The tip of the glowing sword is just clearing the opening of a brick forge, which is filled with glowing red and orange embers.", "An anvil must be positioned in front of the blacksmith, who is turned towards it, ready for the next action.", "Visual effects must include sparks flying from the sword, and a heat-haze shimmer effect around the blade.", "The surrounding workshop must be dimly lit, with the intense light from the forge and hot sword casting strong orange highlights on the blacksmith's face, arms, apron, and the face of the anvil.", "The blacksmith's posture must convey the effort of holding the hot metal, with tense muscles and a focused expression."], "prompt": "Write `svg` code for an image of a blacksmith pulling a glowing orange sword from a forge with a pair of tongs, ready to place it on an anvil.", "id": "write_`svg`_code"} +{"requirements": ["Draw a detailed, fuzzy bumblebee with black and yellow stripes and transparent, veined wings.", "The bumblebee must be positioned on a large pink flower petal, its weight causing the petal to bend downwards slightly.", "Draw the detailed pink flower, showing all its petals, a yellow pistil, and multiple stamens.", "The bee's proboscis (tongue) must be extended and physically touching the pistil in the center of the flower to collect nectar.", "Visible yellow pollen grains must be stuck to the bee's fuzzy legs and abdomen.", "The flower's stamens must also be covered in yellow pollen, and the bee's legs must be brushing against them, dislodging a few grains that are falling onto the petal below.", "Use a macro perspective to highlight the interaction between the bee and the flower's reproductive parts.", "Include the top of the flower's green stem and a single green leaf for context.", "Use vibrant, saturated colors for both the flower and the bee to create a lively scene."], "prompt": "Write `svg` code for an image of a bee pollinating a flower, with pollen grains visibly stuck to its legs as it collects nectar.", "id": "write_`svg`_code"} +{"requirements": ["Depict a person in a white lab coat, positioned at a lab bench.", "The scientist's right eye must be pressed to the eyepiece of a microscope.", "The scientist's left hand must be turning a focus knob on the side of the microscope.", "A clear petri dish must be on the microscope's stage, held by stage clips, directly under the objective lens.", "The petri dish must contain a yellow culture medium with several visible white colonies of bacteria.", "The microscope must be clearly drawn with an eyepiece, body, a turret with three objective lenses, a stage, and a light source at the base that is switched on.", "Include a circular inset view in the top-right corner, representing the microscope's view, which shows a magnified, stylized image of the bacteria from one of the colonies in the petri dish.", "A line must connect the inset view to the microscope's eyepiece to clarify the relationship.", "The background must show other laboratory equipment, such as beakers and test tube racks, to establish the setting."], "prompt": "Write `svg` code for an image of a scientist adjusting the focus on a microscope to look at bacteria in a petri dish.", "id": "write_`svg`_code"} +{"requirements": ["The central focus must be a large, fully decorated Christmas tree, topped with a glowing yellow star.", "Show a family of three people of different ages actively decorating the tree.", "An adult must be lifting a small child up, so the child can reach out and hang a red bauble ornament on a high branch.", "The third person, a teenager, must be kneeling down to arrange a string of colored lights around the lower part of the tree.", "The tree must be heavily decorated with a variety of ornaments, garlands, and the aforementioned lights, which are visibly plugged into a wall socket.", "The scene is set in a cozy living room with a fireplace in the background, in which a fire is burning.", "Include a pile of colorfully wrapped presents under the tree, some of which are being nudged by the teenager's feet.", "All characters must have joyful, smiling expressions, focused on their shared activity.", "The lighting must be warm and inviting, with the primary light source being the glow from the tree's lights and the fireplace, casting soft shadows in the room."], "prompt": "Write `svg` code for an image of a family decorating a Christmas tree together, with one person placing the star on top while others hang ornaments.", "id": "write_`svg`_code"} +{"requirements": ["Illustrate a large, vertical rock cliff face with texture and deep cracks.", "Show a climber figure halfway up the cliff, with their body twisted towards the rock.", "The climber must be wearing a red harness, a white helmet, and dark climbing shoes.", "The climber is in a dynamic pose: their left foot is on a small foothold, their right hand is gripping a side-pull handhold, and their left arm is fully extended, reaching for a chalk-dusted handhold just out of reach.", "A bright green climbing rope must be attached to the climber's harness via a visible figure-eight knot.", "The rope must run from the climber's harness, up through a quickdraw attached to the rock just above them, and then down in a straight, taut line to the belayer at the bottom of the cliff.", "The belayer must be anchored to the base of the cliff, looking up at the climber, with both hands on the rope as it feeds through a belay device attached to their harness.", "The handhold the climber is reaching for should be visibly smaller than the ones they are currently using.", "The background should be a simple blue sky with a few clouds below the climber's position to emphasize the height and exposure."], "prompt": "Write `svg` code for an image of a rock climber halfway up a cliff face, reaching for a handhold while their belayer manages the rope below.", "id": "write_`svg`_code"} +{"requirements": ["Use a cutaway, cross-sectional view to show the internal mechanism of a pin-tumbler lock, including the outer housing and the inner rotating plug.", "A brass key must be fully inserted into the lock's keyway.", "The key's bitting must be clearly visible, with five distinct cuts of varying depths.", "Depict exactly five vertical pin stacks inside the lock cylinder, passing through both the plug and the housing.", "Each stack must consist of a lower 'key pin' and an upper 'driver pin', with the key pins having different lengths corresponding to the key's cuts.", "Show compressed springs above each driver pin, pushing the entire stack downwards.", "The five cuts on the key's bitting must be shown lifting each corresponding key pin, causing the five gaps between the key pins and driver pins to align perfectly with the 'shear line'.", "The shear line (the gap between the inner rotating plug and the outer housing) must be a clearly defined, continuous horizontal line across all five pin channels.", "Include a rotational arrow to show that the key and the now-unobstructed plug are turning clockwise.", "Show a cam on the back of the rotating plug making contact with the lock's bolt mechanism, causing the bolt to retract into the lock housing."], "prompt": "Write `svg` code for a diagram showing a lock-and-key mechanism, with the key inserted and turning the tumblers to align them and unlock the bolt.", "id": "write_`svg`_code"} +{"requirements": ["Draw a large, heavy, spherical wrecking ball, slightly flattened on the side making contact with the wall.", "The ball must be attached to a thick, taut steel cable, which leads up and off-screen along a clear swing arc.", "Use motion blur lines that follow the arc of the swing to show the ball is in powerful motion from left to right.", "Depict a red brick wall that the ball is striking.", "The image must capture the exact moment of impact, with the ball embedded slightly into the wall.", "Show a concave crater forming on the wall that matches the curvature of the wrecking ball.", "Illustrate an explosion of debris flying outward from the impact point, with the trajectory of the fragments moving away from the ball's point of contact. The debris must include dust clouds, small brick fragments, and at least three whole bricks.", "The flying brick fragments must be shown frozen in mid-air, with some rotating.", "The rest of the wall must have cracks radiating out from the perimeter of the crater.", "The composition must convey a strong sense of force, with the motion lines of the ball and the trajectory of the debris creating a focused point of action."], "prompt": "Write `svg` code for an image of a wrecking ball in mid-swing, just making contact with a brick wall and sending debris flying.", "id": "write_`svg`_code"} +{"requirements": ["Show a person in a dynamic fishing stance on the grassy bank of a river.", "The fisherman should be holding a fishing rod, with their body twisted as if having just completed a cast.", "The fishing rod must be bent in a slight arc, as if it is un-flexing after being whipped forward.", "A thin fishing line must be shown unspooling from the reel, going through the guides on the rod, and extending from the very tip of the rod.", "The fishing line should be drawn in a long, graceful arc that starts at the rod tip and extends over the water.", "A red and white fishing lure must be clearly visible at the end of the line, positioned at the apex of the arc in mid-flight.", "The river should have gentle ripples, with a small disturbance in the water at the bank where the fisherman is standing.", "The background should be a natural outdoor scene with trees on the distant shore and a clear sky, establishing the direction of the cast.", "The fisherman's posture, the recovering bend of the rod, and the arc of the line must all work together to illustrate the single, fluid action of casting."], "prompt": "Write `svg` code for an image of a fisherman casting a line into a river, with the lure flying through the air at the end of the line.", "id": "write_`svg`_code"} +{"requirements": ["Depict a child, identifiable by youthful features, kneeling on wet sand.", "The child should be looking towards an incoming wave with a surprised expression, with one hand raised from their work.", "In front of the child, show a sandcastle with at least two towers and a connecting wall. A small plastic shovel should be stuck in the top of one tower.", "Include a large ocean wave, distinct from the calmer water, that is in the process of breaking.", "The white foam from the breaking wave must be shown actively washing over the base of the sandcastle, dissolving the lower part of the walls.", "Show the leading edge of the water and foam beginning to surround the child's knees.", "The setting must be a beach, with the sand around the castle being visibly darker and wet from the approaching water.", "Include a horizon line separating the sea and sky, with the wave rising above it to show its size.", "The motion of the wave, the dissolving sandcastle, and the child's reaction must be clearly linked to convey the moment of destruction."], "prompt": "Write `svg` code for an image of a child building a sandcastle at the beach, just as a wave is beginning to crash and wash it away.", "id": "write_`svg`_code"} +{"requirements": ["Include a person representing a DJ, positioned behind a table with equipment.", "Show two turntables, with a vinyl record on the one closer to the viewer.", "The record on the primary turntable must have radial motion lines to indicate it is spinning.", "One of the DJ's hands must be on the spinning record, with fingers curled, in a pose that suggests they are actively scratching the record back and forth.", "Include a DJ mixer positioned between the two turntables, with visible knobs and faders.", "The DJ's other hand must be shown gripping and moving the main horizontal crossfader on the mixer.", "The DJ should be wearing headphones, but with one earcup pushed back off their ear, allowing them to listen to the room.", "The overall scene should be dimly lit, with a single spotlight illuminating the DJ and their equipment, clearly distinguishing the turntables and the central mixer."], "prompt": "Write `svg` code for an image of a DJ at a turntable, with one hand on a spinning record and the other adjusting a slider on the mixer.", "id": "write_`svg`_code"} +{"requirements": ["Depict two figures: a tailor and a customer, central to the image.", "The tailor should be shown standing slightly to the side of the customer, focused intently on their work.", "The customer should be standing straight with arms held slightly out to their sides, wearing a form-fitting white dress shirt.", "A yellow, flexible measuring tape must be wrapped snugly around the customer's chest, under their arms.", "The tailor must be holding the measuring tape where the end overlaps the numbered scale, pinching it with their thumb and forefinger to mark the measurement. The tailor's eyes should be looking down at this exact point on the tape.", "The numbers on the measuring tape should be visible where the tailor is holding it.", "The background must be a tailor's shop, with a large three-way mirror behind the customer, reflecting the back of the customer and the tailor's action.", "The focused gaze of the tailor and the precise hold on the tape must clearly communicate the act of taking a measurement."], "prompt": "Write `svg` code for an image of a tailor taking measurements for a suit, wrapping a measuring tape around a customer's chest.", "id": "write_`svg`_code"} +{"requirements": ["The image must be a network diagram, not a realistic scene.", "Include a single, central circular node clearly labeled 'Server'. The server icon itself should appear cracked or fractured.", "Include at least five source nodes positioned in an arc around the server node.", "The source nodes must be styled to look malicious, each containing a skull and crossbones icon and colored dark grey.", "Draw a dense flood of lines representing traffic packets, so numerous that they almost merge into solid beams of light.", "All traffic packet lines must originate from the malicious nodes and converge on the central server, creating a visual bottleneck at the server's edge.", "Use arrows on the lines to indicate the unidirectional flow of data towards the server.", "The central server node must have a prominent circular status indicator on it.", "The server's status indicator must be glowing bright red to signify a critical overload or 'down' state, in stark contrast to the dark attacker nodes.", "The visual effect must be one of the central server being completely overwhelmed, with the incoming lines obscuring parts of the server node itself."], "prompt": "Write `svg` code for a diagram of a computer network under a DDoS attack, showing multiple malicious source nodes flooding a central server node with traffic packets, causing its status indicator to turn red.", "id": "write_`svg`_code"} +{"requirements": ["Depict a person in a stable archer's stance, positioned sideways to the target.", "The archer must be at 'full draw,' with the bowstring pulled back so their hand is anchored firmly under their chin, and the string touches their lips.", "Show a longbow that is visibly and deeply bent under the tension of the draw.", "The archer's left arm should be fully extended towards the target, holding the bow steady, with visible tension in the shoulder and arm muscles.", "An arrow, with fletching visible, must be nocked on the bowstring and resting on the archer's extended hand.", "Include a traditional circular target in the distant background, with concentric colored rings and a yellow bullseye.", "The composition must create a strong, clear line of sight, aligning the archer's dominant eye, the shaft of the arrow, and the bullseye of the distant target.", "The archer's gaze must be intensely focused along this line towards the target."], "prompt": "Write `svg` code for an image of an archer at full draw, aiming an arrow at a target in the distance.", "id": "write_`svg`_code"} +{"requirements": ["Depict a person in a stable archer's stance, with their body positioned sideways to the target.", "The archer must be at 'full draw,' with the bowstring pulled back so their drawing hand is anchored firmly at the corner of their mouth.", "Show a longbow that is visibly and deeply bent under the tension from the drawn string.", "The archer's other arm must be fully extended towards the target, holding the bow steady.", "An arrow, with fletching visible, must be nocked on the bowstring and resting on the bow, with its tip pointing directly at the target.", "Include a traditional circular target in the distant background, with concentric colored rings and a yellow bullseye.", "The composition must create a strong, clear line of sight, aligning the archer's dominant eye, the shaft of the arrow, and the bullseye of the distant target.", "The archer's gaze must be intensely focused along this line towards the target."], "prompt": "Write `svg` code for an image of an archer at full draw, aiming an arrow at a target in the distance.", "id": "write_`svg`_code"} +{"requirements": ["Show a person leaning into their work while holding and operating a leaf blower, bracing against its force.", "The leaf blower must be a recognizable shape with a main body, a handle the person is gripping, and a long nozzle aimed directly at a pile of leaves.", "Depict a large, dense pile of leaves on a suburban lawn. The side of the pile facing the leaf blower must be visibly caved in from the force of the air.", "The leaves must have a mix of autumn colors (red, orange, yellow, and brown).", "Show a powerful, visible stream of air, represented by transparent white motion lines, emanating from the blower's nozzle and directly hitting the caved-in side of the leaf pile.", "A cloud of individual leaves must be shown being lifted from the pile and propelled through the air, following the path of the air stream away from the nozzle.", "The setting must be a suburban lawn. A patch of grass where the leaves have been blown from must be clear, contrasting with the area still covered by the main pile.", "The distinction between the static, dense pile of leaves and the individual, airborne leaves must be clear and show a direct cause-and-effect relationship with the air stream from the blower."], "prompt": "Write `svg` code for an image of a person using a leaf blower to clear a large pile of autumn leaves from a suburban lawn.", "id": "write_`svg`_code"} +{"requirements": ["Depict a person in a camping environment, kneeling on the ground and leaning forward over a fire pit.", "The person must be holding a dark grey flint in one hand and a steel striker in the other.", "Show the flint and steel positioned directly over a tinder bundle, in the act of being struck together.", "A shower of bright yellow sparks must be visibly emanating from the point of contact, directed downwards towards the tinder.", "Include a tinder bundle made of fine wood shavings and dry grass, placed at the center of a stone fire pit on the ground.", "Show exactly two sparks landing on the top of the tinder bundle.", "Depict a small, bright orange glow and a thin wisp of white smoke rising from the exact spot where the two sparks have landed on the tinder.", "The person's head should be tilted down, with their gaze fixed on the glowing spot on the tinder.", "The setting must include the stone fire pit on dirt ground, with the dark silhouettes of several pine trees visible in the background."], "prompt": "Write `svg` code for an image of a camper starting a fire with a flint and steel, with the first sparks just catching on a tinder bundle.", "id": "write_`svg`_code"} +{"requirements": ["The image must be a diagram illustrating the process of nuclear fission with clear labels and arrows.", "Show a small blue circle labeled 'Neutron' with a solid black arrow indicating its trajectory towards a large, purple nucleus.", "The large nucleus must be labeled 'Uranium-235 Nucleus' and be depicted in the process of splitting into two smaller, unequal-sized nuclei.", "The two new nuclei, labeled 'Fission Fragment', must be shown moving in opposite directions away from the point of fission, each with a directional arrow.", "Show exactly three new blue circles, identical to the first and labeled 'Neutron', being ejected from the splitting nucleus.", "Each of the three released neutrons must have its own arrow indicating its outward trajectory, with one pointing towards the edge of the frame to suggest a continuing chain reaction.", "A bright yellow flash must emanate from the center of the splitting Uranium-235 Nucleus.", "Wavy red lines, representing energy, must radiate outwards from the yellow flash, passing between the departing fission fragments and neutrons."], "prompt": "Write `svg` code for an image of a nuclear fission reaction, showing a neutron striking a uranium nucleus, causing it to split into smaller elements and release more neutrons.", "id": "write_`svg`_code"} +{"requirements": ["Depict a person in professional sommelier attire, including a tastevin necklace.", "The sommelier must be holding a dark green wine bottle, tilted so that a thin stream of red wine flows from its mouth directly into the opening of a glass decanter positioned on a table below it.", "With their other hand, the sommelier must hold a single, lit white candle, positioning its yellow flame directly behind the neck of the wine bottle.", "The flame from the candle must cast a bright, glowing light through the bottle's neck.", "Inside the bottle, illuminated by the candle's flame, a small collection of dark specks representing sediment must be visible, gathered at the bottle's shoulder, prevented from being poured.", "The stream of wine flowing into the decanter must be clear and free of any sediment.", "The background must depict a wine cellar, with the curved tops of wooden wine barrels and a stone archway visible behind the sommelier."], "prompt": "Write `svg` code for an image of a sommelier pouring a small amount of wine from a bottle into a decanter, holding a candle behind the bottle's neck to check for sediment.", "id": "write_`svg`_code"} +{"requirements": ["Depict a ginger tabby cat lying on its back on a wooden floor in a playful pose.", "The cat must have its front paws actively batting a ball of blue yarn that is positioned directly above its chest.", "The ball of yarn must be partially unraveled, with a long, continuous strand of yarn trailing away from it.", "This loose strand of yarn must be tangled around one of the cat's rear legs before continuing to spread in a chaotic, looping mess across the floor.", "The cat must have wide, focused eyes and a slightly open mouth, indicating playful excitement directed at the yarn.", "The scene must take place on a light-colored wooden floor, with the parallel lines of the floorboards clearly visible beneath the cat and the yarn.", "At least one of the cat's claws must be visible, slightly snagged in the ball of yarn."], "prompt": "Write `svg` code for an image of a cat playfully batting at a ball of yarn, causing it to unravel across a wooden floor.", "id": "write_`svg`_code"} diff --git a/tests/pytest/data/svgbench_sample_dataset.jsonl b/tests/pytest/data/svgbench_sample_dataset.jsonl new file mode 100644 index 00000000..d5c38370 --- /dev/null +++ b/tests/pytest/data/svgbench_sample_dataset.jsonl @@ -0,0 +1,3 @@ +{"requirements": ["Cow must be clearly recognizable with distinctive bovine features", "Include cow body, head, four legs, tail, and udder", "Add cow ears, eyes, and snout for facial recognition", "Cow should be positioned in a realistic plowing stance (pulling forward)", "Use appropriate cow coloring (black/white patches, brown, or solid color)", "Include a traditional plow with visible blade/share", "Show plow handles extending upward", "Depict connection mechanism between cow and plow (yoke, harness, or chains)", "Plow should appear to be cutting into the soil", "Show ground/soil with visible furrows behind the plow", "Include plowed and unplowed sections of field", "Add simple background elements (horizon line, sky)", "Include basic vegetation or crops"], "prompt": "Write `svg` code to draw an image of a cow plowing a field.", "id": "cow_plowing"} +{"requirements": ["The overall background of the SVG must be white", "All primary elements must be horizontally centered on the canvas", "Include the Google logo in the center, using its official multi-color scheme (blue, red, yellow, blue, green, red)", "Place a prominent search bar directly below the Google logo", "The search bar must be a rounded rectangle with a light gray border", "The search bar must contain a gray magnifying glass icon on the left side", "The search bar must contain a gray microphone icon on the right side", "Place two distinct buttons below the search bar", "The left button must be labeled 'Google Search'", "The right button must be labeled 'I'm Feeling Lucky'", "Buttons should have a light gray background, a thin border, and dark gray text", "Create a header section at the top right of the canvas", "The header must include text links for 'Gmail' and 'Images'", "The header must include a 3x3 grid icon (Google Apps launcher)", "The header must include a prominent 'Sign in' button, typically with a blue background and white text"], "prompt": "Write `svg` code for a screenshot of the [Google homepage](https://google.com).", "id": "google_homepage"} +{"requirements": ["Create a primary circular or elliptical shape for the top surface of a round dinner table", "The table should have a distinct color or a simple texture like wood grain", "Include exactly 4 sets of cutlery arranged around the table", "Each cutlery set must consist of a recognizable fork, knife, and spoon", "Position the 4 cutlery sets at distinct place settings (e.g., at 12, 3, 6, and 9 o'clock positions)", "Optionally, include a round dinner plate at each of the 4 place settings", "Place exactly 3 main food dishes on the surface of the table", "First dish: A recognizable roasted turkey, golden-brown in color, showing drumsticks and a plump body", "The turkey should be presented on its own platter or serving dish", "Second dish: A round pizza, cut into slices, with visible crust and toppings", "Third dish: A serving of tacos (at least two), with visible folded shells and fillings (e.g., lettuce, meat, cheese)", "The tacos should be on a plate or in a holder", "Arrange the three main dishes in the center of the table, ensuring they don't unnaturally overlap", "The overall perspective should be top-down or slightly isometric"], "prompt": "Write `svg` code for an image of a round dinner table with 4 sets of cutlery and 3 dishes on the table, including a turkey, pizza and tacos.", "id": "dinner_table"} diff --git a/tests/pytest/test_svgbench.py b/tests/pytest/test_svgbench.py new file mode 100644 index 00000000..364db365 --- /dev/null +++ b/tests/pytest/test_svgbench.py @@ -0,0 +1,379 @@ +""" +SVGBench evaluation test for EvalProtocol.io. + +This test evaluates LLM ability to generate SVG code that meets specific visual requirements. +The evaluation process includes: +1. SVG code generation from text prompts +2. SVG to PNG rendering using Selenium +3. LLM judge evaluation of requirement fulfillment +4. Scoring based on fulfilled requirements ratio +""" + +import base64 +import json +import logging +import os +import re +import tempfile +from typing import Any, Dict, List, Optional + +import litellm +from pydantic import BaseModel + +from eval_protocol.models import EvaluateResult, EvaluationRow, InputMetadata, Message +from eval_protocol.pytest import evaluation_test +from eval_protocol.pytest.default_single_turn_rollout_process import default_single_turn_rollout_processor + +logger = logging.getLogger(__name__) + + +class SVGBenchResponse(BaseModel): + number_of_fulfilled_requirements: int + + +def svgbench_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]: + """ + Convert SVGBench dataset entries to EvaluationRow objects. + + Args: + data: List of dictionaries containing prompt and requirements + + Returns: + List of EvaluationRow objects + """ + rows = [] + + for row in data: + # Format requirements as numbered list + requirements = "\n".join([f"{i+1}. {req}" for i, req in enumerate(row["requirements"])]) + + # Create the generation prompt following SVGBench format + prompt = f"""{row['prompt']} Wrap the SVG code in an SVG code block following the example below. + +Example: +```svg + + + +``` + +Requirements: +{requirements}""" + + eval_row = EvaluationRow( + messages=[Message(role="user", content=prompt)], + input_metadata=InputMetadata( + row_id=row["id"], + dataset_info={ + "original_prompt": row["prompt"], + "requirements": row["requirements"], + "total_requirements": len(row["requirements"]), + "formatted_prompt": prompt, + }, + ), + ) + + rows.append(eval_row) + + return rows + + +def extract_svg_code(text: str) -> Optional[str]: + """ + Extract SVG code from model response using SVGBench's extraction logic. + + Args: + text: Raw model response text + + Returns: + Extracted SVG code or None if not found + """ + # First try: Look for ```svg code blocks + if "```svg" in text: + svg_parts = text.split("```svg") + if len(svg_parts) > 1: + svg_code = svg_parts[1].split("```")[0].strip() + return svg_code + + # Second try: Look for ... tags + if "" in text: + start = text.find("") + 6 + svg_code = text[start:end].strip() + return svg_code + + return None + + +def render_svg_to_png(svg_code: str, output_path: str) -> bool: + """ + Render SVG code to PNG using Selenium WebDriver. + + Args: + svg_code: Valid SVG code + output_path: Path where PNG should be saved + + Returns: + True if successful, False otherwise + """ + try: + # Check if selenium and webdriver are available + try: + from selenium import webdriver + from selenium.webdriver.chrome.options import Options + from selenium.webdriver.common.by import By + from selenium.webdriver.support import expected_conditions as EC + from selenium.webdriver.support.ui import WebDriverWait + except ImportError: + logger.error("Selenium not available. Install with: pip install selenium") + return False + + # Parse SVG dimensions + width, height = 800, 600 # Default dimensions + + # Try to extract dimensions from SVG + width_match = re.search(r'width="(\d+)"', svg_code) + height_match = re.search(r'height="(\d+)"', svg_code) + viewbox_match = re.search(r'viewBox="[^"]*?(\d+)\s+(\d+)"', svg_code) + + if width_match and height_match: + width, height = int(width_match.group(1)), int(height_match.group(1)) + elif viewbox_match: + width, height = int(viewbox_match.group(1)), int(viewbox_match.group(2)) + + # Create HTML wrapper + html_content = f""" + + + + + + + + {svg_code} + + + """ + + # Set up Chrome options + chrome_options = Options() + chrome_options.add_argument("--headless") + chrome_options.add_argument("--no-sandbox") + chrome_options.add_argument("--disable-dev-shm-usage") + chrome_options.add_argument("--disable-gpu") + chrome_options.add_argument(f"--window-size={width+40},{height+40}") + + # Create temporary HTML file + with tempfile.NamedTemporaryFile(mode="w", suffix=".html", delete=False) as f: + f.write(html_content) + html_path = f.name + + try: + # Initialize WebDriver + driver = webdriver.Chrome(options=chrome_options) + driver.get(f"file://{html_path}") + + # Wait for SVG to load + WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "svg"))) + + # Take screenshot + driver.save_screenshot(output_path) + driver.quit() + + return True + + finally: + # Clean up temporary file + os.unlink(html_path) + + except Exception as e: + logger.error(f"SVG rendering failed: {e}") + return False + + +def evaluate_with_llm_judge(image_path: str, requirements: List[str]) -> Dict[str, Any]: + """ + Use LLM judge to evaluate how many requirements are fulfilled. + Uses GPT-4o for vision capabilities to match project's model preferences. (note original repo uses Gemini 2.5 flashs) + + Args: + image_path: Path to rendered PNG image + requirements: List of requirements to evaluate + + Returns: + Dictionary with evaluation results + """ + # Format requirements for evaluation (exactly as in original) + requirements_text = "\n".join([f"{i+1}. {req}" for i, req in enumerate(requirements)]) + + # Create evaluation prompt with JSON response format + evaluate_prompt = f"""Examine the generated image. How many of the following {len(requirements)} requirements were fulfilled? + +Be strict about the requirements and respond ONLY with a JSON object in this exact format: +{{"number_of_fulfilled_requirements": }} + +Where is a number between 0 and {len(requirements)}. + +Requirements: +{requirements_text}""" + + # Read and encode image + with open(image_path, "rb") as f: + image_data = base64.b64encode(f.read()).decode("utf-8") + + # Prepare messages with image + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": evaluate_prompt}, + {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_data}"}}, + ], + } + ] + + # Use GPT-4o for vision capabilities to match project's OpenAI model preference + response = litellm.completion( + model="gpt-4o", + messages=messages, + temperature=0.0, + max_tokens=200, + response_format={ + "type": "json_schema", + "json_schema": {"name": "SVGBenchResponse", "schema": SVGBenchResponse.model_json_schema()}, + }, + ) + + # Parse response + response_content = response.choices[0].message.content + + # Handle empty response + if not response_content or response_content.strip() == "": + raise ValueError("Empty response from LLM judge") + + result = json.loads(response_content) + + # Validate the result + if "number_of_fulfilled_requirements" in result: + return result + else: + raise ValueError("Missing required field in response") + + +@evaluation_test( + input_dataset=["tests/pytest/data/svgbench_dataset.jsonl"], + dataset_adapter=svgbench_to_evaluation_row, + completion_params=[ + {"temperature": 0.0, "max_tokens": 4096, "model": "gpt-4.1"}, + { + "temperature": 0.8, + "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b", + "extra_body": {"reasoning_effort": "high"}, + }, + ], + rollout_processor=default_single_turn_rollout_processor, + passed_threshold=0.5, # 50% average score to pass + num_runs=1, + mode="pointwise", + max_concurrent_rollouts=3, +) +def test_svg_generation_evaluation(row: EvaluationRow) -> EvaluationRow: + """ + Test SVG generation and evaluation using SVGBench methodology. + + This test: + 1. Extracts SVG code from the model's response + 2. Renders SVG to PNG using Selenium + 3. Uses LLM judge to evaluate requirement fulfillment + 4. Calculates score based on fulfilled requirements ratio + + Args: + row: EvaluationRow with model's SVG generation response + + Returns: + EvaluationRow with evaluation results + """ + # Extract dataset info + requirements = row.input_metadata.dataset_info["requirements"] + total_requirements = row.input_metadata.dataset_info["total_requirements"] + original_prompt = row.input_metadata.dataset_info["original_prompt"] + row_id = row.input_metadata.row_id + + # Check if we should save debug files + save_debug_files = os.environ.get("SVGBENCH_SAVE_DEBUG_FILES", "false").lower() == "true" + + # Get model response + if not row.messages or len(row.messages) < 2: + row.evaluation_result = EvaluateResult(score=0.0, reason="No model response found") + return row + + model_response = row.messages[-1].content + + # Extract SVG code with better error reporting (matching original) + try: + svg_code = extract_svg_code(model_response) + if not svg_code: + raise ValueError("No valid SVG code found in response") + except Exception as e: + logger.error(f"Error extracting SVG code for question {row_id}: {e}") + if save_debug_files: + logger.error(f"Full response: {model_response}") + + row.evaluation_result = EvaluateResult(score=0.0, reason=f"SVG extraction failed: {str(e)}") + return row + + # Setup file paths + if save_debug_files: + # Create debug directory + model = row.input_metadata.completion_params["model"] + # Sanitize model name for filesystem (replace slashes with underscores) + safe_model_name = model.replace("/", "_").replace(":", "_") + debug_dir = "svgbench_debug" + os.makedirs(debug_dir, exist_ok=True) + png_path = os.path.join(debug_dir, f"question_{row_id}_{safe_model_name}.png") + svg_path = os.path.join(debug_dir, f"question_{row_id}_{safe_model_name}.svg") + # Save SVG file for debugging + with open(svg_path, "w") as f: + f.write(svg_code) + else: + # Use temporary file + with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f: + png_path = f.name + + try: + # Render SVG to PNG + if not render_svg_to_png(svg_code, png_path): + row.evaluation_result = EvaluateResult(score=0.0, reason="Failed to render SVG to PNG") + return row + + # Evaluate with LLM judge + judge_result = evaluate_with_llm_judge(png_path, requirements) + + # Calculate score + fulfilled_count = judge_result.get("number_of_fulfilled_requirements", 0) + fulfilled_count = max(0, min(fulfilled_count, total_requirements)) # Clamp to valid range + score = fulfilled_count / total_requirements + + row.evaluation_result = EvaluateResult( + score=score, + reason=f"Fulfilled {fulfilled_count}/{total_requirements} requirements ({score:.1%}) for prompt: '{original_prompt}'", + ) + + return row + + except Exception as e: + logger.error(f"Evaluation failed for question {row_id}: {e}") + row.evaluation_result = EvaluateResult(score=0.0, reason=f"Evaluation error: {str(e)}") + return row + + finally: + # Clean up temporary PNG file (only if not saving debug files) + if not save_debug_files: + try: + if os.path.exists(png_path): + os.unlink(png_path) + except Exception: + pass diff --git a/uv.lock b/uv.lock index 4a9008a2..d439a47a 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 3 +revision = 2 requires-python = ">=3.10" resolution-markers = [ "python_full_version >= '3.13'", @@ -1207,6 +1207,9 @@ langfuse = [ openevals = [ { name = "openevals" }, ] +svgbench = [ + { name = "selenium" }, +] trl = [ { name = "accelerate" }, { name = "peft" }, @@ -1282,6 +1285,7 @@ requires-dist = [ { name = "pyyaml", specifier = ">=5.0" }, { name = "requests", specifier = ">=2.25.0" }, { name = "rich", specifier = ">=12.0.0" }, + { name = "selenium", marker = "extra == 'svgbench'", specifier = ">=4.0.0" }, { name = "swig", marker = "extra == 'box2d'" }, { name = "toml", specifier = ">=0.10.0" }, { name = "torch", marker = "extra == 'trl'", specifier = ">=1.9" }, @@ -1300,7 +1304,7 @@ requires-dist = [ { name = "websockets", specifier = ">=15.0.1" }, { name = "werkzeug", marker = "extra == 'dev'", specifier = ">=2.0.0" }, ] -provides-extras = ["dev", "trl", "openevals", "fireworks", "box2d", "langfuse", "huggingface", "adapters"] +provides-extras = ["dev", "trl", "openevals", "fireworks", "box2d", "langfuse", "huggingface", "adapters", "svgbench"] [package.metadata.requires-dev] dev = [ @@ -3798,6 +3802,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/43/0c/f75015669d7817d222df1bb207f402277b77d22c4833950c8c8c7cf2d325/orjson-3.11.0-cp313-cp313-win_arm64.whl", hash = "sha256:51cdca2f36e923126d0734efaf72ddbb5d6da01dbd20eab898bdc50de80d7b5a", size = 126349, upload-time = "2025-07-15T16:08:00.322Z" }, ] +[[package]] +name = "outcome" +version = "1.3.0.post0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "attrs" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/98/df/77698abfac98571e65ffeb0c1fba8ffd692ab8458d617a0eed7d9a8d38f2/outcome-1.3.0.post0.tar.gz", hash = "sha256:9dcf02e65f2971b80047b377468e72a268e15c0af3cf1238e6ff14f7f91143b8", size = 21060, upload-time = "2023-10-26T04:26:04.361Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/55/8b/5ab7257531a5d830fc8000c476e63c935488d74609b50f9384a643ec0a62/outcome-1.3.0.post0-py2.py3-none-any.whl", hash = "sha256:e771c5ce06d1415e356078d3bdd68523f284b4ce5419828922b6871e65eda82b", size = 10692, upload-time = "2023-10-26T04:26:02.532Z" }, +] + [[package]] name = "overrides" version = "7.7.0" @@ -4569,6 +4585,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/bd/24/12818598c362d7f300f18e74db45963dbcb85150324092410c8b49405e42/pyproject_hooks-1.2.0-py3-none-any.whl", hash = "sha256:9e5c6bfa8dcc30091c74b0cf803c81fdd29d94f01992a7707bc97babb1141913", size = 10216, upload-time = "2024-09-29T09:24:11.978Z" }, ] +[[package]] +name = "pysocks" +version = "1.7.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/bd/11/293dd436aea955d45fc4e8a35b6ae7270f5b8e00b53cf6c024c83b657a11/PySocks-1.7.1.tar.gz", hash = "sha256:3f8804571ebe159c380ac6de37643bb4685970655d3bba243530d6558b799aa0", size = 284429, upload-time = "2019-09-20T02:07:35.714Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8d/59/b4572118e098ac8e46e399a1dd0f2d85403ce8bbaad9ec79373ed6badaf9/PySocks-1.7.1-py3-none-any.whl", hash = "sha256:2725bd0a9925919b9b51739eea5f9e2bae91e83288108a9ad338b2e3a4435ee5", size = 16725, upload-time = "2019-09-20T02:06:22.938Z" }, +] + [[package]] name = "pytest" version = "8.4.1" @@ -5351,6 +5376,23 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/54/24/b4293291fa1dd830f353d2cb163295742fa87f179fcc8a20a306a81978b7/SecretStorage-3.3.3-py3-none-any.whl", hash = "sha256:f356e6628222568e3af06f2eba8df495efa13b3b63081dafd4f7d9a7b7bc9f99", size = 15221, upload-time = "2022-08-13T16:22:44.457Z" }, ] +[[package]] +name = "selenium" +version = "4.35.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "certifi" }, + { name = "trio" }, + { name = "trio-websocket" }, + { name = "typing-extensions" }, + { name = "urllib3", extra = ["socks"] }, + { name = "websocket-client" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/75/67/9016942b5781843cfea6f5bc1383cea852d9fa08f85f55a0547874525b5c/selenium-4.35.0.tar.gz", hash = "sha256:83937a538afb40ef01e384c1405c0863fa184c26c759d34a1ebbe7b925d3481c", size = 907991, upload-time = "2025-08-12T15:46:40.822Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/17/ef/d0e033e1b3f19a0325ce03863b68d709780908381135fc0f9436dea76a7b/selenium-4.35.0-py3-none-any.whl", hash = "sha256:90bb6c6091fa55805785cf1660fa1e2176220475ccdb466190f654ef8eef6114", size = 9602106, upload-time = "2025-08-12T15:46:38.244Z" }, +] + [[package]] name = "send2trash" version = "1.8.3" @@ -5409,6 +5451,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" }, ] +[[package]] +name = "sortedcontainers" +version = "2.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e8/c4/ba2f8066cceb6f23394729afe52f3bf7adec04bf9ed2c820b39e19299111/sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88", size = 30594, upload-time = "2021-05-16T22:03:42.897Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/32/46/9cb0e58b2deb7f82b84065f37f3bffeb12413f947f9388e4cac22c4621ce/sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0", size = 29575, upload-time = "2021-05-16T22:03:41.177Z" }, +] + [[package]] name = "soupsieve" version = "2.7" @@ -5794,6 +5845,39 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/41/b1/d7520cc5cb69c825599042eb3a7c986fa9baa8a8d2dea9acd78e152c81e2/transformers-4.53.3-py3-none-any.whl", hash = "sha256:5aba81c92095806b6baf12df35d756cf23b66c356975fb2a7fa9e536138d7c75", size = 10826382, upload-time = "2025-07-22T07:30:48.458Z" }, ] +[[package]] +name = "trio" +version = "0.30.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "attrs" }, + { name = "cffi", marker = "implementation_name != 'pypy' and os_name == 'nt'" }, + { name = "exceptiongroup", marker = "python_full_version < '3.11'" }, + { name = "idna" }, + { name = "outcome" }, + { name = "sniffio" }, + { name = "sortedcontainers" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/01/c1/68d582b4d3a1c1f8118e18042464bb12a7c1b75d64d75111b297687041e3/trio-0.30.0.tar.gz", hash = "sha256:0781c857c0c81f8f51e0089929a26b5bb63d57f927728a5586f7e36171f064df", size = 593776, upload-time = "2025-04-21T00:48:19.507Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/69/8e/3f6dfda475ecd940e786defe6df6c500734e686c9cd0a0f8ef6821e9b2f2/trio-0.30.0-py3-none-any.whl", hash = "sha256:3bf4f06b8decf8d3cf00af85f40a89824669e2d033bb32469d34840edcfc22a5", size = 499194, upload-time = "2025-04-21T00:48:17.167Z" }, +] + +[[package]] +name = "trio-websocket" +version = "0.12.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "exceptiongroup", marker = "python_full_version < '3.11'" }, + { name = "outcome" }, + { name = "trio" }, + { name = "wsproto" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d1/3c/8b4358e81f2f2cfe71b66a267f023a91db20a817b9425dd964873796980a/trio_websocket-0.12.2.tar.gz", hash = "sha256:22c72c436f3d1e264d0910a3951934798dcc5b00ae56fc4ee079d46c7cf20fae", size = 33549, upload-time = "2025-02-25T05:16:58.947Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c7/19/eb640a397bba49ba49ef9dbe2e7e5c04202ba045b6ce2ec36e9cadc51e04/trio_websocket-0.12.2-py3-none-any.whl", hash = "sha256:df605665f1db533f4a386c94525870851096a223adcb97f72a07e8b4beba45b6", size = 21221, upload-time = "2025-02-25T05:16:57.545Z" }, +] + [[package]] name = "triton" version = "3.3.1" @@ -5971,6 +6055,11 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a7/c2/fe1e52489ae3122415c51f387e221dd0773709bad6c6cdaa599e8a2c5185/urllib3-2.5.0-py3-none-any.whl", hash = "sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc", size = 129795, upload-time = "2025-06-18T14:07:40.39Z" }, ] +[package.optional-dependencies] +socks = [ + { name = "pysocks" }, +] + [[package]] name = "uvicorn" version = "0.35.0" From 3318f22bb822ad5c8e8be764fe24737812db4e05 Mon Sep 17 00:00:00 2001 From: Derek Xu <32891260+xzrderek@users.noreply.github.com> Date: Thu, 14 Aug 2025 15:35:14 -0700 Subject: [PATCH 18/26] Checkpointing + Error Retry for Rollout Processor (#80) * Finished Error Handling * Address comments * Changing the rollout processors * cleaning up mcp gym * remove import * Update * failing test * fixing flaky test * update comments --- .github/workflows/ci.yml | 1 + eval_protocol/benchmarks/suites/aime25.py | 4 +- eval_protocol/benchmarks/suites/gpqa.py | 43 ++-- .../suites/livebench_data_analysis.py | 8 +- .../benchmarks/suites/tau_bench_retail.py | 4 +- eval_protocol/mcp/execution/manager.py | 21 +- eval_protocol/mcp_env.py | 24 +- eval_protocol/pytest/__init__.py | 21 +- .../pytest/default_agent_rollout_processor.py | 69 +++--- .../default_mcp_gym_rollout_processor.py | 93 ++++--- .../pytest/default_no_op_rollout_process.py | 15 -- .../pytest/default_no_op_rollout_processor.py | 27 ++ .../default_single_turn_rollout_process.py | 212 ++++++++-------- eval_protocol/pytest/evaluation_test.py | 218 ++++------------ eval_protocol/pytest/plugin.py | 25 ++ eval_protocol/pytest/rollout_processor.py | 21 ++ eval_protocol/pytest/types.py | 6 +- eval_protocol/pytest/utils.py | 233 +++++++++++++++++- examples/gpqa/tests/test_gpqa.py | 4 +- examples/healthbench/tests/test_evaluation.py | 4 +- tests/pytest/test_apps_coding.py | 4 +- tests/pytest/test_basic_coding.py | 4 +- tests/pytest/test_frozen_lake.py | 4 +- tests/pytest/test_hallucination.py | 4 +- tests/pytest/test_lunar_lander.py | 4 +- tests/pytest/test_markdown_highlighting.py | 4 +- ..._pytest_default_agent_rollout_processor.py | 4 +- tests/pytest/test_pytest_ensure_logging.py | 4 +- tests/pytest/test_pytest_flaky_sometimes.py | 4 +- tests/pytest/test_pytest_function_calling.py | 4 +- tests/pytest/test_pytest_ids.py | 6 +- tests/pytest/test_pytest_input_messages.py | 4 +- tests/pytest/test_pytest_json_schema.py | 4 +- tests/pytest/test_pytest_math_example.py | 4 +- .../pytest/test_pytest_math_format_length.py | 4 +- tests/pytest/test_pytest_mcp_config.py | 4 +- tests/pytest/test_pytest_mcp_url.py | 4 +- .../pytest/test_pytest_word_count_example.py | 4 +- tests/pytest/test_tau_bench_airline.py | 4 +- tests/test_retry_mechanism.py | 157 ++++++++++++ .../test_rollout_control_plane_integration.py | 28 ++- tests/test_tau_bench_airline_smoke.py | 4 +- 42 files changed, 831 insertions(+), 489 deletions(-) delete mode 100644 eval_protocol/pytest/default_no_op_rollout_process.py create mode 100644 eval_protocol/pytest/default_no_op_rollout_processor.py create mode 100644 eval_protocol/pytest/rollout_processor.py create mode 100644 tests/test_retry_mechanism.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a1cf6aec..a0184b62 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -92,6 +92,7 @@ jobs: --ignore=tests/pytest/test_frozen_lake.py \ --ignore=tests/pytest/test_lunar_lander.py \ --ignore=tests/pytest/test_tau_bench_airline.py \ + --ignore=tests/pytest/test_apps_coding.py \ --ignore=tests/test_tau_bench_airline_smoke.py \ --cov=eval_protocol --cov-append --cov-report=xml --cov-report=term-missing -v --durations=10 diff --git a/eval_protocol/benchmarks/suites/aime25.py b/eval_protocol/benchmarks/suites/aime25.py index 3558eaa1..92d7bedc 100644 --- a/eval_protocol/benchmarks/suites/aime25.py +++ b/eval_protocol/benchmarks/suites/aime25.py @@ -3,7 +3,7 @@ from eval_protocol.benchmarks.registry import export_benchmark from eval_protocol.models import EvaluateResult, EvaluationRow, Message, MetricResult from eval_protocol.pytest.default_single_turn_rollout_process import ( - default_single_turn_rollout_processor, + SingleTurnRolloutProcessor, ) from eval_protocol.pytest.evaluation_test import evaluation_test @@ -72,7 +72,7 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]: "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b", } ], - rollout_processor=default_single_turn_rollout_processor, + rollout_processor=SingleTurnRolloutProcessor(), aggregation_method="mean", passed_threshold=None, num_runs=8, diff --git a/eval_protocol/benchmarks/suites/gpqa.py b/eval_protocol/benchmarks/suites/gpqa.py index 76967beb..ced8ac9f 100644 --- a/eval_protocol/benchmarks/suites/gpqa.py +++ b/eval_protocol/benchmarks/suites/gpqa.py @@ -1,3 +1,4 @@ +import asyncio import csv import io import re @@ -8,9 +9,11 @@ from eval_protocol.benchmarks.registry import export_benchmark from eval_protocol.models import EvaluateResult, EvaluationRow, Message, MetricResult from eval_protocol.pytest.default_single_turn_rollout_process import ( - default_single_turn_rollout_processor, + SingleTurnRolloutProcessor, ) from eval_protocol.pytest.evaluation_test import evaluation_test +from eval_protocol.pytest.rollout_processor import RolloutProcessor +from eval_protocol.pytest.types import RolloutProcessorConfig SYSTEM_PROMPT = ( "You are a helpful assistant. Read the question and options carefully. " @@ -60,19 +63,31 @@ def _strip_gt_messages(msgs: List[Message]) -> List[Message]: return [m for m in msgs if not (m.role == "system" and (m.content or "").startswith("__GT__:"))] -async def gpqa_strip_gt_rollout_processor(rows: List[EvaluationRow], config) -> List[EvaluationRow]: - """Preprocess rows to set ground_truth and remove __GT__ messages, then delegate to default processor.""" - processed: List[EvaluationRow] = [] - for r in rows: - gt_tokens = [m.content for m in r.messages if m.role == "system" and (m.content or "").startswith("__GT__:")] - if gt_tokens: - gt_val = gt_tokens[-1].split(":", 1)[1].strip() - r.ground_truth = gt_val - r.messages = [ - m for m in r.messages if not (m.role == "system" and (m.content or "").startswith("__GT__:")) +class GPQAStripGTRolloutProcessor(RolloutProcessor): + """Preprocess rows to set ground_truth and remove __GT__ messages, then delegate to SingleTurnRolloutProcessor.""" + + def __init__(self): + super().__init__() + self.single_turn_processor = SingleTurnRolloutProcessor() + + def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) -> List[asyncio.Task[EvaluationRow]]: + """Preprocess rows and delegate to SingleTurnRolloutProcessor.""" + processed: List[EvaluationRow] = [] + + for r in rows: + gt_tokens = [ + m.content for m in r.messages if m.role == "system" and (m.content or "").startswith("__GT__:") ] - processed.append(r) - return await default_single_turn_rollout_processor(processed, config) + if gt_tokens: + gt_val = gt_tokens[-1].split(":", 1)[1].strip() + r.ground_truth = gt_val + r.messages = [ + m for m in r.messages if not (m.role == "system" and (m.content or "").startswith("__GT__:")) + ] + processed.append(r) + + # Delegate to SingleTurnRolloutProcessor + return self.single_turn_processor(processed, config) @export_benchmark("gpqa") @@ -81,7 +96,7 @@ async def gpqa_strip_gt_rollout_processor(rows: List[EvaluationRow], config) -> completion_params=[ {"extra_body": {"reasoning_effort": "low"}, "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"} ], - rollout_processor=gpqa_strip_gt_rollout_processor, + rollout_processor=GPQAStripGTRolloutProcessor(), aggregation_method="mean", passed_threshold=None, num_runs=8, diff --git a/eval_protocol/benchmarks/suites/livebench_data_analysis.py b/eval_protocol/benchmarks/suites/livebench_data_analysis.py index fc5abb4e..da384439 100644 --- a/eval_protocol/benchmarks/suites/livebench_data_analysis.py +++ b/eval_protocol/benchmarks/suites/livebench_data_analysis.py @@ -5,7 +5,7 @@ from eval_protocol.benchmarks.registry import export_benchmark, register_composite_benchmark from eval_protocol.models import EvaluateResult, EvaluationRow, Message, MetricResult from eval_protocol.pytest.default_single_turn_rollout_process import ( - default_single_turn_rollout_processor, + SingleTurnRolloutProcessor, ) from eval_protocol.pytest.evaluation_test import evaluation_test @@ -375,7 +375,7 @@ def _extract_gt(row: EvaluationRow) -> Dict[str, Any]: completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}], input_messages=[[m for m in r.messages] for r in _CTA_ROWS], rollout_processor_kwargs=[{"extra_body": {"reasoning_effort": "low"}}], - rollout_processor=default_single_turn_rollout_processor, + rollout_processor=SingleTurnRolloutProcessor(), aggregation_method="mean", passed_threshold=None, num_runs=4, @@ -418,7 +418,7 @@ def livebench_cta_pointwise(row: EvaluationRow) -> EvaluationRow: completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}], input_messages=[[m for m in r.messages] for r in _TABLEJOIN_ROWS], rollout_processor_kwargs=[{"extra_body": {"reasoning_effort": "low"}}], - rollout_processor=default_single_turn_rollout_processor, + rollout_processor=SingleTurnRolloutProcessor(), aggregation_method="mean", passed_threshold=None, num_runs=4, @@ -462,7 +462,7 @@ def livebench_tablejoin_pointwise(row: EvaluationRow) -> EvaluationRow: completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}], input_messages=[[m for m in r.messages] for r in _TABLEREFORMAT_ROWS], rollout_processor_kwargs=[{"extra_body": {"reasoning_effort": "low"}}], - rollout_processor=default_single_turn_rollout_processor, + rollout_processor=SingleTurnRolloutProcessor(), aggregation_method="mean", passed_threshold=None, num_runs=4, diff --git a/eval_protocol/benchmarks/suites/tau_bench_retail.py b/eval_protocol/benchmarks/suites/tau_bench_retail.py index 8e8aaea0..6c0a8a36 100644 --- a/eval_protocol/benchmarks/suites/tau_bench_retail.py +++ b/eval_protocol/benchmarks/suites/tau_bench_retail.py @@ -13,7 +13,7 @@ from eval_protocol.benchmarks.registry import export_benchmark from eval_protocol.models import EvaluateResult, EvaluationRow, InputMetadata, Message from eval_protocol.pytest import evaluation_test -from eval_protocol.pytest.default_mcp_gym_rollout_processor import default_mcp_gym_rollout_processor +from eval_protocol.pytest.default_mcp_gym_rollout_processor import MCPGymRolloutProcessor from vendor.tau2.data_model.message import ( AssistantMessage, SystemMessage, @@ -73,7 +73,7 @@ def tau_bench_retail_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evalu "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b", } ], - rollout_processor=default_mcp_gym_rollout_processor, + rollout_processor=MCPGymRolloutProcessor(), rollout_processor_kwargs={"domain": "retail"}, num_runs=8, mode="pointwise", diff --git a/eval_protocol/mcp/execution/manager.py b/eval_protocol/mcp/execution/manager.py index 405e72b4..b0359d79 100644 --- a/eval_protocol/mcp/execution/manager.py +++ b/eval_protocol/mcp/execution/manager.py @@ -35,7 +35,7 @@ class ExecutionManager: Manage rollout for MCP environments. """ - async def execute_rollouts( + def execute_rollouts( self, envs: "GeneralMCPVectorEnv", policy: Union["LLMBasePolicy", Callable], @@ -43,7 +43,7 @@ async def execute_rollouts( openai_format_log_file: Optional[str] = None, max_concurrent_rollouts: int = 8, evaluation_rows: Optional[List[EvaluationRow]] = None, - ) -> AsyncIterator[EvaluationRow]: + ) -> List[asyncio.Task[EvaluationRow]]: """ Execute general rollouts using tool calling interface with automatic record/playback. @@ -66,7 +66,7 @@ async def execute_rollouts( - Set and file exists: Playback mode (uses recorded data) Returns: - AsyncIterator of EvaluationRow objects with unified evaluation data format + List of asyncio.Task objects for external handling """ start_time = time.time() @@ -138,7 +138,7 @@ async def _execute_with_semaphore(idx): if trajectory.terminated: if trajectory.termination_reason == TerminationReason.ERROR: evaluation_row.rollout_status.status = "error" - evaluation_row.rollout_status.error_message = trajectory.control_plane_summary.get( + evaluation_row.rollout_status.termination_reason = trajectory.control_plane_summary.get( "error_message", None ) else: @@ -151,18 +151,7 @@ async def _execute_with_semaphore(idx): # Create all tasks tasks = [asyncio.create_task(_execute_with_semaphore(i)) for i in range(envs.n)] - - # Yield results as they complete (note that they're not necessarily in original order) - try: - for task in asyncio.as_completed(tasks): - try: - yield await task - except Exception: - logger.exception("Error processing rollout") - finally: - for t in tasks: - t.cancel() - await asyncio.gather(*tasks, return_exceptions=True) + return tasks async def _execute_rollout( self, diff --git a/eval_protocol/mcp_env.py b/eval_protocol/mcp_env.py index 5d930a4e..f5d09ba0 100644 --- a/eval_protocol/mcp_env.py +++ b/eval_protocol/mcp_env.py @@ -236,7 +236,7 @@ def make( return mcp_envs -async def rollout( +def rollout( envs: GeneralMCPVectorEnv, policy: Union[FireworksPolicy, LLMBasePolicy, Callable], *, @@ -246,7 +246,7 @@ async def rollout( steps: int = 512, openai_format_log_file: Optional[str] = None, max_concurrent_rollouts: int = 8, -) -> AsyncIterator[EvaluationRow]: +) -> List[asyncio.Task[EvaluationRow]]: """ Execute general rollouts using tool calling interface with automatic record/playback. @@ -274,14 +274,14 @@ async def rollout( - Set and file exists: Playback mode (uses recorded data) Returns: - List of EvaluationRow objects + List of asyncio.Task objects for external handling Example: # Live mode - evaluation_rows = await ep.rollout(envs, policy) + tasks = ep.rollout(envs, policy) # Create environments automatically - trajectories = await ep.rollout( + tasks = ep.rollout( "http://localhost:8000/mcp/", policy, evaluation_rows=my_evaluation_rows, @@ -290,10 +290,10 @@ async def rollout( # Recording mode os.environ["EP_PLAYBACK_FILE"] = "record.jsonl" - evaluation_rows = await ep.rollout(envs, policy, openai_format_log_file="sft_data.jsonl") + tasks = ep.rollout(envs, policy, openai_format_log_file="sft_data.jsonl") # Playback mode (after recording file exists) - evaluation_rows = await ep.rollout(envs, policy) + tasks = ep.rollout(envs, policy) """ # Automatically create environments if a base URL is provided if isinstance(envs, str): @@ -301,15 +301,15 @@ async def rollout( raise ValueError("Either 'evaluation_rows' or 'dataset' must be provided when envs is a URL") auto_model_id = model_id or getattr(policy, "model_id", "unknown") - envs = await make(envs, evaluation_rows=evaluation_rows, dataset=dataset, model_id=auto_model_id) + envs = make(envs, evaluation_rows=evaluation_rows, dataset=dataset, model_id=auto_model_id) # Use the new ExecutionManager for execution execution_manager = ExecutionManager() - async for evaluation_row in execution_manager.execute_rollouts( + tasks = execution_manager.execute_rollouts( envs, policy, steps, openai_format_log_file, max_concurrent_rollouts, evaluation_rows - ): - yield evaluation_row + ) + return tasks async def test_mcp(base_url: str, seeds: List[int]) -> Dict[str, Any]: @@ -336,7 +336,7 @@ async def test_mcp(base_url: str, seeds: List[int]) -> Dict[str, Any]: policy = FireworksPolicy("test-model") # Run short rollout - evaluation_rows = await rollout(envs, policy=policy, steps=10) + evaluation_rows = rollout(envs, policy=policy, steps=10) if evaluation_rows and len(evaluation_rows[0].messages) > 1: results["successful"] += 1 diff --git a/eval_protocol/pytest/__init__.py b/eval_protocol/pytest/__init__.py index 2d2576d6..171fa3dc 100644 --- a/eval_protocol/pytest/__init__.py +++ b/eval_protocol/pytest/__init__.py @@ -1,18 +1,19 @@ -from .default_agent_rollout_processor import default_agent_rollout_processor +from .default_agent_rollout_processor import AgentRolloutProcessor from .default_dataset_adapter import default_dataset_adapter -from .default_mcp_gym_rollout_processor import default_mcp_gym_rollout_processor -from .default_no_op_rollout_process import default_no_op_rollout_processor -from .default_single_turn_rollout_process import default_single_turn_rollout_processor +from .default_mcp_gym_rollout_processor import MCPGymRolloutProcessor +from .default_no_op_rollout_processor import NoOpRolloutProcessor +from .default_single_turn_rollout_process import SingleTurnRolloutProcessor from .evaluation_test import evaluation_test -from .types import RolloutProcessor, RolloutProcessorConfig +from .rollout_processor import RolloutProcessor +from .types import RolloutProcessorConfig __all__ = [ - "default_agent_rollout_processor", - "default_mcp_gym_rollout_processor", - "default_no_op_rollout_processor", - "default_single_turn_rollout_processor", - "default_dataset_adapter", + "AgentRolloutProcessor", + "MCPGymRolloutProcessor", "RolloutProcessor", + "SingleTurnRolloutProcessor", + "NoOpRolloutProcessor", + "default_dataset_adapter", "RolloutProcessorConfig", "evaluation_test", ] diff --git a/eval_protocol/pytest/default_agent_rollout_processor.py b/eval_protocol/pytest/default_agent_rollout_processor.py index 57b3ef73..f87e4c31 100644 --- a/eval_protocol/pytest/default_agent_rollout_processor.py +++ b/eval_protocol/pytest/default_agent_rollout_processor.py @@ -13,6 +13,7 @@ from eval_protocol.mcp.execution.policy import LiteLLMPolicy from eval_protocol.mcp.mcp_multi_client import MCPMultiClient from eval_protocol.models import EvaluationRow, Message +from eval_protocol.pytest.rollout_processor import RolloutProcessor from eval_protocol.pytest.types import Dataset, RolloutProcessorConfig logger = logging.getLogger(__name__) @@ -122,46 +123,36 @@ def _format_tool_message_content( return [ChatCompletionContentPartTextParam(text=c.text, type="text") for c in content] -async def default_agent_rollout_processor( - rows: List[EvaluationRow], config: RolloutProcessorConfig -) -> AsyncIterator[EvaluationRow]: - """Process agent rollouts with bounded concurrency and yield as they complete.""" +class AgentRolloutProcessor(RolloutProcessor): + """Agent rollout processor for tool-calling agents.""" - max_concurrent = getattr(config, "max_concurrent_rollouts", 8) or 8 - semaphore = asyncio.Semaphore(max_concurrent) + def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) -> List[asyncio.Task[EvaluationRow]]: + """Create agent rollout tasks and return them for external handling.""" - async def process_row(row: EvaluationRow) -> EvaluationRow: - """Process a single row with agent rollout.""" - agent = Agent( - model=config.completion_params["model"], row=row, config_path=config.mcp_config_path, logger=config.logger - ) - try: - await agent.setup() - await agent.call_agent() - return agent.evaluation_row - finally: - if agent.mcp_client: - await agent.mcp_client.cleanup() - - async def _sem_wrapper(r: EvaluationRow) -> EvaluationRow: - async with semaphore: - try: - return await process_row(r) - except Exception as e: - logger.exception(f"Error processing row {r.input_metadata.row_id}: {e}") - return r - - # Create all tasks - tasks = [asyncio.create_task(_sem_wrapper(row)) for row in rows] + max_concurrent = getattr(config, "max_concurrent_rollouts", 8) or 8 + semaphore = asyncio.Semaphore(max_concurrent) - # Yield results as they complete (note that they're not necessarily in original order) - try: - for task in asyncio.as_completed(tasks): + async def process_row(row: EvaluationRow) -> EvaluationRow: + """Process a single row with agent rollout.""" + agent = Agent( + model=config.completion_params["model"], + row=row, + config_path=config.mcp_config_path, + logger=config.logger, + ) try: - yield await task - except Exception: - logger.exception("Error processing row") - finally: - for t in tasks: - t.cancel() - await asyncio.gather(*tasks, return_exceptions=True) + await agent.setup() + await agent.call_agent() + return agent.evaluation_row + finally: + if agent.mcp_client: + await agent.mcp_client.cleanup() + + async def _sem_wrapper(r: EvaluationRow) -> EvaluationRow: + async with semaphore: + result = await process_row(r) + return result + + # Create and return tasks for external handling + tasks = [asyncio.create_task(_sem_wrapper(row)) for row in rows] + return tasks diff --git a/eval_protocol/pytest/default_mcp_gym_rollout_processor.py b/eval_protocol/pytest/default_mcp_gym_rollout_processor.py index 2b90239d..b7376e9c 100644 --- a/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +++ b/eval_protocol/pytest/default_mcp_gym_rollout_processor.py @@ -6,10 +6,11 @@ import subprocess import time from pathlib import Path -from typing import AsyncIterator, List, Optional +from typing import List, Optional import eval_protocol as ep -from eval_protocol.models import EvaluationRow, Message +from eval_protocol.models import EvaluationRow +from eval_protocol.pytest.rollout_processor import RolloutProcessor from eval_protocol.pytest.types import RolloutProcessorConfig @@ -192,53 +193,73 @@ def __exit__(self, exc_type, exc_val, exc_tb): return False # Don't suppress exceptions -async def default_mcp_gym_rollout_processor( - rows: List[EvaluationRow], config: RolloutProcessorConfig -) -> AsyncIterator[EvaluationRow]: +class MCPGymRolloutProcessor(RolloutProcessor): """ Rollout processor for tau bench environments. - This processor starts an MCP server, creates tau bench environments, and runs rollouts - using the eval_protocol framework, yielding results as they complete. + This processor starts an MCP server, creates tau bench environments, and returns rollout tasks + using the eval_protocol framework with proper cleanup handling. + """ - Args: - rows: List of EvaluationRow objects containing messages and dataset info in input_metadata - config: RolloutProcessorConfig with model and other parameters + def __init__(self): + self.server = None + self.policy = None - Returns: - AsyncIterator of EvaluationRow objects with completed conversations - """ - if config.server_script_path is None: - raise ValueError("server_script_path is required for default_mcp_gym_rollout_processor") - server = MCPServerManager(config.server_script_path, port=9700, **(config.kwargs or {})) - - try: - server.start() - - policy = ep.LiteLLMPolicy( - model_id=config.completion_params.model, - temperature=config.completion_params.get("temperature", 0.0), - max_tokens=config.completion_params.get("max_tokens", 4096), - reasoning_effort=config.completion_params.get("reasoning_effort", None), - ) + def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) -> List[asyncio.Task[EvaluationRow]]: + """Process evaluation rows with MCP gym environments.""" + start_server = config.kwargs.get("start_server", True) if config.kwargs else True + + if start_server: + # Create fresh MCP server and environments for this run + if config.server_script_path is None: + raise ValueError("server_script_path is required for MCPGymRolloutProcessor") + + self.server = MCPServerManager(config.server_script_path, port=9700, **(config.kwargs or {})) + + try: + self.server.start() + + self.policy = ep.LiteLLMPolicy( + model_id=config.completion_params.get("model", None), + temperature=config.completion_params.get("temperature", 0.0), + max_tokens=config.completion_params.get("max_tokens", 4096), + reasoning_effort=config.completion_params.get("reasoning_effort", None), + ) + + except Exception as e: + if self.server: + self.server.stop() + self.server = None + self.policy = None + raise e + + else: + # Reuse existing MCP environments for retry + if not self.server or not self.policy: + raise RuntimeError( + "Cannot retry without existing server/environments. Call with start_server=True first." + ) # Create MCP environments directly from evaluation_rows envs = ep.make( "http://localhost:9700/mcp/", evaluation_rows=rows, - model_id=policy.model_id, + model_id=self.policy.model_id, ) - # Run rollout with environments and policy - async for evaluation_row in ep.rollout( + # Get rollout tasks from ep.rollout + tasks = ep.rollout( envs, - policy=policy, + policy=self.policy, evaluation_rows=rows, steps=config.steps, max_concurrent_rollouts=config.max_concurrent_rollouts, - ): - yield evaluation_row - - finally: - # Always clean up the server - server.stop() + ) + return tasks + + def cleanup(self) -> None: + """Cleanup MCP server and environments.""" + if self.server: + self.server.stop() + self.server = None + self.policy = None diff --git a/eval_protocol/pytest/default_no_op_rollout_process.py b/eval_protocol/pytest/default_no_op_rollout_process.py deleted file mode 100644 index 47cb17be..00000000 --- a/eval_protocol/pytest/default_no_op_rollout_process.py +++ /dev/null @@ -1,15 +0,0 @@ -from typing import AsyncIterator, List - -from eval_protocol.models import EvaluationRow -from eval_protocol.pytest.types import RolloutProcessorConfig - - -async def default_no_op_rollout_processor( - rows: List[EvaluationRow], config: RolloutProcessorConfig -) -> AsyncIterator[EvaluationRow]: - """ - Simply passes input dataset through to the test function. This can be useful - if you want to run the rollout yourself. - """ - for row in rows: - yield row diff --git a/eval_protocol/pytest/default_no_op_rollout_processor.py b/eval_protocol/pytest/default_no_op_rollout_processor.py new file mode 100644 index 00000000..973d6083 --- /dev/null +++ b/eval_protocol/pytest/default_no_op_rollout_processor.py @@ -0,0 +1,27 @@ +import asyncio +from typing import List + +from eval_protocol.models import EvaluationRow +from eval_protocol.pytest.rollout_processor import RolloutProcessor +from eval_protocol.pytest.types import RolloutProcessorConfig + + +class NoOpRolloutProcessor(RolloutProcessor): + """ + No-op rollout processor that passes input dataset through unchanged. + + Simply returns the input rows as completed tasks. This is useful for testing + or when you want to handle rollout processing manually. + """ + + def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) -> List[asyncio.Task[EvaluationRow]]: + """Process rows by returning them unchanged (no-op implementation).""" + + async def return_row(row: EvaluationRow) -> EvaluationRow: + return row + + # Create tasks that immediately return the rows (no-op) + tasks = [asyncio.create_task(return_row(row)) for row in rows] + return tasks + + # Inherits cleanup() from RolloutProcessor - no override needed diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py index ef2ad48b..bf43b7da 100644 --- a/eval_protocol/pytest/default_single_turn_rollout_process.py +++ b/eval_protocol/pytest/default_single_turn_rollout_process.py @@ -2,129 +2,117 @@ import logging import os import time -from typing import AsyncIterator, List +from typing import List -import litellm from litellm import acompletion from openai.types.chat.chat_completion_message import ChatCompletionMessageToolCall from eval_protocol.dataset_logger import default_logger from eval_protocol.models import EvaluationRow, Message +from eval_protocol.pytest.rollout_processor import RolloutProcessor from eval_protocol.pytest.types import RolloutProcessorConfig logger = logging.getLogger(__name__) -async def default_single_turn_rollout_processor( - rows: List[EvaluationRow], config: RolloutProcessorConfig -) -> AsyncIterator[EvaluationRow]: - """Generate a single response from any supported model provider using LiteLLM.""" - - # Quiet LiteLLM logs in test runs unless user overrode - try: - if os.environ.get("LITELLM_LOG") is None: - os.environ["LITELLM_LOG"] = "ERROR" - _llog = logging.getLogger("LiteLLM") - _llog.setLevel(logging.CRITICAL) - _llog.propagate = False - for _h in list(_llog.handlers): - _llog.removeHandler(_h) - except Exception: - pass - - # Do not modify global LiteLLM cache. Disable caching per-request instead. - - async def process_row(row: EvaluationRow) -> EvaluationRow: - """Process a single row asynchronously.""" - if len(row.messages) == 0: - raise ValueError("Messages is empty. Please provide a non-empty dataset") - - messages_payload = [{"role": m.role, "content": m.content} for m in row.messages] - - request_params = {"messages": messages_payload, **config.completion_params} - # Ensure caching is disabled only for this request (review feedback) - request_params["cache"] = {"no-cache": True} - # Single-level reasoning effort: expect `reasoning_effort` only - effort_val = None - - if "reasoning_effort" in config.completion_params: - effort_val = str(config.completion_params["reasoning_effort"]) # flat shape - elif ( - isinstance(config.completion_params.get("extra_body"), dict) - and "reasoning_effort" in config.completion_params["extra_body"] - ): - # Accept if user passed it directly inside extra_body - effort_val = str(config.completion_params["extra_body"]["reasoning_effort"]) # already in extra_body - - if effort_val: - # Always under extra_body so LiteLLM forwards to provider-specific param set - request_params.setdefault("extra_body", {}) - request_params["extra_body"]["reasoning_effort"] = effort_val - # Ensure unsupported top-level keys are not present - if "reasoning_effort" in request_params: - request_params.pop("reasoning_effort", None) - - if row.tools is not None: - request_params["tools"] = row.tools - - # Dynamic import to avoid static dependency/lint errors if LiteLLM isn't installed yet - import importlib - - _litellm = importlib.import_module("litellm") - acompletion = getattr(_litellm, "acompletion") - response = await acompletion(**request_params) - - assistant_content = response.choices[0].message.content or "" - tool_calls = response.choices[0].message.tool_calls if response.choices[0].message.tool_calls else None - - converted_tool_calls = None - if tool_calls: - converted_tool_calls = [ - ChatCompletionMessageToolCall( - id=tool_call.id, - type=tool_call.type, - function={ - "name": tool_call.function.name, - "arguments": tool_call.function.arguments, - }, +class SingleTurnRolloutProcessor(RolloutProcessor): + """Single turn rollout processor for direct LLM calls.""" + + def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) -> List[asyncio.Task[EvaluationRow]]: + """Generate single turn rollout tasks and return them for external handling.""" + + # Quiet LiteLLM logs in test runs unless user overrode + try: + if os.environ.get("LITELLM_LOG") is None: + os.environ["LITELLM_LOG"] = "ERROR" + _llog = logging.getLogger("LiteLLM") + _llog.setLevel(logging.CRITICAL) + _llog.propagate = False + for _h in list(_llog.handlers): + _llog.removeHandler(_h) + except Exception: + pass + + # Do not modify global LiteLLM cache. Disable caching per-request instead. + + async def process_row(row: EvaluationRow) -> EvaluationRow: + """Process a single row asynchronously.""" + if len(row.messages) == 0: + raise ValueError("Messages is empty. Please provide a non-empty dataset") + + messages_payload = [{"role": m.role, "content": m.content} for m in row.messages] + + request_params = {"messages": messages_payload, **config.completion_params} + # Ensure caching is disabled only for this request (review feedback) + request_params["cache"] = {"no-cache": True} + # Single-level reasoning effort: expect `reasoning_effort` only + effort_val = None + + if "reasoning_effort" in config.completion_params: + effort_val = str(config.completion_params["reasoning_effort"]) # flat shape + elif ( + isinstance(config.completion_params.get("extra_body"), dict) + and "reasoning_effort" in config.completion_params["extra_body"] + ): + # Accept if user passed it directly inside extra_body + effort_val = str(config.completion_params["extra_body"]["reasoning_effort"]) # already in extra_body + + if effort_val: + # Always under extra_body so LiteLLM forwards to provider-specific param set + request_params.setdefault("extra_body", {}) + request_params["extra_body"]["reasoning_effort"] = effort_val + # Ensure unsupported top-level keys are not present + if "reasoning_effort" in request_params: + request_params.pop("reasoning_effort", None) + + if row.tools is not None: + request_params["tools"] = row.tools + + # Dynamic import to avoid static dependency/lint errors if LiteLLM isn't installed yet + import importlib + + _litellm = importlib.import_module("litellm") + acompletion = getattr(_litellm, "acompletion") + response = await acompletion(**request_params) + + assistant_content = response.choices[0].message.content or "" + tool_calls = response.choices[0].message.tool_calls if response.choices[0].message.tool_calls else None + + converted_tool_calls = None + if tool_calls: + converted_tool_calls = [ + ChatCompletionMessageToolCall( + id=tool_call.id, + type=tool_call.type, + function={ + "name": tool_call.function.name, + "arguments": tool_call.function.arguments, + }, + ) + for tool_call in tool_calls + ] + + messages = list(row.messages) + [ + Message( + role="assistant", + content=assistant_content, + tool_calls=converted_tool_calls, ) - for tool_call in tool_calls ] - messages = list(row.messages) + [ - Message( - role="assistant", - content=assistant_content, - tool_calls=converted_tool_calls, - ) - ] - - row.messages = messages - default_logger.log(row) - return row - - # Process rows with bounded concurrency and yield as they complete - max_concurrent = getattr(config, "max_concurrent_rollouts", 8) or 8 - semaphore = asyncio.Semaphore(max_concurrent) - - async def _sem_wrapper(r: EvaluationRow) -> EvaluationRow: - async with semaphore: - try: - return await process_row(r) - except Exception as e: - return r - - # Create all tasks - tasks = [asyncio.create_task(_sem_wrapper(row)) for row in rows] - - # Yield results as they complete (note that they're not necessarily in original order) - try: - for task in asyncio.as_completed(tasks): - try: - yield await task - except Exception: - logger.exception("Error processing row") - finally: - for t in tasks: - t.cancel() - await asyncio.gather(*tasks, return_exceptions=True) + row.messages = messages + default_logger.log(row) + return row + + # Process rows with bounded concurrency + max_concurrent = getattr(config, "max_concurrent_rollouts", 8) or 8 + semaphore = asyncio.Semaphore(max_concurrent) + + async def _sem_wrapper(r: EvaluationRow) -> EvaluationRow: + async with semaphore: + result = await process_row(r) + return result + + # Create and return tasks for external handling + tasks = [asyncio.create_task(_sem_wrapper(row)) for row in rows] + return tasks diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index dd7ecb04..6127c7b9 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -8,6 +8,7 @@ import re import statistics import time +from dataclasses import replace from typing import Any, Callable, Dict, List, Literal, Optional, Union import pytest @@ -24,7 +25,8 @@ Message, ) from eval_protocol.pytest.default_dataset_adapter import default_dataset_adapter -from eval_protocol.pytest.default_no_op_rollout_process import default_no_op_rollout_processor +from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor +from eval_protocol.pytest.rollout_processor import RolloutProcessor from eval_protocol.pytest.types import ( Dataset, DatasetPathParam, @@ -32,7 +34,6 @@ EvaluationTestMode, InputMessagesParam, ModelParam, - RolloutProcessor, RolloutProcessorConfig, RolloutProcessorInputParam, TestFunction, @@ -41,8 +42,14 @@ AggregationMethod, aggregate, create_dynamically_parameterized_wrapper, + deep_update_dict, execute_function, + extract_effort_tag, + generate_parameter_combinations, log_eval_status_and_rows, + parse_ep_max_rows, + rollout_processor_with_retry, + sanitize_filename, ) from eval_protocol.stats.confidence_intervals import compute_fixed_set_mu_ci @@ -55,7 +62,7 @@ def evaluation_test( # noqa: C901 input_messages: Optional[List[InputMessagesParam]] = None, input_dataset: Optional[List[DatasetPathParam]] = None, dataset_adapter: Callable[[List[Dict[str, Any]]], Dataset] = default_dataset_adapter, - rollout_processor: RolloutProcessor = default_no_op_rollout_processor, + rollout_processor: RolloutProcessor = NoOpRolloutProcessor(), evaluation_test_kwargs: Optional[List[EvaluationInputParam]] = None, rollout_processor_kwargs: Optional[RolloutProcessorInputParam] = None, aggregation_method: AggregationMethod = "mean", @@ -200,76 +207,15 @@ async def execute_with_params( return test_func(**kwargs) # Calculate all possible combinations of parameters - def _parse_ep_max_rows(default_value: int | None) -> int | None: - """Read EP_MAX_DATASET_ROWS env override as int or None.""" - raw = os.getenv("EP_MAX_DATASET_ROWS") - if raw is None: - return default_value - s = raw.strip().lower() - if s == "none": - return None - try: - return int(s) - except ValueError: - return default_value - - def _deep_update_dict(base: dict, override: dict) -> dict: - """Recursively update nested dictionaries in-place and return base.""" - for key, value in override.items(): - if isinstance(value, dict) and isinstance(base.get(key), dict): - _deep_update_dict(base[key], value) - else: - base[key] = value - return base - - def generate_combinations(): - combinations = [] - - # Handle optional parameters with defaults - # Optionally combine multiple dataset paths into one logical dataset, - # or parameterize to run one dataset per test invocation. - if input_dataset is not None: - if combine_datasets: - datasets: List[Optional[List[DatasetPathParam]]] = [input_dataset] # type: ignore - else: - # Fan out: one dataset path per parameterization - if isinstance(input_dataset, list): # type: ignore - datasets = [[p] for p in input_dataset] # type: ignore - else: - datasets = [[input_dataset]] # type: ignore - else: - datasets = [None] - cps: List[Optional[CompletionParams]] = completion_params if completion_params is not None else [None] # type: ignore - # Apply EP_MAX_DATASET_ROWS to input_messages, but do NOT parameterize over - # each row. Instead, pass the entire sliced list through in a single test run - # so summaries aggregate all rows together (AIME-style behavior). - if input_messages is not None and isinstance(input_messages, list): - effective_max_rows = _parse_ep_max_rows(max_dataset_rows) - if effective_max_rows is not None: - sliced_messages = input_messages[:effective_max_rows] # type: ignore - else: - sliced_messages = input_messages # type: ignore - # Wrap as a single parameter payload - messages = [sliced_messages] # type: ignore - else: - messages = [None] # type: ignore - kwargs: List[Optional[EvaluationInputParam]] = evaluation_test_kwargs if evaluation_test_kwargs is not None else [None] # type: ignore - - # Generate all combinations - for ds in datasets: - for cp in cps: - for im in messages: - for etk in kwargs: - # if no dataset and no messages, raise an error - if ds is None and im is None: - raise ValueError( - "No dataset or messages provided. Please provide at least one of input_dataset or input_messages." - ) - combinations.append((ds, cp, im, etk)) - - return combinations - combinations = generate_combinations() + combinations = generate_parameter_combinations( + input_dataset, + completion_params, + input_messages, + evaluation_test_kwargs, + max_dataset_rows, + combine_datasets, + ) if len(combinations) == 0: raise ValueError( "No combinations of parameters were found. Please provide at least a model and one of input_dataset or input_messages." @@ -331,7 +277,7 @@ def _log_eval_error( else: data_jsonl = load_jsonl(ds_arg) # Apply env override for max rows if present - effective_max_rows = _parse_ep_max_rows(max_dataset_rows) + effective_max_rows = parse_ep_max_rows(max_dataset_rows) if effective_max_rows is not None: data_jsonl = data_jsonl[:effective_max_rows] data = dataset_adapter(data_jsonl) @@ -367,7 +313,7 @@ def _log_eval_error( if _env_override: override_obj = _json.loads(_env_override) if isinstance(override_obj, dict): - completion_params = _deep_update_dict(dict(completion_params), override_obj) + completion_params = deep_update_dict(dict(completion_params), override_obj) except Exception: pass @@ -410,6 +356,8 @@ def _log_eval_error( kwargs=rollout_processor_kwargs or {}, ) + max_retry = int(os.getenv("EP_MAX_RETRY", "0")) + for i in range(num_runs): # Regenerate outputs each run by deep-copying the pristine dataset # so model responses are not reused across runs. @@ -428,8 +376,6 @@ def _log_eval_error( for row in fresh_dataset: active_logger.log(row) - rollout_result = rollout_processor(fresh_dataset, config) - if mode == "pointwise": # Pointwise mode, rollouts will return as they complete so we can pipeline evaluation_test execution semaphore = asyncio.Semaphore(max_concurrent_rollouts) @@ -437,6 +383,8 @@ def _log_eval_error( async def _execute_with_semaphore(row): async with semaphore: + # NOTE: we will still evaluate errored rows (give users control over this) + # i.e., they can choose to give EvaluateResult.score = 0 for errored rows in their test_func result = await execute_with_params( test_func, processed_row=row, @@ -448,7 +396,10 @@ async def _execute_with_semaphore(row): ) return result - async for row in rollout_processor(fresh_dataset, config): + # Use wrapper that handles retry logic internally + async for row in rollout_processor_with_retry( + rollout_processor, fresh_dataset, config, max_retry + ): tasks.append(asyncio.create_task(_execute_with_semaphore(row))) all_results[i] = await asyncio.gather(*tasks) @@ -456,9 +407,12 @@ async def _execute_with_semaphore(row): else: # Batch mode: collect all results first, then evaluate (no pipelining) input_dataset = [] - async for row in rollout_result: + async for row in rollout_processor_with_retry( + rollout_processor, fresh_dataset, config, max_retry + ): input_dataset.append(row) - + # NOTE: we will still evaluate errored rows (give users control over this) + # i.e., they can choose to give EvaluateResult.score = 0 for errored rows in their test_func results = await execute_with_params( test_func, processed_dataset=input_dataset, @@ -517,11 +471,10 @@ async def _execute_with_semaphore(row): passed = success_passed and std_passed - # Update eval metadata status and passed field for all results + # Update eval metadata passed field for all results for result in all_results: for r in result: if r.eval_metadata is not None: - r.eval_metadata.status = "finished" r.eval_metadata.passed = passed active_logger.log(r) @@ -530,7 +483,7 @@ async def _execute_with_semaphore(row): should_print = os.getenv("EP_PRINT_SUMMARY") == "1" summary_path = os.getenv("EP_SUMMARY_JSON") suite_name = test_func.__name__ - model_used = config.completion_params.model + model_used = config.completion_params["model"] total_rows = len([item for sublist in all_results for item in sublist]) summary_obj = { "suite": suite_name, @@ -587,35 +540,9 @@ async def _execute_with_semaphore(row): ) # As per project convention, avoid printing per-metric CI lines to reduce noise if summary_path: - - def _sanitize_filename(text: str) -> str: - safe = re.sub(r"[^A-Za-z0-9._-]+", "-", text.strip()) - return safe[:120] - - def _extract_effort_tag(params: dict) -> str | None: - try: - if not isinstance(params, dict): - return None - # Common locations - if "extra_body" in params and isinstance(params["extra_body"], dict): - eb = params["extra_body"] - if isinstance(eb.get("reasoning"), dict) and "effort" in eb["reasoning"]: - return str(eb["reasoning"]["effort"]).lower() - if "reasoning_effort" in eb: - return str(eb["reasoning_effort"]).lower() - if ( - "reasoning" in params - and isinstance(params["reasoning"], dict) - and "effort" in params["reasoning"] - ): - return str(params["reasoning"]["effort"]).lower() - except Exception: - return None - return None - - model_slug = _sanitize_filename(model_used) - effort_tag = _extract_effort_tag(completion_params) or "" - effort_suffix = f"__effort-{_sanitize_filename(effort_tag)}" if effort_tag else "" + model_slug = sanitize_filename(model_used) + effort_tag = extract_effort_tag(completion_params) or "" + effort_suffix = f"__effort-{sanitize_filename(effort_tag)}" if effort_tag else "" base_name = f"{suite_name}__{model_slug}{effort_suffix}__{mode}__runs{num_runs}.json" p = pathlib.Path(summary_path) @@ -633,7 +560,7 @@ def _extract_effort_tag(params: dict) -> str | None: parent.mkdir(parents=True, exist_ok=True) # If we detected an effort tag, fan out to separate files; otherwise write to the exact file if effort_tag: - out_file = parent / f"{p.stem}__{_sanitize_filename(effort_tag)}{p.suffix}" + out_file = parent / f"{p.stem}__{sanitize_filename(effort_tag)}{p.suffix}" else: out_file = p @@ -822,7 +749,7 @@ def run_evaluation_test_direct( input_dataset: Optional[List[DatasetPathParam]] = None, dataset_adapter: Callable[[List[Dict[str, Any]]], Dataset] = default_dataset_adapter, completion_params: Optional[CompletionParams] = None, - rollout_processor: RolloutProcessor = default_no_op_rollout_processor, + rollout_processor: RolloutProcessor = NoOpRolloutProcessor(), rollout_processor_kwargs: Optional[RolloutProcessorInputParam] = None, aggregation_method: AggregationMethod = "mean", passed_threshold: Optional[Union[EvaluationThreshold, float]] = None, @@ -844,26 +771,6 @@ def run_evaluation_test_direct( if passed_threshold is not None and not isinstance(passed_threshold, EvaluationThreshold): passed_threshold = EvaluationThreshold(success=passed_threshold) - def _parse_ep_max_rows(default_value: int | None) -> int | None: - raw = os.getenv("EP_MAX_DATASET_ROWS") - if raw is None: - return default_value - s = raw.strip().lower() - if s == "none": - return None - try: - return int(s) - except ValueError: - return default_value - - def _deep_update_dict(base: dict, override: dict) -> dict: - for key, value in override.items(): - if isinstance(value, dict) and isinstance(base.get(key), dict): - _deep_update_dict(base[key], value) - else: - base[key] = value - return base - # Build dataset/messages data: List[EvaluationRow] = [] if input_dataset is not None: @@ -871,12 +778,12 @@ def _deep_update_dict(base: dict, override: dict) -> dict: data_jsonl: List[Dict[str, Any]] = [] for p in input_dataset: data_jsonl.extend(load_jsonl(p)) - effective_max_rows = _parse_ep_max_rows(max_dataset_rows) + effective_max_rows = parse_ep_max_rows(max_dataset_rows) if effective_max_rows is not None: data_jsonl = data_jsonl[:effective_max_rows] data = dataset_adapter(data_jsonl) elif input_messages is not None: - effective_max_rows = _parse_ep_max_rows(max_dataset_rows) + effective_max_rows = parse_ep_max_rows(max_dataset_rows) msgs = input_messages if effective_max_rows is not None and isinstance(msgs, list): msgs = msgs[:effective_max_rows] # type: ignore @@ -896,7 +803,7 @@ def _deep_update_dict(base: dict, override: dict) -> dict: if _env_override: override_obj = _json.loads(_env_override) if isinstance(override_obj, dict): - completion_params = _deep_update_dict(dict(completion_params), override_obj) + completion_params = deep_update_dict(dict(completion_params), override_obj) except Exception: pass @@ -990,7 +897,7 @@ def _deep_update_dict(base: dict, override: dict) -> dict: total_rows = len(all_results) summary_obj = { "suite": suite_name, - "model": config.completion_params.model, + "model": config.completion_params["model"], "agg_score": float(agg_score) if agg_score is not None else None, "num_runs": num_runs, "rows": total_rows, @@ -1001,45 +908,20 @@ def _deep_update_dict(base: dict, override: dict) -> dict: if should_print: if ci_low is not None and ci_high is not None: print( - f"EP Summary | suite={suite_name} model={config.completion_params.model} agg={summary_obj['agg_score']:.3f} ci95=[{ci_low:.3f},{ci_high:.3f}] runs={num_runs} rows={total_rows}" + f"EP Summary | suite={suite_name} model={config.completion_params['model']} agg={summary_obj['agg_score']:.3f} ci95=[{ci_low:.3f},{ci_high:.3f}] runs={num_runs} rows={total_rows}" ) else: print( - f"EP Summary | suite={suite_name} model={config.completion_params.model} agg={summary_obj['agg_score']:.3f} runs={num_runs} rows={total_rows}" + f"EP Summary | suite={suite_name} model={config.completion_params['model']} agg={summary_obj['agg_score']:.3f} runs={num_runs} rows={total_rows}" ) if summary_path: import json as _json import pathlib as _pathlib - import re as _re import time as _time - def _sanitize_filename(text: str) -> str: - safe = _re.sub(r"[^A-Za-z0-9._-]+", "-", text.strip()) - return safe[:120] - - def _extract_effort_tag(params: dict) -> str | None: - try: - if not isinstance(params, dict): - return None - if "extra_body" in params and isinstance(params["extra_body"], dict): - eb = params["extra_body"] - if isinstance(eb.get("reasoning"), dict) and "effort" in eb["reasoning"]: - return str(eb["reasoning"]["effort"]).lower() - if "reasoning_effort" in eb: - return str(eb["reasoning_effort"]).lower() - if ( - "reasoning" in params - and isinstance(params["reasoning"], dict) - and "effort" in params["reasoning"] - ): - return str(params["reasoning"]["effort"]).lower() - except Exception: - return None - return None - - model_slug = _sanitize_filename(config.completion_params.model) - effort_tag = _extract_effort_tag(completion_params) or "" - effort_suffix = f"__effort-{_sanitize_filename(effort_tag)}" if effort_tag else "" + model_slug = sanitize_filename(config.completion_params["model"]) + effort_tag = extract_effort_tag(completion_params) or "" + effort_suffix = f"__effort-{sanitize_filename(effort_tag)}" if effort_tag else "" base_name = f"{suite_name}__{model_slug}{effort_suffix}__{mode}__runs{num_runs}.json" p = _pathlib.Path(summary_path) @@ -1052,7 +934,7 @@ def _extract_effort_tag(params: dict) -> str | None: parent = p.parent parent.mkdir(parents=True, exist_ok=True) if effort_tag: - out_file = parent / f"{p.stem}__{_sanitize_filename(effort_tag)}{p.suffix}" + out_file = parent / f"{p.stem}__{sanitize_filename(effort_tag)}{p.suffix}" else: out_file = p with open(out_file, "w", encoding="utf-8") as f: diff --git a/eval_protocol/pytest/plugin.py b/eval_protocol/pytest/plugin.py index 3a5ec0e2..4522caef 100644 --- a/eval_protocol/pytest/plugin.py +++ b/eval_protocol/pytest/plugin.py @@ -59,6 +59,23 @@ def pytest_addoption(parser) -> None: "Values: low|medium|high" ), ) + group.addoption( + "--ep-max-retry", + action="store", + type=int, + default=None, + help=("Failed rollouts (with rollout_status.status == 'error') will be retried up to this many times."), + ) + group.addoption( + "--ep-fail-on-permanent-failure", + action="store", + default=None, + choices=["true", "false"], + help=( + "Whether to fail the entire rollout when permanent failures occur after max retries. " + "Default: true (fail on permanent failures). Set to 'false' to continue with remaining rollouts." + ), + ) def _normalize_max_rows(val: Optional[str]) -> Optional[str]: @@ -100,6 +117,14 @@ def pytest_configure(config) -> None: if summary_json_path: os.environ["EP_SUMMARY_JSON"] = summary_json_path + max_retry = config.getoption("--ep-max-retry") + if max_retry is not None: + os.environ["EP_MAX_RETRY"] = str(max_retry) + + fail_on_permanent_failure = config.getoption("--ep-fail-on-permanent-failure") + if fail_on_permanent_failure is not None: + os.environ["EP_FAIL_ON_PERMANENT_FAILURE"] = fail_on_permanent_failure + # Allow ad-hoc overrides of input params via CLI flags try: import json as _json diff --git a/eval_protocol/pytest/rollout_processor.py b/eval_protocol/pytest/rollout_processor.py new file mode 100644 index 00000000..824dd015 --- /dev/null +++ b/eval_protocol/pytest/rollout_processor.py @@ -0,0 +1,21 @@ +import asyncio +from abc import ABC, abstractmethod +from typing import List + +from eval_protocol.models import EvaluationRow +from eval_protocol.pytest.types import RolloutProcessorConfig + + +class RolloutProcessor(ABC): + """ + Abstract base class for all rollout processor strategies. + """ + + @abstractmethod + def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) -> List[asyncio.Task[EvaluationRow]]: + """Process evaluation rows and return async tasks. Must be implemented by subclasses.""" + pass + + def cleanup(self) -> None: + """Cleanup resources. Override in subclasses if cleanup is needed.""" + pass diff --git a/eval_protocol/pytest/types.py b/eval_protocol/pytest/types.py index 1a80254b..8a3be489 100644 --- a/eval_protocol/pytest/types.py +++ b/eval_protocol/pytest/types.py @@ -2,8 +2,9 @@ Parameter types """ +import asyncio from dataclasses import dataclass, field -from typing import Any, AsyncIterator, Callable, Dict, List, Literal, Optional +from typing import Any, Callable, Dict, List, Literal, Optional from eval_protocol.dataset_logger import default_logger from eval_protocol.dataset_logger.dataset_logger import DatasetLogger @@ -49,6 +50,3 @@ class RolloutProcessorConfig: steps: int = 30 # max number of rollout steps logger: DatasetLogger = default_logger # logger to use during rollout for mid-rollout logs kwargs: Dict[str, Any] = field(default_factory=dict) # any additional kwargs to pass to the rollout processor - - -RolloutProcessor = Callable[[List[EvaluationRow], RolloutProcessorConfig], AsyncIterator[EvaluationRow]] diff --git a/eval_protocol/pytest/utils.py b/eval_protocol/pytest/utils.py index 23a5722d..24b60028 100644 --- a/eval_protocol/pytest/utils.py +++ b/eval_protocol/pytest/utils.py @@ -1,9 +1,20 @@ import asyncio import inspect -from typing import Any, Callable, List, Literal, Optional +import os +import re +from dataclasses import replace +from typing import Any, Callable, Dict, List, Literal, Optional, Union from eval_protocol.dataset_logger.dataset_logger import DatasetLogger from eval_protocol.models import EvalMetadata, EvaluationRow +from eval_protocol.pytest.rollout_processor import RolloutProcessor +from eval_protocol.pytest.types import ( + CompletionParams, + DatasetPathParam, + EvaluationInputParam, + InputMessagesParam, + RolloutProcessorConfig, +) def execute_function(func: Callable, **kwargs) -> Any: @@ -124,3 +135,223 @@ def log_eval_status_and_rows( if r.eval_metadata is not None: r.eval_metadata.status = status logger.log(r) + + +def parse_ep_max_rows(default_value: Optional[int]) -> Optional[int]: + """Read EP_MAX_DATASET_ROWS env override as int or None.""" + raw = os.getenv("EP_MAX_DATASET_ROWS") + if raw is None: + return default_value + s = raw.strip().lower() + if s == "none": + return None + try: + return int(s) + except ValueError: + return default_value + + +def deep_update_dict(base: dict, override: dict) -> dict: + """Recursively update nested dictionaries in-place and return base.""" + for key, value in override.items(): + if isinstance(value, dict) and isinstance(base.get(key), dict): + deep_update_dict(base[key], value) + else: + base[key] = value + return base + + +def generate_parameter_combinations( + input_dataset: Optional[List[DatasetPathParam]], + completion_params: List[CompletionParams], + input_messages: Optional[List[InputMessagesParam]], + evaluation_test_kwargs: Optional[List[EvaluationInputParam]], + max_dataset_rows: Optional[int], + combine_datasets: bool, +) -> List[tuple]: + """ + Generate all combinations of parameters for pytest parameterization. + + Args: + input_dataset: Dataset paths to use + completion_params: Completion parameters to test + input_messages: Input messages to use + evaluation_test_kwargs: Additional kwargs for evaluation tests + max_dataset_rows: Maximum number of dataset rows to process + combine_datasets: Whether to combine multiple datasets into one test + + Returns: + List of parameter tuples for pytest.mark.parametrize + """ + combinations = [] + + # Handle optional parameters with defaults + # Optionally combine multiple dataset paths into one logical dataset, + # or parameterize to run one dataset per test invocation. + if input_dataset is not None: + if combine_datasets: + datasets: List[Optional[List[DatasetPathParam]]] = [input_dataset] # type: ignore + else: + # Fan out: one dataset path per parameterization + if isinstance(input_dataset, list): # type: ignore + datasets = [[p] for p in input_dataset] # type: ignore + else: + datasets = [[input_dataset]] # type: ignore + else: + datasets = [None] + + cps: List[Optional[CompletionParams]] = completion_params if completion_params is not None else [None] # type: ignore + + # Apply EP_MAX_DATASET_ROWS to input_messages, but do NOT parameterize over + # each row. Instead, pass the entire sliced list through in a single test run + # so summaries aggregate all rows together (AIME-style behavior). + if input_messages is not None and isinstance(input_messages, list): + effective_max_rows = parse_ep_max_rows(max_dataset_rows) + if effective_max_rows is not None: + sliced_messages = input_messages[:effective_max_rows] # type: ignore + else: + sliced_messages = input_messages # type: ignore + # Wrap as a single parameter payload + messages = [sliced_messages] # type: ignore + else: + messages = [None] # type: ignore + + kwargs: List[Optional[EvaluationInputParam]] = evaluation_test_kwargs if evaluation_test_kwargs is not None else [None] # type: ignore + + # Generate all combinations + for ds in datasets: + for cp in cps: + for im in messages: + for etk in kwargs: + # if no dataset and no messages, raise an error + if ds is None and im is None: + raise ValueError( + "No dataset or messages provided. Please provide at least one of input_dataset or input_messages." + ) + combinations.append((ds, cp, im, etk)) + + return combinations + + +async def rollout_processor_with_retry( + rollout_processor: RolloutProcessor, + fresh_dataset: List[EvaluationRow], + config: RolloutProcessorConfig, + max_retry: int, +): + """ + Wrapper around rollout_processor that handles retry logic internally. + Uses async queue pattern to yield results immediately as they become available. + Yields both successful and failed results, leaving it up to the user to handle them in test_func. + """ + + try: + queue = asyncio.Queue() + retry_counts = {r.execution_metadata.rollout_id: 0 for r in fresh_dataset} + failed_permanently = [] + + async def retry_handler(failed_row: EvaluationRow): + rollout_id = failed_row.execution_metadata.rollout_id + current_attempts = retry_counts.get(rollout_id, 0) + + if current_attempts >= max_retry: + assert ( + failed_row.rollout_status and failed_row.rollout_status.status == "error" + ), f"Rollout {failed_row.execution_metadata.rollout_id} did not fail with error status" + failed_permanently.append(failed_row) + await queue.put(failed_row) # put failed row on queue + return + + retry_counts[rollout_id] = current_attempts + 1 + + # add kwargs start_server=False to config so we don't start new MCP server + retry_config = replace(config, kwargs={**(config.kwargs or {}), "start_server": False}) + + retry_tasks = rollout_processor([failed_row], retry_config) + + try: + retry_result = await retry_tasks[0] + retry_result.rollout_status.status = "finished" + await queue.put(retry_result) + except Exception as e: + failed_row.rollout_status.status = "error" + failed_row.rollout_status.termination_reason = str(e) + asyncio.create_task(retry_handler(failed_row)) # retry failed, spawn another retry + + async def initial_processor(): + """Process initial batch and spawn retries for failures""" + base_tasks = rollout_processor(fresh_dataset, config) + pending = set(base_tasks) + + while pending: + done, pending = await asyncio.wait(pending, return_when=asyncio.FIRST_COMPLETED) + + for task in done: + task_index = base_tasks.index(task) + + try: + result = await task + result.rollout_status.status = "finished" + await queue.put(result) + except Exception as e: + failed_row = fresh_dataset[task_index] + failed_row.rollout_status.status = "error" + failed_row.rollout_status.termination_reason = str(e) + asyncio.create_task(retry_handler(failed_row)) # rollout errored, spawn retry task + + processor_task = asyncio.create_task(initial_processor()) + + # yield results as they become available + completed_count = 0 + total_expected = len(fresh_dataset) + + while completed_count < total_expected: + finished_row = await queue.get() + + # only permanent failure rows are put on the queue, so we can check for them here + if finished_row.rollout_status and finished_row.rollout_status.status == "error": + if os.getenv("EP_FAIL_ON_PERMANENT_FAILURE", "true") != "false": + raise RuntimeError( + f"Rollout {finished_row.execution_metadata.rollout_id} failed after {max_retry} retries. Errors: {finished_row.rollout_status.termination_reason}" + ) + + completed_count += 1 + yield finished_row + + await processor_task # explicitly wait for task completion and catch any exceptions + + finally: + rollout_processor.cleanup() + + +def sanitize_filename(text: str) -> str: + """Sanitize text for use in filenames by replacing special characters with dashes.""" + safe = re.sub(r"[^A-Za-z0-9._-]+", "-", text.strip()) + return safe[:120] + + +def extract_effort_tag(params: dict) -> Optional[str]: + """ + Extract effort tag from completion parameters for use in file naming. + + Args: + params: Completion parameters dictionary + + Returns: + Effort tag string if found, None otherwise + """ + try: + if not isinstance(params, dict): + return None + # Common locations + if "extra_body" in params and isinstance(params["extra_body"], dict): + eb = params["extra_body"] + if isinstance(eb.get("reasoning"), dict) and "effort" in eb["reasoning"]: + return str(eb["reasoning"]["effort"]).lower() + if "reasoning_effort" in eb: + return str(eb["reasoning_effort"]).lower() + if "reasoning" in params and isinstance(params["reasoning"], dict) and "effort" in params["reasoning"]: + return str(params["reasoning"]["effort"]).lower() + except Exception: + return None + return None diff --git a/examples/gpqa/tests/test_gpqa.py b/examples/gpqa/tests/test_gpqa.py index dcbf7b53..d67e64a1 100644 --- a/examples/gpqa/tests/test_gpqa.py +++ b/examples/gpqa/tests/test_gpqa.py @@ -7,7 +7,7 @@ from eval_protocol.models import EvaluateResult, EvaluationRow, Message, MetricResult from eval_protocol.pytest.default_single_turn_rollout_process import ( - default_single_turn_rollout_processor, + SingleTurnRolloutProcessor, ) from eval_protocol.pytest.evaluation_test import evaluation_test @@ -66,7 +66,7 @@ def _load_gpqa_messages_from_csv() -> List[List[Message]]: completion_params=[ {"extra_body": {"reasoning_effort": "low"}, "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"} ], # default to low effort; override via CLI plugin - rollout_processor=default_single_turn_rollout_processor, + rollout_processor=SingleTurnRolloutProcessor(), aggregation_method="mean", passed_threshold=None, num_runs=8, diff --git a/examples/healthbench/tests/test_evaluation.py b/examples/healthbench/tests/test_evaluation.py index a40c5d96..e0c7917b 100644 --- a/examples/healthbench/tests/test_evaluation.py +++ b/examples/healthbench/tests/test_evaluation.py @@ -3,7 +3,7 @@ from eval_protocol.models import EvaluateResult, EvaluationRow, Message, MetricResult from eval_protocol.pytest.default_single_turn_rollout_process import ( - default_single_turn_rollout_processor, + SingleTurnRolloutProcessor, ) from eval_protocol.pytest.evaluation_test import evaluation_test @@ -51,7 +51,7 @@ completion_params=[ {"temperature": 0.2, "max_tokens": 512, "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"} ], - rollout_processor=default_single_turn_rollout_processor, + rollout_processor=SingleTurnRolloutProcessor(), aggregation_method="mean", passed_threshold=None, num_runs=1, diff --git a/tests/pytest/test_apps_coding.py b/tests/pytest/test_apps_coding.py index 7cb976ac..9350a381 100644 --- a/tests/pytest/test_apps_coding.py +++ b/tests/pytest/test_apps_coding.py @@ -9,7 +9,7 @@ from typing import Any, Dict, List from eval_protocol.models import EvaluateResult, EvaluationRow, Message -from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test +from eval_protocol.pytest import SingleTurnRolloutProcessor, evaluation_test from eval_protocol.rewards.apps_coding_reward import evaluate_apps_solution @@ -30,7 +30,7 @@ def apps_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio {"temperature": 0.0, "max_tokens": 4096, "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"} ], passed_threshold=0.33, - rollout_processor=default_single_turn_rollout_processor, + rollout_processor=SingleTurnRolloutProcessor(), num_runs=1, mode="pointwise", ) diff --git a/tests/pytest/test_basic_coding.py b/tests/pytest/test_basic_coding.py index 2b1c2a4a..4945d378 100644 --- a/tests/pytest/test_basic_coding.py +++ b/tests/pytest/test_basic_coding.py @@ -8,7 +8,7 @@ from typing import Any, Dict, List from eval_protocol.models import EvaluateResult, EvaluationRow, Message -from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test +from eval_protocol.pytest import SingleTurnRolloutProcessor, evaluation_test from eval_protocol.rewards.code_execution import execute_python_code, extract_code_blocks @@ -32,7 +32,7 @@ def coding_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluat {"temperature": 0.0, "max_tokens": 4096, "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"} ], passed_threshold=0.8, - rollout_processor=default_single_turn_rollout_processor, + rollout_processor=SingleTurnRolloutProcessor(), num_runs=1, mode="pointwise", ) diff --git a/tests/pytest/test_frozen_lake.py b/tests/pytest/test_frozen_lake.py index bea42bed..24e32b56 100644 --- a/tests/pytest/test_frozen_lake.py +++ b/tests/pytest/test_frozen_lake.py @@ -9,7 +9,7 @@ from eval_protocol.models import EvaluateResult, EvaluationRow, InputMetadata, Message, MetricResult from eval_protocol.pytest import evaluation_test -from eval_protocol.pytest.default_mcp_gym_rollout_processor import default_mcp_gym_rollout_processor +from eval_protocol.pytest.default_mcp_gym_rollout_processor import MCPGymRolloutProcessor def frozen_lake_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]: @@ -41,7 +41,7 @@ def frozen_lake_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluation completion_params=[ {"temperature": 0.0, "max_tokens": 4096, "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"} ], - rollout_processor=default_mcp_gym_rollout_processor, + rollout_processor=MCPGymRolloutProcessor(), passed_threshold=0.66, num_runs=1, max_concurrent_rollouts=3, diff --git a/tests/pytest/test_hallucination.py b/tests/pytest/test_hallucination.py index b29fb53c..fe8f32f0 100644 --- a/tests/pytest/test_hallucination.py +++ b/tests/pytest/test_hallucination.py @@ -12,7 +12,7 @@ import litellm from eval_protocol.models import EvaluateResult, EvaluationRow, Message, MetricResult -from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test +from eval_protocol.pytest import SingleTurnRolloutProcessor, evaluation_test # Configure the judge model for LiteLLM JUDGE_MODEL = "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct" @@ -35,7 +35,7 @@ def hallucination_dataset_adapter(data: List[Dict[str, Any]]) -> List[Evaluation completion_params=[ {"temperature": 0.0, "max_tokens": 512, "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"} ], - rollout_processor=default_single_turn_rollout_processor, + rollout_processor=SingleTurnRolloutProcessor(), passed_threshold=0.33, num_runs=1, mode="pointwise", diff --git a/tests/pytest/test_lunar_lander.py b/tests/pytest/test_lunar_lander.py index 3fddac62..00f966a5 100644 --- a/tests/pytest/test_lunar_lander.py +++ b/tests/pytest/test_lunar_lander.py @@ -9,7 +9,7 @@ from eval_protocol.models import EvaluateResult, EvaluationRow, InputMetadata, Message from eval_protocol.pytest import evaluation_test -from eval_protocol.pytest.default_mcp_gym_rollout_processor import default_mcp_gym_rollout_processor +from eval_protocol.pytest.default_mcp_gym_rollout_processor import MCPGymRolloutProcessor def lunar_lander_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]: @@ -39,7 +39,7 @@ def lunar_lander_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio input_dataset=["tests/pytest/data/lunar_lander_dataset.jsonl"], dataset_adapter=lunar_lander_to_evaluation_row, completion_params=[{"temperature": 0.0, "max_tokens": 4096, "model": "gpt-4.1"}], - rollout_processor=default_mcp_gym_rollout_processor, + rollout_processor=MCPGymRolloutProcessor(), passed_threshold=0.0, num_runs=1, mode="pointwise", diff --git a/tests/pytest/test_markdown_highlighting.py b/tests/pytest/test_markdown_highlighting.py index 9c70721f..c393ee60 100644 --- a/tests/pytest/test_markdown_highlighting.py +++ b/tests/pytest/test_markdown_highlighting.py @@ -8,7 +8,7 @@ from typing import Any, Dict, List from eval_protocol.models import EvaluateResult, EvaluationRow, InputMetadata, Message -from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test +from eval_protocol.pytest import SingleTurnRolloutProcessor, evaluation_test def markdown_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]: @@ -32,7 +32,7 @@ def markdown_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evalu {"temperature": 0.0, "max_tokens": 4096, "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"} ], passed_threshold=0.5, - rollout_processor=default_single_turn_rollout_processor, + rollout_processor=SingleTurnRolloutProcessor(), num_runs=1, mode="pointwise", ) diff --git a/tests/pytest/test_pytest_default_agent_rollout_processor.py b/tests/pytest/test_pytest_default_agent_rollout_processor.py index 8320ec8a..bfabe35c 100644 --- a/tests/pytest/test_pytest_default_agent_rollout_processor.py +++ b/tests/pytest/test_pytest_default_agent_rollout_processor.py @@ -2,7 +2,7 @@ from typing import List from eval_protocol.models import EvaluationRow, Message -from eval_protocol.pytest import default_agent_rollout_processor, evaluation_test +from eval_protocol.pytest import AgentRolloutProcessor, evaluation_test @evaluation_test( @@ -16,7 +16,7 @@ ) ] ], - rollout_processor=default_agent_rollout_processor, + rollout_processor=AgentRolloutProcessor(), completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}], ) def test_pytest_default_agent_rollout_processor(rows: List[EvaluationRow]) -> List[EvaluationRow]: diff --git a/tests/pytest/test_pytest_ensure_logging.py b/tests/pytest/test_pytest_ensure_logging.py index c9884756..e57b3c8c 100644 --- a/tests/pytest/test_pytest_ensure_logging.py +++ b/tests/pytest/test_pytest_ensure_logging.py @@ -18,7 +18,7 @@ async def test_ensure_logging(monkeypatch): "eval_protocol.dataset_logger.sqlite_dataset_logger_adapter.SqliteEvaluationRowStore", return_value=mock_store ): from eval_protocol.models import EvaluationRow - from eval_protocol.pytest.default_no_op_rollout_process import default_no_op_rollout_processor + from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor from eval_protocol.pytest.evaluation_test import evaluation_test from tests.pytest.test_markdown_highlighting import markdown_dataset_to_evaluation_row @@ -28,7 +28,7 @@ async def test_ensure_logging(monkeypatch): ], completion_params=[{"temperature": 0.0, "model": "dummy/local-model"}], dataset_adapter=markdown_dataset_to_evaluation_row, - rollout_processor=default_no_op_rollout_processor, + rollout_processor=NoOpRolloutProcessor(), mode="pointwise", combine_datasets=False, num_runs=2, diff --git a/tests/pytest/test_pytest_flaky_sometimes.py b/tests/pytest/test_pytest_flaky_sometimes.py index 65e1e63d..bde5e34c 100644 --- a/tests/pytest/test_pytest_flaky_sometimes.py +++ b/tests/pytest/test_pytest_flaky_sometimes.py @@ -5,7 +5,7 @@ import pytest from eval_protocol.models import EvaluateResult, EvaluationRow, Message -from eval_protocol.pytest import default_no_op_rollout_processor, evaluation_test +from eval_protocol.pytest import NoOpRolloutProcessor, evaluation_test # skip in CI since it will intentionally fail. This is useful for local generation of logs @@ -13,7 +13,7 @@ @evaluation_test( input_messages=[[Message(role="user", content="Return HEADS or TAILS at random.")]], completion_params=[{"model": "dummy/local-model"}], - rollout_processor=default_no_op_rollout_processor, + rollout_processor=NoOpRolloutProcessor(), mode="pointwise", num_runs=5, ) diff --git a/tests/pytest/test_pytest_function_calling.py b/tests/pytest/test_pytest_function_calling.py index 63488dbe..60f38b0d 100644 --- a/tests/pytest/test_pytest_function_calling.py +++ b/tests/pytest/test_pytest_function_calling.py @@ -2,7 +2,7 @@ from typing import Any, Dict, List from eval_protocol.models import EvaluationRow -from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test +from eval_protocol.pytest import SingleTurnRolloutProcessor, evaluation_test from eval_protocol.rewards.function_calling import exact_tool_match_reward @@ -23,7 +23,7 @@ def function_calling_to_evaluation_row(rows: List[Dict[str, Any]]) -> List[Evalu completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}], mode="pointwise", dataset_adapter=function_calling_to_evaluation_row, - rollout_processor=default_single_turn_rollout_processor, + rollout_processor=SingleTurnRolloutProcessor(), ) async def test_pytest_function_calling(row: EvaluationRow) -> EvaluationRow: """Run pointwise evaluation on sample dataset using pytest interface.""" diff --git a/tests/pytest/test_pytest_ids.py b/tests/pytest/test_pytest_ids.py index 045d2a19..b6bb4a35 100644 --- a/tests/pytest/test_pytest_ids.py +++ b/tests/pytest/test_pytest_ids.py @@ -3,7 +3,7 @@ import eval_protocol.dataset_logger as dataset_logger from eval_protocol.dataset_logger.dataset_logger import DatasetLogger from eval_protocol.models import EvaluationRow -from eval_protocol.pytest.default_no_op_rollout_process import default_no_op_rollout_processor +from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor from tests.pytest.test_markdown_highlighting import markdown_dataset_to_evaluation_row @@ -30,7 +30,7 @@ async def test_evaluation_test_decorator(monkeypatch): ], completion_params=[{"temperature": 0.0, "model": "dummy/local-model"}], dataset_adapter=markdown_dataset_to_evaluation_row, - rollout_processor=default_no_op_rollout_processor, + rollout_processor=NoOpRolloutProcessor(), mode="pointwise", combine_datasets=False, num_runs=2, @@ -71,7 +71,7 @@ async def test_evaluation_test_decorator_ids_single(monkeypatch): {"temperature": 1.0, "model": "dummy/local-model"}, ], dataset_adapter=markdown_dataset_to_evaluation_row, - rollout_processor=default_no_op_rollout_processor, + rollout_processor=NoOpRolloutProcessor(), mode="pointwise", combine_datasets=False, num_runs=5, diff --git a/tests/pytest/test_pytest_input_messages.py b/tests/pytest/test_pytest_input_messages.py index dc460aa5..7b4f8d9e 100644 --- a/tests/pytest/test_pytest_input_messages.py +++ b/tests/pytest/test_pytest_input_messages.py @@ -1,7 +1,7 @@ from typing import List from eval_protocol.models import EvaluationRow, Message -from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test +from eval_protocol.pytest import SingleTurnRolloutProcessor, evaluation_test @evaluation_test( @@ -11,7 +11,7 @@ ] ], completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}], - rollout_processor=default_single_turn_rollout_processor, + rollout_processor=SingleTurnRolloutProcessor(), ) def test_input_messages_in_decorator(rows: List[EvaluationRow]) -> List[EvaluationRow]: """Run math evaluation on sample dataset using pytest interface.""" diff --git a/tests/pytest/test_pytest_json_schema.py b/tests/pytest/test_pytest_json_schema.py index 158874f1..c5a20c5d 100644 --- a/tests/pytest/test_pytest_json_schema.py +++ b/tests/pytest/test_pytest_json_schema.py @@ -2,7 +2,7 @@ from typing import Any, Dict, List from eval_protocol.models import EvaluationRow -from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test +from eval_protocol.pytest import SingleTurnRolloutProcessor, evaluation_test from eval_protocol.rewards.json_schema import json_schema_reward @@ -26,7 +26,7 @@ def json_schema_to_evaluation_row(rows: List[Dict[str, Any]]) -> List[Evaluation input_dataset=["tests/pytest/data/json_schema.jsonl"], completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}], mode="pointwise", - rollout_processor=default_single_turn_rollout_processor, + rollout_processor=SingleTurnRolloutProcessor(), dataset_adapter=json_schema_to_evaluation_row, ) async def test_pytest_function_calling(row: EvaluationRow) -> EvaluationRow: diff --git a/tests/pytest/test_pytest_math_example.py b/tests/pytest/test_pytest_math_example.py index 23010797..55c525be 100644 --- a/tests/pytest/test_pytest_math_example.py +++ b/tests/pytest/test_pytest_math_example.py @@ -1,5 +1,5 @@ from eval_protocol.models import EvaluateResult, EvaluationRow, MetricResult -from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test +from eval_protocol.pytest import SingleTurnRolloutProcessor, evaluation_test from eval_protocol.rewards.math import math_reward from examples.math_example.main import check_think_answer_format from tests.pytest.helper.gsm8k_to_evaluation_row import gsm8k_to_evaluation_row @@ -11,7 +11,7 @@ completion_params=[{"temperature": 0.0, "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}], max_dataset_rows=5, passed_threshold=0.0, - rollout_processor=default_single_turn_rollout_processor, + rollout_processor=SingleTurnRolloutProcessor(), mode="pointwise", evaluation_test_kwargs=[ {"math_reward_kwargs": {"tolerance": 0.001, "absolute_tolerance": 1e-8, "require_units": False}} diff --git a/tests/pytest/test_pytest_math_format_length.py b/tests/pytest/test_pytest_math_format_length.py index 5bba5c0e..3da732a0 100644 --- a/tests/pytest/test_pytest_math_format_length.py +++ b/tests/pytest/test_pytest_math_format_length.py @@ -1,7 +1,7 @@ import math from eval_protocol.models import EvaluateResult, EvaluationRow, MetricResult -from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test +from eval_protocol.pytest import SingleTurnRolloutProcessor, evaluation_test from eval_protocol.rewards.length import count_tokens from eval_protocol.rewards.math import math_reward from examples.math_with_format_and_length.main import check_think_answer_format @@ -14,7 +14,7 @@ completion_params=[{"temperature": 0.0, "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}], max_dataset_rows=5, passed_threshold=0.0, - rollout_processor=default_single_turn_rollout_processor, + rollout_processor=SingleTurnRolloutProcessor(), mode="pointwise", evaluation_test_kwargs=[ { diff --git a/tests/pytest/test_pytest_mcp_config.py b/tests/pytest/test_pytest_mcp_config.py index dde15aa9..c578d07c 100644 --- a/tests/pytest/test_pytest_mcp_config.py +++ b/tests/pytest/test_pytest_mcp_config.py @@ -2,7 +2,7 @@ from typing import List from eval_protocol.models import EvaluateResult, EvaluationRow, Message -from eval_protocol.pytest import default_agent_rollout_processor, evaluation_test +from eval_protocol.pytest import AgentRolloutProcessor, evaluation_test @evaluation_test( @@ -19,7 +19,7 @@ ) ] ], - rollout_processor=default_agent_rollout_processor, + rollout_processor=AgentRolloutProcessor(), completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-20b"}], mode="pointwise", mcp_config_path="tests/pytest/mcp_configurations/mock_discord_mcp_config.json", diff --git a/tests/pytest/test_pytest_mcp_url.py b/tests/pytest/test_pytest_mcp_url.py index 01c06c45..ce265da5 100644 --- a/tests/pytest/test_pytest_mcp_url.py +++ b/tests/pytest/test_pytest_mcp_url.py @@ -1,5 +1,5 @@ from eval_protocol.models import EvaluateResult, EvaluationRow, Message -from eval_protocol.pytest import default_agent_rollout_processor, evaluation_test +from eval_protocol.pytest import AgentRolloutProcessor, evaluation_test @evaluation_test( @@ -18,7 +18,7 @@ ), ] ], - rollout_processor=default_agent_rollout_processor, + rollout_processor=AgentRolloutProcessor(), completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}], mode="pointwise", mcp_config_path="tests/pytest/mcp_configurations/docs_mcp_config.json", diff --git a/tests/pytest/test_pytest_word_count_example.py b/tests/pytest/test_pytest_word_count_example.py index 339c5152..72c9bc2f 100644 --- a/tests/pytest/test_pytest_word_count_example.py +++ b/tests/pytest/test_pytest_word_count_example.py @@ -1,7 +1,7 @@ from haikus import haikus from eval_protocol.models import EvaluateResult, EvaluationRow, MetricResult -from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test +from eval_protocol.pytest import SingleTurnRolloutProcessor, evaluation_test from tests.pytest.helper.word_count_to_evaluation_row import word_count_to_evaluation_row @@ -11,7 +11,7 @@ completion_params=[{"temperature": 0.0, "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}], max_dataset_rows=5, passed_threshold=0.3, # Reasonable threshold for word count evaluation - rollout_processor=default_single_turn_rollout_processor, + rollout_processor=SingleTurnRolloutProcessor(), mode="pointwise", # Use pointwise mode for elegant row-by-row evaluation ) def test_word_count_evaluate(row: EvaluationRow) -> EvaluationRow: diff --git a/tests/pytest/test_tau_bench_airline.py b/tests/pytest/test_tau_bench_airline.py index 0eeba626..f3a7c65f 100644 --- a/tests/pytest/test_tau_bench_airline.py +++ b/tests/pytest/test_tau_bench_airline.py @@ -12,7 +12,7 @@ from eval_protocol.models import EvaluateResult, EvaluationRow, InputMetadata, Message from eval_protocol.pytest import evaluation_test -from eval_protocol.pytest.default_mcp_gym_rollout_processor import default_mcp_gym_rollout_processor +from eval_protocol.pytest.default_mcp_gym_rollout_processor import MCPGymRolloutProcessor from vendor.tau2.data_model.message import ( AssistantMessage, SystemMessage, @@ -72,7 +72,7 @@ def tau_bench_airline_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Eval "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b", } ], - rollout_processor=default_mcp_gym_rollout_processor, + rollout_processor=MCPGymRolloutProcessor(), passed_threshold={"success": 0.4, "standard_deviation": 0.1}, num_runs=8, mode="pointwise", diff --git a/tests/test_retry_mechanism.py b/tests/test_retry_mechanism.py new file mode 100644 index 00000000..a483f0e1 --- /dev/null +++ b/tests/test_retry_mechanism.py @@ -0,0 +1,157 @@ +#!/usr/bin/env python3 +""" +Simple test to verify the retry mechanism works with evaluation_test. +""" + +import asyncio +import os +from collections import Counter +from typing import List +from unittest.mock import Mock + +import pytest + +from eval_protocol.models import EvaluateResult, EvaluationRow, Message, RolloutStatus +from eval_protocol.pytest.evaluation_test import evaluation_test +from eval_protocol.pytest.rollout_processor import RolloutProcessor +from eval_protocol.pytest.types import RolloutProcessorConfig + +os.environ["EP_MAX_RETRY"] = "2" # Allow up to 2 retries + + +class MockRolloutProcessorWithRetries(RolloutProcessor): + """Mock rollout processor that fails second task alphabetically on first attempt, succeeds on retry""" + + def __init__(self): + self.mock_tracker = Mock() + + def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) -> List[asyncio.Task[EvaluationRow]]: + # Track this batch call + self.mock_tracker.batch_call(len(rows)) + + row_setup = { + 0: {"delay": 0.01, "should_fail": False}, + 1: {"delay": 0.01, "should_fail": True}, # Will be adjusted based on attempt number + 2: {"delay": 0.01, "should_fail": False}, + 3: {"delay": 0.01, "should_fail": False}, + 4: {"delay": 0.01, "should_fail": False}, + } + + async def process_single_row( + row: EvaluationRow, delay: float, base_should_fail: bool = False + ) -> EvaluationRow: + rollout_id = row.execution_metadata.rollout_id + + # Track individual row processing call + self.mock_tracker.process_row_call(rollout_id) + + # Determine attempt number by counting previous calls for this rollout_id + previous_calls = [ + call for call in self.mock_tracker.process_row_call.call_args_list if call[0][0] == rollout_id + ] + attempt_number = len(previous_calls) + + # Determine if this specific attempt should fail + # Row 1 fails on first attempt (attempt_number == 1), succeeds on retry (attempt_number == 2) + should_fail = base_should_fail and attempt_number == 1 + + print(f"🔄 ATTEMPTING rollout_id={rollout_id}, attempt={attempt_number}, will_fail={should_fail}") + + await asyncio.sleep(delay) + print(f"🎉 FINISHED {'error' if should_fail else 'finished'}: {row.execution_metadata.rollout_id}") + + if should_fail: + raise Exception("Simulated failure for testing") + + return row + + # Create and return tasks (let evaluation_test handle them) + tasks = [ + asyncio.create_task(process_single_row(row, row_setup[i]["delay"], row_setup[i]["should_fail"])) + for i, row in enumerate(rows) + ] + + return tasks + + +# Create a shared processor instance for testing +shared_processor = MockRolloutProcessorWithRetries() + + +@evaluation_test( + completion_params=[{"model": "gpt-4o-mini", "temperature": 0}], + input_messages=[ + [Message(role="user", content="Task A")], + [Message(role="user", content="Task B")], + [Message(role="user", content="Task C")], + [Message(role="user", content="Task D")], + [Message(role="user", content="Task E")], + ], + rollout_processor=shared_processor, + num_runs=1, + mode="pointwise", +) +def test_retry_mechanism(row: EvaluationRow) -> EvaluationRow: + """MOCK TEST: Tests that retry mechanism works - one task fails on first attempt, succeeds on retry.""" + print( + f"📊 EVALUATED: {row.execution_metadata.rollout_id} ({'SUCCESS' if row.rollout_status.status == 'finished' else 'FAILURE'})" + ) + + # Assign a score based on success/failure + score = 1.0 if row.rollout_status.status == "finished" else 0.0 + row.evaluation_result = EvaluateResult(score=score) + + return row + + +def test_retry_mechanism_mock_verification(): + """Test that verifies the retry mechanism worked by checking the mock calls""" + # Get our mock tracker + mock_tracker = shared_processor.mock_tracker + + print(f"\n🔄 MOCK CALL ANALYSIS:") + print(f" Batch calls made: {mock_tracker.batch_call.call_count}") + print(f" Total row processing calls: {mock_tracker.process_row_call.call_count}") + + if mock_tracker.process_row_call.call_count == 0: + print("⚠️ No calls recorded yet. The evaluation test may not have run or completed.") + return + + # Get all rollout_ids that were processed + call_args = mock_tracker.process_row_call.call_args_list + rollout_ids = [call[0][0] for call in call_args] + + # Count calls per rollout_id + call_counts = Counter(rollout_ids) + + print(f" Call counts per rollout_id: {dict(call_counts)}") + print(f" Individual calls:") + for i, call_arg in enumerate(call_args, 1): + rollout_id = call_arg[0][0] + attempt_num = rollout_ids[:i].count(rollout_id) + print(f" {i}. rollout_id={rollout_id}, attempt={attempt_num}") + + # ASSERTIONS USING MOCK DATA + # Should have exactly 6 total row processing calls (5 initial + 1 retry) + assert ( + mock_tracker.process_row_call.call_count == 6 + ), f"Expected 6 total calls, got {mock_tracker.process_row_call.call_count}" + + # Should have exactly 2 batch calls (initial batch + retry batch) + assert mock_tracker.batch_call.call_count == 2, f"Expected 2 batch calls, got {mock_tracker.batch_call.call_count}" + + # First batch should have 5 rows, second batch should have 1 row (the retry) + batch_call_args = mock_tracker.batch_call.call_args_list + assert batch_call_args[0][0][0] == 5, f"Expected first batch to have 5 rows, got {batch_call_args[0][0][0]}" + assert batch_call_args[1][0][0] == 1, f"Expected second batch to have 1 row, got {batch_call_args[1][0][0]}" + + # Exactly one rollout_id should be called twice, others called once + call_count_values = list(call_counts.values()) + assert ( + call_count_values.count(2) == 1 + ), f"Expected exactly 1 rollout_id to be called twice, got counts: {dict(call_counts)}" + assert ( + call_count_values.count(1) == 4 + ), f"Expected exactly 4 rollout_ids to be called once, got counts: {dict(call_counts)}" + + print("✅ All mock-based assertions passed! Retry mechanism is working correctly.") diff --git a/tests/test_rollout_control_plane_integration.py b/tests/test_rollout_control_plane_integration.py index 1b92d5aa..8d176780 100644 --- a/tests/test_rollout_control_plane_integration.py +++ b/tests/test_rollout_control_plane_integration.py @@ -239,8 +239,10 @@ def mock_step_side_effect(env_index, tool_call): policy = MockPolicy(["right", "down", "right"]) # Execute rollout + tasks = self.execution_manager.execute_rollouts(mock_env, policy, steps=10) evaluation_rows = [] - async for row in self.execution_manager.execute_rollouts(mock_env, policy, steps=10): + for task in tasks: + row = await task evaluation_rows.append(row) # Validate results @@ -459,8 +461,10 @@ async def test_rollout_handles_control_plane_failure_gracefully(self): # Execute rollout with control plane failure policy = MockPolicy(["right"]) + tasks = self.execution_manager.execute_rollouts(mock_env, policy, steps=1) evaluation_rows = [] - async for row in self.execution_manager.execute_rollouts(mock_env, policy, steps=1): + for task in tasks: + row = await task evaluation_rows.append(row) # Should still work, but without control plane info @@ -497,7 +501,7 @@ async def test_rollout_creates_envs_from_url(self): policy = MockPolicy(["right"]) with ( - patch("eval_protocol.mcp_env.make", new_callable=AsyncMock) as mock_make, + patch("eval_protocol.mcp_env.make") as mock_make, patch("eval_protocol.mcp_env.ExecutionManager") as MockManager, ): mock_env = MagicMock() @@ -505,24 +509,30 @@ async def test_rollout_creates_envs_from_url(self): manager_instance = MockManager.return_value - # Mock execute_rollouts to return an async generator and track calls + # Mock execute_rollouts to return tasks and track calls call_args = [] - async def mock_execute_rollouts(*args, **kwargs): + async def mock_task(): + return "ok" + + def mock_execute_rollouts(*args, **kwargs): call_args.append((args, kwargs)) - for item in ["ok"]: - yield item + import asyncio + + return [asyncio.create_task(mock_task())] manager_instance.execute_rollouts = mock_execute_rollouts result = [] - async for row in ep.rollout( + tasks = ep.rollout( "http://localhost:1234/mcp/", policy, dataset=dataset, model_id="test_model", steps=5, - ): + ) + for task in tasks: + row = await task result.append(row) mock_make.assert_called_once_with( diff --git a/tests/test_tau_bench_airline_smoke.py b/tests/test_tau_bench_airline_smoke.py index 200f7ca8..044447b7 100644 --- a/tests/test_tau_bench_airline_smoke.py +++ b/tests/test_tau_bench_airline_smoke.py @@ -13,7 +13,7 @@ from eval_protocol.models import CompletionParams, EvaluateResult, EvaluationRow, InputMetadata, Message from eval_protocol.pytest import evaluation_test -from eval_protocol.pytest.default_mcp_gym_rollout_processor import default_mcp_gym_rollout_processor +from eval_protocol.pytest.default_mcp_gym_rollout_processor import MCPGymRolloutProcessor from vendor.tau2.data_model.message import ( AssistantMessage, SystemMessage, @@ -72,7 +72,7 @@ def tau_bench_airline_smoke_to_evaluation_row(data: List[Dict[str, Any]]) -> Lis "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b", } ], - rollout_processor=default_mcp_gym_rollout_processor, + rollout_processor=MCPGymRolloutProcessor(), passed_threshold=0.36, num_runs=1, # Smoke test: single run for quick feedback mode="pointwise", From 030e886f183a6b0ff3a6caba8d500a4c5689c3cd Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Thu, 14 Aug 2025 22:37:03 +0000 Subject: [PATCH 19/26] ignore svg bench test --- .github/workflows/ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a0184b62..d759e812 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -94,6 +94,7 @@ jobs: --ignore=tests/pytest/test_tau_bench_airline.py \ --ignore=tests/pytest/test_apps_coding.py \ --ignore=tests/test_tau_bench_airline_smoke.py \ + --ignore=tests/pytest/test_svgbench.py \ --cov=eval_protocol --cov-append --cov-report=xml --cov-report=term-missing -v --durations=10 - name: Store coverage file From 58d840995e6ca925da6fa17dc48b0b0d9ad9d2e8 Mon Sep 17 00:00:00 2001 From: Derek Xu <32891260+xzrderek@users.noreply.github.com> Date: Fri, 15 Aug 2025 00:15:02 -0700 Subject: [PATCH 20/26] livesvgbench + metadata fix (#83) * livesvgbench + metadata fix * bugs in retry processor --- .github/workflows/ci.yml | 1 + eval_protocol/pytest/evaluation_test.py | 1 + eval_protocol/pytest/plugin.py | 14 +- eval_protocol/pytest/utils.py | 10 +- tests/pytest/data/svgbench_dataset.jsonl | 210 +++---- .../pytest/data/svgbench_sample_dataset.jsonl | 6 +- tests/pytest/test_livesvgbench.py | 580 ++++++++++++++++++ tests/pytest/test_svgbench.py | 25 +- tests/test_retry_mechanism.py | 6 +- 9 files changed, 720 insertions(+), 133 deletions(-) create mode 100644 tests/pytest/test_livesvgbench.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d759e812..08aaf406 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -95,6 +95,7 @@ jobs: --ignore=tests/pytest/test_apps_coding.py \ --ignore=tests/test_tau_bench_airline_smoke.py \ --ignore=tests/pytest/test_svgbench.py \ + --ignore=tests/pytest/test_livesvgbench.py \ --cov=eval_protocol --cov-append --cov-report=xml --cov-report=term-missing -v --durations=10 - name: Store coverage file diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index 6127c7b9..38f66d54 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -475,6 +475,7 @@ async def _execute_with_semaphore(row): for result in all_results: for r in result: if r.eval_metadata is not None: + r.eval_metadata.status = "finished" r.eval_metadata.passed = passed active_logger.log(r) diff --git a/eval_protocol/pytest/plugin.py b/eval_protocol/pytest/plugin.py index 4522caef..81b36420 100644 --- a/eval_protocol/pytest/plugin.py +++ b/eval_protocol/pytest/plugin.py @@ -63,13 +63,13 @@ def pytest_addoption(parser) -> None: "--ep-max-retry", action="store", type=int, - default=None, + default=0, help=("Failed rollouts (with rollout_status.status == 'error') will be retried up to this many times."), ) group.addoption( - "--ep-fail-on-permanent-failure", + "--ep-fail-on-max-retry", action="store", - default=None, + default="true", choices=["true", "false"], help=( "Whether to fail the entire rollout when permanent failures occur after max retries. " @@ -118,12 +118,10 @@ def pytest_configure(config) -> None: os.environ["EP_SUMMARY_JSON"] = summary_json_path max_retry = config.getoption("--ep-max-retry") - if max_retry is not None: - os.environ["EP_MAX_RETRY"] = str(max_retry) + os.environ["EP_MAX_RETRY"] = str(max_retry) - fail_on_permanent_failure = config.getoption("--ep-fail-on-permanent-failure") - if fail_on_permanent_failure is not None: - os.environ["EP_FAIL_ON_PERMANENT_FAILURE"] = fail_on_permanent_failure + fail_on_max_retry = config.getoption("--ep-fail-on-max-retry") + os.environ["EP_FAIL_ON_MAX_RETRY"] = fail_on_max_retry # Allow ad-hoc overrides of input params via CLI flags try: diff --git a/eval_protocol/pytest/utils.py b/eval_protocol/pytest/utils.py index 24b60028..186f7c7c 100644 --- a/eval_protocol/pytest/utils.py +++ b/eval_protocol/pytest/utils.py @@ -280,7 +280,13 @@ async def retry_handler(failed_row: EvaluationRow): async def initial_processor(): """Process initial batch and spawn retries for failures""" - base_tasks = rollout_processor(fresh_dataset, config) + # catch any task creation errors and raise them immediately, i.e. port already in use + try: + base_tasks = rollout_processor(fresh_dataset, config) + except Exception as e: + print(f"❌ Rollout processor failed to initialize: {e}") + raise e + pending = set(base_tasks) while pending: @@ -310,7 +316,7 @@ async def initial_processor(): # only permanent failure rows are put on the queue, so we can check for them here if finished_row.rollout_status and finished_row.rollout_status.status == "error": - if os.getenv("EP_FAIL_ON_PERMANENT_FAILURE", "true") != "false": + if max_retry > 0 and os.getenv("EP_FAIL_ON_MAX_RETRY", "true") != "false": raise RuntimeError( f"Rollout {finished_row.execution_metadata.rollout_id} failed after {max_retry} retries. Errors: {finished_row.rollout_status.termination_reason}" ) diff --git a/tests/pytest/data/svgbench_dataset.jsonl b/tests/pytest/data/svgbench_dataset.jsonl index 5f2d0fc4..fdb72623 100644 --- a/tests/pytest/data/svgbench_dataset.jsonl +++ b/tests/pytest/data/svgbench_dataset.jsonl @@ -1,105 +1,105 @@ -{"requirements": ["Create a cow with clearly recognizable bovine features, including a body, head, four legs, tail, and udder.", "The cow must have black and white patches for its coloring.", "Add cow ears, eyes, and snout for facial recognition.", "Position the cow in a realistic plowing stance, leaning forward as if pulling.", "The cow's hooves must be colored brown, as if covered in soil from the field.", "Include a traditional wooden plow with a visible metal blade/share.", "Depict a wooden yoke across the cow's shoulders, connected to the plow by visible chains.", "The plow's blade must be partially buried in the soil, actively turning over a chunk of earth.", "Show at least three distinct, dark furrows in the soil trailing directly behind the plow.", "The field must be split into a plowed section and an unplowed section, with the cow and plow positioned at the boundary between them.", "The unplowed section of the field must have short green grass, which is visibly being overturned by the plow.", "Add a simple background with a clear horizon line and a blue sky containing a yellow sun."], "prompt": "Write `svg` code to draw an image of a cow plowing a field.", "id": "cow_plowing"} -{"requirements": ["The overall background of the SVG must be white.", "All primary elements (logo, search bar, buttons) must be horizontally centered on the canvas.", "Include the Google logo in the center, using its official multi-color scheme (blue, red, yellow, blue, green, red).", "Place a prominent search bar directly below the Google logo, with a vertical spacing equal to half the height of the logo.", "The search bar must be a rounded rectangle with a light gray border.", "The search bar must contain a gray magnifying glass icon perfectly aligned to the left side, inside the bar.", "The search bar must contain a gray microphone icon perfectly aligned to the right side, inside the bar.", "Place two distinct buttons below the search bar, horizontally centered with the search bar, and with a small, consistent gap between them.", "The left button must be labeled 'Google Search'.", "The right button must be labeled 'I'm Feeling Lucky'.", "Both buttons must have a light gray background, a thin gray border, and dark gray text.", "Create a header section at the top right of the canvas, with all its items vertically aligned with each other.", "The header must include text links for 'Gmail' and 'Images'.", "The header must include a 3x3 grid icon (Google Apps launcher) positioned between the 'Images' link and the 'Sign in' button.", "The header must include a prominent 'Sign in' button with a blue background and white text, positioned at the far right of the header."], "prompt": "Write `svg` code for a screenshot of the [Google homepage](https://google.com).", "id": "write_`svg`_code"} -{"requirements": ["Create an elliptical shape for the top surface of a round dinner table with a dark wood grain texture.", "Include exactly 4 sets of cutlery arranged around the table.", "Each cutlery set must consist of a recognizable fork, knife, and spoon.", "Position the 4 cutlery sets at distinct place settings (at 12, 3, 6, and 9 o'clock positions).", "Include a round dinner plate at each of the 4 place settings.", "The fork of each cutlery set must be placed to the left of its corresponding plate, and the knife and spoon to the right.", "Place exactly 3 main food dishes in the center of the table.", "First dish: A recognizable roasted turkey, golden-brown in color, showing a plump body, with one drumstick clearly carved off and missing.", "The turkey must be presented on its own large serving platter.", "Second dish: A round pizza with visible crust and toppings, cut into slices, with one slice missing from the pizza.", "The missing slice of pizza must be placed on the dinner plate at the 3 o'clock position.", "The missing turkey drumstick must be placed on the dinner plate at the 9 o'clock position.", "Third dish: A serving of at least two tacos with visible folded shells and fillings, presented in a red taco holder.", "Arrange the three main dishes in the center of the table, ensuring they don't unnaturally overlap.", "The overall perspective must be slightly isometric."], "prompt": "Write `svg` code for an image of a round dinner table with 4 sets of cutlery and 3 dishes on the table, including a turkey, pizza and tacos.", "id": "dinner_table"} -{"requirements": ["Create a central, cylindrical rocket body colored bright blue.", "Add a pointed, red nose cone attached to the top of the rocket body.", "Include exactly three yellow stabilizer fins, symmetrically attached to the base of the rocket body.", "Incorporate a single circular window on the rocket's body.", "Add two red horizontal stripes on the blue rocket body, one positioned above the window and one below it.", "Apply a clean, cartoonish art style with bold black outlines for all rocket parts.", "Include a visible engine nozzle at the bottom of the rocket, between the fins.", "Position the rocket as if it is launching, with its base just above the ground.", "Add a column of stylized orange and yellow flames emerging from the nozzle, which the rocket is standing on."], "prompt": "Write `svg` code for an image of a toy rocket.", "id": "write_`svg`_code"} -{"requirements": ["Create a classic rubber ducky shape with a distinct body and head, colored bright yellow.", "The duck must have an orange beak and a simple black dot for an eye.", "Draw a white, claw-foot bathtub shape, showing the inside view with a visible rim.", "Fill the lower portion of the bathtub with light blue water.", "Ensure the water line is clearly visible across the duck's body, showing the lower third of the duck submerged.", "Position the duck so it is floating on the water's surface, creating small, concentric circular ripples in the water around its base.", "Depict soap bubbles as clusters of overlapping circles with a slight iridescence, using semi-transparent white, light pink, and light blue fills.", "Place a small cluster of bubbles on top of the duck's head.", "Place a large pile of bubbles against one side of the tub and a few floating on the water's surface around the duck."], "prompt": "Write `svg` code for an image of a rubber ducky floating on top of a soapy bathtub.", "id": "write_`svg`_code"} -{"requirements": ["Create a scene set on top of a solid-looking cloudscape that serves as the ground.", "Include a hot air balloon with a large envelope featuring vertical red and white stripes and a brown wicker basket.", "The balloon's basket must be resting firmly on the surface of a large, flat-topped cloud.", "Show ropes connecting the balloon's envelope to the basket.", "Include four human figures styled as a family: two adults and two children.", "Position one adult figure holding a corner of a red and white checkered picnic blanket, while one child figure holds the opposite corner, as if they are spreading it out together on the cloud.", "Place an open picnic basket on a corner of the blanket that is already spread out.", "A thermos, a bunch of grapes, and a sandwich must be visible emerging from the open picnic basket.", "Position the second adult and second child near the landed hot air balloon, with the adult pointing up at the balloon's envelope.", "The background must be a clear blue sky containing a bright yellow sun and two small, distant clouds."], "prompt": "Write `svg` code for an image of a picnic on top of the clouds, where 2 parents and 2 children have landed with a hot air ballon, and are setting up a picnic with a tarp and food items.", "id": "write_`svg`_code"} -{"requirements": ["Create a red, sporty car with a body, wheels, and windows.", "Draw a distinct circular hoop that is completely surrounded by jagged, irregular flames colored with reds, oranges, and yellows.", "Include a take-off ramp on the left side of the fiery ring and a landing ramp on the right side.", "Position the car in mid-air, with its front half having passed through the ring and its back half still inside the ring.", "The car's rear wheels must be depicted as just having left the edge of the take-off ramp.", "Add a ground surface below the entire jump setup.", "Incorporate gray speed lines trailing behind the car to convey high speed.", "Add orange sparks where the car's tires last touched the take-off ramp."], "prompt": "Write `svg` code for an image of a stunt car jumping through a circle of fire.", "id": "write_`svg`_code"} -{"requirements": ["Create a recognizable grey dolphin with a streamlined body, dorsal fin, and tail fluke.", "Show a water surface below the dolphin with a large splash effect at the point where the dolphin has exited the water.", "Position the dolphin in a dynamic jumping arc, with its entire body in mid-air.", "Draw a circular, multi-colored hula hoop, and position the dolphin so its mid-section is passing through the center of the hoop.", "Include a human trainer's arm and hand extending into the frame.", "The trainer's hand must be holding a small fish by the tail.", "The dolphin's mouth must be depicted as wide open, just about to bite the body of the fish being held by the trainer."], "prompt": "Write `svg` code for an image of a dolphin jumping out of the water and through a hula hoop to bite a fish out of its trainers hand.", "id": "write_`svg`_code"} -{"requirements": ["Create a standard red wine glass shape with a wide bowl, a slender stem, and a circular base.", "The glass must appear transparent, rendered with a light grey tint and low opacity.", "Add bright white highlights on the rim and along the curved side of the bowl to simulate glass reflection.", "Fill the glass with a deep burgundy colored wine.", "The wine must fill the glass to exactly the halfway point of the bowl's height.", "The top surface of the wine must be a flat ellipse, indicating a level liquid surface.", "The body of the wine must perfectly conform to the curved shape of the inside of the glass bowl.", "A single drop of red wine must be shown running down the outside of the glass bowl, starting from the rim and ending just above the stem."], "prompt": "Write `svg` code for an image of half full glass of red wine.", "id": "write_`svg`_code"} -{"requirements": ["Create a full-screen background using a modern, abstract macOS wallpaper.", "Add a horizontal, semi-transparent menu bar at the top edge of the screen.", "Place an Apple logo icon in the top-left corner of the menu bar, followed by the menu text 'Finder', 'File', 'Edit', and 'View'.", "Add Wi-Fi, battery, and date/time icons to the right side of the menu bar.", "Design a glass-like Dock with rounded corners at the bottom of the screen, hovering slightly above the bottom edge.", "Populate the Dock with icons for Finder, Safari, Mail, and System Settings, with a Trash icon at the far right of the Dock.", "Draw a Finder window as the main foreground element, positioned over the desktop wallpaper.", "The Finder window must have a title bar containing the three 'traffic light' control buttons (red, yellow, green) in the top-left corner.", "The main content area of the Finder window must display several generic folder icons.", "One of the folder icons in the Finder window must be identical to the Finder icon in the Dock.", "Apply a prominent drop shadow to the entire Finder window to make it appear floating above the desktop wallpaper and the Dock."], "prompt": "Write `svg` code for a screenshot of the macOS desktop, with a finder window on top.", "id": "write_`svg`_code"} -{"requirements": ["Whip must be depicted in a coiled, spiral arrangement on a flat surface.", "Include a distinct, solid brown handle (the stock) with a visible wood grain texture.", "The handle must feature a silver knob or pommel at its base.", "The handle must include a leather wrist loop (keeper) hanging from the pommel.", "The main flexible part of the whip (the thong) must be attached to the handle and made of braided black leather.", "The thong must show a clear taper, starting thicker at the handle and getting progressively thinner towards the tip.", "The coils of the thong must overlap realistically, with the handle and the thickest part of the thong on top of the outer coils.", "Include a 'fall,' which is a thinner, smooth leather piece attached to the end of the main braided thong.", "Show a frayed white 'cracker' or 'popper' at the very tip of the fall.", "Add subtle shading and highlights to the coils and handle to give the whip a three-dimensional appearance and a slight leather sheen."], "prompt": "Write `svg` code for an image of a coiled whip.", "id": "write_`svg`_code"} -{"requirements": ["Create a first-person perspective, as if looking through the player's eyes.", "Include a recognizable CS:GO AK-47 weapon model held by player hands in the bottom-right of the screen.", "The player's hands must be wearing the default Terrorist team gloves.", "Place a green plus-sign crosshair in the exact center of the screen.", "Display a Heads-Up Display (HUD) with game information using a font style that mimics the actual CS:GO interface.", "In the bottom-left of the HUD, show player health as '100' next to a plus icon and armor as '100' next to a shield icon.", "In the bottom-center of the HUD, show the ammunition count as '30 / 90'.", "In the top-left, include a square radar/minimap with a player indicator arrow in the middle.", "In the top-center, display the round timer as '1:45'.", "Above the timer, show the team scores with the Terrorist icon and a score of '5' on the left, and the Counter-Terrorist icon and a score of '3' on the right.", "The background must depict the 'A-long' area from the map Dust II, with the crosshair aimed at the double doors."], "prompt": "Write `svg` code for an screen of a first person view in CS:GO.", "id": "write_`svg`_code"} -{"requirements": ["Create a main structure for a wooden fruit stall, including a counter.", "Add a red and white striped canopy over the stall, supported by two vertical wooden posts.", "Display exactly four different, recognizable fruits.", "First fruit: A pile of red apples in a wicker basket on the left side of the counter.", "Second fruit: A bunch of yellow bananas placed next to the apples.", "Third fruit: A pile of oranges in another wicker basket on the right side of the counter.", "Fourth fruit: A single, large slice of watermelon resting directly on the counter in the center.", "Include a character representing the stall vendor, a smiling man with a mustache, positioned behind the counter and between the baskets.", "Add a small, hanging chalkboard sign from the canopy that reads 'Fresh Fruit'.", "Depict a cobblestone ground surface in front of the stall.", "Include the silhouette of another market stall in the background to suggest a larger market setting."], "prompt": "Write `svg` code for an image of a fruit stall in the market.", "id": "write_`svg`_code"} -{"requirements": ["Create a wooden barrel with visible vertical planks and two horizontal metal hoops.", "The barrel must be buried in a mound of sand, so only the top half is visible.", "Show the sand mounded up slightly around the visible base of the barrel.", "The barrel must be open at the top and filled with treasure items.", "Show treasure overflowing from the top and spilling down one side of the barrel onto the sand.", "The treasure must include a large pile of shiny gold coins, both inside and outside the barrel.", "Add a variety of colorful gemstones (red rubies, green emeralds, blue sapphires) mixed in with the coins.", "A string of white pearls must be draped over the edge of the barrel, trailing down into the spilled coins.", "A golden goblet must be visible, partially buried in the coins inside the barrel.", "Use bright highlights and glint effects on the coins, gems, and goblet to make them look shiny."], "prompt": "Write `svg` code for an image of half buried barrel of treasure.", "id": "write_`svg`_code"} -{"requirements": ["Create a vintage-style rotary telephone as the main subject, colored classic black.", "The telephone must have a main body, a handset, and a curly cord connecting them.", "The rotary dial must have ten visible finger holes with the numbers 0-9 arranged in a circle beneath them.", "Place the telephone on a small, dark wooden table with a visible wood grain texture.", "The table must have four visible, tapered legs.", "Position the phone realistically on the tabletop, with the handset resting in its cradle.", "The curly cord must hang down from the handset and connect to the main body of the phone.", "Incorporate a distinct shadow cast by the telephone onto the surface of the table.", "Add another shadow on the floor cast by the table itself to create depth."], "prompt": "Write `svg` code for an image of a vintage rotary telephone on a small wooden table.", "id": "write_`svg`_code"} -{"requirements": ["Draw a knight in a full suit of silver armor with a metallic sheen created using highlights and gradients.", "The knight must be holding a longsword in one hand and a shield in the other.", "The shield must have a coat of arms, such as a lion, depicted on it.", "Depict the knight in a dynamic pose, with the shield raised to block and the sword ready to strike.", "Draw a large, menacing green dragon with scales, large wings, and a spiky tail.", "The dragon must be shown actively breathing a large plume of fire directly towards the knight.", "The fire effect must be colored with bright reds, oranges, and yellows.", "The knight's shield must be positioned to intercept the fire, with the flames shown splashing against it.", "The scene must be set in a dark, rocky cavern, with the dragon's fire being the primary light source.", "The fire must cast a bright orange light on the front of the knight and the cavern walls, creating long, dark shadows behind them."], "prompt": "Write `svg` code for an image of a knight in shining armor fighting a fire-breathing dragon.", "id": "write_`svg`_code"} -{"requirements": ["Replicate the user interface of the Slack application using the recognizable Slack color scheme (purple sidebar, white main view).", "Include a left sidebar with a list of channels, with the channel '#design-team' highlighted to show it is active.", "The main view must show the message history for the '#design-team' channel.", "Display exactly three distinct messages from different fictional users.", "The first message must be from 'Alice' with a user avatar, a timestamp, and the text 'Here is the latest mockup. What do you think?'.", "The second message, below the first, must be from 'Bob' with a different avatar, a later timestamp, and the text 'Looks great! I love the new color palette.'.", "The third message, below Bob's, must be from 'Charlie' with a third avatar, a later timestamp, and the text 'Agreed! Ship it!'.", "Add a thumbs-up emoji reaction from two users on Alice's message.", "Show the 'user is typing...' indicator below the last message, with 'David is typing...' visible.", "Include the message input box at the bottom of the channel view, with placeholder text inside it."], "prompt": "Write `svg` code for a screenshot of a Slack channel with several messages, reactions, and a user typing.", "id": "write_`svg`_code"} -{"requirements": ["Draw a mound of dirt on a green grass surface to represent the top of an ant hill.", "Create a cutaway view showing the underground cross-section of the hill, featuring a network of tunnels and chambers.", "Depict three distinct types of chambers connected by tunnels.", "The top chamber must be a food storage area, filled with small green leaf fragments and seeds.", "The middle chamber must be a nursery, containing white ant eggs and larvae.", "The bottom chamber must be the queen's chamber, containing a single, large queen ant.", "Populate the tunnels and chambers with numerous small, black ants.", "Show some ants carrying leaf fragments from the entrance to the food storage chamber.", "Show other ants in the nursery tending to the eggs.", "The queen ant must be significantly larger than the other ants and shown laying an egg.", "Use a dark brown color for the packed earth of the chamber walls and a lighter brown for the loose soil inside the tunnels."], "prompt": "Write `svg` code for an image of a cross-section of an ant hill, showing tunnels and chambers with ants.", "id": "write_`svg`_code"} -{"requirements": ["Draw a school bus from a flat, side-on perspective, colored 'school bus yellow'.", "Include the long, rectangular body of the bus with a series of five evenly spaced passenger windows.", "Draw two visible wheels with black tires and silver hubcaps.", "Incorporate the characteristic black horizontal stripes running the length of the bus.", "Include the text 'SCHOOL BUS' in black, capital letters on the side panel between the black stripes.", "Show the driver's door and window at the front of the bus, with a silhouette of a person visible in the driver's seat.", "Add a red, octagonal stop sign attached to the side, fully extended outwards from the bus.", "Include side mirrors at the front and visible red lights at the front and back of the bus body."], "prompt": "Write `svg` code for an image of a classic yellow school bus from a side-on view.", "id": "write_`svg`_code"} -{"requirements": ["Create a triangular slice of pie on a white plate.", "The pie crust must be a golden-brown color with a texture suggesting it is flaky.", "Design a lattice-style top crust with interwoven strips of pastry, allowing the filling to be seen.", "The pie filling, visible through the lattice and on the cut side, must be red with small, dark red circles to represent cherries.", "Place a scoop of off-white vanilla ice cream directly on top of the pie slice, near the back corner.", "The ice cream scoop must have a slightly irregular, melting shape, with a small puddle forming at its base on the pie.", "The ice cream must have tiny dark specks to indicate vanilla bean.", "A silver fork must be resting on the plate next to the pie slice, with a small piece of cherry filling on its tines."], "prompt": "Write `svg` code for an image of a slice of cherry pie with a lattice crust and a scoop of vanilla ice cream next to it.", "id": "write_`svg`_code"} -{"requirements": ["Design a robot with a 'friendly' appearance, characterized by rounded shapes and large, circular optic sensors.", "The robot should have a polished chrome metallic texture, with highlights and shadows that give it a 3D feel and reflect the bar's lighting.", "Position the robot behind a sleek, minimalist bar counter.", "One of the robot's arms is actively pouring a vibrant, glowing green liquid from a cocktail shaker into a futuristic-looking glass held steady by its other hand.", "The bar setting must look futuristic, with glowing blue neon light strips running along the edges of the counter and the background shelves.", "Include shelves in the background holding uniquely shaped, futuristic bottles, one of which is half-empty and contains the same glowing green liquid as the drink being poured.", "The sleek bar counter must have a reflective surface, showing a partial, distorted reflection of the robot and the neon lights.", "Compose the scene from the perspective of a customer at the bar, with a non-human, metallic hand visible in the foreground, resting on the counter and reaching towards the drink being prepared.", "The robot and the drink preparation process should be the central focus of the image."], "prompt": "Write `svg` code for an image of a friendly robot serving drinks at a futuristic bar.", "id": "write_`svg`_code"} -{"requirements": ["Place a large, glowing Sun at the center of the diagram, emitting visible rays of light.", "Include all eight planets of the solar system: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, and Neptune, arranged in the correct order from the Sun.", "The side of each planet facing the Sun must be brightly lit, while the opposite side is in shadow, demonstrating the Sun as the primary light source.", "Draw distinct elliptical paths to represent the orbit of each planet around the Sun.", "Represent the relative size differences between the planets accurately (e.g., Jupiter largest, Earth much smaller than Uranus, etc.).", "Each planet must have its key visual characteristic: Earth's continents, clouds, and its Moon orbiting it; Mars's red color and polar ice cap; Jupiter's Great Red Spot; and Saturn's prominent rings.", "Include a text label for the Sun and for each of the eight planets, connected to its corresponding celestial body with a thin, faint line.", "Draw the asteroid belt as a dense field of small rocks in a shared orbit between Mars and Jupiter.", "Use a dark background to represent outer space, populated with small, distant stars, and include a single comet with a visible tail that points away from the Sun."], "prompt": "Write `svg` code for an image of a detailed diagram of the solar system with all the planets orbiting the sun.", "id": "write_`svg`_code"} -{"requirements": ["Draw a beaver with recognizable features: brown fur, large front teeth, and a flat, paddle-shaped tail, positioned halfway on a dam it is building.", "The beaver should be holding a muddy stick in its paws, actively placing it onto a section of the dam.", "Construct a dam across a river, made of realistically interlocked sticks, branches, and mud.", "Illustrate a clear difference in the water level: the water on the upstream side is high, calm, and deep, forming a pond, while the water on the downstream side is low and shallow, revealing rocks on the riverbed.", "The environment must include a riverbank with a tree that has been partially gnawed through at its base, with a pile of wood chips around it. The stick the beaver is holding must match the wood of this tree.", "Include the beaver's lodge, a large mound-like home made of sticks and mud, on the edge of the pond created by the dam.", "The water in the newly formed pond should reflect the sky and the trees on the bank.", "The water flowing over a low point in the dam should be depicted with ripples and lines to indicate movement."], "prompt": "Write `svg` code for an image of a beaver building a dam in a river.", "id": "write_`svg`_code"} -{"requirements": ["Replicate the user interface of the Visual Studio Code editor using the Dark+ color theme.", "Include the Activity Bar on the far left with the 'Files' icon in an active state.", "Show the Side Bar with a file explorer tree, where a file named `bot_controller.py` is highlighted as active.", "The main editor pane must display the contents of this `bot_controller.py` file, containing a block of Python code.", "The Python code must have correct syntax highlighting for keywords (purple), strings (orange), comments (green), and function names (yellow). The code must contain a function with a descriptive comment above it.", "Display line numbers in the gutter to the left of the code.", "Include a blinking text cursor positioned on line 15, column 8, within the body of the function.", "Show editor tabs at the top, with the tab for `bot_controller.py` visually active and matching the highlighted file in the explorer.", "Include the Status Bar at the bottom, showing information that corresponds to the editor's state: the language mode ('Python'), the line and column number of the cursor ('Ln 15, Col 8'), and the active Python interpreter."], "prompt": "Write `svg` code for a screenshot of a VS Code editor with a colorful syntax-highlighted block of Python code.", "id": "write_`svg`_code"} -{"requirements": ["Draw a tall, upright grandfather clock in a room setting, placed next to a window.", "The clock case, made of dark mahogany with a visible wood grain texture, must consist of a hood with a decorative finial on top, a long trunk, and a base.", "The clock face within the hood must have Roman numerals, with the hour hand pointing directly at 'III' and the minute hand pointing at 'XII' to show the time is 3:00.", "Light from the window must cast a long shadow from the clock onto the floor, consistent with an afternoon sun.", "Show a swinging brass pendulum inside the trunk's glass panel, depicted at the far right of its arc to imply motion.", "Include three hanging brass weights on chains inside the trunk; the rightmost weight must be positioned slightly higher than the other two, as if it has just chimed the hour.", "Use shading and highlights to give the wooden case and metallic pendulum and weights a three-dimensional appearance."], "prompt": "Write `svg` code for an image of a grandfather clock with a swinging pendulum.", "id": "write_`svg`_code"} -{"requirements": ["Create a scene with a distinct 8-bit, pixelated art style and a limited, bright color palette.", "Design a main character in a side-view, mid-jump, with their head positioned directly beneath a floating 'question mark' block, as if about to hit it.", "A single, pixelated coin must be depicted emerging from the top of the 'question mark' block, frozen mid-air, as a result of the character hitting it.", "Include a ground level made of repeating brown square blocks, with a small gap in the ground that the character is currently jumping over.", "All blocks must have a simple 3D effect with shading on one side.", "Design a simple, 8-bit style walking mushroom enemy on the ground level, moving towards the spot where the character will land after their jump.", "Include a simple UI in the top-left corner of the screen, displaying a score ('SCORE: 005000') and a life count (a small pixelated icon of the character's head x 3).", "The background must have simple pixelated clouds and hills, reinforcing the side-scrolling video game perspective."], "prompt": "Write `svg` code for an image of an 8-bit video game level, similar to Super Mario Bros., with a character, blocks, and an enemy.", "id": "write_`svg`_code"} -{"requirements": ["Draw a large, deep ceramic bowl, viewed from a slightly angled perspective to show all ingredients clearly.", "Fill the bowl with a rich-looking, opaque broth, with highlights to give it a wet, glossy appearance.", "Depict a nest of wavy ramen noodles in the center of the bowl, with a pair of wooden chopsticks lifting a single noodle out of the broth.", "Include two slices of chashu pork with visible layers of meat and fat, with one slice partially submerged in the broth.", "Add a soft-boiled egg (ajitama) cut in half, revealing a bright orange, jammy yolk, nestled against the pork.", "Garnish with a pile of finely chopped green onions on one side of the bowl.", "Include a large, crisp sheet of nori (seaweed) standing upright behind the noodles and a small cluster of menma (bamboo shoots) next to the egg.", "The chopsticks should be resting on the rim of the bowl, with their tips pointing towards the noodles."], "prompt": "Write `svg` code for an image of a detailed bowl of Japanese ramen, with noodles, broth, a soft-boiled egg, and chashu pork.", "id": "write_`svg`_code"} -{"requirements": ["Draw a cat curled into a tight ball, sleeping soundly on a cushioned windowsill, with its body pressed lightly against the base of a potted plant.", "Include a window frame around the scene, viewed from inside a room with a warm and soft color palette.", "Draw vertical streaks and scattered droplets on the window pane to represent heavy rain.", "Create a small, circular patch of condensation on the glass where the cat's warm breath would be, slightly obscuring the view of the rain outside.", "The potted plant on the windowsill must have a single water droplet clinging to the tip of one of its leaves.", "The scene outside the window must be blurred and rendered in muted blues and grays to suggest a cold, overcast day, contrasting with the warm interior light.", "Add a subtle, distorted reflection of the room's interior, including the silhouette of a lamp, on the surface of the window glass."], "prompt": "Write `svg` code for an image of a cat sleeping on a windowsill next to a potted plant, with rain streaking down the window pane.", "id": "write_`svg`_code"} -{"requirements": ["Draw a Swiss Army knife with its main red casing having a glossy finish.", "The iconic white cross inside a red shield logo must be clearly visible and inlaid into the center of the casing.", "Show several tools extended from the knife's body in a fanned-out arrangement from a visible pivot point.", "The large knife blade must be fully extended.", "A pair of scissors must be included and shown slightly open.", "A corkscrew, a can opener, and a flathead screwdriver must also be included, fanned out at different angles from the blade.", "All tools must have a metallic, silver/gray appearance with sharp, specular highlights to suggest shininess.", "Place the knife on a neutral surface, casting a soft shadow beneath it that follows the shape of the knife and its extended tools."], "prompt": "Write `svg` code for an image of a Swiss Army knife with several tools extended.", "id": "write_`svg`_code"} -{"requirements": ["Draw a large, cone-shaped volcano, silhouetted against a dark night sky.", "The sky must be dark black and contain a crescent moon and scattered stars.", "Show a massive plume of smoke and ash billowing from the crater, where its underside is intensely illuminated with fiery reds and oranges from the eruption below.", "Depict bright, glowing red and orange lava erupting from the crater and being ejected high into the air as pyroclastic debris.", "Illustrate multiple rivers of molten lava flowing down the sides of the volcano, carving glowing paths through the dark rock and pooling at the mountain's base.", "The erupting lava must be the primary light source, casting a dramatic glow on the smoke plume and the slopes of the mountain.", "The stars near the bright smoke plume must be obscured or less visible due to the intense glow of the eruption.", "The contrast between the dark, unlit mountain/sky and the brilliant, glowing lava must be sharp and dramatic."], "prompt": "Write `svg` code for an image of a volcano erupting at night, with lava flowing down its side.", "id": "write_`svg`_code"} -{"requirements": ["Create a main rectangular board area with a light grey background color.", "Draw exactly three vertical columns with the headers 'To Do', 'In Progress', and 'Done' respectively.", "Populate the 'To Do' column with three rectangular white cards containing placeholder text.", "One card in the 'To Do' column must have a red 'Urgent' label.", "Populate the 'In Progress' column with one rectangular white card.", "The card in the 'In Progress' column must feature two circular user avatars, indicating it is assigned to two people.", "Populate the 'Done' column with two rectangular white cards.", "One card in the 'Done' column must have a green 'Completed' label and a paperclip icon, indicating a finished task with an attachment.", "Illustrate one card being dragged from the 'To Do' column towards the 'In Progress' column, positioned between the two columns with a slight rotation and a drop shadow to indicate it is actively being moved.", "Ensure consistent spacing and alignment between all columns and cards.", "Include the main board title 'Project Alpha' at the top of the image."], "prompt": "Write `svg` code for a screenshot of a Trello board with multiple columns and cards.", "id": "write_`svg`_code"} -{"requirements": ["Draw a cooked sausage (frankfurter) nestled inside a sliced hot dog bun.", "The bun must appear soft and lightly toasted, with its shape conforming to the sausage it holds.", "Add a wavy line of yellow mustard across the top of the sausage.", "Add a wavy line of red ketchup that intertwines and overlaps with the mustard line on top of the sausage.", "Show a small drip of yellow mustard that has fallen from the hot dog onto the paper plate below.", "Place the entire hot dog on a white paper plate with fluted/ridged edges.", "The hot dog must cast a slight shadow onto the surface of the plate to create depth.", "Use shading and highlights on the sausage and bun to give them a three-dimensional, rounded look.", "The condiments must appear as if they are sitting on top of the sausage, following its curved contour."], "prompt": "Write `svg` code for an image of a hot dog with mustard and ketchup in a bun, on a paper plate.", "id": "write_`svg`_code"} -{"requirements": ["Create an underwater scene with a blue water background that is light cyan at the top and gets progressively darker towards the sea floor.", "Draw a variety of colorful coral formations on the sea floor, including pink brain coral, orange staghorn coral, and purple sea fans.", "Include a large sea turtle as the central element, swimming towards a patch of sea grass near the coral.", "Depict a school of at least five small, yellow tang fish swimming in unison past the turtle.", "Show two orange-and-white striped clownfish peeking out from within the tentacles of a green sea anemone.", "Place a red starfish attached to a rock at the base of the coral formations.", "Illustrate rays of light filtering down from the water's surface, casting a dappled light pattern on the sea turtle's shell and the sea floor.", "One clownfish must be partially obscured by the anemone's tentacles.", "The overall composition must be vibrant and dense, with the turtle, fish, and coral overlapping to create a sense of depth."], "prompt": "Write `svg` code for an image of a coral reef teeming with colorful fish and a sea turtle.", "id": "write_`svg`_code"} -{"requirements": ["Draw a clear glass mason jar, complete with its characteristic screw-top threads and embossed lettering on its side.", "The jar must be transparent, with the marbles inside fully visible and their shapes slightly distorted by the curved glass.", "Add white, curved highlights and reflections on the jar's surface that follow its cylindrical shape.", "Fill the jar almost to the top with numerous overlapping and stacked spherical marbles.", "The marbles must include at least three distinct, visible patterns: solid blue, green and white swirled, and a classic 'cat's eye' with a colored vane inside.", "Place a single 'cat's eye' marble on the surface next to the jar, casting a small shadow.", "The jar itself must cast a faint, transparent shadow that is tinted by the colors of the marbles within it.", "Each marble, both inside and outside the jar, must have a small, sharp white highlight to indicate its glossy surface.", "Include dark contact shadows between the marbles where they touch each other and where they press against the inside of the jar to create a sense of volume and weight."], "prompt": "Write `svg` code for an image of a glass jar filled with colorful marbles.", "id": "write_`svg`_code"} -{"requirements": ["Depict a large, partially constructed pyramid with visible stone layers and an unfinished, flat top.", "Include a large earthen ramp spiraling up the side of the pyramid, leading to the current construction level.", "Show a group of at least five workers in ancient Egyptian loincloths pulling a large stone block up the ramp using thick ropes.", "The stone block must be resting on a wooden sledge.", "Depict another worker walking in front of the sledge, pouring water from a clay jug onto the sand of the ramp to reduce friction.", "On the top level of the pyramid, show two other workers using long wooden levers to pry another stone block into its final position next to an existing one.", "Several unused, rectangular stone blocks must be visible at the base of the ramp in the sand.", "The setting must be a vast desert landscape under a bright, clear blue sky with a harsh sun.", "The workers, the sledge, and the pyramid must cast long, dark shadows on the sand, consistent with the bright sun's position.", "The overall color palette must consist of sandy yellows, stone greys, and sky blues."], "prompt": "Write `svg` code for an image of an ancient Egyptian pyramid under construction, with workers moving large stone blocks.", "id": "write_`svg`_code"} -{"requirements": ["Draw a single rose flower head in full bloom, with vibrant red petals that overlap in a natural, spiral-like formation.", "Use shading and gradients on the petals to create depth and a velvety texture.", "Place a single, clear droplet of water on the edge of one of the outer petals, showing refraction of the red petal color within it and a sharp highlight.", "Include a long, slender green stem connected to the base of the flower head, with several small, sharp thorns protruding from it.", "Attach exactly two green leaves to the stem, each with serrated edges and visible veins.", "One of the leaves must have a small, irregular hole in it, as if from an insect bite.", "Show a single red petal that has fallen from the flower and is lying on the surface near the base of the stem.", "The rose and the fallen petal must cast a soft shadow on a simple, light-grey background."], "prompt": "Write `svg` code for an image of a single red rose with a long stem and thorns.", "id": "write_`svg`_code"} -{"requirements": ["Replicate the general layout of the YouTube homepage within a rectangular frame with a dark mode theme.", "Include a header section at the top with a dark grey background.", "The header must contain the white YouTube logo (play icon and text), a central dark search bar with a search icon, and user-related icons on the right (create, notifications, profile avatar).", "Include a collapsible sidebar on the left with navigation links and icons (Home, Shorts, Subscriptions, Library), with the 'Home' icon and text highlighted to indicate the current page.", "The main content area must be a grid of at least six video thumbnails.", "Each thumbnail must be a rectangle containing a placeholder image, with a small box in the corner indicating video length (e.g., '10:32').", "Below each thumbnail, include a circular channel avatar, a placeholder for the video title on one line, and the channel name and view count on a second line.", "One of the video thumbnails must show a progress bar at the bottom, indicating it has been partially watched.", "Use the official YouTube color scheme: red (#FF0000) for highlights like the logo and progress bar, and shades of dark grey and white for the UI."], "prompt": "Write `svg` code for a screenshot of the YouTube homepage, showing video thumbnails and a sidebar.", "id": "write_`svg`_code"} -{"requirements": ["Illustrate a stack of exactly four books, arranged vertically but slightly askew so they don't line up perfectly.", "Depict the books as old and leather-bound, using colors like dark brown, burgundy, and forest green for the covers.", "Show the spines of the books, with the top book's spine featuring raised bands and faded gold-leaf lettering for a title.", "The visible page edges must be a yellowish, aged color, with thin horizontal lines to represent individual pages.", "The book at the bottom of the stack must be larger and thicker than the others, forming a stable base.", "Use subtle textures and scuff marks on the leather covers and corners to indicate wear and tear.", "Render the stack in a 3D perspective, showing the top cover of the highest book and the side spines and page edges of all four.", "The entire stack must cast a soft shadow on the surface it is resting on."], "prompt": "Write `svg` code for an image of a stack of old, leather-bound books.", "id": "write_`svg`_code"} -{"requirements": ["Draw a chameleon with its characteristic features: a curled tail, a head crest, and a prominent, independently rotating eye.", "Position the chameleon on a tree branch that extends diagonally across the image.", "The branch must have a rough brown bark texture and several green leaves attached to it.", "Illustrate the camouflage effect by having the chameleon's skin pattern and color actively blending into the branch and leaves it is touching.", "The rear half of the chameleon's body and its back legs, which are on the bark, must mimic the brown, rough texture of the branch.", "The front half of the chameleon's body and its head, which are near the leaves, must mimic the green color and vein patterns of the leaves.", "Show a visible, soft gradient transition on the chameleon's torso where the brown bark pattern blends into the green leaf pattern.", "The chameleon must be in a realistic clinging pose, with its zygodactyl feet gripping the branch firmly."], "prompt": "Write `svg` code for an image of a chameleon on a branch, changing its color to match the leaves.", "id": "write_`svg`_code"} -{"requirements": ["The background must be a solid, dark 'blueprint blue' color (#000080).", "All lines and text must be white.", "Depict a top-down floor plan of a small house with at least two bedrooms and one bathroom.", "Use thick lines for exterior walls and thinner lines for interior walls.", "Show openings for doors and windows within the walls; doors must be indicated with a line and a quarter-circle arc showing the swing direction into the corresponding room.", "Include labels in all caps for each room: 'KITCHEN', 'BEDROOM 1', 'BEDROOM 2', 'LIVING ROOM', and 'BATH'.", "Add exterior dimension lines with measurement annotations (e.g., '30ft') along the outside of the walls.", "Add interior dimension lines to show the size of 'BEDROOM 1'.", "Include schematic outlines of key furniture: a bed and closet in each bedroom, a sofa in the living room, a toilet and shower in the bathroom, and kitchen counters with a sink.", "The kitchen counters must connect to the living room in an open-plan layout.", "Incorporate a title block in the bottom-right corner with text for 'Project Name: 'Small House'', 'Drawing: 'Floor Plan'', and 'Scale: '1/4\" = 1ft''."], "prompt": "Write `svg` code for an image of a detailed architectural blueprint of a small house.", "id": "write_`svg`_code"} -{"requirements": ["Draw a white bowl.", "Fill the bowl with a mound of spaghetti noodles, depicted with many overlapping, curved yellow lines.", "Cover the spaghetti with a generous amount of red tomato sauce that drips down the sides of the noodle mound.", "Add highlights to the sauce to give it a glossy appearance.", "Place exactly three round, brown meatballs on top of the spaghetti, nestled in the sauce.", "Add a sprinkle of green specks (parsley) over the dish, with some specks also landing on the meatballs and the rim of the bowl.", "Show a metallic fork (grey with highlights) actively twirling a small portion of spaghetti noodles, lifting them slightly from the bowl.", "The twirled noodles on the fork must be coated in sauce and have one of the meatballs caught in the twirl.", "The main mound of spaghetti must show an indentation where the fork has lifted the noodles from."], "prompt": "Write `svg` code for an image of a plate of spaghetti and meatballs, with a fork twirling some noodles.", "id": "write_`svg`_code"} -{"requirements": ["Design a spaceship with a sleek, futuristic aesthetic, featuring smooth curves and panel lines on its silver, metallic-looking surfaces.", "Show the spaceship with its landing gear deployed and resting on the ground, with dust and small rocks kicked up around the landing struts.", "Include a cockpit window through which a faint silhouette of a pilot is visible, engine exhausts that are still glowing faintly red from the landing, and several active glowing blue lights on the hull.", "The setting is a barren alien planet with a surface composed of red soil and scattered purple rocks.", "Depict two moons of different sizes visible in the sky.", "The sky must be a dark purple, with its color reflecting off the silver hull of the spaceship.", "The light from the larger of the two moons must cast a long, dramatic shadow of the spaceship across the red soil, with the smaller purple rocks also casting their own distinct shadows."], "prompt": "Write `svg` code for an image of a sleek, futuristic spaceship landing on a barren alien planet with two moons.", "id": "write_`svg`_code"} -{"requirements": ["The image must be a close-up (macro) view of a spiderweb.", "Draw the web with a classic orb-weaver structure: radial support lines originating from a center point, and a spiral of thinner capture threads.", "The web's threads should be thin and delicate.", "Scatter multiple small, circular dewdrops along the threads of the web.", "Each dewdrop must be rendered as translucent, showing the web lines behind them as slightly distorted or refracted.", "Add a small, white highlight to each drop, with all highlights consistently placed to indicate a single, low light source from the early morning sun.", "Include a small gnat trapped in one of the web's spiral threads, with the thread pulling taut from the insect's weight.", "Use a soft, blurred, out-of-focus background of green foliage to make the web and dewdrops stand out."], "prompt": "Write `svg` code for an image of a spiderweb with dewdrops on it, seen up close.", "id": "write_`svg`_code"} -{"requirements": ["Create a rectangular frame representing a smartphone screen.", "Design a clean, modern User Interface (UI) for a weather application.", "At the top, display the current location as 'San Francisco', the current temperature as '68°' in a large font, and the weather description as 'Sunny'.", "Include a large, clear sun icon next to the current conditions, matching the 'Sunny' description.", "The UI background must be a light blue gradient, reflecting the current 'Sunny' weather condition.", "Below the current conditions, display a horizontal 5-day forecast section.", "The first day of the forecast (MON) must show a sun icon with temps '72° / 55°'.", "The second day (TUE) must show a sun-and-cloud icon with temps '69° / 54°'.", "The third day (WED) must show a cloud icon with temps '65° / 52°'.", "The fourth and fifth days (THU, FRI) must show rain drop icons with temps '62° / 50°' and '60° / 49°' respectively, showing a clear progression of weather.", "Use a legible, sans-serif font throughout the UI."], "prompt": "Write `svg` code for a screenshot of a weather app UI, showing a 5-day forecast with icons for sun, clouds, and rain.", "id": "write_`svg`_code"} -{"requirements": ["Draw a three-story brick building with multiple windows on its facade.", "Show visible orange, red, and yellow flames and dark smoke billowing from a second-story window.", "Depict exactly two firefighters in full protective gear (helmet, coat, pants, boots).", "A ladder must extend from a partially visible red fire truck to a third-story window, where one firefighter is positioned, preparing to enter.", "The second firefighter must be on the ground, aiming a fire hose towards the flaming second-story window.", "A thick fire hose must connect the firefighter on the ground to the fire truck.", "Show a powerful stream of water spraying from the hose's nozzle, arcing up and entering the flaming window.", "The scene must be set against a dark night sky, where the orange glow from the fire illuminates the side of the building, both firefighters, the ladder, and the stream of water."], "prompt": "Write `svg` code for an image of a group of firefighters putting out a fire on a multi-story building.", "id": "write_`svg`_code"} -{"requirements": ["Draw a ceramic-style coffee cup with a handle.", "Place the cup on a matching saucer.", "Fill the cup with a two-toned liquid representing cappuccino: a dark brown coffee base and a lighter, creamy foam top.", "Create a distinct heart shape in the center of the foam using the darker coffee color, recognizable as latte art.", "Position the view from a slight angle to clearly display the heart design and the side of the cup.", "Add a small silver spoon resting on the saucer, with its reflection slightly visible on the side of the cup.", "Include subtle wisps of steam rising from the cup, with the heart art slightly distorting the path of the steam rising directly above it."], "prompt": "Write `svg` code for an image of a cup of cappuccino with latte art in the shape of a heart.", "id": "write_`svg`_code"} -{"requirements": ["Draw a thick, ancient-looking book with a decorative leather cover.", "The book must be open, showing two pages with diagrams and symbols.", "Place the open book on top of an ornate stone pedestal.", "Illustrate several runes floating in the air directly above the open pages.", "The runes must have a visible glow effect, casting a colored light down onto the pages of the book.", "One of the symbols on the book's page must match one of the glowing runes floating above it.", "The book itself must emit a faint glow that illuminates the top surface of the pedestal it rests on.", "Use a dark, atmospheric background of a stone chamber to make the glowing elements stand out."], "prompt": "Write `svg` code for an image of a wizard's spellbook open on a pedestal, with glowing runes floating above it.", "id": "write_`svg`_code"} -{"requirements": ["Frame the entire image with the opening of a camping tent, creating a first-person perspective from inside, with the tent's fabric, seams, and zipper visible as the frame.", "In the foreground, show the edge of a red sleeping bag and a green backpack, establishing the interior of the tent.", "Outside the tent, depict a lit campfire with visible logs and bright orange and yellow flames.", "The campfire must cast a warm, flickering glow on the ground in front of the tent and on the visible parts of the tent's opening.", "The background must be a dark night sky populated with numerous small dots representing stars.", "Include silhouettes of pine trees against the starry sky.", "The perspective must be low, as if lying down inside the tent on the sleeping bag, looking out past the campfire to the sky."], "prompt": "Write `svg` code for an image of the view from inside a tent, looking out at a campfire and a starry night sky.", "id": "write_`svg`_code"} -{"requirements": ["Draw the main base of the record player with a wood grain finish.", "Include a circular platter on the base, on which a black vinyl record is placed.", "The record must have visible concentric grooves and a red center label.", "Illustrate a tonearm with a headshell and cartridge, positioned so the stylus (needle) is resting within one of the grooves on the record's surface.", "Include control elements: a power knob that is in the 'on' position and a speed selector set to '33' rpm.", "Show an open, transparent dust cover hinged at the back of the base, with a slight reflection of the tonearm visible on its surface.", "Depict small, stylized musical notes floating up from the record to indicate that music is playing."], "prompt": "Write `svg` code for an image of a classic vinyl record player with a record on the turntable.", "id": "write_`svg`_code"} -{"requirements": ["Illustrate an athletic figure in a dynamic tennis serving pose, wearing a white shirt and blue shorts.", "The player's body must be arched backwards, conveying coiled power.", "One arm must be extended upwards, having just tossed a yellow tennis ball into the air.", "The other arm must be holding a tennis racquet, swung high and captured at the moment just before it strikes the ball.", "The tennis ball must be positioned in the air slightly in front of and above the player, at the peak of the toss, perfectly aligned with the center of the raised racquet.", "Use motion lines trailing the racquet to suggest the high speed of its upward swing.", "Depict a portion of a blue tennis court, including the white baseline the player is standing behind, and the net in the background.", "The bright sun must cast a sharp, dynamic shadow of the player and their raised racquet onto the court surface."], "prompt": "Write `svg` code for an image of a tennis player in the middle of a powerful serve.", "id": "write_`svg`_code"} -{"requirements": ["Create a diagram illustrating the four distinct stages of the butterfly life cycle on a single host plant.", "Stage 1: Show a cluster of small eggs on a green leaf.", "Stage 2: Show a caterpillar actively eating the edge of the same leaf where remnants of the hatched eggs are visible.", "Stage 3: Show a chrysalis (pupa) hanging from a twig directly above the leaf from Stage 2.", "Stage 4: Show a fully formed adult butterfly with patterned wings, positioned next to the now-empty chrysalis casing from which it has emerged.", "Arrange the four stages in a logical circular sequence on the plant.", "Use arrows to connect the stages in the correct order: from the eggs to the caterpillar, from the caterpillar to the chrysalis, and from the chrysalis to the butterfly.", "An arrow must go from the butterfly back towards a fresh leaf on the plant, as if to lay new eggs, visually completing the cycle.", "Each stage must have a clear text label pointing to the relevant part of the plant: 'Eggs', 'Caterpillar', 'Chrysalis', 'Butterfly'."], "prompt": "Write `svg` code for a diagram showing the life cycle of a butterfly, from egg to caterpillar to chrysalis to adult.", "id": "write_`svg`_code"} -{"requirements": ["Draw a research-style submersible, not a military one, exploring a deep-sea trench.", "The submersible must have a main viewport, external lights, and at least one robotic arm extended towards the seafloor.", "Show bright beams of light emanating from the submersible's lights, directly illuminating an anglerfish in the foreground.", "Create a dark, deep-sea trench environment with rocky walls and a seafloor populated with a cluster of glowing tube worms.", "The robotic arm must be positioned as if it is about to collect a rock sample from next to the tube worms.", "Include at least one anglerfish, with its characteristic glowing lure, caught within the submersible's main beam of light.", "The overall scene must be very dark, with light originating only from the submersible, the anglerfish's lure, and other bioluminescent life.", "Depict floating particles in the water to create a sense of depth and murkiness.", "Add several bioluminescent jellyfish floating in the mid-ground between the submersible and the trench wall."], "prompt": "Write `svg` code for an image of a submarine exploring a deep-sea trench with glowing anglerfish nearby.", "id": "write_`svg`_code"} -{"requirements": ["Create a UI window that resembles a desktop application, with a title bar and window controls (minimize, maximize, close).", "Display a monthly calendar view for 'October 2023' inside the window.", "Include a header with the month and year and navigation arrows. The 'next month' arrow must be depicted in a hovered or pressed state.", "Lay out a grid for the days of the month, with headers for the days of the week (e.g., S, M, T, W, T, F, S).", "Populate the grid cells with date numbers for October 2023.", "Fill one date cell with a colored block and the text 'Project Deadline'.", "Fill a separate three-day span of consecutive dates with a single colored block labeled 'Team Offsite'.", "Highlight the 'Project Deadline' date cell with a circular outline to represent it as the 'current day'.", "Add one other event on a different day labeled 'Team Sync'.", "The design should be clean and modern, typical of a calendar app."], "prompt": "Write `svg` code for a screenshot of a calendar application, showing a monthly view with several events scheduled.", "id": "write_`svg`_code"} -{"requirements": ["Draw a large, medieval-style catapult made of wood with visible wood grain texture and metal fittings.", "The catapult's structure must include a sturdy base frame and a long throwing arm, powered by a large counterweight.", "Depict the catapult in the middle of the launch action: the counterweight must be shown near the bottom of its downward swing, while the throwing arm has just reached the apex of its upward swing.", "A large, round boulder must be shown in mid-air, having just left the catapult's sling, following a clear trajectory.", "Use motion lines to convey the rapid movement of the throwing arm and the flight path of the boulder.", "The scene must be set on a muddy field, with disturbed ground around the catapult's base to suggest the force of the launch.", "Include a stone castle wall in the distant background, positioned as the clear target for the boulder's trajectory."], "prompt": "Write `svg` code for an image of a medieval catapult launching a boulder.", "id": "write_`svg`_code"} -{"requirements": ["Create a primary circular plate to serve as the base for the food.", "Include several pieces of nigiri sushi, differentiating toppings by color (red for tuna, orange for salmon). One piece of salmon nigiri must have a small dab of green wasabi on top.", "Add at least one type of maki roll (sushi roll), showing the outer layer of nori (seaweed) and the cross-section of rice and fillings.", "Include a few slices of sashimi (raw fish without rice), arranged artfully on the plate, with a visible empty space where one piece of nigiri was removed.", "Place a small, shallow bowl on the side, filled with dark brown soy sauce that has ripples on its surface.", "Include a pair of chopsticks, positioned to be actively lifting a piece of tuna nigiri from the plate. The fish on the lifted nigiri must be slightly darkened at the tip, as if it has just been dipped in the soy sauce.", "Include a small mound of green wasabi and a pile of pink pickled ginger (gari) as garnishes on the plate.", "Arrange all elements in a visually appealing composition, focusing on the action of eating.", "The overall image should have a clean and fresh aesthetic."], "prompt": "Write `svg` code for an image of a plate of sushi and sashimi, with chopsticks and a small bowl of soy sauce.", "id": "write_`svg`_code"} -{"requirements": ["Draw a tall, cylindrical lighthouse tower with red and white horizontal stripes, positioned on top of a dark, jagged, rocky cliff.", "Show the lantern room at the top of the lighthouse with a visible, glowing light source inside.", "Create a powerful beam of light, depicted as a solid, yellow, trapezoidal shape, emanating from the lantern.", "The light beam must cut across the scene and directly illuminate a specific, treacherous-looking rock jutting out of the water.", "Establish a night scene with a dark blue sky, where a faint crescent moon and a few stars are partially visible through thin fog.", "Incorporate a fog effect using semi-transparent white shapes, which is thickest around the base of the cliff and thins out towards the sky.", "Depict dark, churning water with large, white-capped waves shown actively crashing against both the base of the cliff and the illuminated rock."], "prompt": "Write `svg` code for an image of a lighthouse on a rocky cliff, with its light beam cutting through a foggy night.", "id": "write_`svg`_code"} -{"requirements": ["Illustrate two strands twisting around a central axis to form a right-handed double helix with clear 3D perspective.", "The two outer strands must represent the sugar-phosphate backbones, depicted as smooth, continuous helical lines.", "Connect the two backbones with horizontal rungs representing the base pairs.", "Use four distinct colors for the nucleobases: Adenine (e.g., blue), Guanine (e.g., red), Cytosine (e.g., yellow), and Thymine (e.g., green).", "The base pairing must be consistently shown, so that the color for Adenine always pairs with the color for Thymine, and the color for Cytosine always pairs with the color for Guanine.", "Clearly show the major and minor grooves created by the helical twist, and add labels with lines pointing to 'Major Groove' and 'Minor Groove'.", "Add labels with lines pointing to the 'Sugar-phosphate backbone' and a 'Base pair'.", "In one section of the helix, magnify a single C-G pair to explicitly label the 'C' and 'G' on their respective colored shapes to reinforce the pairing rule.", "Maintain a clean, scientifically recognizable, diagrammatic style."], "prompt": "Write `svg` code for an image of a DNA double helix strand.", "id": "write_`svg`_code"} -{"requirements": ["The scene's focal point must be a multi-layered birthday cake on a table, with several lit candles on top.", "Include a group of at least three stylized people around the table, all wearing colorful, conical party hats.", "One person must be shown leaning forward over the cake, with puffed cheeks, in the act of blowing out the candles. Faint motion lines should emanate from their mouth towards the candle flames.", "The other people must be looking at the person blowing out the candles, with expressions of excitement or cheering.", "Add a bunch of colorful balloons with strings floating in the background, with one string shown leading to the hand of one of the people.", "Decorate the scene with festive streamers hanging from the ceiling and confetti scattered on the table around the cake.", "Place a few wrapped gift boxes on the table, one of which is partially unwrapped with ribbon trailing onto the table.", "Use a bright and cheerful color palette to convey a celebratory mood in an indoor party room."], "prompt": "Write `svg` code for an image of a birthday party scene with a cake, balloons, and people wearing party hats.", "id": "write_`svg`_code"} -{"requirements": ["Draw the open case of a desktop PC tower, showing the internal components from a perspective view.", "Include a large motherboard as the main circuit board, serving as the base for other components.", "Show the CPU socket on the motherboard, covered by a large heatsink and a spinning fan assembly indicated by motion lines.", "Illustrate at least two RAM sticks slotted into the motherboard.", "Include a dedicated GPU card plugged into a PCI-e slot on the motherboard, with its own two cooling fans.", "Depict the PSU in its housing, with a bundle of colored wires extending from it and connecting to the motherboard and GPU.", "Show one rectangular HDD and one flatter SSD, both connected to the motherboard with visible SATA data cables.", "Draw power cables from the PSU connecting to the motherboard's main power connector, the CPU power connector, and the GPU.", "Add clear labels with lines pointing to each major component: 'Motherboard', 'CPU Cooler', 'GPU', 'RAM', 'PSU', 'HDD', and 'SSD'.", "Use a clear, technical diagram style with clean lines to show how the components are interconnected."], "prompt": "Write `svg` code for an image of a detailed diagram of the internal components of a desktop computer.", "id": "write_`svg`_code"} -{"requirements": ["Draw a large, mature oak tree with a thick, textured trunk and wide, spreading branches.", "Construct a rustic wooden treehouse, made of planks, nestled among and structurally supported by the tree's branches.", "The treehouse must have a simple roof, a window with a small flower box on its sill, and a door.", "A rope ladder with wooden rungs must hang from the treehouse entrance down to the ground.", "A tire swing must be shown hanging by a rope from a sturdy, lower branch of the same oak tree.", "A small, red flag must be attached to the peak of the treehouse roof.", "The scene must be set in a green, grassy backyard, with the base of the tree's trunk clearly visible in the grass.", "Use a bright color palette that suggests a sunny day, with a clear blue sky in the background."], "prompt": "Write `svg` code for an image of a treehouse with a rope ladder, nestled in a large oak tree.", "id": "write_`svg`_code"} -{"requirements": ["Depict a blacksmith character, shown with a strong build and wearing a work apron.", "The blacksmith must be holding a hammer in a raised position, positioned directly above a glowing sword blade on an anvil, as if about to strike.", "Place a classic-shaped, heavy anvil on a wooden stump in front of the blacksmith.", "On the anvil, place a sword blade that is glowing bright orange and yellow to indicate it is heated.", "Show sparks flying upwards from the specific point on the glowing blade where the hammer is about to make contact.", "In the background, include a forge with visible glowing coals and flames, which serves as a primary light source.", "The setting must be a dark, rustic workshop; a pair of tongs must be resting against the anvil's wooden stump.", "The forge and the glowing sword blade must be the only light sources, casting an orange glow on the side of the blacksmith and a bright yellow-orange light on his front and the top of the anvil, creating distinct shadows.", "The anvil must be dark and metallic, with its top surface reflecting the bright orange glow from the sword blade."], "prompt": "Write `svg` code for an image of a blacksmith at an anvil, hammering a glowing piece of metal.", "id": "write_`svg`_code"} -{"requirements": ["Create a multitude of nodes, represented as circles of varying sizes, organized into at least three distinct color-coded clusters (e.g., blue, green, red).", "Connect the nodes with a large number of lines (edges) to show interconnection.", "The graph must be arranged in a force-directed layout, creating a complex, organic, web-like structure.", "Within each cluster, a larger central hub node must be connected via thick lines to its smaller, peripheral nodes.", "Thinner lines must be used to connect the peripheral nodes to each other within the same cluster.", "A few thin, curved lines must bridge the different colored clusters, connecting peripheral nodes from one cluster to another to show cross-cluster interaction.", "The central hub nodes must be the largest in size, with many connections, while peripheral nodes are smaller with fewer connections.", "The curved lines must navigate around other nodes gracefully to avoid a messy appearance."], "prompt": "Write `svg` code for an image of a complex network graph with nodes and interconnected lines.", "id": "write_`svg`_code"} -{"requirements": ["Draw an old-fashioned steam locomotive as the main subject, viewed from a three-quarter perspective.", "The locomotive must have key features: a smokestack, a cowcatcher, large driving wheels with connecting rods, and a cab for the engineer.", "Show white steam puffing from the smokestack, trailing backward over the top of the first attached train car to indicate motion.", "Attach exactly two passenger cars behind the locomotive.", "The train must be positioned one-third of the way across a detailed wooden trestle bridge.", "The bridge's structure must show the crisscrossing wooden beams and supports of the trestles, and its reflection must be visible in the river below.", "The bridge must span a river flowing through a valley.", "The background must feature pine forests on rolling hills leading up to distant, snow-capped mountains.", "The three-quarter perspective must effectively show the length of the train and the scale of the bridge over the river."], "prompt": "Write `svg` code for an image of an old-fashioned steam train crossing a wooden trestle bridge.", "id": "write_`svg`_code"} -{"requirements": ["Create a rectangular frame representing a phone screen in dark mode, with a dark background and light text/icons.", "Include a large square area for the album art, which must be a graphic of a stylized sun setting over an ocean.", "Below the album art, display the song title 'Ocean Sunset' in a larger, bold font.", "Below the song title, display the artist name 'The Vectors' in a smaller font.", "Create a playback control bar at the bottom containing icon buttons for 'Previous', 'Pause', and 'Next'. The 'Pause' icon must be the most prominent, indicating the song is playing.", "Include a horizontal progress bar (scrubber) above the control buttons.", "The progress bar's handle must be positioned at the one-third mark to indicate the current playback position.", "Display the elapsed time timestamp '1:23' at the left end of the progress bar and the total duration '3:45' at the right end, corresponding to the handle's position.", "On the same line as the progress bar, include a 'Shuffle' icon on the left and a 'Repeat' icon on the right. The 'Shuffle' icon must be illuminated to indicate it is active."], "prompt": "Write `svg` code for a screenshot of a music player interface, like Spotify, showing album art and playback controls.", "id": "write_`svg`_code"} -{"requirements": ["Include a central figure of a chef wearing a traditional uniform (toque, jacket) with an expression of intense concentration.", "The chef must be in a dynamic, mid-action pose, bringing a chef's knife down in a chopping motion.", "The chef's knife must be positioned just above a carrot on a cutting board, at the peak of its downward chop.", "On the cutting board, show a whole carrot with several circular slices already cut and lying to its left.", "Depict at least three small, irregular pieces of the carrot in mid-air to the right of the knife, flying away from the point of impact.", "The setting must be a busy kitchen, with a metal bowl of other chopped vegetables (onions, celery) sitting on the counter next to the cutting board.", "Show a pot on a stove in the background with wavy, semi-transparent lines of steam rising from it.", "The steam must drift upwards and partially obscure a shelf of spices located behind the pot.", "The overall composition must convey a sense of action, with the chef's eyes focused directly on the point where the knife will meet the carrot."], "prompt": "Write `svg` code for an image of a chef in a busy kitchen, mid-chop, sending pieces of a carrot flying off a cutting board, with steam rising from pots in the background.", "id": "write_`svg`_code"} -{"requirements": ["Draw a scuba diver figure complete with gear: mask, regulator, air tank, and fins.", "The diver must be holding an underwater camera and aiming its lens directly at the eye of an octopus.", "The octopus must be positioned behind a piece of brain coral, with two tentacles wrapped around it and its head and one eye peeking out.", "The octopus's skin texture and color must mimic the bumpy, tan texture of the brain coral it is hiding behind, demonstrating camouflage.", "Create a detailed coral reef environment with varied shapes and colors of coral, rock, and a small, brightly colored clownfish swimming near the camouflaged octopus.", "The entire image must have a blue tint to simulate being underwater.", "Include light rays filtering down from the water's surface, illuminating the diver's back and casting a slight shadow over the area where the octopus is hiding.", "Show a continuous stream of bubbles rising from the diver's regulator, moving upwards and passing in front of a section of the background coral."], "prompt": "Write `svg` code for an image of a scuba diver using an underwater camera to take a picture of a shy octopus that is partially camouflaged against a coral reef.", "id": "write_`svg`_code"} -{"requirements": ["Design a visually complex Rube Goldberg machine with a cobbled-together, DIY aesthetic, using parts like ramps, levers, and scissors.", "A red marble must be shown at the end of a wooden ramp, making contact with one end of a see-saw-like lever.", "The lever must be tilted down on the side the marble has hit, and consequently tilted up on the opposite end.", "The rising end of the lever must be shown pushing a pin out from under a weight.", "The now-unsupported weight must be depicted falling downwards, pulling a cord taut.", "The cord must be attached to the handle of a pair of scissors, pulling the blades closed.", "The scissor blades must be shown halfway closed, with a taut red string positioned between them, moments from being severed.", "The chain of events—marble hitting lever, lever releasing weight, weight pulling cord, cord closing scissors—must be clearly and sequentially illustrated."], "prompt": "Write `svg` code for an image of a complex Rube Goldberg machine in action, where a falling marble has just triggered a lever, which is in the process of releasing a pair of scissors to cut a string.", "id": "write_`svg`_code"} -{"requirements": ["Depict two people, one young and one old, sitting opposite each other at a table.", "The young person's hand must be hovering directly over their white queen piece on an 8x8 chessboard.", "The chess pieces must be arranged in a late-game configuration where the white queen's next move results in checkmate.", "The old person's black king must be shown trapped on the board, with its potential escape squares blocked by other white pieces, such as a rook and a bishop.", "The young person must have a facial expression of confident triumph, with their eyes fixed on the opponent's trapped king.", "The old person must have a facial expression of sudden, defeated realization, with their wide eyes looking at their own trapped king.", "The perspective must be from slightly over the young person's shoulder, focusing attention on their hand, the white queen, and the checkmated black king.", "The queen piece must be clearly identifiable and the target of the player's action."], "prompt": "Write `svg` code for an image of two people, one young and one old, intensely focused on a chess game, where one player's hand is hovering over the queen to make a checkmate move.", "id": "write_`svg`_code"} -{"requirements": ["Show an alchemist's hands and forearms, wearing dark, rustic sleeves, holding a glass beaker.", "The alchemist must be tilting the beaker, pouring a stream of glowing blue liquid from it.", "The stream of glowing liquid must flow into a large, dark, metallic cauldron that is positioned over a crackling wood fire.", "The cauldron must contain a green potion, and at the point where the blue stream meets the green liquid, there must be a bright flash of white light indicating a reaction.", "The green potion must be bubbling violently, with the bubbling most intense at the point of contact with the blue liquid.", "Plumes of purple smoke must be rising from the cauldron, curling upwards to partially obscure a background shelf filled with glass jars.", "The alchemist's hands and the beaker must be illuminated by the blue glow of the liquid, while the front of the cauldron is illuminated by the orange fire beneath it, casting complex shadows on the stone wall behind."], "prompt": "Write `svg` code for an image of an alchemist pouring a glowing blue liquid from a beaker into a cauldron, causing the green potion inside to bubble violently and emit purple smoke.", "id": "write_`svg`_code"} -{"requirements": ["Draw a recognizable Formula 1 race car, stationary and lifted off the ground on front and rear jacks inside a pit box.", "Show multiple pit crew members in team uniforms in dynamic poses of urgent, precise action around the car.", "At the front-left wheel, depict one crew member removing the old wheel while another stands ready, holding the new wheel.", "At the rear-right wheel, depict a crew member using a pneumatic wheel gun to tighten the nut on a newly fitted wheel, with sparks flying from the gun's impact.", "A crew member on the right side of the car must have a large refueling hose firmly connected to the car's fuel port.", "The driver, wearing a helmet, must be visible in the cockpit with hands on the steering wheel, looking intently towards the pit lane exit.", "A 'lollipop man' crew member must be standing directly in front of the car, holding a sign that indicates 'Brakes On'.", "The scene must be set in a pit lane with appropriate ground markings and a pit wall gantry in the background."], "prompt": "Write `svg` code for an image of a pit crew in a Formula 1 race, simultaneously changing all four tires and refueling the car while the driver waits.", "id": "write_`svg`_code"} -{"requirements": ["Focus on a close-up view of a barista's hands, with one hand holding a ceramic coffee cup by its handle.", "The other hand must be gripping a stainless steel milk pitcher from the side, with the thumb on the handle for stability.", "The pitcher must be tilted, with its spout positioned directly over the center of the cup.", "Show a thin, controlled stream of white, steamed milk pouring from the pitcher's spout into the cup.", "The cup must contain dark brown liquid representing espresso, with a creamy layer of crema on top.", "On the surface of the crema, there must be a detailed latte art pattern in the shape of a fern (rosetta), which is nearly complete.", "The stream of milk must be shown connecting to the top of the rosetta, forming the final, delicate leaf of the fern pattern.", "The composition should be tightly cropped, showing parts of the barista's forearms and apron, to emphasize the action of pouring and the creation of the art."], "prompt": "Write `svg` code for an image of a barista pouring steamed milk from a metal pitcher into a cup of espresso, creating detailed latte art in the shape of a fern.", "id": "write_`svg`_code"} -{"requirements": ["Use a cutaway style to show the internal workings of a single engine cylinder.", "Include the main components: cylinder wall, piston, connecting rod, a portion of the crankshaft, and the cylinder head.", "The crankshaft must be shown rotating, causing the connecting rod to pull the piston downwards within the cylinder, representing the intake stroke.", "A downward-pointing arrow must be attached to the top of the piston to indicate its direction of motion.", "In the cylinder head, the intake valve must be shown fully open, while the exhaust valve is fully closed.", "Show a carburetor connected to the cylinder's intake port.", "Represent the fuel-air mixture as a blue-colored gas being drawn from the carburetor, flowing past the open intake valve, and filling the expanding space above the descending piston.", "A spark plug must be screwed into the top of the cylinder head, with its electrode visible inside the combustion chamber.", "The image must have the clean, technical look of a diagram with clear outlines and labels for the piston, crankshaft, and intake valve."], "prompt": "Write `svg` code for a cutaway diagram of a car engine where the piston is moving down during the intake stroke, drawing a fuel-air mixture in from a carburetor.", "id": "write_`svg`_code"} -{"requirements": ["Depict a glassblower's hands and forearms as the primary subject, with one hand wearing a protective glove.", "The gloved hand must be holding and rotating a long metal blowpipe.", "At the far end of the blowpipe, show a glowing, red-orange, molten glass bubble.", "The other hand must be holding a shaping tool made of a thick, wet, folded wad of newspaper.", "The newspaper tool must be pressed firmly against the side of the molten glass bubble, creating a visible indentation.", "A thick cloud of steam must be shown billowing up from the exact point of contact where the wet newspaper touches the hot glass.", "The indentation on the glass bubble must directly correspond to the shape of the newspaper tool pressing into it.", "The background must clearly show the glowing orange opening of a furnace (the glory hole), which is the source of light in the scene."], "prompt": "Write `svg` code for an image of a glassblower at the end of a blowpipe, shaping a molten glass bubble with a wet wad of newspaper, causing steam to rise.", "id": "write_`svg`_code"} -{"requirements": ["The image must be a close-up focusing on a pair of hands and forearms, covered in a texture representing wet, brown clay.", "The hands must be positioned around a lump of clay, with one hand inside the opening and the other shaping the exterior wall.", "The clay must be perfectly centered on the circular head of a potter's wheel, which is surrounded by a splash pan.", "The clay must be formed into the recognizable, in-progress shape of a vase, with a defined base, a swelling body, and a narrowing neck.", "Include concentric circular lines on the clay and the wheel head to indicate a rapid spinning motion.", "Show drips of watery clay slip running down the exterior of the vase and the potter's hands, pooling at the base of the clay on the wheel head.", "The background must be a simple, dark, out-of-focus wall to keep the focus on the hands and the creative process."], "prompt": "Write `svg` code for an image of a potter's hands, covered in clay, shaping a vase on a spinning potter's wheel.", "id": "write_`svg`_code"} -{"requirements": ["Include a mother bird, recognizable by adult plumage, perched on the edge of a nest.", "The nest must be depicted with a woven texture of twigs and grass.", "The nest must be securely situated in the fork of a tree branch, with green leaves framing the scene.", "There must be exactly three baby chicks inside the nest.", "The chicks should appear young and fluffy, with underdeveloped feathers.", "All three chicks must have their beaks wide open, necks stretched, and pointing upwards towards the mother bird in a hungry posture.", "The mother bird must be leaning over, with her beak positioned directly above one of the chick's open beaks.", "A pink worm must be clearly visible, held at its midpoint in the mother's beak, with one end of the worm just entering the chick's beak.", "The scene should be brightly lit to convey a sense of a sunny springtime day."], "prompt": "Write `svg` code for an image of a mother bird at her nest, placing a worm into the wide-open beak of one of her three hungry chicks.", "id": "write_`svg`_code"} -{"requirements": ["Depict an astronaut in a standard white Extravehicular Mobility Unit (spacesuit) with a golden-tinted helmet visor.", "The astronaut must be attached to the end of a multi-jointed robotic arm via a foot restraint, floating in a zero-gravity pose.", "The robotic arm must be positioned to hold the astronaut steady next to a large solar panel array.", "The astronaut must be holding a specialized repair tool in their gloved hands and actively applying it to a visible tear in the solar panel.", "The solar panel must have a distinct tear, with jagged edges.", "A portion of the truss structure of the International Space Station (ISS) must be visible, serving as the base for the robotic arm.", "The blue and white curve of the Earth must be prominent in the background.", "The background must be the blackness of space with a scattering of stars.", "A coiled safety tether must be clearly visible, with one end clipped to the astronaut's suit and the other end attached to the ISS structure."], "prompt": "Write `svg` code for an image of an astronaut on a spacewalk, using a robotic arm to repair a damaged solar panel on the International Space Station, with the Earth visible below.", "id": "write_`svg`_code"} -{"requirements": ["Focus on the hands of a gardener, one of which is wearing a gardening glove.", "Show a mature branch of a tree, representing the rootstock, which has a clean 'V'-shaped notch cut into it.", "Include a smaller, separate twig (the scion) with several visible buds on it.", "The base of the scion must be cut into a wedge shape that fits perfectly into the rootstock's 'V'-shaped notch.", "Depict the gardener's hands holding the scion firmly in place within the rootstock's notch, ensuring the cambium layers align.", "Show a strip of green grafting tape being wrapped tightly by the gardener's fingers around the union point, holding the two pieces together.", "Show that some grafting wax has already been applied from a small tin to seal the top cut-end of the scion.", "A sharp, clean grafting knife must be visible resting on the rootstock branch next to the graft site.", "The background must show out-of-focus rows of other trees, suggesting an orchard setting."], "prompt": "Write `svg` code for an image of a gardener carefully grafting a branch from an apple tree onto a different rootstock tree, with grafting tape and wax visible.", "id": "write_`svg`_code"} -{"requirements": ["Depict a monk in a traditional brown, hooded medieval robe, seated at a slanted wooden writing desk.", "The monk's hand must be holding a white feather quill, poised over an open manuscript.", "The tip of the quill must be positioned just above the manuscript page, with a single, dark drop of ink visible on the nib, about to touch the parchment.", "On the desk, there must be an open manuscript page which features a large, ornate, 'illuminated' letter 'I' decorated with gold leaf and intricate vines.", "The quill must be positioned directly after the illuminated letter, ready to write the next character on a pre-drawn ruled line.", "The setting must be a stone room, and a distinct beam of dusty light from an arched window must be shown falling across the desk, illuminating the manuscript and the monk's hands.", "Next to the open manuscript, there must be an open inkwell and a small pile of stacked, leather-bound books."], "prompt": "Write `svg` code for an image of a medieval monk in a scriptorium, dipping a quill into an inkwell, about to write on an illuminated manuscript.", "id": "write_`svg`_code"} -{"requirements": ["Include exactly three children on top of a grassy hill.", "The children must be depicted in active, cooperative poses: one child is holding the kite string reel, a second child is guiding the taut string with their hands, and the third is pointing up excitedly.", "Show a large, elaborate kite in the shape of a green dragon high in the sky.", "The dragon kite must have a long, segmented tail that is flowing and rippling in the wind.", "A single, taut kite string must be visible, connecting from the kite down to the reel held by the first child.", "The setting is the crest of a green, rolling hill.", "The wind must be visually represented by having the children's hair, their loose clothing, the kite's tail, and the blades of grass all blowing in the same direction.", "The blue sky must have several puffy white clouds that appear to be moving quickly.", "All three children's expressions must be joyful and their gaze directed upwards at the kite."], "prompt": "Write `svg` code for an image of a group of children working together to fly a large, elaborate dragon kite on a windy day.", "id": "write_`svg`_code"} -{"requirements": ["Create a recognizable web browser window with a frame, three tabs (with the Google Drive tab being active), and an address bar showing a 'drive.google.com' URL.", "The content of the browser window must be the Google Drive user interface, showing a grid of folders with names like 'Photos', 'Work', and 'Vacation 2023'.", "A portion of a blurred nature photograph desktop background must be visible behind the browser window.", "Include a standard image file icon labeled 'Mountain.jpg' on the desktop area.", "Show a mouse cursor (arrow pointer) positioned over the browser window.", "The cursor must be depicted as 'dragging' the 'Mountain.jpg' file icon; the icon should appear semi-transparent and be located directly beneath the cursor's tip.", "The cursor and the dragged file icon must be positioned directly over the 'Photos' folder within the Google Drive interface.", "The 'Photos' folder must be highlighted with a blue border and a slightly changed background color to indicate it is the active drop zone, a direct result of the cursor's position.", "The overall image must clearly represent the user action of dragging a file from the desktop to a specific cloud storage folder."], "prompt": "Write `svg` code for a screenshot of a user dragging and dropping a file from their desktop into a Google Drive folder in a web browser.", "id": "write_`svg`_code"} -{"requirements": ["Include a person dressed in a veterinarian's lab coat over blue scrubs.", "The veterinarian should be holding a stethoscope, with the earpieces in their ears and the chest-piece placed on a dog's chest.", "The veterinarian's free hand must be resting reassuringly on the dog's back.", "The dog must be a golden retriever, positioned on a stainless steel veterinary examination table.", "The dog must appear calm, with its head turned towards its owner.", "Include a second person, the dog's owner, standing beside the table.", "The owner's right hand must be extended, holding a visible dog treat, which the dog is sniffing.", "The owner's left hand must be gently stroking the dog's head.", "The setting must be identifiable as a vet's office, with a clean, tiled background and a cabinet with medical supplies visible.", "The overall mood must be calm and caring, emphasized by the physical contact between all three subjects."], "prompt": "Write `svg` code for an image of a veterinarian listening to a dog's heartbeat with a stethoscope, while the dog's owner offers it a treat.", "id": "write_`svg`_code"} -{"requirements": ["Depict a red and black bowling ball at the end of a polished wooden bowling lane.", "The image must capture the exact moment of impact between the ball and the front-most (#1) pin.", "Show the full set of ten bowling pins, with the #1 pin shattering into pieces from the powerful impact.", "A dynamic starburst effect must emanate from the point of impact between the ball and the #1 pin.", "The impact must be shown causing the adjacent #2 and #3 pins to begin tilting backwards, starting a chain reaction.", "The other seven pins must be standing but showing slight vibrations.", "Incorporate sharp speed lines trailing the bowling ball to indicate it was thrown at high speed.", "The perspective must be low and close to the pins, looking down the lane, to heighten the drama.", "Include the pin deck and dark gutters of the bowling lane, with reflections from the polished wood."], "prompt": "Write `svg` code for an image of a bowling ball just as it makes impact with the front pin, sending it flying back into the others.", "id": "write_`svg`_code"} -{"requirements": ["The image must be a close-up on a bomb-like device composed of a bundle of dynamite sticks wrapped in tape.", "The device must feature a bundle of multi-colored wires (red, blue, green, yellow) leading to a timer.", "A prominent red digital timer must be part of the device, clearly displaying the numbers '0:07'.", "Show a pair of hands wearing thick, black, protective bomb-disposal gloves.", "One hand must be holding a pair of wire cutters, with its blades actively cutting the red wire.", "The red wire must be shown partially severed, with a small white and yellow spark at the point of the cut.", "The other gloved hand must be steadying the bundle, with its fingers separating the red wire from the adjacent blue and green wires.", "The scene must be tense and focused, with a dark, out-of-focus background to isolate the action.", "The composition must create a tight focal point on the interaction between the wire cutters, the sparking red wire, and the timer."], "prompt": "Write `svg` code for an image of a bomb disposal expert cutting the red wire on a complex-looking bomb with a timer that reads \"0:07\".", "id": "write_`svg`_code"} -{"requirements": ["The image must be a diagrammatic cross-section of a plant, bisected by a horizontal line representing the ground.", "The below-ground section must show a root system spreading into dark brown soil containing blue water particles and brown nutrient particles.", "Blue arrows must originate at the water particles, enter the root tips, and travel up a channel (xylem) in the stem.", "The above-ground section must show the plant's stem, green leaves, and a yellow flower.", "Include a bright yellow sun in the top-left corner.", "Yellow arrows must represent sunlight traveling from the sun and pointing to the surfaces of the leaves.", "Gray arrows labeled 'CO2' must point from the air into the leaves.", "Light blue arrows labeled 'O2' must point from the leaves out into the air.", "The leaves must contain small green dots representing chloroplasts, where the sunlight arrows terminate.", "A second set of orange arrows labeled 'Sugars' must originate in the leaves and travel down a channel (phloem) in the stem towards the roots, showing the distribution of energy from photosynthesis."], "prompt": "Write `svg` code for a cross-section of a plant, showing the roots absorbing water from the soil and the leaves using sunlight for photosynthesis.", "id": "write_`svg`_code"} -{"requirements": ["Include the head and upper torso of a watchmaker, with a wrinkled brow to show intense concentration.", "A brass watchmaker's loupe must be fitted over the watchmaker's right eye.", "The reflection of the watch mechanism and tweezers must be visible on the lens of the loupe.", "The watchmaker's left hand must be steadying the casing of an open mechanical watch.", "The right hand must be holding a pair of fine-tipped tweezers, which are gripping a single, tiny brass watch gear.", "The open mechanical watch must reveal an intricate interior of interlocking silver and brass gears, springs, and red jewel bearings.", "The gear held by the tweezers must be positioned directly above an empty axle in the mechanism, fractions of a millimeter from being seated.", "A focused cone of light from an overhead desk lamp must illuminate the watch, the hands, and the tools.", "The background must be a dark, out-of-focus workshop to draw all attention to the detailed foreground action."], "prompt": "Write `svg` code for an image of a watchmaker using a loupe and fine tweezers to place a tiny gear into the intricate mechanism of a mechanical watch.", "id": "write_`svg`_code"} -{"requirements": ["Depict a child's face in three-quarter view, with puffed cheeks and puckered lips.", "The child must be holding a plastic bubble wand, with the ring held to their lips.", "A single, large bubble must be emerging from the wand, still connected to the soapy film in the wand's loop.", "The surface of the half-formed bubble must be transparent and show swirling, iridescent, rainbow-like colors.", "The distorted reflection of the child's face must be visible on the surface of the bubble.", "Use varying levels of transparency and opacity to make the bubble look delicate.", "The background must be a bright, sunny day in a grassy field with a clear blue sky.", "Include two fully-formed, iridescent bubbles floating away in the background, with distorted reflections of the sky and grass on their surfaces."], "prompt": "Write `svg` code for an image of a child blowing a bubble with a wand, with the bubble half-formed and showing iridescent reflections.", "id": "write_`svg`_code"} -{"requirements": ["Illustrate a blacksmith figure with muscular arms, wearing a heavy leather apron over a simple shirt.", "The blacksmith must be holding a long pair of tongs, gripping a sword by its tang (the part that goes into the hilt).", "The entire blade of the sword must be glowing a bright orange-yellow, indicating it is white-hot.", "The tip of the glowing sword is just clearing the opening of a brick forge, which is filled with glowing red and orange embers.", "An anvil must be positioned in front of the blacksmith, who is turned towards it, ready for the next action.", "Visual effects must include sparks flying from the sword, and a heat-haze shimmer effect around the blade.", "The surrounding workshop must be dimly lit, with the intense light from the forge and hot sword casting strong orange highlights on the blacksmith's face, arms, apron, and the face of the anvil.", "The blacksmith's posture must convey the effort of holding the hot metal, with tense muscles and a focused expression."], "prompt": "Write `svg` code for an image of a blacksmith pulling a glowing orange sword from a forge with a pair of tongs, ready to place it on an anvil.", "id": "write_`svg`_code"} -{"requirements": ["Draw a detailed, fuzzy bumblebee with black and yellow stripes and transparent, veined wings.", "The bumblebee must be positioned on a large pink flower petal, its weight causing the petal to bend downwards slightly.", "Draw the detailed pink flower, showing all its petals, a yellow pistil, and multiple stamens.", "The bee's proboscis (tongue) must be extended and physically touching the pistil in the center of the flower to collect nectar.", "Visible yellow pollen grains must be stuck to the bee's fuzzy legs and abdomen.", "The flower's stamens must also be covered in yellow pollen, and the bee's legs must be brushing against them, dislodging a few grains that are falling onto the petal below.", "Use a macro perspective to highlight the interaction between the bee and the flower's reproductive parts.", "Include the top of the flower's green stem and a single green leaf for context.", "Use vibrant, saturated colors for both the flower and the bee to create a lively scene."], "prompt": "Write `svg` code for an image of a bee pollinating a flower, with pollen grains visibly stuck to its legs as it collects nectar.", "id": "write_`svg`_code"} -{"requirements": ["Depict a person in a white lab coat, positioned at a lab bench.", "The scientist's right eye must be pressed to the eyepiece of a microscope.", "The scientist's left hand must be turning a focus knob on the side of the microscope.", "A clear petri dish must be on the microscope's stage, held by stage clips, directly under the objective lens.", "The petri dish must contain a yellow culture medium with several visible white colonies of bacteria.", "The microscope must be clearly drawn with an eyepiece, body, a turret with three objective lenses, a stage, and a light source at the base that is switched on.", "Include a circular inset view in the top-right corner, representing the microscope's view, which shows a magnified, stylized image of the bacteria from one of the colonies in the petri dish.", "A line must connect the inset view to the microscope's eyepiece to clarify the relationship.", "The background must show other laboratory equipment, such as beakers and test tube racks, to establish the setting."], "prompt": "Write `svg` code for an image of a scientist adjusting the focus on a microscope to look at bacteria in a petri dish.", "id": "write_`svg`_code"} -{"requirements": ["The central focus must be a large, fully decorated Christmas tree, topped with a glowing yellow star.", "Show a family of three people of different ages actively decorating the tree.", "An adult must be lifting a small child up, so the child can reach out and hang a red bauble ornament on a high branch.", "The third person, a teenager, must be kneeling down to arrange a string of colored lights around the lower part of the tree.", "The tree must be heavily decorated with a variety of ornaments, garlands, and the aforementioned lights, which are visibly plugged into a wall socket.", "The scene is set in a cozy living room with a fireplace in the background, in which a fire is burning.", "Include a pile of colorfully wrapped presents under the tree, some of which are being nudged by the teenager's feet.", "All characters must have joyful, smiling expressions, focused on their shared activity.", "The lighting must be warm and inviting, with the primary light source being the glow from the tree's lights and the fireplace, casting soft shadows in the room."], "prompt": "Write `svg` code for an image of a family decorating a Christmas tree together, with one person placing the star on top while others hang ornaments.", "id": "write_`svg`_code"} -{"requirements": ["Illustrate a large, vertical rock cliff face with texture and deep cracks.", "Show a climber figure halfway up the cliff, with their body twisted towards the rock.", "The climber must be wearing a red harness, a white helmet, and dark climbing shoes.", "The climber is in a dynamic pose: their left foot is on a small foothold, their right hand is gripping a side-pull handhold, and their left arm is fully extended, reaching for a chalk-dusted handhold just out of reach.", "A bright green climbing rope must be attached to the climber's harness via a visible figure-eight knot.", "The rope must run from the climber's harness, up through a quickdraw attached to the rock just above them, and then down in a straight, taut line to the belayer at the bottom of the cliff.", "The belayer must be anchored to the base of the cliff, looking up at the climber, with both hands on the rope as it feeds through a belay device attached to their harness.", "The handhold the climber is reaching for should be visibly smaller than the ones they are currently using.", "The background should be a simple blue sky with a few clouds below the climber's position to emphasize the height and exposure."], "prompt": "Write `svg` code for an image of a rock climber halfway up a cliff face, reaching for a handhold while their belayer manages the rope below.", "id": "write_`svg`_code"} -{"requirements": ["Use a cutaway, cross-sectional view to show the internal mechanism of a pin-tumbler lock, including the outer housing and the inner rotating plug.", "A brass key must be fully inserted into the lock's keyway.", "The key's bitting must be clearly visible, with five distinct cuts of varying depths.", "Depict exactly five vertical pin stacks inside the lock cylinder, passing through both the plug and the housing.", "Each stack must consist of a lower 'key pin' and an upper 'driver pin', with the key pins having different lengths corresponding to the key's cuts.", "Show compressed springs above each driver pin, pushing the entire stack downwards.", "The five cuts on the key's bitting must be shown lifting each corresponding key pin, causing the five gaps between the key pins and driver pins to align perfectly with the 'shear line'.", "The shear line (the gap between the inner rotating plug and the outer housing) must be a clearly defined, continuous horizontal line across all five pin channels.", "Include a rotational arrow to show that the key and the now-unobstructed plug are turning clockwise.", "Show a cam on the back of the rotating plug making contact with the lock's bolt mechanism, causing the bolt to retract into the lock housing."], "prompt": "Write `svg` code for a diagram showing a lock-and-key mechanism, with the key inserted and turning the tumblers to align them and unlock the bolt.", "id": "write_`svg`_code"} -{"requirements": ["Draw a large, heavy, spherical wrecking ball, slightly flattened on the side making contact with the wall.", "The ball must be attached to a thick, taut steel cable, which leads up and off-screen along a clear swing arc.", "Use motion blur lines that follow the arc of the swing to show the ball is in powerful motion from left to right.", "Depict a red brick wall that the ball is striking.", "The image must capture the exact moment of impact, with the ball embedded slightly into the wall.", "Show a concave crater forming on the wall that matches the curvature of the wrecking ball.", "Illustrate an explosion of debris flying outward from the impact point, with the trajectory of the fragments moving away from the ball's point of contact. The debris must include dust clouds, small brick fragments, and at least three whole bricks.", "The flying brick fragments must be shown frozen in mid-air, with some rotating.", "The rest of the wall must have cracks radiating out from the perimeter of the crater.", "The composition must convey a strong sense of force, with the motion lines of the ball and the trajectory of the debris creating a focused point of action."], "prompt": "Write `svg` code for an image of a wrecking ball in mid-swing, just making contact with a brick wall and sending debris flying.", "id": "write_`svg`_code"} -{"requirements": ["Show a person in a dynamic fishing stance on the grassy bank of a river.", "The fisherman should be holding a fishing rod, with their body twisted as if having just completed a cast.", "The fishing rod must be bent in a slight arc, as if it is un-flexing after being whipped forward.", "A thin fishing line must be shown unspooling from the reel, going through the guides on the rod, and extending from the very tip of the rod.", "The fishing line should be drawn in a long, graceful arc that starts at the rod tip and extends over the water.", "A red and white fishing lure must be clearly visible at the end of the line, positioned at the apex of the arc in mid-flight.", "The river should have gentle ripples, with a small disturbance in the water at the bank where the fisherman is standing.", "The background should be a natural outdoor scene with trees on the distant shore and a clear sky, establishing the direction of the cast.", "The fisherman's posture, the recovering bend of the rod, and the arc of the line must all work together to illustrate the single, fluid action of casting."], "prompt": "Write `svg` code for an image of a fisherman casting a line into a river, with the lure flying through the air at the end of the line.", "id": "write_`svg`_code"} -{"requirements": ["Depict a child, identifiable by youthful features, kneeling on wet sand.", "The child should be looking towards an incoming wave with a surprised expression, with one hand raised from their work.", "In front of the child, show a sandcastle with at least two towers and a connecting wall. A small plastic shovel should be stuck in the top of one tower.", "Include a large ocean wave, distinct from the calmer water, that is in the process of breaking.", "The white foam from the breaking wave must be shown actively washing over the base of the sandcastle, dissolving the lower part of the walls.", "Show the leading edge of the water and foam beginning to surround the child's knees.", "The setting must be a beach, with the sand around the castle being visibly darker and wet from the approaching water.", "Include a horizon line separating the sea and sky, with the wave rising above it to show its size.", "The motion of the wave, the dissolving sandcastle, and the child's reaction must be clearly linked to convey the moment of destruction."], "prompt": "Write `svg` code for an image of a child building a sandcastle at the beach, just as a wave is beginning to crash and wash it away.", "id": "write_`svg`_code"} -{"requirements": ["Include a person representing a DJ, positioned behind a table with equipment.", "Show two turntables, with a vinyl record on the one closer to the viewer.", "The record on the primary turntable must have radial motion lines to indicate it is spinning.", "One of the DJ's hands must be on the spinning record, with fingers curled, in a pose that suggests they are actively scratching the record back and forth.", "Include a DJ mixer positioned between the two turntables, with visible knobs and faders.", "The DJ's other hand must be shown gripping and moving the main horizontal crossfader on the mixer.", "The DJ should be wearing headphones, but with one earcup pushed back off their ear, allowing them to listen to the room.", "The overall scene should be dimly lit, with a single spotlight illuminating the DJ and their equipment, clearly distinguishing the turntables and the central mixer."], "prompt": "Write `svg` code for an image of a DJ at a turntable, with one hand on a spinning record and the other adjusting a slider on the mixer.", "id": "write_`svg`_code"} -{"requirements": ["Depict two figures: a tailor and a customer, central to the image.", "The tailor should be shown standing slightly to the side of the customer, focused intently on their work.", "The customer should be standing straight with arms held slightly out to their sides, wearing a form-fitting white dress shirt.", "A yellow, flexible measuring tape must be wrapped snugly around the customer's chest, under their arms.", "The tailor must be holding the measuring tape where the end overlaps the numbered scale, pinching it with their thumb and forefinger to mark the measurement. The tailor's eyes should be looking down at this exact point on the tape.", "The numbers on the measuring tape should be visible where the tailor is holding it.", "The background must be a tailor's shop, with a large three-way mirror behind the customer, reflecting the back of the customer and the tailor's action.", "The focused gaze of the tailor and the precise hold on the tape must clearly communicate the act of taking a measurement."], "prompt": "Write `svg` code for an image of a tailor taking measurements for a suit, wrapping a measuring tape around a customer's chest.", "id": "write_`svg`_code"} -{"requirements": ["The image must be a network diagram, not a realistic scene.", "Include a single, central circular node clearly labeled 'Server'. The server icon itself should appear cracked or fractured.", "Include at least five source nodes positioned in an arc around the server node.", "The source nodes must be styled to look malicious, each containing a skull and crossbones icon and colored dark grey.", "Draw a dense flood of lines representing traffic packets, so numerous that they almost merge into solid beams of light.", "All traffic packet lines must originate from the malicious nodes and converge on the central server, creating a visual bottleneck at the server's edge.", "Use arrows on the lines to indicate the unidirectional flow of data towards the server.", "The central server node must have a prominent circular status indicator on it.", "The server's status indicator must be glowing bright red to signify a critical overload or 'down' state, in stark contrast to the dark attacker nodes.", "The visual effect must be one of the central server being completely overwhelmed, with the incoming lines obscuring parts of the server node itself."], "prompt": "Write `svg` code for a diagram of a computer network under a DDoS attack, showing multiple malicious source nodes flooding a central server node with traffic packets, causing its status indicator to turn red.", "id": "write_`svg`_code"} -{"requirements": ["Depict a person in a stable archer's stance, positioned sideways to the target.", "The archer must be at 'full draw,' with the bowstring pulled back so their hand is anchored firmly under their chin, and the string touches their lips.", "Show a longbow that is visibly and deeply bent under the tension of the draw.", "The archer's left arm should be fully extended towards the target, holding the bow steady, with visible tension in the shoulder and arm muscles.", "An arrow, with fletching visible, must be nocked on the bowstring and resting on the archer's extended hand.", "Include a traditional circular target in the distant background, with concentric colored rings and a yellow bullseye.", "The composition must create a strong, clear line of sight, aligning the archer's dominant eye, the shaft of the arrow, and the bullseye of the distant target.", "The archer's gaze must be intensely focused along this line towards the target."], "prompt": "Write `svg` code for an image of an archer at full draw, aiming an arrow at a target in the distance.", "id": "write_`svg`_code"} -{"requirements": ["Depict a person in a stable archer's stance, with their body positioned sideways to the target.", "The archer must be at 'full draw,' with the bowstring pulled back so their drawing hand is anchored firmly at the corner of their mouth.", "Show a longbow that is visibly and deeply bent under the tension from the drawn string.", "The archer's other arm must be fully extended towards the target, holding the bow steady.", "An arrow, with fletching visible, must be nocked on the bowstring and resting on the bow, with its tip pointing directly at the target.", "Include a traditional circular target in the distant background, with concentric colored rings and a yellow bullseye.", "The composition must create a strong, clear line of sight, aligning the archer's dominant eye, the shaft of the arrow, and the bullseye of the distant target.", "The archer's gaze must be intensely focused along this line towards the target."], "prompt": "Write `svg` code for an image of an archer at full draw, aiming an arrow at a target in the distance.", "id": "write_`svg`_code"} -{"requirements": ["Show a person leaning into their work while holding and operating a leaf blower, bracing against its force.", "The leaf blower must be a recognizable shape with a main body, a handle the person is gripping, and a long nozzle aimed directly at a pile of leaves.", "Depict a large, dense pile of leaves on a suburban lawn. The side of the pile facing the leaf blower must be visibly caved in from the force of the air.", "The leaves must have a mix of autumn colors (red, orange, yellow, and brown).", "Show a powerful, visible stream of air, represented by transparent white motion lines, emanating from the blower's nozzle and directly hitting the caved-in side of the leaf pile.", "A cloud of individual leaves must be shown being lifted from the pile and propelled through the air, following the path of the air stream away from the nozzle.", "The setting must be a suburban lawn. A patch of grass where the leaves have been blown from must be clear, contrasting with the area still covered by the main pile.", "The distinction between the static, dense pile of leaves and the individual, airborne leaves must be clear and show a direct cause-and-effect relationship with the air stream from the blower."], "prompt": "Write `svg` code for an image of a person using a leaf blower to clear a large pile of autumn leaves from a suburban lawn.", "id": "write_`svg`_code"} -{"requirements": ["Depict a person in a camping environment, kneeling on the ground and leaning forward over a fire pit.", "The person must be holding a dark grey flint in one hand and a steel striker in the other.", "Show the flint and steel positioned directly over a tinder bundle, in the act of being struck together.", "A shower of bright yellow sparks must be visibly emanating from the point of contact, directed downwards towards the tinder.", "Include a tinder bundle made of fine wood shavings and dry grass, placed at the center of a stone fire pit on the ground.", "Show exactly two sparks landing on the top of the tinder bundle.", "Depict a small, bright orange glow and a thin wisp of white smoke rising from the exact spot where the two sparks have landed on the tinder.", "The person's head should be tilted down, with their gaze fixed on the glowing spot on the tinder.", "The setting must include the stone fire pit on dirt ground, with the dark silhouettes of several pine trees visible in the background."], "prompt": "Write `svg` code for an image of a camper starting a fire with a flint and steel, with the first sparks just catching on a tinder bundle.", "id": "write_`svg`_code"} -{"requirements": ["The image must be a diagram illustrating the process of nuclear fission with clear labels and arrows.", "Show a small blue circle labeled 'Neutron' with a solid black arrow indicating its trajectory towards a large, purple nucleus.", "The large nucleus must be labeled 'Uranium-235 Nucleus' and be depicted in the process of splitting into two smaller, unequal-sized nuclei.", "The two new nuclei, labeled 'Fission Fragment', must be shown moving in opposite directions away from the point of fission, each with a directional arrow.", "Show exactly three new blue circles, identical to the first and labeled 'Neutron', being ejected from the splitting nucleus.", "Each of the three released neutrons must have its own arrow indicating its outward trajectory, with one pointing towards the edge of the frame to suggest a continuing chain reaction.", "A bright yellow flash must emanate from the center of the splitting Uranium-235 Nucleus.", "Wavy red lines, representing energy, must radiate outwards from the yellow flash, passing between the departing fission fragments and neutrons."], "prompt": "Write `svg` code for an image of a nuclear fission reaction, showing a neutron striking a uranium nucleus, causing it to split into smaller elements and release more neutrons.", "id": "write_`svg`_code"} -{"requirements": ["Depict a person in professional sommelier attire, including a tastevin necklace.", "The sommelier must be holding a dark green wine bottle, tilted so that a thin stream of red wine flows from its mouth directly into the opening of a glass decanter positioned on a table below it.", "With their other hand, the sommelier must hold a single, lit white candle, positioning its yellow flame directly behind the neck of the wine bottle.", "The flame from the candle must cast a bright, glowing light through the bottle's neck.", "Inside the bottle, illuminated by the candle's flame, a small collection of dark specks representing sediment must be visible, gathered at the bottle's shoulder, prevented from being poured.", "The stream of wine flowing into the decanter must be clear and free of any sediment.", "The background must depict a wine cellar, with the curved tops of wooden wine barrels and a stone archway visible behind the sommelier."], "prompt": "Write `svg` code for an image of a sommelier pouring a small amount of wine from a bottle into a decanter, holding a candle behind the bottle's neck to check for sediment.", "id": "write_`svg`_code"} -{"requirements": ["Depict a ginger tabby cat lying on its back on a wooden floor in a playful pose.", "The cat must have its front paws actively batting a ball of blue yarn that is positioned directly above its chest.", "The ball of yarn must be partially unraveled, with a long, continuous strand of yarn trailing away from it.", "This loose strand of yarn must be tangled around one of the cat's rear legs before continuing to spread in a chaotic, looping mess across the floor.", "The cat must have wide, focused eyes and a slightly open mouth, indicating playful excitement directed at the yarn.", "The scene must take place on a light-colored wooden floor, with the parallel lines of the floorboards clearly visible beneath the cat and the yarn.", "At least one of the cat's claws must be visible, slightly snagged in the ball of yarn."], "prompt": "Write `svg` code for an image of a cat playfully batting at a ball of yarn, causing it to unravel across a wooden floor.", "id": "write_`svg`_code"} +{"requirements": ["Create a cow with clearly recognizable bovine features, including a body, head, four legs, tail, and udder.", "The cow must have black and white patches for its coloring.", "Add cow ears, eyes, and snout for facial recognition.", "Position the cow in a realistic plowing stance, leaning forward as if pulling.", "The cow's hooves must be colored brown, as if covered in soil from the field.", "Include a traditional wooden plow with a visible metal blade/share.", "Depict a wooden yoke across the cow's shoulders, connected to the plow by visible chains.", "The plow's blade must be partially buried in the soil, actively turning over a chunk of earth.", "Show at least three distinct, dark furrows in the soil trailing directly behind the plow.", "The field must be split into a plowed section and an unplowed section, with the cow and plow positioned at the boundary between them.", "The unplowed section of the field must have short green grass, which is visibly being overturned by the plow.", "Add a simple background with a clear horizon line and a blue sky containing a yellow sun."], "prompt": "Write `svg` code to draw an image of a cow plowing a field."} +{"requirements": ["The overall background of the SVG must be white.", "All primary elements (logo, search bar, buttons) must be horizontally centered on the canvas.", "Include the Google logo in the center, using its official multi-color scheme (blue, red, yellow, blue, green, red).", "Place a prominent search bar directly below the Google logo, with a vertical spacing equal to half the height of the logo.", "The search bar must be a rounded rectangle with a light gray border.", "The search bar must contain a gray magnifying glass icon perfectly aligned to the left side, inside the bar.", "The search bar must contain a gray microphone icon perfectly aligned to the right side, inside the bar.", "Place two distinct buttons below the search bar, horizontally centered with the search bar, and with a small, consistent gap between them.", "The left button must be labeled 'Google Search'.", "The right button must be labeled 'I'm Feeling Lucky'.", "Both buttons must have a light gray background, a thin gray border, and dark gray text.", "Create a header section at the top right of the canvas, with all its items vertically aligned with each other.", "The header must include text links for 'Gmail' and 'Images'.", "The header must include a 3x3 grid icon (Google Apps launcher) positioned between the 'Images' link and the 'Sign in' button.", "The header must include a prominent 'Sign in' button with a blue background and white text, positioned at the far right of the header."], "prompt": "Write `svg` code for a screenshot of the [Google homepage](https://google.com)."} +{"requirements": ["Create an elliptical shape for the top surface of a round dinner table with a dark wood grain texture.", "Include exactly 4 sets of cutlery arranged around the table.", "Each cutlery set must consist of a recognizable fork, knife, and spoon.", "Position the 4 cutlery sets at distinct place settings (at 12, 3, 6, and 9 o'clock positions).", "Include a round dinner plate at each of the 4 place settings.", "The fork of each cutlery set must be placed to the left of its corresponding plate, and the knife and spoon to the right.", "Place exactly 3 main food dishes in the center of the table.", "First dish: A recognizable roasted turkey, golden-brown in color, showing a plump body, with one drumstick clearly carved off and missing.", "The turkey must be presented on its own large serving platter.", "Second dish: A round pizza with visible crust and toppings, cut into slices, with one slice missing from the pizza.", "The missing slice of pizza must be placed on the dinner plate at the 3 o'clock position.", "The missing turkey drumstick must be placed on the dinner plate at the 9 o'clock position.", "Third dish: A serving of at least two tacos with visible folded shells and fillings, presented in a red taco holder.", "Arrange the three main dishes in the center of the table, ensuring they don't unnaturally overlap.", "The overall perspective must be slightly isometric."], "prompt": "Write `svg` code for an image of a round dinner table with 4 sets of cutlery and 3 dishes on the table, including a turkey, pizza and tacos."} +{"requirements": ["Create a central, cylindrical rocket body colored bright blue.", "Add a pointed, red nose cone attached to the top of the rocket body.", "Include exactly three yellow stabilizer fins, symmetrically attached to the base of the rocket body.", "Incorporate a single circular window on the rocket's body.", "Add two red horizontal stripes on the blue rocket body, one positioned above the window and one below it.", "Apply a clean, cartoonish art style with bold black outlines for all rocket parts.", "Include a visible engine nozzle at the bottom of the rocket, between the fins.", "Position the rocket as if it is launching, with its base just above the ground.", "Add a column of stylized orange and yellow flames emerging from the nozzle, which the rocket is standing on."], "prompt": "Write `svg` code for an image of a toy rocket."} +{"requirements": ["Create a classic rubber ducky shape with a distinct body and head, colored bright yellow.", "The duck must have an orange beak and a simple black dot for an eye.", "Draw a white, claw-foot bathtub shape, showing the inside view with a visible rim.", "Fill the lower portion of the bathtub with light blue water.", "Ensure the water line is clearly visible across the duck's body, showing the lower third of the duck submerged.", "Position the duck so it is floating on the water's surface, creating small, concentric circular ripples in the water around its base.", "Depict soap bubbles as clusters of overlapping circles with a slight iridescence, using semi-transparent white, light pink, and light blue fills.", "Place a small cluster of bubbles on top of the duck's head.", "Place a large pile of bubbles against one side of the tub and a few floating on the water's surface around the duck."], "prompt": "Write `svg` code for an image of a rubber ducky floating on top of a soapy bathtub."} +{"requirements": ["Create a scene set on top of a solid-looking cloudscape that serves as the ground.", "Include a hot air balloon with a large envelope featuring vertical red and white stripes and a brown wicker basket.", "The balloon's basket must be resting firmly on the surface of a large, flat-topped cloud.", "Show ropes connecting the balloon's envelope to the basket.", "Include four human figures styled as a family: two adults and two children.", "Position one adult figure holding a corner of a red and white checkered picnic blanket, while one child figure holds the opposite corner, as if they are spreading it out together on the cloud.", "Place an open picnic basket on a corner of the blanket that is already spread out.", "A thermos, a bunch of grapes, and a sandwich must be visible emerging from the open picnic basket.", "Position the second adult and second child near the landed hot air balloon, with the adult pointing up at the balloon's envelope.", "The background must be a clear blue sky containing a bright yellow sun and two small, distant clouds."], "prompt": "Write `svg` code for an image of a picnic on top of the clouds, where 2 parents and 2 children have landed with a hot air ballon, and are setting up a picnic with a tarp and food items."} +{"requirements": ["Create a red, sporty car with a body, wheels, and windows.", "Draw a distinct circular hoop that is completely surrounded by jagged, irregular flames colored with reds, oranges, and yellows.", "Include a take-off ramp on the left side of the fiery ring and a landing ramp on the right side.", "Position the car in mid-air, with its front half having passed through the ring and its back half still inside the ring.", "The car's rear wheels must be depicted as just having left the edge of the take-off ramp.", "Add a ground surface below the entire jump setup.", "Incorporate gray speed lines trailing behind the car to convey high speed.", "Add orange sparks where the car's tires last touched the take-off ramp."], "prompt": "Write `svg` code for an image of a stunt car jumping through a circle of fire."} +{"requirements": ["Create a recognizable grey dolphin with a streamlined body, dorsal fin, and tail fluke.", "Show a water surface below the dolphin with a large splash effect at the point where the dolphin has exited the water.", "Position the dolphin in a dynamic jumping arc, with its entire body in mid-air.", "Draw a circular, multi-colored hula hoop, and position the dolphin so its mid-section is passing through the center of the hoop.", "Include a human trainer's arm and hand extending into the frame.", "The trainer's hand must be holding a small fish by the tail.", "The dolphin's mouth must be depicted as wide open, just about to bite the body of the fish being held by the trainer."], "prompt": "Write `svg` code for an image of a dolphin jumping out of the water and through a hula hoop to bite a fish out of its trainers hand."} +{"requirements": ["Create a standard red wine glass shape with a wide bowl, a slender stem, and a circular base.", "The glass must appear transparent, rendered with a light grey tint and low opacity.", "Add bright white highlights on the rim and along the curved side of the bowl to simulate glass reflection.", "Fill the glass with a deep burgundy colored wine.", "The wine must fill the glass to exactly the halfway point of the bowl's height.", "The top surface of the wine must be a flat ellipse, indicating a level liquid surface.", "The body of the wine must perfectly conform to the curved shape of the inside of the glass bowl.", "A single drop of red wine must be shown running down the outside of the glass bowl, starting from the rim and ending just above the stem."], "prompt": "Write `svg` code for an image of half full glass of red wine."} +{"requirements": ["Create a full-screen background using a modern, abstract macOS wallpaper.", "Add a horizontal, semi-transparent menu bar at the top edge of the screen.", "Place an Apple logo icon in the top-left corner of the menu bar, followed by the menu text 'Finder', 'File', 'Edit', and 'View'.", "Add Wi-Fi, battery, and date/time icons to the right side of the menu bar.", "Design a glass-like Dock with rounded corners at the bottom of the screen, hovering slightly above the bottom edge.", "Populate the Dock with icons for Finder, Safari, Mail, and System Settings, with a Trash icon at the far right of the Dock.", "Draw a Finder window as the main foreground element, positioned over the desktop wallpaper.", "The Finder window must have a title bar containing the three 'traffic light' control buttons (red, yellow, green) in the top-left corner.", "The main content area of the Finder window must display several generic folder icons.", "One of the folder icons in the Finder window must be identical to the Finder icon in the Dock.", "Apply a prominent drop shadow to the entire Finder window to make it appear floating above the desktop wallpaper and the Dock."], "prompt": "Write `svg` code for a screenshot of the macOS desktop, with a finder window on top."} +{"requirements": ["Whip must be depicted in a coiled, spiral arrangement on a flat surface.", "Include a distinct, solid brown handle (the stock) with a visible wood grain texture.", "The handle must feature a silver knob or pommel at its base.", "The handle must include a leather wrist loop (keeper) hanging from the pommel.", "The main flexible part of the whip (the thong) must be attached to the handle and made of braided black leather.", "The thong must show a clear taper, starting thicker at the handle and getting progressively thinner towards the tip.", "The coils of the thong must overlap realistically, with the handle and the thickest part of the thong on top of the outer coils.", "Include a 'fall,' which is a thinner, smooth leather piece attached to the end of the main braided thong.", "Show a frayed white 'cracker' or 'popper' at the very tip of the fall.", "Add subtle shading and highlights to the coils and handle to give the whip a three-dimensional appearance and a slight leather sheen."], "prompt": "Write `svg` code for an image of a coiled whip."} +{"requirements": ["Create a first-person perspective, as if looking through the player's eyes.", "Include a recognizable CS:GO AK-47 weapon model held by player hands in the bottom-right of the screen.", "The player's hands must be wearing the default Terrorist team gloves.", "Place a green plus-sign crosshair in the exact center of the screen.", "Display a Heads-Up Display (HUD) with game information using a font style that mimics the actual CS:GO interface.", "In the bottom-left of the HUD, show player health as '100' next to a plus icon and armor as '100' next to a shield icon.", "In the bottom-center of the HUD, show the ammunition count as '30 / 90'.", "In the top-left, include a square radar/minimap with a player indicator arrow in the middle.", "In the top-center, display the round timer as '1:45'.", "Above the timer, show the team scores with the Terrorist icon and a score of '5' on the left, and the Counter-Terrorist icon and a score of '3' on the right.", "The background must depict the 'A-long' area from the map Dust II, with the crosshair aimed at the double doors."], "prompt": "Write `svg` code for an screen of a first person view in CS:GO."} +{"requirements": ["Create a main structure for a wooden fruit stall, including a counter.", "Add a red and white striped canopy over the stall, supported by two vertical wooden posts.", "Display exactly four different, recognizable fruits.", "First fruit: A pile of red apples in a wicker basket on the left side of the counter.", "Second fruit: A bunch of yellow bananas placed next to the apples.", "Third fruit: A pile of oranges in another wicker basket on the right side of the counter.", "Fourth fruit: A single, large slice of watermelon resting directly on the counter in the center.", "Include a character representing the stall vendor, a smiling man with a mustache, positioned behind the counter and between the baskets.", "Add a small, hanging chalkboard sign from the canopy that reads 'Fresh Fruit'.", "Depict a cobblestone ground surface in front of the stall.", "Include the silhouette of another market stall in the background to suggest a larger market setting."], "prompt": "Write `svg` code for an image of a fruit stall in the market."} +{"requirements": ["Create a wooden barrel with visible vertical planks and two horizontal metal hoops.", "The barrel must be buried in a mound of sand, so only the top half is visible.", "Show the sand mounded up slightly around the visible base of the barrel.", "The barrel must be open at the top and filled with treasure items.", "Show treasure overflowing from the top and spilling down one side of the barrel onto the sand.", "The treasure must include a large pile of shiny gold coins, both inside and outside the barrel.", "Add a variety of colorful gemstones (red rubies, green emeralds, blue sapphires) mixed in with the coins.", "A string of white pearls must be draped over the edge of the barrel, trailing down into the spilled coins.", "A golden goblet must be visible, partially buried in the coins inside the barrel.", "Use bright highlights and glint effects on the coins, gems, and goblet to make them look shiny."], "prompt": "Write `svg` code for an image of half buried barrel of treasure."} +{"requirements": ["Create a vintage-style rotary telephone as the main subject, colored classic black.", "The telephone must have a main body, a handset, and a curly cord connecting them.", "The rotary dial must have ten visible finger holes with the numbers 0-9 arranged in a circle beneath them.", "Place the telephone on a small, dark wooden table with a visible wood grain texture.", "The table must have four visible, tapered legs.", "Position the phone realistically on the tabletop, with the handset resting in its cradle.", "The curly cord must hang down from the handset and connect to the main body of the phone.", "Incorporate a distinct shadow cast by the telephone onto the surface of the table.", "Add another shadow on the floor cast by the table itself to create depth."], "prompt": "Write `svg` code for an image of a vintage rotary telephone on a small wooden table."} +{"requirements": ["Draw a knight in a full suit of silver armor with a metallic sheen created using highlights and gradients.", "The knight must be holding a longsword in one hand and a shield in the other.", "The shield must have a coat of arms, such as a lion, depicted on it.", "Depict the knight in a dynamic pose, with the shield raised to block and the sword ready to strike.", "Draw a large, menacing green dragon with scales, large wings, and a spiky tail.", "The dragon must be shown actively breathing a large plume of fire directly towards the knight.", "The fire effect must be colored with bright reds, oranges, and yellows.", "The knight's shield must be positioned to intercept the fire, with the flames shown splashing against it.", "The scene must be set in a dark, rocky cavern, with the dragon's fire being the primary light source.", "The fire must cast a bright orange light on the front of the knight and the cavern walls, creating long, dark shadows behind them."], "prompt": "Write `svg` code for an image of a knight in shining armor fighting a fire-breathing dragon."} +{"requirements": ["Replicate the user interface of the Slack application using the recognizable Slack color scheme (purple sidebar, white main view).", "Include a left sidebar with a list of channels, with the channel '#design-team' highlighted to show it is active.", "The main view must show the message history for the '#design-team' channel.", "Display exactly three distinct messages from different fictional users.", "The first message must be from 'Alice' with a user avatar, a timestamp, and the text 'Here is the latest mockup. What do you think?'.", "The second message, below the first, must be from 'Bob' with a different avatar, a later timestamp, and the text 'Looks great! I love the new color palette.'.", "The third message, below Bob's, must be from 'Charlie' with a third avatar, a later timestamp, and the text 'Agreed! Ship it!'.", "Add a thumbs-up emoji reaction from two users on Alice's message.", "Show the 'user is typing...' indicator below the last message, with 'David is typing...' visible.", "Include the message input box at the bottom of the channel view, with placeholder text inside it."], "prompt": "Write `svg` code for a screenshot of a Slack channel with several messages, reactions, and a user typing."} +{"requirements": ["Draw a mound of dirt on a green grass surface to represent the top of an ant hill.", "Create a cutaway view showing the underground cross-section of the hill, featuring a network of tunnels and chambers.", "Depict three distinct types of chambers connected by tunnels.", "The top chamber must be a food storage area, filled with small green leaf fragments and seeds.", "The middle chamber must be a nursery, containing white ant eggs and larvae.", "The bottom chamber must be the queen's chamber, containing a single, large queen ant.", "Populate the tunnels and chambers with numerous small, black ants.", "Show some ants carrying leaf fragments from the entrance to the food storage chamber.", "Show other ants in the nursery tending to the eggs.", "The queen ant must be significantly larger than the other ants and shown laying an egg.", "Use a dark brown color for the packed earth of the chamber walls and a lighter brown for the loose soil inside the tunnels."], "prompt": "Write `svg` code for an image of a cross-section of an ant hill, showing tunnels and chambers with ants."} +{"requirements": ["Draw a school bus from a flat, side-on perspective, colored 'school bus yellow'.", "Include the long, rectangular body of the bus with a series of five evenly spaced passenger windows.", "Draw two visible wheels with black tires and silver hubcaps.", "Incorporate the characteristic black horizontal stripes running the length of the bus.", "Include the text 'SCHOOL BUS' in black, capital letters on the side panel between the black stripes.", "Show the driver's door and window at the front of the bus, with a silhouette of a person visible in the driver's seat.", "Add a red, octagonal stop sign attached to the side, fully extended outwards from the bus.", "Include side mirrors at the front and visible red lights at the front and back of the bus body."], "prompt": "Write `svg` code for an image of a classic yellow school bus from a side-on view."} +{"requirements": ["Create a triangular slice of pie on a white plate.", "The pie crust must be a golden-brown color with a texture suggesting it is flaky.", "Design a lattice-style top crust with interwoven strips of pastry, allowing the filling to be seen.", "The pie filling, visible through the lattice and on the cut side, must be red with small, dark red circles to represent cherries.", "Place a scoop of off-white vanilla ice cream directly on top of the pie slice, near the back corner.", "The ice cream scoop must have a slightly irregular, melting shape, with a small puddle forming at its base on the pie.", "The ice cream must have tiny dark specks to indicate vanilla bean.", "A silver fork must be resting on the plate next to the pie slice, with a small piece of cherry filling on its tines."], "prompt": "Write `svg` code for an image of a slice of cherry pie with a lattice crust and a scoop of vanilla ice cream next to it."} +{"requirements": ["Design a robot with a 'friendly' appearance, characterized by rounded shapes and large, circular optic sensors.", "The robot should have a polished chrome metallic texture, with highlights and shadows that give it a 3D feel and reflect the bar's lighting.", "Position the robot behind a sleek, minimalist bar counter.", "One of the robot's arms is actively pouring a vibrant, glowing green liquid from a cocktail shaker into a futuristic-looking glass held steady by its other hand.", "The bar setting must look futuristic, with glowing blue neon light strips running along the edges of the counter and the background shelves.", "Include shelves in the background holding uniquely shaped, futuristic bottles, one of which is half-empty and contains the same glowing green liquid as the drink being poured.", "The sleek bar counter must have a reflective surface, showing a partial, distorted reflection of the robot and the neon lights.", "Compose the scene from the perspective of a customer at the bar, with a non-human, metallic hand visible in the foreground, resting on the counter and reaching towards the drink being prepared.", "The robot and the drink preparation process should be the central focus of the image."], "prompt": "Write `svg` code for an image of a friendly robot serving drinks at a futuristic bar."} +{"requirements": ["Place a large, glowing Sun at the center of the diagram, emitting visible rays of light.", "Include all eight planets of the solar system: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, and Neptune, arranged in the correct order from the Sun.", "The side of each planet facing the Sun must be brightly lit, while the opposite side is in shadow, demonstrating the Sun as the primary light source.", "Draw distinct elliptical paths to represent the orbit of each planet around the Sun.", "Represent the relative size differences between the planets accurately (e.g., Jupiter largest, Earth much smaller than Uranus, etc.).", "Each planet must have its key visual characteristic: Earth's continents, clouds, and its Moon orbiting it; Mars's red color and polar ice cap; Jupiter's Great Red Spot; and Saturn's prominent rings.", "Include a text label for the Sun and for each of the eight planets, connected to its corresponding celestial body with a thin, faint line.", "Draw the asteroid belt as a dense field of small rocks in a shared orbit between Mars and Jupiter.", "Use a dark background to represent outer space, populated with small, distant stars, and include a single comet with a visible tail that points away from the Sun."], "prompt": "Write `svg` code for an image of a detailed diagram of the solar system with all the planets orbiting the sun."} +{"requirements": ["Draw a beaver with recognizable features: brown fur, large front teeth, and a flat, paddle-shaped tail, positioned halfway on a dam it is building.", "The beaver should be holding a muddy stick in its paws, actively placing it onto a section of the dam.", "Construct a dam across a river, made of realistically interlocked sticks, branches, and mud.", "Illustrate a clear difference in the water level: the water on the upstream side is high, calm, and deep, forming a pond, while the water on the downstream side is low and shallow, revealing rocks on the riverbed.", "The environment must include a riverbank with a tree that has been partially gnawed through at its base, with a pile of wood chips around it. The stick the beaver is holding must match the wood of this tree.", "Include the beaver's lodge, a large mound-like home made of sticks and mud, on the edge of the pond created by the dam.", "The water in the newly formed pond should reflect the sky and the trees on the bank.", "The water flowing over a low point in the dam should be depicted with ripples and lines to indicate movement."], "prompt": "Write `svg` code for an image of a beaver building a dam in a river."} +{"requirements": ["Replicate the user interface of the Visual Studio Code editor using the Dark+ color theme.", "Include the Activity Bar on the far left with the 'Files' icon in an active state.", "Show the Side Bar with a file explorer tree, where a file named `bot_controller.py` is highlighted as active.", "The main editor pane must display the contents of this `bot_controller.py` file, containing a block of Python code.", "The Python code must have correct syntax highlighting for keywords (purple), strings (orange), comments (green), and function names (yellow). The code must contain a function with a descriptive comment above it.", "Display line numbers in the gutter to the left of the code.", "Include a blinking text cursor positioned on line 15, column 8, within the body of the function.", "Show editor tabs at the top, with the tab for `bot_controller.py` visually active and matching the highlighted file in the explorer.", "Include the Status Bar at the bottom, showing information that corresponds to the editor's state: the language mode ('Python'), the line and column number of the cursor ('Ln 15, Col 8'), and the active Python interpreter."], "prompt": "Write `svg` code for a screenshot of a VS Code editor with a colorful syntax-highlighted block of Python code."} +{"requirements": ["Draw a tall, upright grandfather clock in a room setting, placed next to a window.", "The clock case, made of dark mahogany with a visible wood grain texture, must consist of a hood with a decorative finial on top, a long trunk, and a base.", "The clock face within the hood must have Roman numerals, with the hour hand pointing directly at 'III' and the minute hand pointing at 'XII' to show the time is 3:00.", "Light from the window must cast a long shadow from the clock onto the floor, consistent with an afternoon sun.", "Show a swinging brass pendulum inside the trunk's glass panel, depicted at the far right of its arc to imply motion.", "Include three hanging brass weights on chains inside the trunk; the rightmost weight must be positioned slightly higher than the other two, as if it has just chimed the hour.", "Use shading and highlights to give the wooden case and metallic pendulum and weights a three-dimensional appearance."], "prompt": "Write `svg` code for an image of a grandfather clock with a swinging pendulum."} +{"requirements": ["Create a scene with a distinct 8-bit, pixelated art style and a limited, bright color palette.", "Design a main character in a side-view, mid-jump, with their head positioned directly beneath a floating 'question mark' block, as if about to hit it.", "A single, pixelated coin must be depicted emerging from the top of the 'question mark' block, frozen mid-air, as a result of the character hitting it.", "Include a ground level made of repeating brown square blocks, with a small gap in the ground that the character is currently jumping over.", "All blocks must have a simple 3D effect with shading on one side.", "Design a simple, 8-bit style walking mushroom enemy on the ground level, moving towards the spot where the character will land after their jump.", "Include a simple UI in the top-left corner of the screen, displaying a score ('SCORE: 005000') and a life count (a small pixelated icon of the character's head x 3).", "The background must have simple pixelated clouds and hills, reinforcing the side-scrolling video game perspective."], "prompt": "Write `svg` code for an image of an 8-bit video game level, similar to Super Mario Bros., with a character, blocks, and an enemy."} +{"requirements": ["Draw a large, deep ceramic bowl, viewed from a slightly angled perspective to show all ingredients clearly.", "Fill the bowl with a rich-looking, opaque broth, with highlights to give it a wet, glossy appearance.", "Depict a nest of wavy ramen noodles in the center of the bowl, with a pair of wooden chopsticks lifting a single noodle out of the broth.", "Include two slices of chashu pork with visible layers of meat and fat, with one slice partially submerged in the broth.", "Add a soft-boiled egg (ajitama) cut in half, revealing a bright orange, jammy yolk, nestled against the pork.", "Garnish with a pile of finely chopped green onions on one side of the bowl.", "Include a large, crisp sheet of nori (seaweed) standing upright behind the noodles and a small cluster of menma (bamboo shoots) next to the egg.", "The chopsticks should be resting on the rim of the bowl, with their tips pointing towards the noodles."], "prompt": "Write `svg` code for an image of a detailed bowl of Japanese ramen, with noodles, broth, a soft-boiled egg, and chashu pork."} +{"requirements": ["Draw a cat curled into a tight ball, sleeping soundly on a cushioned windowsill, with its body pressed lightly against the base of a potted plant.", "Include a window frame around the scene, viewed from inside a room with a warm and soft color palette.", "Draw vertical streaks and scattered droplets on the window pane to represent heavy rain.", "Create a small, circular patch of condensation on the glass where the cat's warm breath would be, slightly obscuring the view of the rain outside.", "The potted plant on the windowsill must have a single water droplet clinging to the tip of one of its leaves.", "The scene outside the window must be blurred and rendered in muted blues and grays to suggest a cold, overcast day, contrasting with the warm interior light.", "Add a subtle, distorted reflection of the room's interior, including the silhouette of a lamp, on the surface of the window glass."], "prompt": "Write `svg` code for an image of a cat sleeping on a windowsill next to a potted plant, with rain streaking down the window pane."} +{"requirements": ["Draw a Swiss Army knife with its main red casing having a glossy finish.", "The iconic white cross inside a red shield logo must be clearly visible and inlaid into the center of the casing.", "Show several tools extended from the knife's body in a fanned-out arrangement from a visible pivot point.", "The large knife blade must be fully extended.", "A pair of scissors must be included and shown slightly open.", "A corkscrew, a can opener, and a flathead screwdriver must also be included, fanned out at different angles from the blade.", "All tools must have a metallic, silver/gray appearance with sharp, specular highlights to suggest shininess.", "Place the knife on a neutral surface, casting a soft shadow beneath it that follows the shape of the knife and its extended tools."], "prompt": "Write `svg` code for an image of a Swiss Army knife with several tools extended."} +{"requirements": ["Draw a large, cone-shaped volcano, silhouetted against a dark night sky.", "The sky must be dark black and contain a crescent moon and scattered stars.", "Show a massive plume of smoke and ash billowing from the crater, where its underside is intensely illuminated with fiery reds and oranges from the eruption below.", "Depict bright, glowing red and orange lava erupting from the crater and being ejected high into the air as pyroclastic debris.", "Illustrate multiple rivers of molten lava flowing down the sides of the volcano, carving glowing paths through the dark rock and pooling at the mountain's base.", "The erupting lava must be the primary light source, casting a dramatic glow on the smoke plume and the slopes of the mountain.", "The stars near the bright smoke plume must be obscured or less visible due to the intense glow of the eruption.", "The contrast between the dark, unlit mountain/sky and the brilliant, glowing lava must be sharp and dramatic."], "prompt": "Write `svg` code for an image of a volcano erupting at night, with lava flowing down its side."} +{"requirements": ["Create a main rectangular board area with a light grey background color.", "Draw exactly three vertical columns with the headers 'To Do', 'In Progress', and 'Done' respectively.", "Populate the 'To Do' column with three rectangular white cards containing placeholder text.", "One card in the 'To Do' column must have a red 'Urgent' label.", "Populate the 'In Progress' column with one rectangular white card.", "The card in the 'In Progress' column must feature two circular user avatars, indicating it is assigned to two people.", "Populate the 'Done' column with two rectangular white cards.", "One card in the 'Done' column must have a green 'Completed' label and a paperclip icon, indicating a finished task with an attachment.", "Illustrate one card being dragged from the 'To Do' column towards the 'In Progress' column, positioned between the two columns with a slight rotation and a drop shadow to indicate it is actively being moved.", "Ensure consistent spacing and alignment between all columns and cards.", "Include the main board title 'Project Alpha' at the top of the image."], "prompt": "Write `svg` code for a screenshot of a Trello board with multiple columns and cards."} +{"requirements": ["Draw a cooked sausage (frankfurter) nestled inside a sliced hot dog bun.", "The bun must appear soft and lightly toasted, with its shape conforming to the sausage it holds.", "Add a wavy line of yellow mustard across the top of the sausage.", "Add a wavy line of red ketchup that intertwines and overlaps with the mustard line on top of the sausage.", "Show a small drip of yellow mustard that has fallen from the hot dog onto the paper plate below.", "Place the entire hot dog on a white paper plate with fluted/ridged edges.", "The hot dog must cast a slight shadow onto the surface of the plate to create depth.", "Use shading and highlights on the sausage and bun to give them a three-dimensional, rounded look.", "The condiments must appear as if they are sitting on top of the sausage, following its curved contour."], "prompt": "Write `svg` code for an image of a hot dog with mustard and ketchup in a bun, on a paper plate."} +{"requirements": ["Create an underwater scene with a blue water background that is light cyan at the top and gets progressively darker towards the sea floor.", "Draw a variety of colorful coral formations on the sea floor, including pink brain coral, orange staghorn coral, and purple sea fans.", "Include a large sea turtle as the central element, swimming towards a patch of sea grass near the coral.", "Depict a school of at least five small, yellow tang fish swimming in unison past the turtle.", "Show two orange-and-white striped clownfish peeking out from within the tentacles of a green sea anemone.", "Place a red starfish attached to a rock at the base of the coral formations.", "Illustrate rays of light filtering down from the water's surface, casting a dappled light pattern on the sea turtle's shell and the sea floor.", "One clownfish must be partially obscured by the anemone's tentacles.", "The overall composition must be vibrant and dense, with the turtle, fish, and coral overlapping to create a sense of depth."], "prompt": "Write `svg` code for an image of a coral reef teeming with colorful fish and a sea turtle."} +{"requirements": ["Draw a clear glass mason jar, complete with its characteristic screw-top threads and embossed lettering on its side.", "The jar must be transparent, with the marbles inside fully visible and their shapes slightly distorted by the curved glass.", "Add white, curved highlights and reflections on the jar's surface that follow its cylindrical shape.", "Fill the jar almost to the top with numerous overlapping and stacked spherical marbles.", "The marbles must include at least three distinct, visible patterns: solid blue, green and white swirled, and a classic 'cat's eye' with a colored vane inside.", "Place a single 'cat's eye' marble on the surface next to the jar, casting a small shadow.", "The jar itself must cast a faint, transparent shadow that is tinted by the colors of the marbles within it.", "Each marble, both inside and outside the jar, must have a small, sharp white highlight to indicate its glossy surface.", "Include dark contact shadows between the marbles where they touch each other and where they press against the inside of the jar to create a sense of volume and weight."], "prompt": "Write `svg` code for an image of a glass jar filled with colorful marbles."} +{"requirements": ["Depict a large, partially constructed pyramid with visible stone layers and an unfinished, flat top.", "Include a large earthen ramp spiraling up the side of the pyramid, leading to the current construction level.", "Show a group of at least five workers in ancient Egyptian loincloths pulling a large stone block up the ramp using thick ropes.", "The stone block must be resting on a wooden sledge.", "Depict another worker walking in front of the sledge, pouring water from a clay jug onto the sand of the ramp to reduce friction.", "On the top level of the pyramid, show two other workers using long wooden levers to pry another stone block into its final position next to an existing one.", "Several unused, rectangular stone blocks must be visible at the base of the ramp in the sand.", "The setting must be a vast desert landscape under a bright, clear blue sky with a harsh sun.", "The workers, the sledge, and the pyramid must cast long, dark shadows on the sand, consistent with the bright sun's position.", "The overall color palette must consist of sandy yellows, stone greys, and sky blues."], "prompt": "Write `svg` code for an image of an ancient Egyptian pyramid under construction, with workers moving large stone blocks."} +{"requirements": ["Draw a single rose flower head in full bloom, with vibrant red petals that overlap in a natural, spiral-like formation.", "Use shading and gradients on the petals to create depth and a velvety texture.", "Place a single, clear droplet of water on the edge of one of the outer petals, showing refraction of the red petal color within it and a sharp highlight.", "Include a long, slender green stem connected to the base of the flower head, with several small, sharp thorns protruding from it.", "Attach exactly two green leaves to the stem, each with serrated edges and visible veins.", "One of the leaves must have a small, irregular hole in it, as if from an insect bite.", "Show a single red petal that has fallen from the flower and is lying on the surface near the base of the stem.", "The rose and the fallen petal must cast a soft shadow on a simple, light-grey background."], "prompt": "Write `svg` code for an image of a single red rose with a long stem and thorns."} +{"requirements": ["Replicate the general layout of the YouTube homepage within a rectangular frame with a dark mode theme.", "Include a header section at the top with a dark grey background.", "The header must contain the white YouTube logo (play icon and text), a central dark search bar with a search icon, and user-related icons on the right (create, notifications, profile avatar).", "Include a collapsible sidebar on the left with navigation links and icons (Home, Shorts, Subscriptions, Library), with the 'Home' icon and text highlighted to indicate the current page.", "The main content area must be a grid of at least six video thumbnails.", "Each thumbnail must be a rectangle containing a placeholder image, with a small box in the corner indicating video length (e.g., '10:32').", "Below each thumbnail, include a circular channel avatar, a placeholder for the video title on one line, and the channel name and view count on a second line.", "One of the video thumbnails must show a progress bar at the bottom, indicating it has been partially watched.", "Use the official YouTube color scheme: red (#FF0000) for highlights like the logo and progress bar, and shades of dark grey and white for the UI."], "prompt": "Write `svg` code for a screenshot of the YouTube homepage, showing video thumbnails and a sidebar."} +{"requirements": ["Illustrate a stack of exactly four books, arranged vertically but slightly askew so they don't line up perfectly.", "Depict the books as old and leather-bound, using colors like dark brown, burgundy, and forest green for the covers.", "Show the spines of the books, with the top book's spine featuring raised bands and faded gold-leaf lettering for a title.", "The visible page edges must be a yellowish, aged color, with thin horizontal lines to represent individual pages.", "The book at the bottom of the stack must be larger and thicker than the others, forming a stable base.", "Use subtle textures and scuff marks on the leather covers and corners to indicate wear and tear.", "Render the stack in a 3D perspective, showing the top cover of the highest book and the side spines and page edges of all four.", "The entire stack must cast a soft shadow on the surface it is resting on."], "prompt": "Write `svg` code for an image of a stack of old, leather-bound books."} +{"requirements": ["Draw a chameleon with its characteristic features: a curled tail, a head crest, and a prominent, independently rotating eye.", "Position the chameleon on a tree branch that extends diagonally across the image.", "The branch must have a rough brown bark texture and several green leaves attached to it.", "Illustrate the camouflage effect by having the chameleon's skin pattern and color actively blending into the branch and leaves it is touching.", "The rear half of the chameleon's body and its back legs, which are on the bark, must mimic the brown, rough texture of the branch.", "The front half of the chameleon's body and its head, which are near the leaves, must mimic the green color and vein patterns of the leaves.", "Show a visible, soft gradient transition on the chameleon's torso where the brown bark pattern blends into the green leaf pattern.", "The chameleon must be in a realistic clinging pose, with its zygodactyl feet gripping the branch firmly."], "prompt": "Write `svg` code for an image of a chameleon on a branch, changing its color to match the leaves."} +{"requirements": ["The background must be a solid, dark 'blueprint blue' color (#000080).", "All lines and text must be white.", "Depict a top-down floor plan of a small house with at least two bedrooms and one bathroom.", "Use thick lines for exterior walls and thinner lines for interior walls.", "Show openings for doors and windows within the walls; doors must be indicated with a line and a quarter-circle arc showing the swing direction into the corresponding room.", "Include labels in all caps for each room: 'KITCHEN', 'BEDROOM 1', 'BEDROOM 2', 'LIVING ROOM', and 'BATH'.", "Add exterior dimension lines with measurement annotations (e.g., '30ft') along the outside of the walls.", "Add interior dimension lines to show the size of 'BEDROOM 1'.", "Include schematic outlines of key furniture: a bed and closet in each bedroom, a sofa in the living room, a toilet and shower in the bathroom, and kitchen counters with a sink.", "The kitchen counters must connect to the living room in an open-plan layout.", "Incorporate a title block in the bottom-right corner with text for 'Project Name: 'Small House'', 'Drawing: 'Floor Plan'', and 'Scale: '1/4\" = 1ft''."], "prompt": "Write `svg` code for an image of a detailed architectural blueprint of a small house."} +{"requirements": ["Draw a white bowl.", "Fill the bowl with a mound of spaghetti noodles, depicted with many overlapping, curved yellow lines.", "Cover the spaghetti with a generous amount of red tomato sauce that drips down the sides of the noodle mound.", "Add highlights to the sauce to give it a glossy appearance.", "Place exactly three round, brown meatballs on top of the spaghetti, nestled in the sauce.", "Add a sprinkle of green specks (parsley) over the dish, with some specks also landing on the meatballs and the rim of the bowl.", "Show a metallic fork (grey with highlights) actively twirling a small portion of spaghetti noodles, lifting them slightly from the bowl.", "The twirled noodles on the fork must be coated in sauce and have one of the meatballs caught in the twirl.", "The main mound of spaghetti must show an indentation where the fork has lifted the noodles from."], "prompt": "Write `svg` code for an image of a plate of spaghetti and meatballs, with a fork twirling some noodles."} +{"requirements": ["Design a spaceship with a sleek, futuristic aesthetic, featuring smooth curves and panel lines on its silver, metallic-looking surfaces.", "Show the spaceship with its landing gear deployed and resting on the ground, with dust and small rocks kicked up around the landing struts.", "Include a cockpit window through which a faint silhouette of a pilot is visible, engine exhausts that are still glowing faintly red from the landing, and several active glowing blue lights on the hull.", "The setting is a barren alien planet with a surface composed of red soil and scattered purple rocks.", "Depict two moons of different sizes visible in the sky.", "The sky must be a dark purple, with its color reflecting off the silver hull of the spaceship.", "The light from the larger of the two moons must cast a long, dramatic shadow of the spaceship across the red soil, with the smaller purple rocks also casting their own distinct shadows."], "prompt": "Write `svg` code for an image of a sleek, futuristic spaceship landing on a barren alien planet with two moons."} +{"requirements": ["The image must be a close-up (macro) view of a spiderweb.", "Draw the web with a classic orb-weaver structure: radial support lines originating from a center point, and a spiral of thinner capture threads.", "The web's threads should be thin and delicate.", "Scatter multiple small, circular dewdrops along the threads of the web.", "Each dewdrop must be rendered as translucent, showing the web lines behind them as slightly distorted or refracted.", "Add a small, white highlight to each drop, with all highlights consistently placed to indicate a single, low light source from the early morning sun.", "Include a small gnat trapped in one of the web's spiral threads, with the thread pulling taut from the insect's weight.", "Use a soft, blurred, out-of-focus background of green foliage to make the web and dewdrops stand out."], "prompt": "Write `svg` code for an image of a spiderweb with dewdrops on it, seen up close."} +{"requirements": ["Create a rectangular frame representing a smartphone screen.", "Design a clean, modern User Interface (UI) for a weather application.", "At the top, display the current location as 'San Francisco', the current temperature as '68°' in a large font, and the weather description as 'Sunny'.", "Include a large, clear sun icon next to the current conditions, matching the 'Sunny' description.", "The UI background must be a light blue gradient, reflecting the current 'Sunny' weather condition.", "Below the current conditions, display a horizontal 5-day forecast section.", "The first day of the forecast (MON) must show a sun icon with temps '72° / 55°'.", "The second day (TUE) must show a sun-and-cloud icon with temps '69° / 54°'.", "The third day (WED) must show a cloud icon with temps '65° / 52°'.", "The fourth and fifth days (THU, FRI) must show rain drop icons with temps '62° / 50°' and '60° / 49°' respectively, showing a clear progression of weather.", "Use a legible, sans-serif font throughout the UI."], "prompt": "Write `svg` code for a screenshot of a weather app UI, showing a 5-day forecast with icons for sun, clouds, and rain."} +{"requirements": ["Draw a three-story brick building with multiple windows on its facade.", "Show visible orange, red, and yellow flames and dark smoke billowing from a second-story window.", "Depict exactly two firefighters in full protective gear (helmet, coat, pants, boots).", "A ladder must extend from a partially visible red fire truck to a third-story window, where one firefighter is positioned, preparing to enter.", "The second firefighter must be on the ground, aiming a fire hose towards the flaming second-story window.", "A thick fire hose must connect the firefighter on the ground to the fire truck.", "Show a powerful stream of water spraying from the hose's nozzle, arcing up and entering the flaming window.", "The scene must be set against a dark night sky, where the orange glow from the fire illuminates the side of the building, both firefighters, the ladder, and the stream of water."], "prompt": "Write `svg` code for an image of a group of firefighters putting out a fire on a multi-story building."} +{"requirements": ["Draw a ceramic-style coffee cup with a handle.", "Place the cup on a matching saucer.", "Fill the cup with a two-toned liquid representing cappuccino: a dark brown coffee base and a lighter, creamy foam top.", "Create a distinct heart shape in the center of the foam using the darker coffee color, recognizable as latte art.", "Position the view from a slight angle to clearly display the heart design and the side of the cup.", "Add a small silver spoon resting on the saucer, with its reflection slightly visible on the side of the cup.", "Include subtle wisps of steam rising from the cup, with the heart art slightly distorting the path of the steam rising directly above it."], "prompt": "Write `svg` code for an image of a cup of cappuccino with latte art in the shape of a heart."} +{"requirements": ["Draw a thick, ancient-looking book with a decorative leather cover.", "The book must be open, showing two pages with diagrams and symbols.", "Place the open book on top of an ornate stone pedestal.", "Illustrate several runes floating in the air directly above the open pages.", "The runes must have a visible glow effect, casting a colored light down onto the pages of the book.", "One of the symbols on the book's page must match one of the glowing runes floating above it.", "The book itself must emit a faint glow that illuminates the top surface of the pedestal it rests on.", "Use a dark, atmospheric background of a stone chamber to make the glowing elements stand out."], "prompt": "Write `svg` code for an image of a wizard's spellbook open on a pedestal, with glowing runes floating above it."} +{"requirements": ["Frame the entire image with the opening of a camping tent, creating a first-person perspective from inside, with the tent's fabric, seams, and zipper visible as the frame.", "In the foreground, show the edge of a red sleeping bag and a green backpack, establishing the interior of the tent.", "Outside the tent, depict a lit campfire with visible logs and bright orange and yellow flames.", "The campfire must cast a warm, flickering glow on the ground in front of the tent and on the visible parts of the tent's opening.", "The background must be a dark night sky populated with numerous small dots representing stars.", "Include silhouettes of pine trees against the starry sky.", "The perspective must be low, as if lying down inside the tent on the sleeping bag, looking out past the campfire to the sky."], "prompt": "Write `svg` code for an image of the view from inside a tent, looking out at a campfire and a starry night sky."} +{"requirements": ["Draw the main base of the record player with a wood grain finish.", "Include a circular platter on the base, on which a black vinyl record is placed.", "The record must have visible concentric grooves and a red center label.", "Illustrate a tonearm with a headshell and cartridge, positioned so the stylus (needle) is resting within one of the grooves on the record's surface.", "Include control elements: a power knob that is in the 'on' position and a speed selector set to '33' rpm.", "Show an open, transparent dust cover hinged at the back of the base, with a slight reflection of the tonearm visible on its surface.", "Depict small, stylized musical notes floating up from the record to indicate that music is playing."], "prompt": "Write `svg` code for an image of a classic vinyl record player with a record on the turntable."} +{"requirements": ["Illustrate an athletic figure in a dynamic tennis serving pose, wearing a white shirt and blue shorts.", "The player's body must be arched backwards, conveying coiled power.", "One arm must be extended upwards, having just tossed a yellow tennis ball into the air.", "The other arm must be holding a tennis racquet, swung high and captured at the moment just before it strikes the ball.", "The tennis ball must be positioned in the air slightly in front of and above the player, at the peak of the toss, perfectly aligned with the center of the raised racquet.", "Use motion lines trailing the racquet to suggest the high speed of its upward swing.", "Depict a portion of a blue tennis court, including the white baseline the player is standing behind, and the net in the background.", "The bright sun must cast a sharp, dynamic shadow of the player and their raised racquet onto the court surface."], "prompt": "Write `svg` code for an image of a tennis player in the middle of a powerful serve."} +{"requirements": ["Create a diagram illustrating the four distinct stages of the butterfly life cycle on a single host plant.", "Stage 1: Show a cluster of small eggs on a green leaf.", "Stage 2: Show a caterpillar actively eating the edge of the same leaf where remnants of the hatched eggs are visible.", "Stage 3: Show a chrysalis (pupa) hanging from a twig directly above the leaf from Stage 2.", "Stage 4: Show a fully formed adult butterfly with patterned wings, positioned next to the now-empty chrysalis casing from which it has emerged.", "Arrange the four stages in a logical circular sequence on the plant.", "Use arrows to connect the stages in the correct order: from the eggs to the caterpillar, from the caterpillar to the chrysalis, and from the chrysalis to the butterfly.", "An arrow must go from the butterfly back towards a fresh leaf on the plant, as if to lay new eggs, visually completing the cycle.", "Each stage must have a clear text label pointing to the relevant part of the plant: 'Eggs', 'Caterpillar', 'Chrysalis', 'Butterfly'."], "prompt": "Write `svg` code for a diagram showing the life cycle of a butterfly, from egg to caterpillar to chrysalis to adult."} +{"requirements": ["Draw a research-style submersible, not a military one, exploring a deep-sea trench.", "The submersible must have a main viewport, external lights, and at least one robotic arm extended towards the seafloor.", "Show bright beams of light emanating from the submersible's lights, directly illuminating an anglerfish in the foreground.", "Create a dark, deep-sea trench environment with rocky walls and a seafloor populated with a cluster of glowing tube worms.", "The robotic arm must be positioned as if it is about to collect a rock sample from next to the tube worms.", "Include at least one anglerfish, with its characteristic glowing lure, caught within the submersible's main beam of light.", "The overall scene must be very dark, with light originating only from the submersible, the anglerfish's lure, and other bioluminescent life.", "Depict floating particles in the water to create a sense of depth and murkiness.", "Add several bioluminescent jellyfish floating in the mid-ground between the submersible and the trench wall."], "prompt": "Write `svg` code for an image of a submarine exploring a deep-sea trench with glowing anglerfish nearby."} +{"requirements": ["Create a UI window that resembles a desktop application, with a title bar and window controls (minimize, maximize, close).", "Display a monthly calendar view for 'October 2023' inside the window.", "Include a header with the month and year and navigation arrows. The 'next month' arrow must be depicted in a hovered or pressed state.", "Lay out a grid for the days of the month, with headers for the days of the week (e.g., S, M, T, W, T, F, S).", "Populate the grid cells with date numbers for October 2023.", "Fill one date cell with a colored block and the text 'Project Deadline'.", "Fill a separate three-day span of consecutive dates with a single colored block labeled 'Team Offsite'.", "Highlight the 'Project Deadline' date cell with a circular outline to represent it as the 'current day'.", "Add one other event on a different day labeled 'Team Sync'.", "The design should be clean and modern, typical of a calendar app."], "prompt": "Write `svg` code for a screenshot of a calendar application, showing a monthly view with several events scheduled."} +{"requirements": ["Draw a large, medieval-style catapult made of wood with visible wood grain texture and metal fittings.", "The catapult's structure must include a sturdy base frame and a long throwing arm, powered by a large counterweight.", "Depict the catapult in the middle of the launch action: the counterweight must be shown near the bottom of its downward swing, while the throwing arm has just reached the apex of its upward swing.", "A large, round boulder must be shown in mid-air, having just left the catapult's sling, following a clear trajectory.", "Use motion lines to convey the rapid movement of the throwing arm and the flight path of the boulder.", "The scene must be set on a muddy field, with disturbed ground around the catapult's base to suggest the force of the launch.", "Include a stone castle wall in the distant background, positioned as the clear target for the boulder's trajectory."], "prompt": "Write `svg` code for an image of a medieval catapult launching a boulder."} +{"requirements": ["Create a primary circular plate to serve as the base for the food.", "Include several pieces of nigiri sushi, differentiating toppings by color (red for tuna, orange for salmon). One piece of salmon nigiri must have a small dab of green wasabi on top.", "Add at least one type of maki roll (sushi roll), showing the outer layer of nori (seaweed) and the cross-section of rice and fillings.", "Include a few slices of sashimi (raw fish without rice), arranged artfully on the plate, with a visible empty space where one piece of nigiri was removed.", "Place a small, shallow bowl on the side, filled with dark brown soy sauce that has ripples on its surface.", "Include a pair of chopsticks, positioned to be actively lifting a piece of tuna nigiri from the plate. The fish on the lifted nigiri must be slightly darkened at the tip, as if it has just been dipped in the soy sauce.", "Include a small mound of green wasabi and a pile of pink pickled ginger (gari) as garnishes on the plate.", "Arrange all elements in a visually appealing composition, focusing on the action of eating.", "The overall image should have a clean and fresh aesthetic."], "prompt": "Write `svg` code for an image of a plate of sushi and sashimi, with chopsticks and a small bowl of soy sauce."} +{"requirements": ["Draw a tall, cylindrical lighthouse tower with red and white horizontal stripes, positioned on top of a dark, jagged, rocky cliff.", "Show the lantern room at the top of the lighthouse with a visible, glowing light source inside.", "Create a powerful beam of light, depicted as a solid, yellow, trapezoidal shape, emanating from the lantern.", "The light beam must cut across the scene and directly illuminate a specific, treacherous-looking rock jutting out of the water.", "Establish a night scene with a dark blue sky, where a faint crescent moon and a few stars are partially visible through thin fog.", "Incorporate a fog effect using semi-transparent white shapes, which is thickest around the base of the cliff and thins out towards the sky.", "Depict dark, churning water with large, white-capped waves shown actively crashing against both the base of the cliff and the illuminated rock."], "prompt": "Write `svg` code for an image of a lighthouse on a rocky cliff, with its light beam cutting through a foggy night."} +{"requirements": ["Illustrate two strands twisting around a central axis to form a right-handed double helix with clear 3D perspective.", "The two outer strands must represent the sugar-phosphate backbones, depicted as smooth, continuous helical lines.", "Connect the two backbones with horizontal rungs representing the base pairs.", "Use four distinct colors for the nucleobases: Adenine (e.g., blue), Guanine (e.g., red), Cytosine (e.g., yellow), and Thymine (e.g., green).", "The base pairing must be consistently shown, so that the color for Adenine always pairs with the color for Thymine, and the color for Cytosine always pairs with the color for Guanine.", "Clearly show the major and minor grooves created by the helical twist, and add labels with lines pointing to 'Major Groove' and 'Minor Groove'.", "Add labels with lines pointing to the 'Sugar-phosphate backbone' and a 'Base pair'.", "In one section of the helix, magnify a single C-G pair to explicitly label the 'C' and 'G' on their respective colored shapes to reinforce the pairing rule.", "Maintain a clean, scientifically recognizable, diagrammatic style."], "prompt": "Write `svg` code for an image of a DNA double helix strand."} +{"requirements": ["The scene's focal point must be a multi-layered birthday cake on a table, with several lit candles on top.", "Include a group of at least three stylized people around the table, all wearing colorful, conical party hats.", "One person must be shown leaning forward over the cake, with puffed cheeks, in the act of blowing out the candles. Faint motion lines should emanate from their mouth towards the candle flames.", "The other people must be looking at the person blowing out the candles, with expressions of excitement or cheering.", "Add a bunch of colorful balloons with strings floating in the background, with one string shown leading to the hand of one of the people.", "Decorate the scene with festive streamers hanging from the ceiling and confetti scattered on the table around the cake.", "Place a few wrapped gift boxes on the table, one of which is partially unwrapped with ribbon trailing onto the table.", "Use a bright and cheerful color palette to convey a celebratory mood in an indoor party room."], "prompt": "Write `svg` code for an image of a birthday party scene with a cake, balloons, and people wearing party hats."} +{"requirements": ["Draw the open case of a desktop PC tower, showing the internal components from a perspective view.", "Include a large motherboard as the main circuit board, serving as the base for other components.", "Show the CPU socket on the motherboard, covered by a large heatsink and a spinning fan assembly indicated by motion lines.", "Illustrate at least two RAM sticks slotted into the motherboard.", "Include a dedicated GPU card plugged into a PCI-e slot on the motherboard, with its own two cooling fans.", "Depict the PSU in its housing, with a bundle of colored wires extending from it and connecting to the motherboard and GPU.", "Show one rectangular HDD and one flatter SSD, both connected to the motherboard with visible SATA data cables.", "Draw power cables from the PSU connecting to the motherboard's main power connector, the CPU power connector, and the GPU.", "Add clear labels with lines pointing to each major component: 'Motherboard', 'CPU Cooler', 'GPU', 'RAM', 'PSU', 'HDD', and 'SSD'.", "Use a clear, technical diagram style with clean lines to show how the components are interconnected."], "prompt": "Write `svg` code for an image of a detailed diagram of the internal components of a desktop computer."} +{"requirements": ["Draw a large, mature oak tree with a thick, textured trunk and wide, spreading branches.", "Construct a rustic wooden treehouse, made of planks, nestled among and structurally supported by the tree's branches.", "The treehouse must have a simple roof, a window with a small flower box on its sill, and a door.", "A rope ladder with wooden rungs must hang from the treehouse entrance down to the ground.", "A tire swing must be shown hanging by a rope from a sturdy, lower branch of the same oak tree.", "A small, red flag must be attached to the peak of the treehouse roof.", "The scene must be set in a green, grassy backyard, with the base of the tree's trunk clearly visible in the grass.", "Use a bright color palette that suggests a sunny day, with a clear blue sky in the background."], "prompt": "Write `svg` code for an image of a treehouse with a rope ladder, nestled in a large oak tree."} +{"requirements": ["Depict a blacksmith character, shown with a strong build and wearing a work apron.", "The blacksmith must be holding a hammer in a raised position, positioned directly above a glowing sword blade on an anvil, as if about to strike.", "Place a classic-shaped, heavy anvil on a wooden stump in front of the blacksmith.", "On the anvil, place a sword blade that is glowing bright orange and yellow to indicate it is heated.", "Show sparks flying upwards from the specific point on the glowing blade where the hammer is about to make contact.", "In the background, include a forge with visible glowing coals and flames, which serves as a primary light source.", "The setting must be a dark, rustic workshop; a pair of tongs must be resting against the anvil's wooden stump.", "The forge and the glowing sword blade must be the only light sources, casting an orange glow on the side of the blacksmith and a bright yellow-orange light on his front and the top of the anvil, creating distinct shadows.", "The anvil must be dark and metallic, with its top surface reflecting the bright orange glow from the sword blade."], "prompt": "Write `svg` code for an image of a blacksmith at an anvil, hammering a glowing piece of metal."} +{"requirements": ["Create a multitude of nodes, represented as circles of varying sizes, organized into at least three distinct color-coded clusters (e.g., blue, green, red).", "Connect the nodes with a large number of lines (edges) to show interconnection.", "The graph must be arranged in a force-directed layout, creating a complex, organic, web-like structure.", "Within each cluster, a larger central hub node must be connected via thick lines to its smaller, peripheral nodes.", "Thinner lines must be used to connect the peripheral nodes to each other within the same cluster.", "A few thin, curved lines must bridge the different colored clusters, connecting peripheral nodes from one cluster to another to show cross-cluster interaction.", "The central hub nodes must be the largest in size, with many connections, while peripheral nodes are smaller with fewer connections.", "The curved lines must navigate around other nodes gracefully to avoid a messy appearance."], "prompt": "Write `svg` code for an image of a complex network graph with nodes and interconnected lines."} +{"requirements": ["Draw an old-fashioned steam locomotive as the main subject, viewed from a three-quarter perspective.", "The locomotive must have key features: a smokestack, a cowcatcher, large driving wheels with connecting rods, and a cab for the engineer.", "Show white steam puffing from the smokestack, trailing backward over the top of the first attached train car to indicate motion.", "Attach exactly two passenger cars behind the locomotive.", "The train must be positioned one-third of the way across a detailed wooden trestle bridge.", "The bridge's structure must show the crisscrossing wooden beams and supports of the trestles, and its reflection must be visible in the river below.", "The bridge must span a river flowing through a valley.", "The background must feature pine forests on rolling hills leading up to distant, snow-capped mountains.", "The three-quarter perspective must effectively show the length of the train and the scale of the bridge over the river."], "prompt": "Write `svg` code for an image of an old-fashioned steam train crossing a wooden trestle bridge."} +{"requirements": ["Create a rectangular frame representing a phone screen in dark mode, with a dark background and light text/icons.", "Include a large square area for the album art, which must be a graphic of a stylized sun setting over an ocean.", "Below the album art, display the song title 'Ocean Sunset' in a larger, bold font.", "Below the song title, display the artist name 'The Vectors' in a smaller font.", "Create a playback control bar at the bottom containing icon buttons for 'Previous', 'Pause', and 'Next'. The 'Pause' icon must be the most prominent, indicating the song is playing.", "Include a horizontal progress bar (scrubber) above the control buttons.", "The progress bar's handle must be positioned at the one-third mark to indicate the current playback position.", "Display the elapsed time timestamp '1:23' at the left end of the progress bar and the total duration '3:45' at the right end, corresponding to the handle's position.", "On the same line as the progress bar, include a 'Shuffle' icon on the left and a 'Repeat' icon on the right. The 'Shuffle' icon must be illuminated to indicate it is active."], "prompt": "Write `svg` code for a screenshot of a music player interface, like Spotify, showing album art and playback controls."} +{"requirements": ["Include a central figure of a chef wearing a traditional uniform (toque, jacket) with an expression of intense concentration.", "The chef must be in a dynamic, mid-action pose, bringing a chef's knife down in a chopping motion.", "The chef's knife must be positioned just above a carrot on a cutting board, at the peak of its downward chop.", "On the cutting board, show a whole carrot with several circular slices already cut and lying to its left.", "Depict at least three small, irregular pieces of the carrot in mid-air to the right of the knife, flying away from the point of impact.", "The setting must be a busy kitchen, with a metal bowl of other chopped vegetables (onions, celery) sitting on the counter next to the cutting board.", "Show a pot on a stove in the background with wavy, semi-transparent lines of steam rising from it.", "The steam must drift upwards and partially obscure a shelf of spices located behind the pot.", "The overall composition must convey a sense of action, with the chef's eyes focused directly on the point where the knife will meet the carrot."], "prompt": "Write `svg` code for an image of a chef in a busy kitchen, mid-chop, sending pieces of a carrot flying off a cutting board, with steam rising from pots in the background."} +{"requirements": ["Draw a scuba diver figure complete with gear: mask, regulator, air tank, and fins.", "The diver must be holding an underwater camera and aiming its lens directly at the eye of an octopus.", "The octopus must be positioned behind a piece of brain coral, with two tentacles wrapped around it and its head and one eye peeking out.", "The octopus's skin texture and color must mimic the bumpy, tan texture of the brain coral it is hiding behind, demonstrating camouflage.", "Create a detailed coral reef environment with varied shapes and colors of coral, rock, and a small, brightly colored clownfish swimming near the camouflaged octopus.", "The entire image must have a blue tint to simulate being underwater.", "Include light rays filtering down from the water's surface, illuminating the diver's back and casting a slight shadow over the area where the octopus is hiding.", "Show a continuous stream of bubbles rising from the diver's regulator, moving upwards and passing in front of a section of the background coral."], "prompt": "Write `svg` code for an image of a scuba diver using an underwater camera to take a picture of a shy octopus that is partially camouflaged against a coral reef."} +{"requirements": ["Design a visually complex Rube Goldberg machine with a cobbled-together, DIY aesthetic, using parts like ramps, levers, and scissors.", "A red marble must be shown at the end of a wooden ramp, making contact with one end of a see-saw-like lever.", "The lever must be tilted down on the side the marble has hit, and consequently tilted up on the opposite end.", "The rising end of the lever must be shown pushing a pin out from under a weight.", "The now-unsupported weight must be depicted falling downwards, pulling a cord taut.", "The cord must be attached to the handle of a pair of scissors, pulling the blades closed.", "The scissor blades must be shown halfway closed, with a taut red string positioned between them, moments from being severed.", "The chain of events—marble hitting lever, lever releasing weight, weight pulling cord, cord closing scissors—must be clearly and sequentially illustrated."], "prompt": "Write `svg` code for an image of a complex Rube Goldberg machine in action, where a falling marble has just triggered a lever, which is in the process of releasing a pair of scissors to cut a string."} +{"requirements": ["Depict two people, one young and one old, sitting opposite each other at a table.", "The young person's hand must be hovering directly over their white queen piece on an 8x8 chessboard.", "The chess pieces must be arranged in a late-game configuration where the white queen's next move results in checkmate.", "The old person's black king must be shown trapped on the board, with its potential escape squares blocked by other white pieces, such as a rook and a bishop.", "The young person must have a facial expression of confident triumph, with their eyes fixed on the opponent's trapped king.", "The old person must have a facial expression of sudden, defeated realization, with their wide eyes looking at their own trapped king.", "The perspective must be from slightly over the young person's shoulder, focusing attention on their hand, the white queen, and the checkmated black king.", "The queen piece must be clearly identifiable and the target of the player's action."], "prompt": "Write `svg` code for an image of two people, one young and one old, intensely focused on a chess game, where one player's hand is hovering over the queen to make a checkmate move."} +{"requirements": ["Show an alchemist's hands and forearms, wearing dark, rustic sleeves, holding a glass beaker.", "The alchemist must be tilting the beaker, pouring a stream of glowing blue liquid from it.", "The stream of glowing liquid must flow into a large, dark, metallic cauldron that is positioned over a crackling wood fire.", "The cauldron must contain a green potion, and at the point where the blue stream meets the green liquid, there must be a bright flash of white light indicating a reaction.", "The green potion must be bubbling violently, with the bubbling most intense at the point of contact with the blue liquid.", "Plumes of purple smoke must be rising from the cauldron, curling upwards to partially obscure a background shelf filled with glass jars.", "The alchemist's hands and the beaker must be illuminated by the blue glow of the liquid, while the front of the cauldron is illuminated by the orange fire beneath it, casting complex shadows on the stone wall behind."], "prompt": "Write `svg` code for an image of an alchemist pouring a glowing blue liquid from a beaker into a cauldron, causing the green potion inside to bubble violently and emit purple smoke."} +{"requirements": ["Draw a recognizable Formula 1 race car, stationary and lifted off the ground on front and rear jacks inside a pit box.", "Show multiple pit crew members in team uniforms in dynamic poses of urgent, precise action around the car.", "At the front-left wheel, depict one crew member removing the old wheel while another stands ready, holding the new wheel.", "At the rear-right wheel, depict a crew member using a pneumatic wheel gun to tighten the nut on a newly fitted wheel, with sparks flying from the gun's impact.", "A crew member on the right side of the car must have a large refueling hose firmly connected to the car's fuel port.", "The driver, wearing a helmet, must be visible in the cockpit with hands on the steering wheel, looking intently towards the pit lane exit.", "A 'lollipop man' crew member must be standing directly in front of the car, holding a sign that indicates 'Brakes On'.", "The scene must be set in a pit lane with appropriate ground markings and a pit wall gantry in the background."], "prompt": "Write `svg` code for an image of a pit crew in a Formula 1 race, simultaneously changing all four tires and refueling the car while the driver waits."} +{"requirements": ["Focus on a close-up view of a barista's hands, with one hand holding a ceramic coffee cup by its handle.", "The other hand must be gripping a stainless steel milk pitcher from the side, with the thumb on the handle for stability.", "The pitcher must be tilted, with its spout positioned directly over the center of the cup.", "Show a thin, controlled stream of white, steamed milk pouring from the pitcher's spout into the cup.", "The cup must contain dark brown liquid representing espresso, with a creamy layer of crema on top.", "On the surface of the crema, there must be a detailed latte art pattern in the shape of a fern (rosetta), which is nearly complete.", "The stream of milk must be shown connecting to the top of the rosetta, forming the final, delicate leaf of the fern pattern.", "The composition should be tightly cropped, showing parts of the barista's forearms and apron, to emphasize the action of pouring and the creation of the art."], "prompt": "Write `svg` code for an image of a barista pouring steamed milk from a metal pitcher into a cup of espresso, creating detailed latte art in the shape of a fern."} +{"requirements": ["Use a cutaway style to show the internal workings of a single engine cylinder.", "Include the main components: cylinder wall, piston, connecting rod, a portion of the crankshaft, and the cylinder head.", "The crankshaft must be shown rotating, causing the connecting rod to pull the piston downwards within the cylinder, representing the intake stroke.", "A downward-pointing arrow must be attached to the top of the piston to indicate its direction of motion.", "In the cylinder head, the intake valve must be shown fully open, while the exhaust valve is fully closed.", "Show a carburetor connected to the cylinder's intake port.", "Represent the fuel-air mixture as a blue-colored gas being drawn from the carburetor, flowing past the open intake valve, and filling the expanding space above the descending piston.", "A spark plug must be screwed into the top of the cylinder head, with its electrode visible inside the combustion chamber.", "The image must have the clean, technical look of a diagram with clear outlines and labels for the piston, crankshaft, and intake valve."], "prompt": "Write `svg` code for a cutaway diagram of a car engine where the piston is moving down during the intake stroke, drawing a fuel-air mixture in from a carburetor."} +{"requirements": ["Depict a glassblower's hands and forearms as the primary subject, with one hand wearing a protective glove.", "The gloved hand must be holding and rotating a long metal blowpipe.", "At the far end of the blowpipe, show a glowing, red-orange, molten glass bubble.", "The other hand must be holding a shaping tool made of a thick, wet, folded wad of newspaper.", "The newspaper tool must be pressed firmly against the side of the molten glass bubble, creating a visible indentation.", "A thick cloud of steam must be shown billowing up from the exact point of contact where the wet newspaper touches the hot glass.", "The indentation on the glass bubble must directly correspond to the shape of the newspaper tool pressing into it.", "The background must clearly show the glowing orange opening of a furnace (the glory hole), which is the source of light in the scene."], "prompt": "Write `svg` code for an image of a glassblower at the end of a blowpipe, shaping a molten glass bubble with a wet wad of newspaper, causing steam to rise."} +{"requirements": ["The image must be a close-up focusing on a pair of hands and forearms, covered in a texture representing wet, brown clay.", "The hands must be positioned around a lump of clay, with one hand inside the opening and the other shaping the exterior wall.", "The clay must be perfectly centered on the circular head of a potter's wheel, which is surrounded by a splash pan.", "The clay must be formed into the recognizable, in-progress shape of a vase, with a defined base, a swelling body, and a narrowing neck.", "Include concentric circular lines on the clay and the wheel head to indicate a rapid spinning motion.", "Show drips of watery clay slip running down the exterior of the vase and the potter's hands, pooling at the base of the clay on the wheel head.", "The background must be a simple, dark, out-of-focus wall to keep the focus on the hands and the creative process."], "prompt": "Write `svg` code for an image of a potter's hands, covered in clay, shaping a vase on a spinning potter's wheel."} +{"requirements": ["Include a mother bird, recognizable by adult plumage, perched on the edge of a nest.", "The nest must be depicted with a woven texture of twigs and grass.", "The nest must be securely situated in the fork of a tree branch, with green leaves framing the scene.", "There must be exactly three baby chicks inside the nest.", "The chicks should appear young and fluffy, with underdeveloped feathers.", "All three chicks must have their beaks wide open, necks stretched, and pointing upwards towards the mother bird in a hungry posture.", "The mother bird must be leaning over, with her beak positioned directly above one of the chick's open beaks.", "A pink worm must be clearly visible, held at its midpoint in the mother's beak, with one end of the worm just entering the chick's beak.", "The scene should be brightly lit to convey a sense of a sunny springtime day."], "prompt": "Write `svg` code for an image of a mother bird at her nest, placing a worm into the wide-open beak of one of her three hungry chicks."} +{"requirements": ["Depict an astronaut in a standard white Extravehicular Mobility Unit (spacesuit) with a golden-tinted helmet visor.", "The astronaut must be attached to the end of a multi-jointed robotic arm via a foot restraint, floating in a zero-gravity pose.", "The robotic arm must be positioned to hold the astronaut steady next to a large solar panel array.", "The astronaut must be holding a specialized repair tool in their gloved hands and actively applying it to a visible tear in the solar panel.", "The solar panel must have a distinct tear, with jagged edges.", "A portion of the truss structure of the International Space Station (ISS) must be visible, serving as the base for the robotic arm.", "The blue and white curve of the Earth must be prominent in the background.", "The background must be the blackness of space with a scattering of stars.", "A coiled safety tether must be clearly visible, with one end clipped to the astronaut's suit and the other end attached to the ISS structure."], "prompt": "Write `svg` code for an image of an astronaut on a spacewalk, using a robotic arm to repair a damaged solar panel on the International Space Station, with the Earth visible below."} +{"requirements": ["Focus on the hands of a gardener, one of which is wearing a gardening glove.", "Show a mature branch of a tree, representing the rootstock, which has a clean 'V'-shaped notch cut into it.", "Include a smaller, separate twig (the scion) with several visible buds on it.", "The base of the scion must be cut into a wedge shape that fits perfectly into the rootstock's 'V'-shaped notch.", "Depict the gardener's hands holding the scion firmly in place within the rootstock's notch, ensuring the cambium layers align.", "Show a strip of green grafting tape being wrapped tightly by the gardener's fingers around the union point, holding the two pieces together.", "Show that some grafting wax has already been applied from a small tin to seal the top cut-end of the scion.", "A sharp, clean grafting knife must be visible resting on the rootstock branch next to the graft site.", "The background must show out-of-focus rows of other trees, suggesting an orchard setting."], "prompt": "Write `svg` code for an image of a gardener carefully grafting a branch from an apple tree onto a different rootstock tree, with grafting tape and wax visible."} +{"requirements": ["Depict a monk in a traditional brown, hooded medieval robe, seated at a slanted wooden writing desk.", "The monk's hand must be holding a white feather quill, poised over an open manuscript.", "The tip of the quill must be positioned just above the manuscript page, with a single, dark drop of ink visible on the nib, about to touch the parchment.", "On the desk, there must be an open manuscript page which features a large, ornate, 'illuminated' letter 'I' decorated with gold leaf and intricate vines.", "The quill must be positioned directly after the illuminated letter, ready to write the next character on a pre-drawn ruled line.", "The setting must be a stone room, and a distinct beam of dusty light from an arched window must be shown falling across the desk, illuminating the manuscript and the monk's hands.", "Next to the open manuscript, there must be an open inkwell and a small pile of stacked, leather-bound books."], "prompt": "Write `svg` code for an image of a medieval monk in a scriptorium, dipping a quill into an inkwell, about to write on an illuminated manuscript."} +{"requirements": ["Include exactly three children on top of a grassy hill.", "The children must be depicted in active, cooperative poses: one child is holding the kite string reel, a second child is guiding the taut string with their hands, and the third is pointing up excitedly.", "Show a large, elaborate kite in the shape of a green dragon high in the sky.", "The dragon kite must have a long, segmented tail that is flowing and rippling in the wind.", "A single, taut kite string must be visible, connecting from the kite down to the reel held by the first child.", "The setting is the crest of a green, rolling hill.", "The wind must be visually represented by having the children's hair, their loose clothing, the kite's tail, and the blades of grass all blowing in the same direction.", "The blue sky must have several puffy white clouds that appear to be moving quickly.", "All three children's expressions must be joyful and their gaze directed upwards at the kite."], "prompt": "Write `svg` code for an image of a group of children working together to fly a large, elaborate dragon kite on a windy day."} +{"requirements": ["Create a recognizable web browser window with a frame, three tabs (with the Google Drive tab being active), and an address bar showing a 'drive.google.com' URL.", "The content of the browser window must be the Google Drive user interface, showing a grid of folders with names like 'Photos', 'Work', and 'Vacation 2023'.", "A portion of a blurred nature photograph desktop background must be visible behind the browser window.", "Include a standard image file icon labeled 'Mountain.jpg' on the desktop area.", "Show a mouse cursor (arrow pointer) positioned over the browser window.", "The cursor must be depicted as 'dragging' the 'Mountain.jpg' file icon; the icon should appear semi-transparent and be located directly beneath the cursor's tip.", "The cursor and the dragged file icon must be positioned directly over the 'Photos' folder within the Google Drive interface.", "The 'Photos' folder must be highlighted with a blue border and a slightly changed background color to indicate it is the active drop zone, a direct result of the cursor's position.", "The overall image must clearly represent the user action of dragging a file from the desktop to a specific cloud storage folder."], "prompt": "Write `svg` code for a screenshot of a user dragging and dropping a file from their desktop into a Google Drive folder in a web browser."} +{"requirements": ["Include a person dressed in a veterinarian's lab coat over blue scrubs.", "The veterinarian should be holding a stethoscope, with the earpieces in their ears and the chest-piece placed on a dog's chest.", "The veterinarian's free hand must be resting reassuringly on the dog's back.", "The dog must be a golden retriever, positioned on a stainless steel veterinary examination table.", "The dog must appear calm, with its head turned towards its owner.", "Include a second person, the dog's owner, standing beside the table.", "The owner's right hand must be extended, holding a visible dog treat, which the dog is sniffing.", "The owner's left hand must be gently stroking the dog's head.", "The setting must be identifiable as a vet's office, with a clean, tiled background and a cabinet with medical supplies visible.", "The overall mood must be calm and caring, emphasized by the physical contact between all three subjects."], "prompt": "Write `svg` code for an image of a veterinarian listening to a dog's heartbeat with a stethoscope, while the dog's owner offers it a treat."} +{"requirements": ["Depict a red and black bowling ball at the end of a polished wooden bowling lane.", "The image must capture the exact moment of impact between the ball and the front-most (#1) pin.", "Show the full set of ten bowling pins, with the #1 pin shattering into pieces from the powerful impact.", "A dynamic starburst effect must emanate from the point of impact between the ball and the #1 pin.", "The impact must be shown causing the adjacent #2 and #3 pins to begin tilting backwards, starting a chain reaction.", "The other seven pins must be standing but showing slight vibrations.", "Incorporate sharp speed lines trailing the bowling ball to indicate it was thrown at high speed.", "The perspective must be low and close to the pins, looking down the lane, to heighten the drama.", "Include the pin deck and dark gutters of the bowling lane, with reflections from the polished wood."], "prompt": "Write `svg` code for an image of a bowling ball just as it makes impact with the front pin, sending it flying back into the others."} +{"requirements": ["The image must be a close-up on a bomb-like device composed of a bundle of dynamite sticks wrapped in tape.", "The device must feature a bundle of multi-colored wires (red, blue, green, yellow) leading to a timer.", "A prominent red digital timer must be part of the device, clearly displaying the numbers '0:07'.", "Show a pair of hands wearing thick, black, protective bomb-disposal gloves.", "One hand must be holding a pair of wire cutters, with its blades actively cutting the red wire.", "The red wire must be shown partially severed, with a small white and yellow spark at the point of the cut.", "The other gloved hand must be steadying the bundle, with its fingers separating the red wire from the adjacent blue and green wires.", "The scene must be tense and focused, with a dark, out-of-focus background to isolate the action.", "The composition must create a tight focal point on the interaction between the wire cutters, the sparking red wire, and the timer."], "prompt": "Write `svg` code for an image of a bomb disposal expert cutting the red wire on a complex-looking bomb with a timer that reads \"0:07\"."} +{"requirements": ["The image must be a diagrammatic cross-section of a plant, bisected by a horizontal line representing the ground.", "The below-ground section must show a root system spreading into dark brown soil containing blue water particles and brown nutrient particles.", "Blue arrows must originate at the water particles, enter the root tips, and travel up a channel (xylem) in the stem.", "The above-ground section must show the plant's stem, green leaves, and a yellow flower.", "Include a bright yellow sun in the top-left corner.", "Yellow arrows must represent sunlight traveling from the sun and pointing to the surfaces of the leaves.", "Gray arrows labeled 'CO2' must point from the air into the leaves.", "Light blue arrows labeled 'O2' must point from the leaves out into the air.", "The leaves must contain small green dots representing chloroplasts, where the sunlight arrows terminate.", "A second set of orange arrows labeled 'Sugars' must originate in the leaves and travel down a channel (phloem) in the stem towards the roots, showing the distribution of energy from photosynthesis."], "prompt": "Write `svg` code for a cross-section of a plant, showing the roots absorbing water from the soil and the leaves using sunlight for photosynthesis."} +{"requirements": ["Include the head and upper torso of a watchmaker, with a wrinkled brow to show intense concentration.", "A brass watchmaker's loupe must be fitted over the watchmaker's right eye.", "The reflection of the watch mechanism and tweezers must be visible on the lens of the loupe.", "The watchmaker's left hand must be steadying the casing of an open mechanical watch.", "The right hand must be holding a pair of fine-tipped tweezers, which are gripping a single, tiny brass watch gear.", "The open mechanical watch must reveal an intricate interior of interlocking silver and brass gears, springs, and red jewel bearings.", "The gear held by the tweezers must be positioned directly above an empty axle in the mechanism, fractions of a millimeter from being seated.", "A focused cone of light from an overhead desk lamp must illuminate the watch, the hands, and the tools.", "The background must be a dark, out-of-focus workshop to draw all attention to the detailed foreground action."], "prompt": "Write `svg` code for an image of a watchmaker using a loupe and fine tweezers to place a tiny gear into the intricate mechanism of a mechanical watch."} +{"requirements": ["Depict a child's face in three-quarter view, with puffed cheeks and puckered lips.", "The child must be holding a plastic bubble wand, with the ring held to their lips.", "A single, large bubble must be emerging from the wand, still connected to the soapy film in the wand's loop.", "The surface of the half-formed bubble must be transparent and show swirling, iridescent, rainbow-like colors.", "The distorted reflection of the child's face must be visible on the surface of the bubble.", "Use varying levels of transparency and opacity to make the bubble look delicate.", "The background must be a bright, sunny day in a grassy field with a clear blue sky.", "Include two fully-formed, iridescent bubbles floating away in the background, with distorted reflections of the sky and grass on their surfaces."], "prompt": "Write `svg` code for an image of a child blowing a bubble with a wand, with the bubble half-formed and showing iridescent reflections."} +{"requirements": ["Illustrate a blacksmith figure with muscular arms, wearing a heavy leather apron over a simple shirt.", "The blacksmith must be holding a long pair of tongs, gripping a sword by its tang (the part that goes into the hilt).", "The entire blade of the sword must be glowing a bright orange-yellow, indicating it is white-hot.", "The tip of the glowing sword is just clearing the opening of a brick forge, which is filled with glowing red and orange embers.", "An anvil must be positioned in front of the blacksmith, who is turned towards it, ready for the next action.", "Visual effects must include sparks flying from the sword, and a heat-haze shimmer effect around the blade.", "The surrounding workshop must be dimly lit, with the intense light from the forge and hot sword casting strong orange highlights on the blacksmith's face, arms, apron, and the face of the anvil.", "The blacksmith's posture must convey the effort of holding the hot metal, with tense muscles and a focused expression."], "prompt": "Write `svg` code for an image of a blacksmith pulling a glowing orange sword from a forge with a pair of tongs, ready to place it on an anvil."} +{"requirements": ["Draw a detailed, fuzzy bumblebee with black and yellow stripes and transparent, veined wings.", "The bumblebee must be positioned on a large pink flower petal, its weight causing the petal to bend downwards slightly.", "Draw the detailed pink flower, showing all its petals, a yellow pistil, and multiple stamens.", "The bee's proboscis (tongue) must be extended and physically touching the pistil in the center of the flower to collect nectar.", "Visible yellow pollen grains must be stuck to the bee's fuzzy legs and abdomen.", "The flower's stamens must also be covered in yellow pollen, and the bee's legs must be brushing against them, dislodging a few grains that are falling onto the petal below.", "Use a macro perspective to highlight the interaction between the bee and the flower's reproductive parts.", "Include the top of the flower's green stem and a single green leaf for context.", "Use vibrant, saturated colors for both the flower and the bee to create a lively scene."], "prompt": "Write `svg` code for an image of a bee pollinating a flower, with pollen grains visibly stuck to its legs as it collects nectar."} +{"requirements": ["Depict a person in a white lab coat, positioned at a lab bench.", "The scientist's right eye must be pressed to the eyepiece of a microscope.", "The scientist's left hand must be turning a focus knob on the side of the microscope.", "A clear petri dish must be on the microscope's stage, held by stage clips, directly under the objective lens.", "The petri dish must contain a yellow culture medium with several visible white colonies of bacteria.", "The microscope must be clearly drawn with an eyepiece, body, a turret with three objective lenses, a stage, and a light source at the base that is switched on.", "Include a circular inset view in the top-right corner, representing the microscope's view, which shows a magnified, stylized image of the bacteria from one of the colonies in the petri dish.", "A line must connect the inset view to the microscope's eyepiece to clarify the relationship.", "The background must show other laboratory equipment, such as beakers and test tube racks, to establish the setting."], "prompt": "Write `svg` code for an image of a scientist adjusting the focus on a microscope to look at bacteria in a petri dish."} +{"requirements": ["The central focus must be a large, fully decorated Christmas tree, topped with a glowing yellow star.", "Show a family of three people of different ages actively decorating the tree.", "An adult must be lifting a small child up, so the child can reach out and hang a red bauble ornament on a high branch.", "The third person, a teenager, must be kneeling down to arrange a string of colored lights around the lower part of the tree.", "The tree must be heavily decorated with a variety of ornaments, garlands, and the aforementioned lights, which are visibly plugged into a wall socket.", "The scene is set in a cozy living room with a fireplace in the background, in which a fire is burning.", "Include a pile of colorfully wrapped presents under the tree, some of which are being nudged by the teenager's feet.", "All characters must have joyful, smiling expressions, focused on their shared activity.", "The lighting must be warm and inviting, with the primary light source being the glow from the tree's lights and the fireplace, casting soft shadows in the room."], "prompt": "Write `svg` code for an image of a family decorating a Christmas tree together, with one person placing the star on top while others hang ornaments."} +{"requirements": ["Illustrate a large, vertical rock cliff face with texture and deep cracks.", "Show a climber figure halfway up the cliff, with their body twisted towards the rock.", "The climber must be wearing a red harness, a white helmet, and dark climbing shoes.", "The climber is in a dynamic pose: their left foot is on a small foothold, their right hand is gripping a side-pull handhold, and their left arm is fully extended, reaching for a chalk-dusted handhold just out of reach.", "A bright green climbing rope must be attached to the climber's harness via a visible figure-eight knot.", "The rope must run from the climber's harness, up through a quickdraw attached to the rock just above them, and then down in a straight, taut line to the belayer at the bottom of the cliff.", "The belayer must be anchored to the base of the cliff, looking up at the climber, with both hands on the rope as it feeds through a belay device attached to their harness.", "The handhold the climber is reaching for should be visibly smaller than the ones they are currently using.", "The background should be a simple blue sky with a few clouds below the climber's position to emphasize the height and exposure."], "prompt": "Write `svg` code for an image of a rock climber halfway up a cliff face, reaching for a handhold while their belayer manages the rope below."} +{"requirements": ["Use a cutaway, cross-sectional view to show the internal mechanism of a pin-tumbler lock, including the outer housing and the inner rotating plug.", "A brass key must be fully inserted into the lock's keyway.", "The key's bitting must be clearly visible, with five distinct cuts of varying depths.", "Depict exactly five vertical pin stacks inside the lock cylinder, passing through both the plug and the housing.", "Each stack must consist of a lower 'key pin' and an upper 'driver pin', with the key pins having different lengths corresponding to the key's cuts.", "Show compressed springs above each driver pin, pushing the entire stack downwards.", "The five cuts on the key's bitting must be shown lifting each corresponding key pin, causing the five gaps between the key pins and driver pins to align perfectly with the 'shear line'.", "The shear line (the gap between the inner rotating plug and the outer housing) must be a clearly defined, continuous horizontal line across all five pin channels.", "Include a rotational arrow to show that the key and the now-unobstructed plug are turning clockwise.", "Show a cam on the back of the rotating plug making contact with the lock's bolt mechanism, causing the bolt to retract into the lock housing."], "prompt": "Write `svg` code for a diagram showing a lock-and-key mechanism, with the key inserted and turning the tumblers to align them and unlock the bolt."} +{"requirements": ["Draw a large, heavy, spherical wrecking ball, slightly flattened on the side making contact with the wall.", "The ball must be attached to a thick, taut steel cable, which leads up and off-screen along a clear swing arc.", "Use motion blur lines that follow the arc of the swing to show the ball is in powerful motion from left to right.", "Depict a red brick wall that the ball is striking.", "The image must capture the exact moment of impact, with the ball embedded slightly into the wall.", "Show a concave crater forming on the wall that matches the curvature of the wrecking ball.", "Illustrate an explosion of debris flying outward from the impact point, with the trajectory of the fragments moving away from the ball's point of contact. The debris must include dust clouds, small brick fragments, and at least three whole bricks.", "The flying brick fragments must be shown frozen in mid-air, with some rotating.", "The rest of the wall must have cracks radiating out from the perimeter of the crater.", "The composition must convey a strong sense of force, with the motion lines of the ball and the trajectory of the debris creating a focused point of action."], "prompt": "Write `svg` code for an image of a wrecking ball in mid-swing, just making contact with a brick wall and sending debris flying."} +{"requirements": ["Show a person in a dynamic fishing stance on the grassy bank of a river.", "The fisherman should be holding a fishing rod, with their body twisted as if having just completed a cast.", "The fishing rod must be bent in a slight arc, as if it is un-flexing after being whipped forward.", "A thin fishing line must be shown unspooling from the reel, going through the guides on the rod, and extending from the very tip of the rod.", "The fishing line should be drawn in a long, graceful arc that starts at the rod tip and extends over the water.", "A red and white fishing lure must be clearly visible at the end of the line, positioned at the apex of the arc in mid-flight.", "The river should have gentle ripples, with a small disturbance in the water at the bank where the fisherman is standing.", "The background should be a natural outdoor scene with trees on the distant shore and a clear sky, establishing the direction of the cast.", "The fisherman's posture, the recovering bend of the rod, and the arc of the line must all work together to illustrate the single, fluid action of casting."], "prompt": "Write `svg` code for an image of a fisherman casting a line into a river, with the lure flying through the air at the end of the line."} +{"requirements": ["Depict a child, identifiable by youthful features, kneeling on wet sand.", "The child should be looking towards an incoming wave with a surprised expression, with one hand raised from their work.", "In front of the child, show a sandcastle with at least two towers and a connecting wall. A small plastic shovel should be stuck in the top of one tower.", "Include a large ocean wave, distinct from the calmer water, that is in the process of breaking.", "The white foam from the breaking wave must be shown actively washing over the base of the sandcastle, dissolving the lower part of the walls.", "Show the leading edge of the water and foam beginning to surround the child's knees.", "The setting must be a beach, with the sand around the castle being visibly darker and wet from the approaching water.", "Include a horizon line separating the sea and sky, with the wave rising above it to show its size.", "The motion of the wave, the dissolving sandcastle, and the child's reaction must be clearly linked to convey the moment of destruction."], "prompt": "Write `svg` code for an image of a child building a sandcastle at the beach, just as a wave is beginning to crash and wash it away."} +{"requirements": ["Include a person representing a DJ, positioned behind a table with equipment.", "Show two turntables, with a vinyl record on the one closer to the viewer.", "The record on the primary turntable must have radial motion lines to indicate it is spinning.", "One of the DJ's hands must be on the spinning record, with fingers curled, in a pose that suggests they are actively scratching the record back and forth.", "Include a DJ mixer positioned between the two turntables, with visible knobs and faders.", "The DJ's other hand must be shown gripping and moving the main horizontal crossfader on the mixer.", "The DJ should be wearing headphones, but with one earcup pushed back off their ear, allowing them to listen to the room.", "The overall scene should be dimly lit, with a single spotlight illuminating the DJ and their equipment, clearly distinguishing the turntables and the central mixer."], "prompt": "Write `svg` code for an image of a DJ at a turntable, with one hand on a spinning record and the other adjusting a slider on the mixer."} +{"requirements": ["Depict two figures: a tailor and a customer, central to the image.", "The tailor should be shown standing slightly to the side of the customer, focused intently on their work.", "The customer should be standing straight with arms held slightly out to their sides, wearing a form-fitting white dress shirt.", "A yellow, flexible measuring tape must be wrapped snugly around the customer's chest, under their arms.", "The tailor must be holding the measuring tape where the end overlaps the numbered scale, pinching it with their thumb and forefinger to mark the measurement. The tailor's eyes should be looking down at this exact point on the tape.", "The numbers on the measuring tape should be visible where the tailor is holding it.", "The background must be a tailor's shop, with a large three-way mirror behind the customer, reflecting the back of the customer and the tailor's action.", "The focused gaze of the tailor and the precise hold on the tape must clearly communicate the act of taking a measurement."], "prompt": "Write `svg` code for an image of a tailor taking measurements for a suit, wrapping a measuring tape around a customer's chest."} +{"requirements": ["The image must be a network diagram, not a realistic scene.", "Include a single, central circular node clearly labeled 'Server'. The server icon itself should appear cracked or fractured.", "Include at least five source nodes positioned in an arc around the server node.", "The source nodes must be styled to look malicious, each containing a skull and crossbones icon and colored dark grey.", "Draw a dense flood of lines representing traffic packets, so numerous that they almost merge into solid beams of light.", "All traffic packet lines must originate from the malicious nodes and converge on the central server, creating a visual bottleneck at the server's edge.", "Use arrows on the lines to indicate the unidirectional flow of data towards the server.", "The central server node must have a prominent circular status indicator on it.", "The server's status indicator must be glowing bright red to signify a critical overload or 'down' state, in stark contrast to the dark attacker nodes.", "The visual effect must be one of the central server being completely overwhelmed, with the incoming lines obscuring parts of the server node itself."], "prompt": "Write `svg` code for a diagram of a computer network under a DDoS attack, showing multiple malicious source nodes flooding a central server node with traffic packets, causing its status indicator to turn red."} +{"requirements": ["Depict a person in a stable archer's stance, positioned sideways to the target.", "The archer must be at 'full draw,' with the bowstring pulled back so their hand is anchored firmly under their chin, and the string touches their lips.", "Show a longbow that is visibly and deeply bent under the tension of the draw.", "The archer's left arm should be fully extended towards the target, holding the bow steady, with visible tension in the shoulder and arm muscles.", "An arrow, with fletching visible, must be nocked on the bowstring and resting on the archer's extended hand.", "Include a traditional circular target in the distant background, with concentric colored rings and a yellow bullseye.", "The composition must create a strong, clear line of sight, aligning the archer's dominant eye, the shaft of the arrow, and the bullseye of the distant target.", "The archer's gaze must be intensely focused along this line towards the target."], "prompt": "Write `svg` code for an image of an archer at full draw, aiming an arrow at a target in the distance."} +{"requirements": ["Depict a person in a stable archer's stance, with their body positioned sideways to the target.", "The archer must be at 'full draw,' with the bowstring pulled back so their drawing hand is anchored firmly at the corner of their mouth.", "Show a longbow that is visibly and deeply bent under the tension from the drawn string.", "The archer's other arm must be fully extended towards the target, holding the bow steady.", "An arrow, with fletching visible, must be nocked on the bowstring and resting on the bow, with its tip pointing directly at the target.", "Include a traditional circular target in the distant background, with concentric colored rings and a yellow bullseye.", "The composition must create a strong, clear line of sight, aligning the archer's dominant eye, the shaft of the arrow, and the bullseye of the distant target.", "The archer's gaze must be intensely focused along this line towards the target."], "prompt": "Write `svg` code for an image of an archer at full draw, aiming an arrow at a target in the distance."} +{"requirements": ["Show a person leaning into their work while holding and operating a leaf blower, bracing against its force.", "The leaf blower must be a recognizable shape with a main body, a handle the person is gripping, and a long nozzle aimed directly at a pile of leaves.", "Depict a large, dense pile of leaves on a suburban lawn. The side of the pile facing the leaf blower must be visibly caved in from the force of the air.", "The leaves must have a mix of autumn colors (red, orange, yellow, and brown).", "Show a powerful, visible stream of air, represented by transparent white motion lines, emanating from the blower's nozzle and directly hitting the caved-in side of the leaf pile.", "A cloud of individual leaves must be shown being lifted from the pile and propelled through the air, following the path of the air stream away from the nozzle.", "The setting must be a suburban lawn. A patch of grass where the leaves have been blown from must be clear, contrasting with the area still covered by the main pile.", "The distinction between the static, dense pile of leaves and the individual, airborne leaves must be clear and show a direct cause-and-effect relationship with the air stream from the blower."], "prompt": "Write `svg` code for an image of a person using a leaf blower to clear a large pile of autumn leaves from a suburban lawn."} +{"requirements": ["Depict a person in a camping environment, kneeling on the ground and leaning forward over a fire pit.", "The person must be holding a dark grey flint in one hand and a steel striker in the other.", "Show the flint and steel positioned directly over a tinder bundle, in the act of being struck together.", "A shower of bright yellow sparks must be visibly emanating from the point of contact, directed downwards towards the tinder.", "Include a tinder bundle made of fine wood shavings and dry grass, placed at the center of a stone fire pit on the ground.", "Show exactly two sparks landing on the top of the tinder bundle.", "Depict a small, bright orange glow and a thin wisp of white smoke rising from the exact spot where the two sparks have landed on the tinder.", "The person's head should be tilted down, with their gaze fixed on the glowing spot on the tinder.", "The setting must include the stone fire pit on dirt ground, with the dark silhouettes of several pine trees visible in the background."], "prompt": "Write `svg` code for an image of a camper starting a fire with a flint and steel, with the first sparks just catching on a tinder bundle."} +{"requirements": ["The image must be a diagram illustrating the process of nuclear fission with clear labels and arrows.", "Show a small blue circle labeled 'Neutron' with a solid black arrow indicating its trajectory towards a large, purple nucleus.", "The large nucleus must be labeled 'Uranium-235 Nucleus' and be depicted in the process of splitting into two smaller, unequal-sized nuclei.", "The two new nuclei, labeled 'Fission Fragment', must be shown moving in opposite directions away from the point of fission, each with a directional arrow.", "Show exactly three new blue circles, identical to the first and labeled 'Neutron', being ejected from the splitting nucleus.", "Each of the three released neutrons must have its own arrow indicating its outward trajectory, with one pointing towards the edge of the frame to suggest a continuing chain reaction.", "A bright yellow flash must emanate from the center of the splitting Uranium-235 Nucleus.", "Wavy red lines, representing energy, must radiate outwards from the yellow flash, passing between the departing fission fragments and neutrons."], "prompt": "Write `svg` code for an image of a nuclear fission reaction, showing a neutron striking a uranium nucleus, causing it to split into smaller elements and release more neutrons."} +{"requirements": ["Depict a person in professional sommelier attire, including a tastevin necklace.", "The sommelier must be holding a dark green wine bottle, tilted so that a thin stream of red wine flows from its mouth directly into the opening of a glass decanter positioned on a table below it.", "With their other hand, the sommelier must hold a single, lit white candle, positioning its yellow flame directly behind the neck of the wine bottle.", "The flame from the candle must cast a bright, glowing light through the bottle's neck.", "Inside the bottle, illuminated by the candle's flame, a small collection of dark specks representing sediment must be visible, gathered at the bottle's shoulder, prevented from being poured.", "The stream of wine flowing into the decanter must be clear and free of any sediment.", "The background must depict a wine cellar, with the curved tops of wooden wine barrels and a stone archway visible behind the sommelier."], "prompt": "Write `svg` code for an image of a sommelier pouring a small amount of wine from a bottle into a decanter, holding a candle behind the bottle's neck to check for sediment."} +{"requirements": ["Depict a ginger tabby cat lying on its back on a wooden floor in a playful pose.", "The cat must have its front paws actively batting a ball of blue yarn that is positioned directly above its chest.", "The ball of yarn must be partially unraveled, with a long, continuous strand of yarn trailing away from it.", "This loose strand of yarn must be tangled around one of the cat's rear legs before continuing to spread in a chaotic, looping mess across the floor.", "The cat must have wide, focused eyes and a slightly open mouth, indicating playful excitement directed at the yarn.", "The scene must take place on a light-colored wooden floor, with the parallel lines of the floorboards clearly visible beneath the cat and the yarn.", "At least one of the cat's claws must be visible, slightly snagged in the ball of yarn."], "prompt": "Write `svg` code for an image of a cat playfully batting at a ball of yarn, causing it to unravel across a wooden floor."} diff --git a/tests/pytest/data/svgbench_sample_dataset.jsonl b/tests/pytest/data/svgbench_sample_dataset.jsonl index d5c38370..40638814 100644 --- a/tests/pytest/data/svgbench_sample_dataset.jsonl +++ b/tests/pytest/data/svgbench_sample_dataset.jsonl @@ -1,3 +1,3 @@ -{"requirements": ["Cow must be clearly recognizable with distinctive bovine features", "Include cow body, head, four legs, tail, and udder", "Add cow ears, eyes, and snout for facial recognition", "Cow should be positioned in a realistic plowing stance (pulling forward)", "Use appropriate cow coloring (black/white patches, brown, or solid color)", "Include a traditional plow with visible blade/share", "Show plow handles extending upward", "Depict connection mechanism between cow and plow (yoke, harness, or chains)", "Plow should appear to be cutting into the soil", "Show ground/soil with visible furrows behind the plow", "Include plowed and unplowed sections of field", "Add simple background elements (horizon line, sky)", "Include basic vegetation or crops"], "prompt": "Write `svg` code to draw an image of a cow plowing a field.", "id": "cow_plowing"} -{"requirements": ["The overall background of the SVG must be white", "All primary elements must be horizontally centered on the canvas", "Include the Google logo in the center, using its official multi-color scheme (blue, red, yellow, blue, green, red)", "Place a prominent search bar directly below the Google logo", "The search bar must be a rounded rectangle with a light gray border", "The search bar must contain a gray magnifying glass icon on the left side", "The search bar must contain a gray microphone icon on the right side", "Place two distinct buttons below the search bar", "The left button must be labeled 'Google Search'", "The right button must be labeled 'I'm Feeling Lucky'", "Buttons should have a light gray background, a thin border, and dark gray text", "Create a header section at the top right of the canvas", "The header must include text links for 'Gmail' and 'Images'", "The header must include a 3x3 grid icon (Google Apps launcher)", "The header must include a prominent 'Sign in' button, typically with a blue background and white text"], "prompt": "Write `svg` code for a screenshot of the [Google homepage](https://google.com).", "id": "google_homepage"} -{"requirements": ["Create a primary circular or elliptical shape for the top surface of a round dinner table", "The table should have a distinct color or a simple texture like wood grain", "Include exactly 4 sets of cutlery arranged around the table", "Each cutlery set must consist of a recognizable fork, knife, and spoon", "Position the 4 cutlery sets at distinct place settings (e.g., at 12, 3, 6, and 9 o'clock positions)", "Optionally, include a round dinner plate at each of the 4 place settings", "Place exactly 3 main food dishes on the surface of the table", "First dish: A recognizable roasted turkey, golden-brown in color, showing drumsticks and a plump body", "The turkey should be presented on its own platter or serving dish", "Second dish: A round pizza, cut into slices, with visible crust and toppings", "Third dish: A serving of tacos (at least two), with visible folded shells and fillings (e.g., lettuce, meat, cheese)", "The tacos should be on a plate or in a holder", "Arrange the three main dishes in the center of the table, ensuring they don't unnaturally overlap", "The overall perspective should be top-down or slightly isometric"], "prompt": "Write `svg` code for an image of a round dinner table with 4 sets of cutlery and 3 dishes on the table, including a turkey, pizza and tacos.", "id": "dinner_table"} +{"requirements": ["Cow must be clearly recognizable with distinctive bovine features", "Include cow body, head, four legs, tail, and udder", "Add cow ears, eyes, and snout for facial recognition", "Cow should be positioned in a realistic plowing stance (pulling forward)", "Use appropriate cow coloring (black/white patches, brown, or solid color)", "Include a traditional plow with visible blade/share", "Show plow handles extending upward", "Depict connection mechanism between cow and plow (yoke, harness, or chains)", "Plow should appear to be cutting into the soil", "Show ground/soil with visible furrows behind the plow", "Include plowed and unplowed sections of field", "Add simple background elements (horizon line, sky)", "Include basic vegetation or crops"], "prompt": "Write `svg` code to draw an image of a cow plowing a field."} +{"requirements": ["The overall background of the SVG must be white", "All primary elements must be horizontally centered on the canvas", "Include the Google logo in the center, using its official multi-color scheme (blue, red, yellow, blue, green, red)", "Place a prominent search bar directly below the Google logo", "The search bar must be a rounded rectangle with a light gray border", "The search bar must contain a gray magnifying glass icon on the left side", "The search bar must contain a gray microphone icon on the right side", "Place two distinct buttons below the search bar", "The left button must be labeled 'Google Search'", "The right button must be labeled 'I'm Feeling Lucky'", "Buttons should have a light gray background, a thin border, and dark gray text", "Create a header section at the top right of the canvas", "The header must include text links for 'Gmail' and 'Images'", "The header must include a 3x3 grid icon (Google Apps launcher)", "The header must include a prominent 'Sign in' button, typically with a blue background and white text"], "prompt": "Write `svg` code for a screenshot of the [Google homepage](https://google.com)."} +{"requirements": ["Create a primary circular or elliptical shape for the top surface of a round dinner table", "The table should have a distinct color or a simple texture like wood grain", "Include exactly 4 sets of cutlery arranged around the table", "Each cutlery set must consist of a recognizable fork, knife, and spoon", "Position the 4 cutlery sets at distinct place settings (e.g., at 12, 3, 6, and 9 o'clock positions)", "Optionally, include a round dinner plate at each of the 4 place settings", "Place exactly 3 main food dishes on the surface of the table", "First dish: A recognizable roasted turkey, golden-brown in color, showing drumsticks and a plump body", "The turkey should be presented on its own platter or serving dish", "Second dish: A round pizza, cut into slices, with visible crust and toppings", "Third dish: A serving of tacos (at least two), with visible folded shells and fillings (e.g., lettuce, meat, cheese)", "The tacos should be on a plate or in a holder", "Arrange the three main dishes in the center of the table, ensuring they don't unnaturally overlap", "The overall perspective should be top-down or slightly isometric"], "prompt": "Write `svg` code for an image of a round dinner table with 4 sets of cutlery and 3 dishes on the table, including a turkey, pizza and tacos."} diff --git a/tests/pytest/test_livesvgbench.py b/tests/pytest/test_livesvgbench.py new file mode 100644 index 00000000..f105f8e0 --- /dev/null +++ b/tests/pytest/test_livesvgbench.py @@ -0,0 +1,580 @@ +""" +SVGBench evaluation test for EvalProtocol.io. + +This test evaluates LLM ability to generate SVG code that meets specific visual requirements. +The evaluation process includes: +1. SVG code generation from text prompts +2. SVG to PNG rendering using Selenium +3. LLM judge evaluation of requirement fulfillment +4. Scoring based on fulfilled requirements ratio +""" + +import base64 +import json +import logging +import os +import re +import tempfile +import time +from typing import Any, Dict, List, Optional + +import litellm +from pydantic import BaseModel + +from eval_protocol.models import EvaluateResult, EvaluationRow, InputMetadata, Message, MetricResult +from eval_protocol.pytest import evaluation_test +from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor + +logger = logging.getLogger(__name__) + + +class SVGBenchResponse(BaseModel): + reasoning: str + number_of_fulfilled_requirements: int + + +def svgbench_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]: + """ + Convert SVGBench dataset entries to EvaluationRow objects. + + Args: + data: List of dictionaries containing prompt and requirements + + Returns: + List of EvaluationRow objects + """ + rows = [] + + for i, row in enumerate(data): + # Format requirements as numbered list + requirements = "\n".join([f"{i+1}. {req}" for i, req in enumerate(row["requirements"])]) + + # Create the generation prompt following SVGBench format + prompt = f"""{row['prompt']} Wrap the SVG code in an SVG code block following the example below. + +Example: +```svg + + + +``` + +Requirements: +{requirements}""" + + eval_row = EvaluationRow( + messages=[Message(role="user", content=prompt)], + input_metadata=InputMetadata( + row_id=f"row_{i}", + dataset_info={ + "original_prompt": row["prompt"], + "requirements": row["requirements"], + "total_requirements": len(row["requirements"]), + "formatted_prompt": prompt, + }, + ), + ) + + rows.append(eval_row) + + return rows + + +def extract_svg_code(text: str) -> Optional[str]: + """ + Extract SVG code from model response using SVGBench's extraction logic. + + Args: + text: Raw model response text + + Returns: + Extracted SVG code or None if not found + """ + # First try: Look for ```svg code blocks + if "```svg" in text: + svg_parts = text.split("```svg") + if len(svg_parts) > 1: + svg_code = svg_parts[1].split("```")[0].strip() + return svg_code + + # Second try: Look for ... tags + if "" in text: + start = text.find("") + 6 + svg_code = text[start:end].strip() + return svg_code + + return None + + +def render_svg_to_png(svg_code: str, output_path: str) -> bool: + """ + Render SVG code to PNG using Selenium WebDriver. + + Args: + svg_code: Valid SVG code + output_path: Path where PNG should be saved + + Returns: + True if successful, False otherwise + """ + try: + # Check if selenium and webdriver are available + try: + from selenium import webdriver + from selenium.webdriver.chrome.options import Options + from selenium.webdriver.common.by import By + from selenium.webdriver.support import expected_conditions as EC + from selenium.webdriver.support.ui import WebDriverWait + except ImportError: + logger.error("Selenium not available. Install with: pip install selenium") + return False + + # Parse SVG dimensions + width, height = 800, 600 # Default dimensions + + # Try to extract dimensions from SVG + width_match = re.search(r'width="(\d+)"', svg_code) + height_match = re.search(r'height="(\d+)"', svg_code) + viewbox_match = re.search(r'viewBox="[^"]*?(\d+)\s+(\d+)"', svg_code) + + if width_match and height_match: + width, height = int(width_match.group(1)), int(height_match.group(1)) + elif viewbox_match: + width, height = int(viewbox_match.group(1)), int(viewbox_match.group(2)) + + # Create HTML wrapper + html_content = f""" + + + + + + + + {svg_code} + + + """ + + # Set up Chrome options + chrome_options = Options() + chrome_options.add_argument("--headless") + chrome_options.add_argument("--no-sandbox") + chrome_options.add_argument("--disable-dev-shm-usage") + chrome_options.add_argument("--disable-gpu") + chrome_options.add_argument(f"--window-size={width+40},{height+40}") + + # Create temporary HTML file + with tempfile.NamedTemporaryFile(mode="w", suffix=".html", delete=False) as f: + f.write(html_content) + html_path = f.name + + try: + # Initialize WebDriver + driver = webdriver.Chrome(options=chrome_options) + driver.get(f"file://{html_path}") + + # Wait for SVG to load + WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "svg"))) + + # Take screenshot + driver.save_screenshot(output_path) + driver.quit() + + return True + + finally: + # Clean up temporary file + os.unlink(html_path) + + except Exception as e: + logger.error(f"SVG rendering failed: {e}") + return False + + +def evaluate_with_llm_judge(image_path: str, requirements: List[str]) -> Dict[str, Any]: + """ + Use LLM judge to evaluate how many requirements are fulfilled. + Uses GPT-4.1 for vision capabilities to match project's model preferences. (note original repo uses Gemini 2.5 flashs) + + Args: + image_path: Path to rendered PNG image + requirements: List of requirements to evaluate + + Returns: + Dictionary with evaluation results + """ + # Format requirements for evaluation (exactly as in original) + requirements_text = "\n".join([f"{i+1}. {req}" for i, req in enumerate(requirements)]) + + # Create evaluation prompt with JSON response format + evaluate_prompt = f"""Examine the generated image. How many of the following {len(requirements)} requirements were fulfilled? + +Be strict about the requirements and respond ONLY with a JSON object in this exact format: +{{"number_of_fulfilled_requirements": , +"reasoning": }} + +Where is a number between 0 and {len(requirements)}. + +Requirements: +{requirements_text}""" + + # Read and encode image + with open(image_path, "rb") as f: + image_data = base64.b64encode(f.read()).decode("utf-8") + + # Prepare messages with image + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": evaluate_prompt}, + {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_data}"}}, + ], + } + ] + + # Use GPT-4.1 for vision capabilities to match project's OpenAI model preference + response = litellm.completion( + model="gpt-4.1", + messages=messages, + temperature=0.0, + response_format={ + "type": "json_schema", + "json_schema": {"name": "SVGBenchResponse", "schema": SVGBenchResponse.model_json_schema()}, + }, + ) + + # Parse response + response_content = response.choices[0].message.content + + # Handle empty response + if not response_content or response_content.strip() == "": + raise ValueError("Empty response from LLM judge") + + result = json.loads(response_content) + + # Validate the result + if "number_of_fulfilled_requirements" in result: + return result + else: + raise ValueError("Missing required field in response") + + +class HumanPreferenceResponse(BaseModel): + """Response structure for human preference evaluation with detailed rubrics.""" + + intent_reasoning: str + intent_matching_score: float # 0-1: Does the content match the intended purpose? + + content_reasoning: str + content_recognizability_score: float # 0-1: Are key elements actually recognizable? + + spatial_reasoning: str + spatial_design_score: float # 0-1: Quality of layout, hierarchy, professional appearance + + ux_reasoning: str + user_experience_score: float # 0-1: Would humans find this usable/appropriate? + + coherence_reasoning: str + visual_coherence_score: float # 0-1: Do all elements work together harmoniously? + + overall_reasoning: str + overall_human_preference_score: float # Weighted combination of above scores + + +def evaluate_with_human_preference_rubrics( + image_path: str, original_prompt: str, requirements: List[str] +) -> Dict[str, Any]: + """ + Evaluate image using human preference rubrics focusing on intent matching, + recognizability, spatial design, and user experience. + + This addresses issues like the Google logo being colored circles instead of actual letterforms. + """ + # Read and encode image + with open(image_path, "rb") as f: + image_data = base64.b64encode(f.read()).decode("utf-8") + + # Create comprehensive evaluation prompt focusing on human preference + evaluate_prompt = f"""You are evaluating an SVG image from a human preference perspective. + +Original Request: {original_prompt} + +Evaluate the image across these 5 key rubrics that matter to humans: + +**1. INTENT MATCHING (Weight: 30%)** +Does the content actually fulfill the intended purpose? Look beyond surface requirements. +- For logos: Are they actually recognizable as the intended brand/text, not just colored shapes? +- For UI: Does it look like a functional interface users would recognize? +- For objects: Would humans identify the main subject correctly? + +**2. CONTENT RECOGNIZABILITY (Weight: 25%)** +Are the key elements genuinely recognizable, not abstract representations? +- Text/logos: Can you read the actual letters/words, or are they just shapes? +- Objects: Are they clearly identifiable with proper features? +- Brands/icons: Do they match what humans would expect to see? + +**3. SPATIAL DESIGN QUALITY (Weight: 20%)** +Professional layout, visual hierarchy, and design principles: +- Visual hierarchy: Do important elements stand out appropriately? +- Layout balance: Is the composition well-balanced and professional? +- Spacing and alignment: Does it follow good design principles? +- Proportions: Are elements sized appropriately relative to each other? + +**4. USER EXPERIENCE (Weight: 15%)** +Would humans find this usable and appropriate? +- Functionality: For UI elements, do they look clickable/interactive? +- Clarity: Is the purpose and function immediately clear? +- Accessibility: Is text readable, elements distinguishable? +- Professional appearance: Does it meet basic quality standards? + +**5. VISUAL COHERENCE (Weight: 10%)** +Do all elements work together harmoniously? +- Style consistency: Do elements match in style and quality? +- Color harmony: Do colors work well together? +- Visual flow: Does the eye move through the design naturally? + +**CRITICAL: Be very strict about content that looks like abstract shapes instead of the intended content.** +For example, colored circles arranged in Google colors should score very low for intent matching and recognizability. + +Original Requirements (for context): +{chr(10).join([f"{i+1}. {req}" for i, req in enumerate(requirements)])} + +Respond with JSON in this exact format: +{{ + "intent_matching_score": <0.0-1.0>, + "intent_reasoning": "", + "content_recognizability_score": <0.0-1.0>, + "content_reasoning": "", + "spatial_design_score": <0.0-1.0>, + "spatial_reasoning": "", + "user_experience_score": <0.0-1.0>, + "ux_reasoning": "", + "visual_coherence_score": <0.0-1.0>, + "coherence_reasoning": "", + "overall_human_preference_score": , + "overall_reasoning": "" +}}""" + + # Prepare messages with image + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": evaluate_prompt}, + {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_data}"}}, + ], + } + ] + + # Use GPT-4.1 for evaluation + response = litellm.completion( + model="gpt-4.1", + messages=messages, + temperature=0.0, + response_format={ + "type": "json_schema", + "json_schema": {"name": "HumanPreferenceResponse", "schema": HumanPreferenceResponse.model_json_schema()}, + }, + ) + + # Parse response + response_content = response.choices[0].message.content + + if not response_content or response_content.strip() == "": + raise ValueError("Empty response from human preference evaluator") + + result = json.loads(response_content) + + # Validate the result has required fields + required_fields = ["intent_matching_score", "content_recognizability_score", "overall_human_preference_score"] + for field in required_fields: + if field not in result: + raise ValueError(f"Missing required field in response: {field}") + + return result + + +@evaluation_test( + input_dataset=["tests/pytest/data/svgbench_dataset.jsonl"], + dataset_adapter=svgbench_to_evaluation_row, + completion_params=[ + {"temperature": 0.0, "model": "gpt-4.1"}, + { + "temperature": 0.8, + "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b", + "extra_body": {"reasoning_effort": "high"}, + }, + ], + rollout_processor=SingleTurnRolloutProcessor(), + passed_threshold=0.6, # Higher threshold for combined evaluation + num_runs=1, + mode="pointwise", + max_concurrent_rollouts=50, +) +def test_svg_combined_evaluation(row: EvaluationRow) -> EvaluationRow: + """ + Combined SVG evaluation using both requirement fulfillment and human preference rubrics. + + This runs two evaluations: + 1. Original: Specific requirements per row (listwise) + 2. Human Preference: Universal rubrics for all rows (pointwise) + + Combines results to catch issues like Google logos that are just colored circles. + """ + # Extract dataset info + requirements = row.input_metadata.dataset_info["requirements"] + total_requirements = row.input_metadata.dataset_info["total_requirements"] + original_prompt = row.input_metadata.dataset_info["original_prompt"] + row_id = row.input_metadata.row_id + + # Check if we should save debug files + save_debug_files = os.environ.get("SVGBENCH_SAVE_DEBUG_FILES", "false").lower() == "true" + + # Get model response + if not row.messages or len(row.messages) < 2: + row.evaluation_result = EvaluateResult(score=0.0, reason="No model response found") + return row + + model_response = row.messages[-1].content + + # Extract SVG code + try: + svg_code = extract_svg_code(model_response) + if not svg_code: + raise ValueError("No valid SVG code found in response") + except Exception as e: + logger.error(f"Error extracting SVG code for question {row_id}: {e}") + row.evaluation_result = EvaluateResult(score=0.0, reason=f"SVG extraction failed: {str(e)}") + return row + + # Setup file paths + if save_debug_files: + model = row.input_metadata.completion_params["model"] + safe_model_name = model.replace("/", "_").replace(":", "_") + debug_dir = "svgbench_debug_combined" + os.makedirs(debug_dir, exist_ok=True) + png_path = os.path.join(debug_dir, f"question_{row_id}_{safe_model_name}.png") + svg_path = os.path.join(debug_dir, f"question_{row_id}_{safe_model_name}.svg") + with open(svg_path, "w") as f: + f.write(svg_code) + else: + with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f: + png_path = f.name + + try: + # Render SVG to PNG + if not render_svg_to_png(svg_code, png_path): + row.evaluation_result = EvaluateResult(score=0.0, reason="Failed to render SVG to PNG") + return row + + # Run BOTH evaluations + + # 1. Original requirements-based evaluation (listwise - different per row) + requirements_result = evaluate_with_llm_judge(png_path, requirements) + fulfilled_count = requirements_result.get("number_of_fulfilled_requirements", 0) + fulfilled_count = max(0, min(fulfilled_count, total_requirements)) + requirements_score = fulfilled_count / total_requirements + + # 2. Human preference evaluation (pointwise - same rubrics for all rows) + human_pref_result = evaluate_with_human_preference_rubrics(png_path, original_prompt, requirements) + human_pref_score = human_pref_result.get("overall_human_preference_score", 0.0) + + # Combine scores (you can adjust the weighting) + combined_score = (requirements_score * 0.3) + (human_pref_score * 0.7) # Emphasize human preference + + # Create comprehensive reasoning showing both evaluations + combined_reasoning = f"""COMBINED EVALUATION (Requirements 30% + Human Preference 70%): + +=== REQUIREMENTS EVALUATION (Listwise - Row-Specific) === +Score: {requirements_score:.3f} +{requirements_result.get('reasoning', 'No reasoning provided')} + +=== HUMAN PREFERENCE EVALUATION (Pointwise - Universal Rubrics) === +Score: {human_pref_score:.3f} + +🎯 Intent Matching: {human_pref_result.get('intent_matching_score', 0.0):.2f}/1.0 +{human_pref_result.get('intent_reasoning', 'No reasoning provided')} + +👁️ Content Recognizability: {human_pref_result.get('content_recognizability_score', 0.0):.2f}/1.0 +{human_pref_result.get('content_reasoning', 'No reasoning provided')} + +📐 Spatial Design Quality: {human_pref_result.get('spatial_design_score', 0.0):.2f}/1.0 +{human_pref_result.get('spatial_reasoning', 'No reasoning provided')} + +👤 User Experience: {human_pref_result.get('user_experience_score', 0.0):.2f}/1.0 +{human_pref_result.get('ux_reasoning', 'No reasoning provided')} + +🎨 Visual Coherence: {human_pref_result.get('visual_coherence_score', 0.0):.2f}/1.0 +{human_pref_result.get('coherence_reasoning', 'No reasoning provided')} + +{human_pref_result.get('overall_reasoning', 'No overall reasoning provided')} + +=== FINAL COMBINED SCORE === +Requirements: {requirements_score:.3f} × 30% = {requirements_score * 0.3:.3f} +Human Preference: {human_pref_score:.3f} × 70% = {human_pref_score * 0.7:.3f} +Combined: {combined_score:.3f} + +The human preference evaluation helps catch issues like unrecognizable content that meets technical requirements.""" + + # Store individual scores in metrics for analysis + metrics = { + "original_requirements_score": MetricResult( + score=requirements_score, + reason=f"Requirements fulfillment: {fulfilled_count}/{total_requirements} requirements met", + is_score_valid=True, + ), + "overall_human_preference_score": MetricResult( + score=human_pref_score, + reason=human_pref_result.get("overall_reasoning", "Human preference evaluation"), + is_score_valid=True, + ), + "intent_matching_score": MetricResult( + score=human_pref_result.get("intent_matching_score", 0.0), + reason=human_pref_result.get("intent_reasoning", "Intent matching evaluation"), + is_score_valid=True, + ), + "content_recognizability_score": MetricResult( + score=human_pref_result.get("content_recognizability_score", 0.0), + reason=human_pref_result.get("content_reasoning", "Content recognizability evaluation"), + is_score_valid=True, + ), + "spatial_design_score": MetricResult( + score=human_pref_result.get("spatial_design_score", 0.0), + reason=human_pref_result.get("spatial_reasoning", "Spatial design evaluation"), + is_score_valid=True, + ), + "user_experience_score": MetricResult( + score=human_pref_result.get("user_experience_score", 0.0), + reason=human_pref_result.get("ux_reasoning", "User experience evaluation"), + is_score_valid=True, + ), + "visual_coherence_score": MetricResult( + score=human_pref_result.get("visual_coherence_score", 0.0), + reason=human_pref_result.get("coherence_reasoning", "Visual coherence evaluation"), + is_score_valid=True, + ), + } + + row.evaluation_result = EvaluateResult(score=combined_score, reason=combined_reasoning, metrics=metrics) + + return row + + except Exception as e: + logger.error(f"Combined evaluation failed for question {row_id}: {e}") + row.evaluation_result = EvaluateResult(score=0.0, reason=f"Evaluation error: {str(e)}") + return row + + finally: + # Clean up temporary PNG file (only if not saving debug files) + if not save_debug_files: + try: + if os.path.exists(png_path): + os.unlink(png_path) + except Exception: + pass diff --git a/tests/pytest/test_svgbench.py b/tests/pytest/test_svgbench.py index 364db365..7ff08642 100644 --- a/tests/pytest/test_svgbench.py +++ b/tests/pytest/test_svgbench.py @@ -22,12 +22,13 @@ from eval_protocol.models import EvaluateResult, EvaluationRow, InputMetadata, Message from eval_protocol.pytest import evaluation_test -from eval_protocol.pytest.default_single_turn_rollout_process import default_single_turn_rollout_processor +from eval_protocol.pytest.default_single_turn_rollout_process import SingleTurnRolloutProcessor logger = logging.getLogger(__name__) class SVGBenchResponse(BaseModel): + reasoning: str number_of_fulfilled_requirements: int @@ -43,7 +44,7 @@ def svgbench_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow """ rows = [] - for row in data: + for i, row in enumerate(data): # Format requirements as numbered list requirements = "\n".join([f"{i+1}. {req}" for i, req in enumerate(row["requirements"])]) @@ -63,7 +64,7 @@ def svgbench_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow eval_row = EvaluationRow( messages=[Message(role="user", content=prompt)], input_metadata=InputMetadata( - row_id=row["id"], + row_id=f"row_{i}", dataset_info={ "original_prompt": row["prompt"], "requirements": row["requirements"], @@ -197,7 +198,7 @@ def render_svg_to_png(svg_code: str, output_path: str) -> bool: def evaluate_with_llm_judge(image_path: str, requirements: List[str]) -> Dict[str, Any]: """ Use LLM judge to evaluate how many requirements are fulfilled. - Uses GPT-4o for vision capabilities to match project's model preferences. (note original repo uses Gemini 2.5 flashs) + Uses GPT-4.1 for vision capabilities to match project's model preferences. (note original repo uses Gemini 2.5 flashs) Args: image_path: Path to rendered PNG image @@ -213,7 +214,8 @@ def evaluate_with_llm_judge(image_path: str, requirements: List[str]) -> Dict[st evaluate_prompt = f"""Examine the generated image. How many of the following {len(requirements)} requirements were fulfilled? Be strict about the requirements and respond ONLY with a JSON object in this exact format: -{{"number_of_fulfilled_requirements": }} +{{"number_of_fulfilled_requirements": , +"reasoning": }} Where is a number between 0 and {len(requirements)}. @@ -235,12 +237,11 @@ def evaluate_with_llm_judge(image_path: str, requirements: List[str]) -> Dict[st } ] - # Use GPT-4o for vision capabilities to match project's OpenAI model preference + # Use GPT-4.1 for vision capabilities to match project's OpenAI model preference response = litellm.completion( - model="gpt-4o", + model="gpt-4.1", messages=messages, temperature=0.0, - max_tokens=200, response_format={ "type": "json_schema", "json_schema": {"name": "SVGBenchResponse", "schema": SVGBenchResponse.model_json_schema()}, @@ -267,18 +268,18 @@ def evaluate_with_llm_judge(image_path: str, requirements: List[str]) -> Dict[st input_dataset=["tests/pytest/data/svgbench_dataset.jsonl"], dataset_adapter=svgbench_to_evaluation_row, completion_params=[ - {"temperature": 0.0, "max_tokens": 4096, "model": "gpt-4.1"}, + {"temperature": 0.0, "model": "gpt-4.1"}, { "temperature": 0.8, "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b", "extra_body": {"reasoning_effort": "high"}, }, ], - rollout_processor=default_single_turn_rollout_processor, + rollout_processor=SingleTurnRolloutProcessor(), passed_threshold=0.5, # 50% average score to pass num_runs=1, mode="pointwise", - max_concurrent_rollouts=3, + max_concurrent_rollouts=50, ) def test_svg_generation_evaluation(row: EvaluationRow) -> EvaluationRow: """ @@ -359,7 +360,7 @@ def test_svg_generation_evaluation(row: EvaluationRow) -> EvaluationRow: row.evaluation_result = EvaluateResult( score=score, - reason=f"Fulfilled {fulfilled_count}/{total_requirements} requirements ({score:.1%}) for prompt: '{original_prompt}'", + reason=judge_result.get("reasoning", ""), ) return row diff --git a/tests/test_retry_mechanism.py b/tests/test_retry_mechanism.py index a483f0e1..f00be4fc 100644 --- a/tests/test_retry_mechanism.py +++ b/tests/test_retry_mechanism.py @@ -7,7 +7,7 @@ import os from collections import Counter from typing import List -from unittest.mock import Mock +from unittest.mock import Mock, patch import pytest @@ -16,8 +16,6 @@ from eval_protocol.pytest.rollout_processor import RolloutProcessor from eval_protocol.pytest.types import RolloutProcessorConfig -os.environ["EP_MAX_RETRY"] = "2" # Allow up to 2 retries - class MockRolloutProcessorWithRetries(RolloutProcessor): """Mock rollout processor that fails second task alphabetically on first attempt, succeeds on retry""" @@ -78,6 +76,7 @@ async def process_single_row( shared_processor = MockRolloutProcessorWithRetries() +@patch.dict(os.environ, {"EP_MAX_RETRY": "2"}) @evaluation_test( completion_params=[{"model": "gpt-4o-mini", "temperature": 0}], input_messages=[ @@ -104,6 +103,7 @@ def test_retry_mechanism(row: EvaluationRow) -> EvaluationRow: return row +@patch.dict(os.environ, {"EP_MAX_RETRY": "2"}) def test_retry_mechanism_mock_verification(): """Test that verifies the retry mechanism worked by checking the mock calls""" # Get our mock tracker From 44654a5af55e755692a17b76d7b10b8b170a7175 Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Sat, 16 Aug 2025 12:38:51 -0700 Subject: [PATCH 21/26] chart export (#87) * works * vite build / fix warnings * don't show totals / fix warnings / vite build * styling * no black border / vite build --- vite-app/CHART_EXPORT_README.md | 139 ++++++++++ vite-app/dist/assets/index-CbFeqRvW.js | 131 +++++++++ vite-app/dist/assets/index-CbFeqRvW.js.map | 1 + vite-app/dist/assets/index-D1ErODUS.js | 93 ------- vite-app/dist/assets/index-D1ErODUS.js.map | 1 - vite-app/dist/assets/index-D5KxcfFQ.css | 1 - vite-app/dist/assets/index-SA8VJz3D.css | 1 + vite-app/dist/index.html | 4 +- vite-app/package.json | 3 + vite-app/pnpm-lock.yaml | 69 +++++ vite-app/src/components/ChartExport.tsx | 304 +++++++++++++++++++++ vite-app/src/components/PivotTab.tsx | 144 ++++------ vite-app/src/components/PivotTable.tsx | 1 - vite-app/src/hooks/usePivotData.ts | 87 ++++++ vite-app/src/util/field-processors.ts | 121 ++++++++ 15 files changed, 915 insertions(+), 185 deletions(-) create mode 100644 vite-app/CHART_EXPORT_README.md create mode 100644 vite-app/dist/assets/index-CbFeqRvW.js create mode 100644 vite-app/dist/assets/index-CbFeqRvW.js.map delete mode 100644 vite-app/dist/assets/index-D1ErODUS.js delete mode 100644 vite-app/dist/assets/index-D1ErODUS.js.map delete mode 100644 vite-app/dist/assets/index-D5KxcfFQ.css create mode 100644 vite-app/dist/assets/index-SA8VJz3D.css create mode 100644 vite-app/src/components/ChartExport.tsx create mode 100644 vite-app/src/hooks/usePivotData.ts create mode 100644 vite-app/src/util/field-processors.ts diff --git a/vite-app/CHART_EXPORT_README.md b/vite-app/CHART_EXPORT_README.md new file mode 100644 index 00000000..15d4fea7 --- /dev/null +++ b/vite-app/CHART_EXPORT_README.md @@ -0,0 +1,139 @@ +# Chart Export Functionality + +This document describes the new Chart Export feature that allows users to export pivot table data as interactive charts and save them as high-resolution PNG images. + +## Overview + +The Chart Export component (`ChartExport.tsx`) integrates with Chart.js to provide visualization capabilities for pivot table data. Users can: + +- Choose from multiple chart types (Bar, Line, Doughnut, Pie) +- View real-time chart updates as pivot table configuration changes +- Export charts as high-resolution PNG images +- Customize chart appearance and data representation + +## Features + +### Chart Types + +1. **Bar Chart**: Best for comparing values across categories +2. **Line Chart**: Ideal for showing trends over time or sequences +3. **Doughnut Chart**: Good for showing proportions of a whole +4. **Pie Chart**: Similar to doughnut but shows complete proportions + +### Data Visualization + +- **Row-based grouping**: Row fields become chart labels +- **Column-based datasets**: Each column field combination becomes a separate dataset +- **Totals integration**: Row totals can be included as an additional dataset +- **Dynamic coloring**: Automatic color generation for different datasets +- **Responsive design**: Charts adapt to container size + +### Export Capabilities + +- **High-resolution output**: 2x scale for crisp images +- **PNG format**: Lossless image format suitable for presentations and reports +- **Automatic naming**: Files include chart type and timestamp +- **Background handling**: Clean white background for professional appearance + +## Technical Implementation + +### Dependencies + +- `chart.js` (v4.5.0): Core charting library +- `react-chartjs-2` (v5.3.0): React wrapper for Chart.js +- `html2canvas` (v1.4.1): HTML to canvas conversion for image export + +### Component Structure + +```tsx + +``` + +### Data Flow + +1. **Pivot Data**: Raw pivot table computation results +2. **Chart Conversion**: Data transformation for Chart.js format +3. **Rendering**: Chart display using react-chartjs-2 +4. **Export**: HTML to canvas conversion and PNG download + +## Usage + +### Basic Setup + +1. Ensure pivot table has both row and column fields selected +2. The Chart Export component will automatically appear above the pivot table +3. Select desired chart type from the dropdown +4. Click "Export as Image" to download the chart + +### Chart Type Selection + +- **Bar/Line**: Best for comparing multiple categories with multiple datasets +- **Pie/Doughnut**: Best for showing proportions when you have one main dimension + +### Export Process + +1. Click "Export as Image" button +2. Wait for processing (button shows "Exporting...") +3. Browser automatically downloads PNG file +4. File is named: `pivot-chart-{type}-{timestamp}.png` + +## Integration + +The component is automatically integrated into the PivotTab and only appears when: +- At least one row field is selected +- At least one column field is selected +- Valid pivot data exists + +## Styling + +- Follows the existing design system with minimal, clean appearance +- Uses Tailwind CSS classes for consistent styling +- Responsive design that works on different screen sizes +- Colorblind-friendly color generation using HSL color space + +## Performance Considerations + +- Charts are rendered only when pivot data changes +- Export process uses `html2canvas` for reliable image generation +- Chart data is memoized to prevent unnecessary re-renders +- Responsive design maintains good performance on various devices + +## Browser Compatibility + +- Modern browsers with ES6+ support +- Canvas API support required for image export +- File download API support required for automatic downloads + +## Troubleshooting + +### Common Issues + +1. **Chart not appearing**: Ensure both row and column fields are selected +2. **Export fails**: Check browser console for errors, ensure canvas is properly rendered +3. **Poor image quality**: Export uses 2x scale by default for high resolution +4. **Chart data missing**: Verify pivot table configuration and data availability + +### Debug Information + +- Check browser console for any JavaScript errors +- Verify pivot data structure matches expected format +- Ensure all required dependencies are properly installed + +## Future Enhancements + +Potential improvements for future versions: +- Additional chart types (scatter, radar, etc.) +- Custom color schemes +- Chart configuration options (axes, legends, etc.) +- Multiple export formats (SVG, PDF) +- Chart templates and presets +- Batch export capabilities diff --git a/vite-app/dist/assets/index-CbFeqRvW.js b/vite-app/dist/assets/index-CbFeqRvW.js new file mode 100644 index 00000000..37226d6f --- /dev/null +++ b/vite-app/dist/assets/index-CbFeqRvW.js @@ -0,0 +1,131 @@ +(function(){const t=document.createElement("link").relList;if(t&&t.supports&&t.supports("modulepreload"))return;for(const i of document.querySelectorAll('link[rel="modulepreload"]'))n(i);new MutationObserver(i=>{for(const a of i)if(a.type==="childList")for(const o of a.addedNodes)o.tagName==="LINK"&&o.rel==="modulepreload"&&n(o)}).observe(document,{childList:!0,subtree:!0});function A(i){const a={};return i.integrity&&(a.integrity=i.integrity),i.referrerPolicy&&(a.referrerPolicy=i.referrerPolicy),i.crossOrigin==="use-credentials"?a.credentials="include":i.crossOrigin==="anonymous"?a.credentials="omit":a.credentials="same-origin",a}function n(i){if(i.ep)return;i.ep=!0;const a=A(i);fetch(i.href,a)}})();function Y_(e){return e&&e.__esModule&&Object.prototype.hasOwnProperty.call(e,"default")?e.default:e}var _g={exports:{}},ho={};/** + * @license React + * react-jsx-runtime.production.js + * + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */var ab;function l1(){if(ab)return ho;ab=1;var e=Symbol.for("react.transitional.element"),t=Symbol.for("react.fragment");function A(n,i,a){var o=null;if(a!==void 0&&(o=""+a),i.key!==void 0&&(o=""+i.key),"key"in i){a={};for(var u in i)u!=="key"&&(a[u]=i[u])}else a=i;return i=a.ref,{$$typeof:e,type:n,key:o,ref:i!==void 0?i:null,props:a}}return ho.Fragment=t,ho.jsx=A,ho.jsxs=A,ho}var ob;function u1(){return ob||(ob=1,_g.exports=l1()),_g.exports}var x=u1(),Qg={exports:{}},Ct={};/** + * @license React + * react.production.js + * + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */var lb;function c1(){if(lb)return Ct;lb=1;var e=Symbol.for("react.transitional.element"),t=Symbol.for("react.portal"),A=Symbol.for("react.fragment"),n=Symbol.for("react.strict_mode"),i=Symbol.for("react.profiler"),a=Symbol.for("react.consumer"),o=Symbol.for("react.context"),u=Symbol.for("react.forward_ref"),c=Symbol.for("react.suspense"),h=Symbol.for("react.memo"),g=Symbol.for("react.lazy"),B=Symbol.iterator;function m(H){return H===null||typeof H!="object"?null:(H=B&&H[B]||H["@@iterator"],typeof H=="function"?H:null)}var v={isMounted:function(){return!1},enqueueForceUpdate:function(){},enqueueReplaceState:function(){},enqueueSetState:function(){}},b=Object.assign,y={};function _(H,X,tt){this.props=H,this.context=X,this.refs=y,this.updater=tt||v}_.prototype.isReactComponent={},_.prototype.setState=function(H,X){if(typeof H!="object"&&typeof H!="function"&&H!=null)throw Error("takes an object of state variables to update or a function which returns an object of state variables.");this.updater.enqueueSetState(this,H,X,"setState")},_.prototype.forceUpdate=function(H){this.updater.enqueueForceUpdate(this,H,"forceUpdate")};function C(){}C.prototype=_.prototype;function U(H,X,tt){this.props=H,this.context=X,this.refs=y,this.updater=tt||v}var E=U.prototype=new C;E.constructor=U,b(E,_.prototype),E.isPureReactComponent=!0;var O=Array.isArray,F={H:null,A:null,T:null,S:null,V:null},T=Object.prototype.hasOwnProperty;function k(H,X,tt,q,Y,ot){return tt=ot.ref,{$$typeof:e,type:H,key:X,ref:tt!==void 0?tt:null,props:ot}}function P(H,X){return k(H.type,X,void 0,void 0,void 0,H.props)}function N(H){return typeof H=="object"&&H!==null&&H.$$typeof===e}function J(H){var X={"=":"=0",":":"=2"};return"$"+H.replace(/[=:]/g,function(tt){return X[tt]})}var et=/\/+/g;function nt(H,X){return typeof H=="object"&&H!==null&&H.key!=null?J(""+H.key):X.toString(36)}function ct(){}function bt(H){switch(H.status){case"fulfilled":return H.value;case"rejected":throw H.reason;default:switch(typeof H.status=="string"?H.then(ct,ct):(H.status="pending",H.then(function(X){H.status==="pending"&&(H.status="fulfilled",H.value=X)},function(X){H.status==="pending"&&(H.status="rejected",H.reason=X)})),H.status){case"fulfilled":return H.value;case"rejected":throw H.reason}}throw H}function ut(H,X,tt,q,Y){var ot=typeof H;(ot==="undefined"||ot==="boolean")&&(H=null);var st=!1;if(H===null)st=!0;else switch(ot){case"bigint":case"string":case"number":st=!0;break;case"object":switch(H.$$typeof){case e:case t:st=!0;break;case g:return st=H._init,ut(st(H._payload),X,tt,q,Y)}}if(st)return Y=Y(H),st=q===""?"."+nt(H,0):q,O(Y)?(tt="",st!=null&&(tt=st.replace(et,"$&/")+"/"),ut(Y,X,tt,"",function(FA){return FA})):Y!=null&&(N(Y)&&(Y=P(Y,tt+(Y.key==null||H&&H.key===Y.key?"":(""+Y.key).replace(et,"$&/")+"/")+st)),X.push(Y)),1;st=0;var be=q===""?".":q+":";if(O(H))for(var qt=0;qt>>1,H=I[ht];if(0>>1;hti(q,rt))Yi(ot,q)?(I[ht]=ot,I[Y]=rt,ht=Y):(I[ht]=q,I[tt]=rt,ht=tt);else if(Yi(ot,rt))I[ht]=ot,I[Y]=rt,ht=Y;else break t}}return W}function i(I,W){var rt=I.sortIndex-W.sortIndex;return rt!==0?rt:I.id-W.id}if(e.unstable_now=void 0,typeof performance=="object"&&typeof performance.now=="function"){var a=performance;e.unstable_now=function(){return a.now()}}else{var o=Date,u=o.now();e.unstable_now=function(){return o.now()-u}}var c=[],h=[],g=1,B=null,m=3,v=!1,b=!1,y=!1,_=!1,C=typeof setTimeout=="function"?setTimeout:null,U=typeof clearTimeout=="function"?clearTimeout:null,E=typeof setImmediate<"u"?setImmediate:null;function O(I){for(var W=A(h);W!==null;){if(W.callback===null)n(h);else if(W.startTime<=I)n(h),W.sortIndex=W.expirationTime,t(c,W);else break;W=A(h)}}function F(I){if(y=!1,O(I),!b)if(A(c)!==null)b=!0,T||(T=!0,nt());else{var W=A(h);W!==null&&ut(F,W.startTime-I)}}var T=!1,k=-1,P=5,N=-1;function J(){return _?!0:!(e.unstable_now()-NI&&J());){var ht=B.callback;if(typeof ht=="function"){B.callback=null,m=B.priorityLevel;var H=ht(B.expirationTime<=I);if(I=e.unstable_now(),typeof H=="function"){B.callback=H,O(I),W=!0;break e}B===A(c)&&n(c),O(I)}else n(c);B=A(c)}if(B!==null)W=!0;else{var X=A(h);X!==null&&ut(F,X.startTime-I),W=!1}}break t}finally{B=null,m=rt,v=!1}W=void 0}}finally{W?nt():T=!1}}}var nt;if(typeof E=="function")nt=function(){E(et)};else if(typeof MessageChannel<"u"){var ct=new MessageChannel,bt=ct.port2;ct.port1.onmessage=et,nt=function(){bt.postMessage(null)}}else nt=function(){C(et,0)};function ut(I,W){k=C(function(){I(e.unstable_now())},W)}e.unstable_IdlePriority=5,e.unstable_ImmediatePriority=1,e.unstable_LowPriority=4,e.unstable_NormalPriority=3,e.unstable_Profiling=null,e.unstable_UserBlockingPriority=2,e.unstable_cancelCallback=function(I){I.callback=null},e.unstable_forceFrameRate=function(I){0>I||125ht?(I.sortIndex=rt,t(h,I),A(c)===null&&I===A(h)&&(y?(U(k),k=-1):y=!0,ut(F,rt-ht))):(I.sortIndex=H,t(c,I),b||v||(b=!0,T||(T=!0,nt()))),I},e.unstable_shouldYield=J,e.unstable_wrapCallback=function(I){var W=m;return function(){var rt=m;m=W;try{return I.apply(this,arguments)}finally{m=rt}}}}(Fg)),Fg}var fb;function h1(){return fb||(fb=1,Ug.exports=f1()),Ug.exports}var Eg={exports:{}},Ve={};/** + * @license React + * react-dom.production.js + * + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */var hb;function d1(){if(hb)return Ve;hb=1;var e=Hf();function t(c){var h="https://react.dev/errors/"+c;if(1"u"||typeof __REACT_DEVTOOLS_GLOBAL_HOOK__.checkDCE!="function"))try{__REACT_DEVTOOLS_GLOBAL_HOOK__.checkDCE(e)}catch(t){console.error(t)}}return e(),Eg.exports=d1(),Eg.exports}/** + * @license React + * react-dom-client.production.js + * + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */var gb;function g1(){if(gb)return go;gb=1;var e=h1(),t=Hf(),A=W_();function n(r){var s="https://react.dev/errors/"+r;if(1H||(r.current=ht[H],ht[H]=null,H--)}function q(r,s){H++,ht[H]=r.current,r.current=s}var Y=X(null),ot=X(null),st=X(null),be=X(null);function qt(r,s){switch(q(st,s),q(ot,r),q(Y,null),s.nodeType){case 9:case 11:r=(r=s.documentElement)&&(r=r.namespaceURI)?Mw(r):0;break;default:if(r=s.tagName,s=s.namespaceURI)s=Mw(s),r=Lw(s,r);else switch(r){case"svg":r=1;break;case"math":r=2;break;default:r=0}}tt(Y),q(Y,r)}function FA(){tt(Y),tt(ot),tt(st)}function da(r){r.memoizedState!==null&&q(be,r);var s=Y.current,l=Lw(s,r.type);s!==l&&(q(ot,r),q(Y,l))}function $r(r){ot.current===r&&(tt(Y),tt(ot)),be.current===r&&(tt(be),oo._currentValue=rt)}var ui=Object.prototype.hasOwnProperty,ci=e.unstable_scheduleCallback,ga=e.unstable_cancelCallback,dm=e.unstable_shouldYield,zF=e.unstable_requestPaint,an=e.unstable_now,VF=e.unstable_getCurrentPriorityLevel,gm=e.unstable_ImmediatePriority,pm=e.unstable_UserBlockingPriority,Ul=e.unstable_NormalPriority,PF=e.unstable_LowPriority,Bm=e.unstable_IdlePriority,jF=e.log,GF=e.unstable_setDisableYieldValue,pa=null,fA=null;function fi(r){if(typeof jF=="function"&&GF(r),fA&&typeof fA.setStrictMode=="function")try{fA.setStrictMode(pa,r)}catch{}}var hA=Math.clz32?Math.clz32:YF,XF=Math.log,ZF=Math.LN2;function YF(r){return r>>>=0,r===0?32:31-(XF(r)/ZF|0)|0}var Fl=256,El=4194304;function nr(r){var s=r&42;if(s!==0)return s;switch(r&-r){case 1:return 1;case 2:return 2;case 4:return 4;case 8:return 8;case 16:return 16;case 32:return 32;case 64:return 64;case 128:return 128;case 256:case 512:case 1024:case 2048:case 4096:case 8192:case 16384:case 32768:case 65536:case 131072:case 262144:case 524288:case 1048576:case 2097152:return r&4194048;case 4194304:case 8388608:case 16777216:case 33554432:return r&62914560;case 67108864:return 67108864;case 134217728:return 134217728;case 268435456:return 268435456;case 536870912:return 536870912;case 1073741824:return 0;default:return r}}function Sl(r,s,l){var f=r.pendingLanes;if(f===0)return 0;var d=0,p=r.suspendedLanes,w=r.pingedLanes;r=r.warmLanes;var Q=f&134217727;return Q!==0?(f=Q&~p,f!==0?d=nr(f):(w&=Q,w!==0?d=nr(w):l||(l=Q&~r,l!==0&&(d=nr(l))))):(Q=f&~p,Q!==0?d=nr(Q):w!==0?d=nr(w):l||(l=f&~r,l!==0&&(d=nr(l)))),d===0?0:s!==0&&s!==d&&(s&p)===0&&(p=d&-d,l=s&-s,p>=l||p===32&&(l&4194048)!==0)?s:d}function Ba(r,s){return(r.pendingLanes&~(r.suspendedLanes&~r.pingedLanes)&s)===0}function WF(r,s){switch(r){case 1:case 2:case 4:case 8:case 64:return s+250;case 16:case 32:case 128:case 256:case 512:case 1024:case 2048:case 4096:case 8192:case 16384:case 32768:case 65536:case 131072:case 262144:case 524288:case 1048576:case 2097152:return s+5e3;case 4194304:case 8388608:case 16777216:case 33554432:return-1;case 67108864:case 134217728:case 268435456:case 536870912:case 1073741824:return-1;default:return-1}}function mm(){var r=Fl;return Fl<<=1,(Fl&4194048)===0&&(Fl=256),r}function vm(){var r=El;return El<<=1,(El&62914560)===0&&(El=4194304),r}function hh(r){for(var s=[],l=0;31>l;l++)s.push(r);return s}function ma(r,s){r.pendingLanes|=s,s!==268435456&&(r.suspendedLanes=0,r.pingedLanes=0,r.warmLanes=0)}function $F(r,s,l,f,d,p){var w=r.pendingLanes;r.pendingLanes=l,r.suspendedLanes=0,r.pingedLanes=0,r.warmLanes=0,r.expiredLanes&=l,r.entangledLanes&=l,r.errorRecoveryDisabledLanes&=l,r.shellSuspendCounter=0;var Q=r.entanglements,S=r.expirationTimes,K=r.hiddenUpdates;for(l=w&~l;0)":-1d||S[f]!==K[d]){var j=` +`+S[f].replace(" at new "," at ");return r.displayName&&j.includes("")&&(j=j.replace("",r.displayName)),j}while(1<=f&&0<=d);break}}}finally{vh=!1,Error.prepareStackTrace=l}return(l=r?r.displayName||r.name:"")?ns(l):""}function nE(r){switch(r.tag){case 26:case 27:case 5:return ns(r.type);case 16:return ns("Lazy");case 13:return ns("Suspense");case 19:return ns("SuspenseList");case 0:case 15:return wh(r.type,!1);case 11:return wh(r.type.render,!1);case 1:return wh(r.type,!0);case 31:return ns("Activity");default:return""}}function Em(r){try{var s="";do s+=nE(r),r=r.return;while(r);return s}catch(l){return` +Error generating stack: `+l.message+` +`+l.stack}}function EA(r){switch(typeof r){case"bigint":case"boolean":case"number":case"string":case"undefined":return r;case"object":return r;default:return""}}function Sm(r){var s=r.type;return(r=r.nodeName)&&r.toLowerCase()==="input"&&(s==="checkbox"||s==="radio")}function iE(r){var s=Sm(r)?"checked":"value",l=Object.getOwnPropertyDescriptor(r.constructor.prototype,s),f=""+r[s];if(!r.hasOwnProperty(s)&&typeof l<"u"&&typeof l.get=="function"&&typeof l.set=="function"){var d=l.get,p=l.set;return Object.defineProperty(r,s,{configurable:!0,get:function(){return d.call(this)},set:function(w){f=""+w,p.call(this,w)}}),Object.defineProperty(r,s,{enumerable:l.enumerable}),{getValue:function(){return f},setValue:function(w){f=""+w},stopTracking:function(){r._valueTracker=null,delete r[s]}}}}function Tl(r){r._valueTracker||(r._valueTracker=iE(r))}function Hm(r){if(!r)return!1;var s=r._valueTracker;if(!s)return!0;var l=s.getValue(),f="";return r&&(f=Sm(r)?r.checked?"true":"false":r.value),r=f,r!==l?(s.setValue(r),!0):!1}function Dl(r){if(r=r||(typeof document<"u"?document:void 0),typeof r>"u")return null;try{return r.activeElement||r.body}catch{return r.body}}var rE=/[\n"\\]/g;function SA(r){return r.replace(rE,function(s){return"\\"+s.charCodeAt(0).toString(16)+" "})}function bh(r,s,l,f,d,p,w,Q){r.name="",w!=null&&typeof w!="function"&&typeof w!="symbol"&&typeof w!="boolean"?r.type=w:r.removeAttribute("type"),s!=null?w==="number"?(s===0&&r.value===""||r.value!=s)&&(r.value=""+EA(s)):r.value!==""+EA(s)&&(r.value=""+EA(s)):w!=="submit"&&w!=="reset"||r.removeAttribute("value"),s!=null?yh(r,w,EA(s)):l!=null?yh(r,w,EA(l)):f!=null&&r.removeAttribute("value"),d==null&&p!=null&&(r.defaultChecked=!!p),d!=null&&(r.checked=d&&typeof d!="function"&&typeof d!="symbol"),Q!=null&&typeof Q!="function"&&typeof Q!="symbol"&&typeof Q!="boolean"?r.name=""+EA(Q):r.removeAttribute("name")}function Om(r,s,l,f,d,p,w,Q){if(p!=null&&typeof p!="function"&&typeof p!="symbol"&&typeof p!="boolean"&&(r.type=p),s!=null||l!=null){if(!(p!=="submit"&&p!=="reset"||s!=null))return;l=l!=null?""+EA(l):"",s=s!=null?""+EA(s):l,Q||s===r.value||(r.value=s),r.defaultValue=s}f=f??d,f=typeof f!="function"&&typeof f!="symbol"&&!!f,r.checked=Q?r.checked:!!f,r.defaultChecked=!!f,w!=null&&typeof w!="function"&&typeof w!="symbol"&&typeof w!="boolean"&&(r.name=w)}function yh(r,s,l){s==="number"&&Dl(r.ownerDocument)===r||r.defaultValue===""+l||(r.defaultValue=""+l)}function is(r,s,l,f){if(r=r.options,s){s={};for(var d=0;d"u"||typeof window.document>"u"||typeof window.document.createElement>"u"),Uh=!1;if(Fn)try{var ya={};Object.defineProperty(ya,"passive",{get:function(){Uh=!0}}),window.addEventListener("test",ya,ya),window.removeEventListener("test",ya,ya)}catch{Uh=!1}var di=null,Fh=null,Ll=null;function Nm(){if(Ll)return Ll;var r,s=Fh,l=s.length,f,d="value"in di?di.value:di.textContent,p=d.length;for(r=0;r=Qa),jm=" ",Gm=!1;function Xm(r,s){switch(r){case"keyup":return TE.indexOf(s.keyCode)!==-1;case"keydown":return s.keyCode!==229;case"keypress":case"mousedown":case"focusout":return!0;default:return!1}}function Zm(r){return r=r.detail,typeof r=="object"&&"data"in r?r.data:null}var os=!1;function ME(r,s){switch(r){case"compositionend":return Zm(s);case"keypress":return s.which!==32?null:(Gm=!0,jm);case"textInput":return r=s.data,r===jm&&Gm?null:r;default:return null}}function LE(r,s){if(os)return r==="compositionend"||!Th&&Xm(r,s)?(r=Nm(),Ll=Fh=di=null,os=!1,r):null;switch(r){case"paste":return null;case"keypress":if(!(s.ctrlKey||s.altKey||s.metaKey)||s.ctrlKey&&s.altKey){if(s.char&&1=s)return{node:l,offset:s-r};r=f}t:{for(;l;){if(l.nextSibling){l=l.nextSibling;break t}l=l.parentNode}l=void 0}l=A0(l)}}function i0(r,s){return r&&s?r===s?!0:r&&r.nodeType===3?!1:s&&s.nodeType===3?i0(r,s.parentNode):"contains"in r?r.contains(s):r.compareDocumentPosition?!!(r.compareDocumentPosition(s)&16):!1:!1}function r0(r){r=r!=null&&r.ownerDocument!=null&&r.ownerDocument.defaultView!=null?r.ownerDocument.defaultView:window;for(var s=Dl(r.document);s instanceof r.HTMLIFrameElement;){try{var l=typeof s.contentWindow.location.href=="string"}catch{l=!1}if(l)r=s.contentWindow;else break;s=Dl(r.document)}return s}function Lh(r){var s=r&&r.nodeName&&r.nodeName.toLowerCase();return s&&(s==="input"&&(r.type==="text"||r.type==="search"||r.type==="tel"||r.type==="url"||r.type==="password")||s==="textarea"||r.contentEditable==="true")}var PE=Fn&&"documentMode"in document&&11>=document.documentMode,ls=null,Ih=null,Ea=null,Rh=!1;function s0(r,s,l){var f=l.window===l?l.document:l.nodeType===9?l:l.ownerDocument;Rh||ls==null||ls!==Dl(f)||(f=ls,"selectionStart"in f&&Lh(f)?f={start:f.selectionStart,end:f.selectionEnd}:(f=(f.ownerDocument&&f.ownerDocument.defaultView||window).getSelection(),f={anchorNode:f.anchorNode,anchorOffset:f.anchorOffset,focusNode:f.focusNode,focusOffset:f.focusOffset}),Ea&&Fa(Ea,f)||(Ea=f,f=xu(Ih,"onSelect"),0>=w,d-=w,Sn=1<<32-hA(s)+d|l<p?p:8;var w=I.T,Q={};I.T=Q,Cd(r,!1,s,l);try{var S=d(),K=I.S;if(K!==null&&K(Q,S),S!==null&&typeof S=="object"&&typeof S.then=="function"){var j=qE(S,f);Pa(r,s,j,vA(r))}else Pa(r,s,f,vA(r))}catch($){Pa(r,s,{then:function(){},status:"rejected",reason:$},vA())}finally{W.p=p,I.T=w}}function iS(){}function bd(r,s,l,f){if(r.tag!==5)throw Error(n(476));var d=av(r).queue;sv(r,d,s,rt,l===null?iS:function(){return ov(r),l(f)})}function av(r){var s=r.memoizedState;if(s!==null)return s;s={memoizedState:rt,baseState:rt,baseQueue:null,queue:{pending:null,lanes:0,dispatch:null,lastRenderedReducer:Dn,lastRenderedState:rt},next:null};var l={};return s.next={memoizedState:l,baseState:l,baseQueue:null,queue:{pending:null,lanes:0,dispatch:null,lastRenderedReducer:Dn,lastRenderedState:l},next:null},r.memoizedState=s,r=r.alternate,r!==null&&(r.memoizedState=s),s}function ov(r){var s=av(r).next.queue;Pa(r,s,{},vA())}function yd(){return ze(oo)}function lv(){return Ce().memoizedState}function uv(){return Ce().memoizedState}function rS(r){for(var s=r.return;s!==null;){switch(s.tag){case 24:case 3:var l=vA();r=Bi(l);var f=mi(s,r,l);f!==null&&(wA(f,s,l),Ra(f,s,l)),s={cache:Jh()},r.payload=s;return}s=s.return}}function sS(r,s,l){var f=vA();l={lane:f,revertLane:0,action:l,hasEagerState:!1,eagerState:null,next:null},su(r)?fv(s,l):(l=zh(r,s,l,f),l!==null&&(wA(l,r,f),hv(l,s,f)))}function cv(r,s,l){var f=vA();Pa(r,s,l,f)}function Pa(r,s,l,f){var d={lane:f,revertLane:0,action:l,hasEagerState:!1,eagerState:null,next:null};if(su(r))fv(s,d);else{var p=r.alternate;if(r.lanes===0&&(p===null||p.lanes===0)&&(p=s.lastRenderedReducer,p!==null))try{var w=s.lastRenderedState,Q=p(w,l);if(d.hasEagerState=!0,d.eagerState=Q,dA(Q,w))return Vl(r,s,d,0),Yt===null&&zl(),!1}catch{}finally{}if(l=zh(r,s,d,f),l!==null)return wA(l,r,f),hv(l,s,f),!0}return!1}function Cd(r,s,l,f){if(f={lane:2,revertLane:eg(),action:f,hasEagerState:!1,eagerState:null,next:null},su(r)){if(s)throw Error(n(479))}else s=zh(r,l,f,2),s!==null&&wA(s,r,2)}function su(r){var s=r.alternate;return r===_t||s!==null&&s===_t}function fv(r,s){vs=tu=!0;var l=r.pending;l===null?s.next=s:(s.next=l.next,l.next=s),r.pending=s}function hv(r,s,l){if((l&4194048)!==0){var f=s.lanes;f&=r.pendingLanes,l|=f,s.lanes=l,bm(r,l)}}var au={readContext:ze,use:Au,useCallback:pe,useContext:pe,useEffect:pe,useImperativeHandle:pe,useLayoutEffect:pe,useInsertionEffect:pe,useMemo:pe,useReducer:pe,useRef:pe,useState:pe,useDebugValue:pe,useDeferredValue:pe,useTransition:pe,useSyncExternalStore:pe,useId:pe,useHostTransitionStatus:pe,useFormState:pe,useActionState:pe,useOptimistic:pe,useMemoCache:pe,useCacheRefresh:pe},dv={readContext:ze,use:Au,useCallback:function(r,s){return AA().memoizedState=[r,s===void 0?null:s],r},useContext:ze,useEffect:$0,useImperativeHandle:function(r,s,l){l=l!=null?l.concat([r]):null,ru(4194308,4,ev.bind(null,s,r),l)},useLayoutEffect:function(r,s){return ru(4194308,4,r,s)},useInsertionEffect:function(r,s){ru(4,2,r,s)},useMemo:function(r,s){var l=AA();s=s===void 0?null:s;var f=r();if(pr){fi(!0);try{r()}finally{fi(!1)}}return l.memoizedState=[f,s],f},useReducer:function(r,s,l){var f=AA();if(l!==void 0){var d=l(s);if(pr){fi(!0);try{l(s)}finally{fi(!1)}}}else d=s;return f.memoizedState=f.baseState=d,r={pending:null,lanes:0,dispatch:null,lastRenderedReducer:r,lastRenderedState:d},f.queue=r,r=r.dispatch=sS.bind(null,_t,r),[f.memoizedState,r]},useRef:function(r){var s=AA();return r={current:r},s.memoizedState=r},useState:function(r){r=Bd(r);var s=r.queue,l=cv.bind(null,_t,s);return s.dispatch=l,[r.memoizedState,l]},useDebugValue:vd,useDeferredValue:function(r,s){var l=AA();return wd(l,r,s)},useTransition:function(){var r=Bd(!1);return r=sv.bind(null,_t,r.queue,!0,!1),AA().memoizedState=r,[!1,r]},useSyncExternalStore:function(r,s,l){var f=_t,d=AA();if(Lt){if(l===void 0)throw Error(n(407));l=l()}else{if(l=s(),Yt===null)throw Error(n(349));(Ht&124)!==0||M0(f,s,l)}d.memoizedState=l;var p={value:l,getSnapshot:s};return d.queue=p,$0(I0.bind(null,f,p,r),[r]),f.flags|=2048,bs(9,iu(),L0.bind(null,f,p,l,s),null),l},useId:function(){var r=AA(),s=Yt.identifierPrefix;if(Lt){var l=Hn,f=Sn;l=(f&~(1<<32-hA(f)-1)).toString(32)+l,s="«"+s+"R"+l,l=eu++,0mt?(He=pt,pt=null):He=pt.sibling;var Tt=z(L,pt,R[mt],Z);if(Tt===null){pt===null&&(pt=He);break}r&&pt&&Tt.alternate===null&&s(L,pt),D=p(Tt,D,mt),Qt===null?lt=Tt:Qt.sibling=Tt,Qt=Tt,pt=He}if(mt===R.length)return l(L,pt),Lt&&ur(L,mt),lt;if(pt===null){for(;mtmt?(He=pt,pt=null):He=pt.sibling;var Mi=z(L,pt,Tt.value,Z);if(Mi===null){pt===null&&(pt=He);break}r&&pt&&Mi.alternate===null&&s(L,pt),D=p(Mi,D,mt),Qt===null?lt=Mi:Qt.sibling=Mi,Qt=Mi,pt=He}if(Tt.done)return l(L,pt),Lt&&ur(L,mt),lt;if(pt===null){for(;!Tt.done;mt++,Tt=R.next())Tt=$(L,Tt.value,Z),Tt!==null&&(D=p(Tt,D,mt),Qt===null?lt=Tt:Qt.sibling=Tt,Qt=Tt);return Lt&&ur(L,mt),lt}for(pt=f(pt);!Tt.done;mt++,Tt=R.next())Tt=V(pt,L,mt,Tt.value,Z),Tt!==null&&(r&&Tt.alternate!==null&&pt.delete(Tt.key===null?mt:Tt.key),D=p(Tt,D,mt),Qt===null?lt=Tt:Qt.sibling=Tt,Qt=Tt);return r&&pt.forEach(function(o1){return s(L,o1)}),Lt&&ur(L,mt),lt}function Vt(L,D,R,Z){if(typeof R=="object"&&R!==null&&R.type===b&&R.key===null&&(R=R.props.children),typeof R=="object"&&R!==null){switch(R.$$typeof){case m:t:{for(var lt=R.key;D!==null;){if(D.key===lt){if(lt=R.type,lt===b){if(D.tag===7){l(L,D.sibling),Z=d(D,R.props.children),Z.return=L,L=Z;break t}}else if(D.elementType===lt||typeof lt=="object"&<!==null&<.$$typeof===P&&pv(lt)===D.type){l(L,D.sibling),Z=d(D,R.props),Ga(Z,R),Z.return=L,L=Z;break t}l(L,D);break}else s(L,D);D=D.sibling}R.type===b?(Z=or(R.props.children,L.mode,Z,R.key),Z.return=L,L=Z):(Z=jl(R.type,R.key,R.props,null,L.mode,Z),Ga(Z,R),Z.return=L,L=Z)}return w(L);case v:t:{for(lt=R.key;D!==null;){if(D.key===lt)if(D.tag===4&&D.stateNode.containerInfo===R.containerInfo&&D.stateNode.implementation===R.implementation){l(L,D.sibling),Z=d(D,R.children||[]),Z.return=L,L=Z;break t}else{l(L,D);break}else s(L,D);D=D.sibling}Z=jh(R,L.mode,Z),Z.return=L,L=Z}return w(L);case P:return lt=R._init,R=lt(R._payload),Vt(L,D,R,Z)}if(ut(R))return vt(L,D,R,Z);if(nt(R)){if(lt=nt(R),typeof lt!="function")throw Error(n(150));return R=lt.call(R),Bt(L,D,R,Z)}if(typeof R.then=="function")return Vt(L,D,ou(R),Z);if(R.$$typeof===E)return Vt(L,D,Yl(L,R),Z);lu(L,R)}return typeof R=="string"&&R!==""||typeof R=="number"||typeof R=="bigint"?(R=""+R,D!==null&&D.tag===6?(l(L,D.sibling),Z=d(D,R),Z.return=L,L=Z):(l(L,D),Z=Ph(R,L.mode,Z),Z.return=L,L=Z),w(L)):l(L,D)}return function(L,D,R,Z){try{ja=0;var lt=Vt(L,D,R,Z);return ys=null,lt}catch(pt){if(pt===La||pt===$l)throw pt;var Qt=gA(29,pt,null,L.mode);return Qt.lanes=Z,Qt.return=L,Qt}finally{}}}var Cs=Bv(!0),mv=Bv(!1),MA=X(null),ln=null;function wi(r){var s=r.alternate;q(Qe,Qe.current&1),q(MA,r),ln===null&&(s===null||ms.current!==null||s.memoizedState!==null)&&(ln=r)}function vv(r){if(r.tag===22){if(q(Qe,Qe.current),q(MA,r),ln===null){var s=r.alternate;s!==null&&s.memoizedState!==null&&(ln=r)}}else bi()}function bi(){q(Qe,Qe.current),q(MA,MA.current)}function Mn(r){tt(MA),ln===r&&(ln=null),tt(Qe)}var Qe=X(0);function uu(r){for(var s=r;s!==null;){if(s.tag===13){var l=s.memoizedState;if(l!==null&&(l=l.dehydrated,l===null||l.data==="$?"||hg(l)))return s}else if(s.tag===19&&s.memoizedProps.revealOrder!==void 0){if((s.flags&128)!==0)return s}else if(s.child!==null){s.child.return=s,s=s.child;continue}if(s===r)break;for(;s.sibling===null;){if(s.return===null||s.return===r)return null;s=s.return}s.sibling.return=s.return,s=s.sibling}return null}function _d(r,s,l,f){s=r.memoizedState,l=l(f,s),l=l==null?s:g({},s,l),r.memoizedState=l,r.lanes===0&&(r.updateQueue.baseState=l)}var Qd={enqueueSetState:function(r,s,l){r=r._reactInternals;var f=vA(),d=Bi(f);d.payload=s,l!=null&&(d.callback=l),s=mi(r,d,f),s!==null&&(wA(s,r,f),Ra(s,r,f))},enqueueReplaceState:function(r,s,l){r=r._reactInternals;var f=vA(),d=Bi(f);d.tag=1,d.payload=s,l!=null&&(d.callback=l),s=mi(r,d,f),s!==null&&(wA(s,r,f),Ra(s,r,f))},enqueueForceUpdate:function(r,s){r=r._reactInternals;var l=vA(),f=Bi(l);f.tag=2,s!=null&&(f.callback=s),s=mi(r,f,l),s!==null&&(wA(s,r,l),Ra(s,r,l))}};function wv(r,s,l,f,d,p,w){return r=r.stateNode,typeof r.shouldComponentUpdate=="function"?r.shouldComponentUpdate(f,p,w):s.prototype&&s.prototype.isPureReactComponent?!Fa(l,f)||!Fa(d,p):!0}function bv(r,s,l,f){r=s.state,typeof s.componentWillReceiveProps=="function"&&s.componentWillReceiveProps(l,f),typeof s.UNSAFE_componentWillReceiveProps=="function"&&s.UNSAFE_componentWillReceiveProps(l,f),s.state!==r&&Qd.enqueueReplaceState(s,s.state,null)}function Br(r,s){var l=s;if("ref"in s){l={};for(var f in s)f!=="ref"&&(l[f]=s[f])}if(r=r.defaultProps){l===s&&(l=g({},l));for(var d in r)l[d]===void 0&&(l[d]=r[d])}return l}var cu=typeof reportError=="function"?reportError:function(r){if(typeof window=="object"&&typeof window.ErrorEvent=="function"){var s=new window.ErrorEvent("error",{bubbles:!0,cancelable:!0,message:typeof r=="object"&&r!==null&&typeof r.message=="string"?String(r.message):String(r),error:r});if(!window.dispatchEvent(s))return}else if(typeof process=="object"&&typeof process.emit=="function"){process.emit("uncaughtException",r);return}console.error(r)};function yv(r){cu(r)}function Cv(r){console.error(r)}function _v(r){cu(r)}function fu(r,s){try{var l=r.onUncaughtError;l(s.value,{componentStack:s.stack})}catch(f){setTimeout(function(){throw f})}}function Qv(r,s,l){try{var f=r.onCaughtError;f(l.value,{componentStack:l.stack,errorBoundary:s.tag===1?s.stateNode:null})}catch(d){setTimeout(function(){throw d})}}function xd(r,s,l){return l=Bi(l),l.tag=3,l.payload={element:null},l.callback=function(){fu(r,s)},l}function xv(r){return r=Bi(r),r.tag=3,r}function Uv(r,s,l,f){var d=l.type.getDerivedStateFromError;if(typeof d=="function"){var p=f.value;r.payload=function(){return d(p)},r.callback=function(){Qv(s,l,f)}}var w=l.stateNode;w!==null&&typeof w.componentDidCatch=="function"&&(r.callback=function(){Qv(s,l,f),typeof d!="function"&&(Ui===null?Ui=new Set([this]):Ui.add(this));var Q=f.stack;this.componentDidCatch(f.value,{componentStack:Q!==null?Q:""})})}function oS(r,s,l,f,d){if(l.flags|=32768,f!==null&&typeof f=="object"&&typeof f.then=="function"){if(s=l.alternate,s!==null&&Ta(s,l,d,!0),l=MA.current,l!==null){switch(l.tag){case 13:return ln===null?Wd():l.alternate===null&&he===0&&(he=3),l.flags&=-257,l.flags|=65536,l.lanes=d,f===ed?l.flags|=16384:(s=l.updateQueue,s===null?l.updateQueue=new Set([f]):s.add(f),Jd(r,f,d)),!1;case 22:return l.flags|=65536,f===ed?l.flags|=16384:(s=l.updateQueue,s===null?(s={transitions:null,markerInstances:null,retryQueue:new Set([f])},l.updateQueue=s):(l=s.retryQueue,l===null?s.retryQueue=new Set([f]):l.add(f)),Jd(r,f,d)),!1}throw Error(n(435,l.tag))}return Jd(r,f,d),Wd(),!1}if(Lt)return s=MA.current,s!==null?((s.flags&65536)===0&&(s.flags|=256),s.flags|=65536,s.lanes=d,f!==Zh&&(r=Error(n(422),{cause:f}),Oa(HA(r,l)))):(f!==Zh&&(s=Error(n(423),{cause:f}),Oa(HA(s,l))),r=r.current.alternate,r.flags|=65536,d&=-d,r.lanes|=d,f=HA(f,l),d=xd(r.stateNode,f,d),id(r,d),he!==4&&(he=2)),!1;var p=Error(n(520),{cause:f});if(p=HA(p,l),qa===null?qa=[p]:qa.push(p),he!==4&&(he=2),s===null)return!0;f=HA(f,l),l=s;do{switch(l.tag){case 3:return l.flags|=65536,r=d&-d,l.lanes|=r,r=xd(l.stateNode,f,r),id(l,r),!1;case 1:if(s=l.type,p=l.stateNode,(l.flags&128)===0&&(typeof s.getDerivedStateFromError=="function"||p!==null&&typeof p.componentDidCatch=="function"&&(Ui===null||!Ui.has(p))))return l.flags|=65536,d&=-d,l.lanes|=d,d=xv(d),Uv(d,r,l,f),id(l,d),!1}l=l.return}while(l!==null);return!1}var Fv=Error(n(461)),Ee=!1;function Me(r,s,l,f){s.child=r===null?mv(s,null,l,f):Cs(s,r.child,l,f)}function Ev(r,s,l,f,d){l=l.render;var p=s.ref;if("ref"in f){var w={};for(var Q in f)Q!=="ref"&&(w[Q]=f[Q])}else w=f;return dr(s),f=ld(r,s,l,w,p,d),Q=ud(),r!==null&&!Ee?(cd(r,s,d),Ln(r,s,d)):(Lt&&Q&&Gh(s),s.flags|=1,Me(r,s,f,d),s.child)}function Sv(r,s,l,f,d){if(r===null){var p=l.type;return typeof p=="function"&&!Vh(p)&&p.defaultProps===void 0&&l.compare===null?(s.tag=15,s.type=p,Hv(r,s,p,f,d)):(r=jl(l.type,null,f,s,s.mode,d),r.ref=s.ref,r.return=s,s.child=r)}if(p=r.child,!Dd(r,d)){var w=p.memoizedProps;if(l=l.compare,l=l!==null?l:Fa,l(w,f)&&r.ref===s.ref)return Ln(r,s,d)}return s.flags|=1,r=En(p,f),r.ref=s.ref,r.return=s,s.child=r}function Hv(r,s,l,f,d){if(r!==null){var p=r.memoizedProps;if(Fa(p,f)&&r.ref===s.ref)if(Ee=!1,s.pendingProps=f=p,Dd(r,d))(r.flags&131072)!==0&&(Ee=!0);else return s.lanes=r.lanes,Ln(r,s,d)}return Ud(r,s,l,f,d)}function Ov(r,s,l){var f=s.pendingProps,d=f.children,p=r!==null?r.memoizedState:null;if(f.mode==="hidden"){if((s.flags&128)!==0){if(f=p!==null?p.baseLanes|l:l,r!==null){for(d=s.child=r.child,p=0;d!==null;)p=p|d.lanes|d.childLanes,d=d.sibling;s.childLanes=p&~f}else s.childLanes=0,s.child=null;return Tv(r,s,f,l)}if((l&536870912)!==0)s.memoizedState={baseLanes:0,cachePool:null},r!==null&&Wl(s,p!==null?p.cachePool:null),p!==null?H0(s,p):sd(),vv(s);else return s.lanes=s.childLanes=536870912,Tv(r,s,p!==null?p.baseLanes|l:l,l)}else p!==null?(Wl(s,p.cachePool),H0(s,p),bi(),s.memoizedState=null):(r!==null&&Wl(s,null),sd(),bi());return Me(r,s,d,l),s.child}function Tv(r,s,l,f){var d=td();return d=d===null?null:{parent:_e._currentValue,pool:d},s.memoizedState={baseLanes:l,cachePool:d},r!==null&&Wl(s,null),sd(),vv(s),r!==null&&Ta(r,s,f,!0),null}function hu(r,s){var l=s.ref;if(l===null)r!==null&&r.ref!==null&&(s.flags|=4194816);else{if(typeof l!="function"&&typeof l!="object")throw Error(n(284));(r===null||r.ref!==l)&&(s.flags|=4194816)}}function Ud(r,s,l,f,d){return dr(s),l=ld(r,s,l,f,void 0,d),f=ud(),r!==null&&!Ee?(cd(r,s,d),Ln(r,s,d)):(Lt&&f&&Gh(s),s.flags|=1,Me(r,s,l,d),s.child)}function Dv(r,s,l,f,d,p){return dr(s),s.updateQueue=null,l=T0(s,f,l,d),O0(r),f=ud(),r!==null&&!Ee?(cd(r,s,p),Ln(r,s,p)):(Lt&&f&&Gh(s),s.flags|=1,Me(r,s,l,p),s.child)}function Mv(r,s,l,f,d){if(dr(s),s.stateNode===null){var p=hs,w=l.contextType;typeof w=="object"&&w!==null&&(p=ze(w)),p=new l(f,p),s.memoizedState=p.state!==null&&p.state!==void 0?p.state:null,p.updater=Qd,s.stateNode=p,p._reactInternals=s,p=s.stateNode,p.props=f,p.state=s.memoizedState,p.refs={},Ad(s),w=l.contextType,p.context=typeof w=="object"&&w!==null?ze(w):hs,p.state=s.memoizedState,w=l.getDerivedStateFromProps,typeof w=="function"&&(_d(s,l,w,f),p.state=s.memoizedState),typeof l.getDerivedStateFromProps=="function"||typeof p.getSnapshotBeforeUpdate=="function"||typeof p.UNSAFE_componentWillMount!="function"&&typeof p.componentWillMount!="function"||(w=p.state,typeof p.componentWillMount=="function"&&p.componentWillMount(),typeof p.UNSAFE_componentWillMount=="function"&&p.UNSAFE_componentWillMount(),w!==p.state&&Qd.enqueueReplaceState(p,p.state,null),Ka(s,f,p,d),Na(),p.state=s.memoizedState),typeof p.componentDidMount=="function"&&(s.flags|=4194308),f=!0}else if(r===null){p=s.stateNode;var Q=s.memoizedProps,S=Br(l,Q);p.props=S;var K=p.context,j=l.contextType;w=hs,typeof j=="object"&&j!==null&&(w=ze(j));var $=l.getDerivedStateFromProps;j=typeof $=="function"||typeof p.getSnapshotBeforeUpdate=="function",Q=s.pendingProps!==Q,j||typeof p.UNSAFE_componentWillReceiveProps!="function"&&typeof p.componentWillReceiveProps!="function"||(Q||K!==w)&&bv(s,p,f,w),pi=!1;var z=s.memoizedState;p.state=z,Ka(s,f,p,d),Na(),K=s.memoizedState,Q||z!==K||pi?(typeof $=="function"&&(_d(s,l,$,f),K=s.memoizedState),(S=pi||wv(s,l,S,f,z,K,w))?(j||typeof p.UNSAFE_componentWillMount!="function"&&typeof p.componentWillMount!="function"||(typeof p.componentWillMount=="function"&&p.componentWillMount(),typeof p.UNSAFE_componentWillMount=="function"&&p.UNSAFE_componentWillMount()),typeof p.componentDidMount=="function"&&(s.flags|=4194308)):(typeof p.componentDidMount=="function"&&(s.flags|=4194308),s.memoizedProps=f,s.memoizedState=K),p.props=f,p.state=K,p.context=w,f=S):(typeof p.componentDidMount=="function"&&(s.flags|=4194308),f=!1)}else{p=s.stateNode,nd(r,s),w=s.memoizedProps,j=Br(l,w),p.props=j,$=s.pendingProps,z=p.context,K=l.contextType,S=hs,typeof K=="object"&&K!==null&&(S=ze(K)),Q=l.getDerivedStateFromProps,(K=typeof Q=="function"||typeof p.getSnapshotBeforeUpdate=="function")||typeof p.UNSAFE_componentWillReceiveProps!="function"&&typeof p.componentWillReceiveProps!="function"||(w!==$||z!==S)&&bv(s,p,f,S),pi=!1,z=s.memoizedState,p.state=z,Ka(s,f,p,d),Na();var V=s.memoizedState;w!==$||z!==V||pi||r!==null&&r.dependencies!==null&&Zl(r.dependencies)?(typeof Q=="function"&&(_d(s,l,Q,f),V=s.memoizedState),(j=pi||wv(s,l,j,f,z,V,S)||r!==null&&r.dependencies!==null&&Zl(r.dependencies))?(K||typeof p.UNSAFE_componentWillUpdate!="function"&&typeof p.componentWillUpdate!="function"||(typeof p.componentWillUpdate=="function"&&p.componentWillUpdate(f,V,S),typeof p.UNSAFE_componentWillUpdate=="function"&&p.UNSAFE_componentWillUpdate(f,V,S)),typeof p.componentDidUpdate=="function"&&(s.flags|=4),typeof p.getSnapshotBeforeUpdate=="function"&&(s.flags|=1024)):(typeof p.componentDidUpdate!="function"||w===r.memoizedProps&&z===r.memoizedState||(s.flags|=4),typeof p.getSnapshotBeforeUpdate!="function"||w===r.memoizedProps&&z===r.memoizedState||(s.flags|=1024),s.memoizedProps=f,s.memoizedState=V),p.props=f,p.state=V,p.context=S,f=j):(typeof p.componentDidUpdate!="function"||w===r.memoizedProps&&z===r.memoizedState||(s.flags|=4),typeof p.getSnapshotBeforeUpdate!="function"||w===r.memoizedProps&&z===r.memoizedState||(s.flags|=1024),f=!1)}return p=f,hu(r,s),f=(s.flags&128)!==0,p||f?(p=s.stateNode,l=f&&typeof l.getDerivedStateFromError!="function"?null:p.render(),s.flags|=1,r!==null&&f?(s.child=Cs(s,r.child,null,d),s.child=Cs(s,null,l,d)):Me(r,s,l,d),s.memoizedState=p.state,r=s.child):r=Ln(r,s,d),r}function Lv(r,s,l,f){return Ha(),s.flags|=256,Me(r,s,l,f),s.child}var Fd={dehydrated:null,treeContext:null,retryLane:0,hydrationErrors:null};function Ed(r){return{baseLanes:r,cachePool:C0()}}function Sd(r,s,l){return r=r!==null?r.childLanes&~l:0,s&&(r|=LA),r}function Iv(r,s,l){var f=s.pendingProps,d=!1,p=(s.flags&128)!==0,w;if((w=p)||(w=r!==null&&r.memoizedState===null?!1:(Qe.current&2)!==0),w&&(d=!0,s.flags&=-129),w=(s.flags&32)!==0,s.flags&=-33,r===null){if(Lt){if(d?wi(s):bi(),Lt){var Q=fe,S;if(S=Q){t:{for(S=Q,Q=on;S.nodeType!==8;){if(!Q){Q=null;break t}if(S=ZA(S.nextSibling),S===null){Q=null;break t}}Q=S}Q!==null?(s.memoizedState={dehydrated:Q,treeContext:lr!==null?{id:Sn,overflow:Hn}:null,retryLane:536870912,hydrationErrors:null},S=gA(18,null,null,0),S.stateNode=Q,S.return=s,s.child=S,We=s,fe=null,S=!0):S=!1}S||fr(s)}if(Q=s.memoizedState,Q!==null&&(Q=Q.dehydrated,Q!==null))return hg(Q)?s.lanes=32:s.lanes=536870912,null;Mn(s)}return Q=f.children,f=f.fallback,d?(bi(),d=s.mode,Q=du({mode:"hidden",children:Q},d),f=or(f,d,l,null),Q.return=s,f.return=s,Q.sibling=f,s.child=Q,d=s.child,d.memoizedState=Ed(l),d.childLanes=Sd(r,w,l),s.memoizedState=Fd,f):(wi(s),Hd(s,Q))}if(S=r.memoizedState,S!==null&&(Q=S.dehydrated,Q!==null)){if(p)s.flags&256?(wi(s),s.flags&=-257,s=Od(r,s,l)):s.memoizedState!==null?(bi(),s.child=r.child,s.flags|=128,s=null):(bi(),d=f.fallback,Q=s.mode,f=du({mode:"visible",children:f.children},Q),d=or(d,Q,l,null),d.flags|=2,f.return=s,d.return=s,f.sibling=d,s.child=f,Cs(s,r.child,null,l),f=s.child,f.memoizedState=Ed(l),f.childLanes=Sd(r,w,l),s.memoizedState=Fd,s=d);else if(wi(s),hg(Q)){if(w=Q.nextSibling&&Q.nextSibling.dataset,w)var K=w.dgst;w=K,f=Error(n(419)),f.stack="",f.digest=w,Oa({value:f,source:null,stack:null}),s=Od(r,s,l)}else if(Ee||Ta(r,s,l,!1),w=(l&r.childLanes)!==0,Ee||w){if(w=Yt,w!==null&&(f=l&-l,f=(f&42)!==0?1:dh(f),f=(f&(w.suspendedLanes|l))!==0?0:f,f!==0&&f!==S.retryLane))throw S.retryLane=f,fs(r,f),wA(w,r,f),Fv;Q.data==="$?"||Wd(),s=Od(r,s,l)}else Q.data==="$?"?(s.flags|=192,s.child=r.child,s=null):(r=S.treeContext,fe=ZA(Q.nextSibling),We=s,Lt=!0,cr=null,on=!1,r!==null&&(TA[DA++]=Sn,TA[DA++]=Hn,TA[DA++]=lr,Sn=r.id,Hn=r.overflow,lr=s),s=Hd(s,f.children),s.flags|=4096);return s}return d?(bi(),d=f.fallback,Q=s.mode,S=r.child,K=S.sibling,f=En(S,{mode:"hidden",children:f.children}),f.subtreeFlags=S.subtreeFlags&65011712,K!==null?d=En(K,d):(d=or(d,Q,l,null),d.flags|=2),d.return=s,f.return=s,f.sibling=d,s.child=f,f=d,d=s.child,Q=r.child.memoizedState,Q===null?Q=Ed(l):(S=Q.cachePool,S!==null?(K=_e._currentValue,S=S.parent!==K?{parent:K,pool:K}:S):S=C0(),Q={baseLanes:Q.baseLanes|l,cachePool:S}),d.memoizedState=Q,d.childLanes=Sd(r,w,l),s.memoizedState=Fd,f):(wi(s),l=r.child,r=l.sibling,l=En(l,{mode:"visible",children:f.children}),l.return=s,l.sibling=null,r!==null&&(w=s.deletions,w===null?(s.deletions=[r],s.flags|=16):w.push(r)),s.child=l,s.memoizedState=null,l)}function Hd(r,s){return s=du({mode:"visible",children:s},r.mode),s.return=r,r.child=s}function du(r,s){return r=gA(22,r,null,s),r.lanes=0,r.stateNode={_visibility:1,_pendingMarkers:null,_retryCache:null,_transitions:null},r}function Od(r,s,l){return Cs(s,r.child,null,l),r=Hd(s,s.pendingProps.children),r.flags|=2,s.memoizedState=null,r}function Rv(r,s,l){r.lanes|=s;var f=r.alternate;f!==null&&(f.lanes|=s),Wh(r.return,s,l)}function Td(r,s,l,f,d){var p=r.memoizedState;p===null?r.memoizedState={isBackwards:s,rendering:null,renderingStartTime:0,last:f,tail:l,tailMode:d}:(p.isBackwards=s,p.rendering=null,p.renderingStartTime=0,p.last=f,p.tail=l,p.tailMode=d)}function Nv(r,s,l){var f=s.pendingProps,d=f.revealOrder,p=f.tail;if(Me(r,s,f.children,l),f=Qe.current,(f&2)!==0)f=f&1|2,s.flags|=128;else{if(r!==null&&(r.flags&128)!==0)t:for(r=s.child;r!==null;){if(r.tag===13)r.memoizedState!==null&&Rv(r,l,s);else if(r.tag===19)Rv(r,l,s);else if(r.child!==null){r.child.return=r,r=r.child;continue}if(r===s)break t;for(;r.sibling===null;){if(r.return===null||r.return===s)break t;r=r.return}r.sibling.return=r.return,r=r.sibling}f&=1}switch(q(Qe,f),d){case"forwards":for(l=s.child,d=null;l!==null;)r=l.alternate,r!==null&&uu(r)===null&&(d=l),l=l.sibling;l=d,l===null?(d=s.child,s.child=null):(d=l.sibling,l.sibling=null),Td(s,!1,d,l,p);break;case"backwards":for(l=null,d=s.child,s.child=null;d!==null;){if(r=d.alternate,r!==null&&uu(r)===null){s.child=d;break}r=d.sibling,d.sibling=l,l=d,d=r}Td(s,!0,l,null,p);break;case"together":Td(s,!1,null,null,void 0);break;default:s.memoizedState=null}return s.child}function Ln(r,s,l){if(r!==null&&(s.dependencies=r.dependencies),xi|=s.lanes,(l&s.childLanes)===0)if(r!==null){if(Ta(r,s,l,!1),(l&s.childLanes)===0)return null}else return null;if(r!==null&&s.child!==r.child)throw Error(n(153));if(s.child!==null){for(r=s.child,l=En(r,r.pendingProps),s.child=l,l.return=s;r.sibling!==null;)r=r.sibling,l=l.sibling=En(r,r.pendingProps),l.return=s;l.sibling=null}return s.child}function Dd(r,s){return(r.lanes&s)!==0?!0:(r=r.dependencies,!!(r!==null&&Zl(r)))}function lS(r,s,l){switch(s.tag){case 3:qt(s,s.stateNode.containerInfo),gi(s,_e,r.memoizedState.cache),Ha();break;case 27:case 5:da(s);break;case 4:qt(s,s.stateNode.containerInfo);break;case 10:gi(s,s.type,s.memoizedProps.value);break;case 13:var f=s.memoizedState;if(f!==null)return f.dehydrated!==null?(wi(s),s.flags|=128,null):(l&s.child.childLanes)!==0?Iv(r,s,l):(wi(s),r=Ln(r,s,l),r!==null?r.sibling:null);wi(s);break;case 19:var d=(r.flags&128)!==0;if(f=(l&s.childLanes)!==0,f||(Ta(r,s,l,!1),f=(l&s.childLanes)!==0),d){if(f)return Nv(r,s,l);s.flags|=128}if(d=s.memoizedState,d!==null&&(d.rendering=null,d.tail=null,d.lastEffect=null),q(Qe,Qe.current),f)break;return null;case 22:case 23:return s.lanes=0,Ov(r,s,l);case 24:gi(s,_e,r.memoizedState.cache)}return Ln(r,s,l)}function Kv(r,s,l){if(r!==null)if(r.memoizedProps!==s.pendingProps)Ee=!0;else{if(!Dd(r,l)&&(s.flags&128)===0)return Ee=!1,lS(r,s,l);Ee=(r.flags&131072)!==0}else Ee=!1,Lt&&(s.flags&1048576)!==0&&p0(s,Xl,s.index);switch(s.lanes=0,s.tag){case 16:t:{r=s.pendingProps;var f=s.elementType,d=f._init;if(f=d(f._payload),s.type=f,typeof f=="function")Vh(f)?(r=Br(f,r),s.tag=1,s=Mv(null,s,f,r,l)):(s.tag=0,s=Ud(null,s,f,r,l));else{if(f!=null){if(d=f.$$typeof,d===O){s.tag=11,s=Ev(null,s,f,r,l);break t}else if(d===k){s.tag=14,s=Sv(null,s,f,r,l);break t}}throw s=bt(f)||f,Error(n(306,s,""))}}return s;case 0:return Ud(r,s,s.type,s.pendingProps,l);case 1:return f=s.type,d=Br(f,s.pendingProps),Mv(r,s,f,d,l);case 3:t:{if(qt(s,s.stateNode.containerInfo),r===null)throw Error(n(387));f=s.pendingProps;var p=s.memoizedState;d=p.element,nd(r,s),Ka(s,f,null,l);var w=s.memoizedState;if(f=w.cache,gi(s,_e,f),f!==p.cache&&$h(s,[_e],l,!0),Na(),f=w.element,p.isDehydrated)if(p={element:f,isDehydrated:!1,cache:w.cache},s.updateQueue.baseState=p,s.memoizedState=p,s.flags&256){s=Lv(r,s,f,l);break t}else if(f!==d){d=HA(Error(n(424)),s),Oa(d),s=Lv(r,s,f,l);break t}else{switch(r=s.stateNode.containerInfo,r.nodeType){case 9:r=r.body;break;default:r=r.nodeName==="HTML"?r.ownerDocument.body:r}for(fe=ZA(r.firstChild),We=s,Lt=!0,cr=null,on=!0,l=mv(s,null,f,l),s.child=l;l;)l.flags=l.flags&-3|4096,l=l.sibling}else{if(Ha(),f===d){s=Ln(r,s,l);break t}Me(r,s,f,l)}s=s.child}return s;case 26:return hu(r,s),r===null?(l=Pw(s.type,null,s.pendingProps,null))?s.memoizedState=l:Lt||(l=s.type,r=s.pendingProps,f=Fu(st.current).createElement(l),f[ke]=s,f[tA]=r,Ie(f,l,r),Fe(f),s.stateNode=f):s.memoizedState=Pw(s.type,r.memoizedProps,s.pendingProps,r.memoizedState),null;case 27:return da(s),r===null&&Lt&&(f=s.stateNode=kw(s.type,s.pendingProps,st.current),We=s,on=!0,d=fe,Si(s.type)?(dg=d,fe=ZA(f.firstChild)):fe=d),Me(r,s,s.pendingProps.children,l),hu(r,s),r===null&&(s.flags|=4194304),s.child;case 5:return r===null&&Lt&&((d=f=fe)&&(f=IS(f,s.type,s.pendingProps,on),f!==null?(s.stateNode=f,We=s,fe=ZA(f.firstChild),on=!1,d=!0):d=!1),d||fr(s)),da(s),d=s.type,p=s.pendingProps,w=r!==null?r.memoizedProps:null,f=p.children,ug(d,p)?f=null:w!==null&&ug(d,w)&&(s.flags|=32),s.memoizedState!==null&&(d=ld(r,s,eS,null,null,l),oo._currentValue=d),hu(r,s),Me(r,s,f,l),s.child;case 6:return r===null&&Lt&&((r=l=fe)&&(l=RS(l,s.pendingProps,on),l!==null?(s.stateNode=l,We=s,fe=null,r=!0):r=!1),r||fr(s)),null;case 13:return Iv(r,s,l);case 4:return qt(s,s.stateNode.containerInfo),f=s.pendingProps,r===null?s.child=Cs(s,null,f,l):Me(r,s,f,l),s.child;case 11:return Ev(r,s,s.type,s.pendingProps,l);case 7:return Me(r,s,s.pendingProps,l),s.child;case 8:return Me(r,s,s.pendingProps.children,l),s.child;case 12:return Me(r,s,s.pendingProps.children,l),s.child;case 10:return f=s.pendingProps,gi(s,s.type,f.value),Me(r,s,f.children,l),s.child;case 9:return d=s.type._context,f=s.pendingProps.children,dr(s),d=ze(d),f=f(d),s.flags|=1,Me(r,s,f,l),s.child;case 14:return Sv(r,s,s.type,s.pendingProps,l);case 15:return Hv(r,s,s.type,s.pendingProps,l);case 19:return Nv(r,s,l);case 31:return f=s.pendingProps,l=s.mode,f={mode:f.mode,children:f.children},r===null?(l=du(f,l),l.ref=s.ref,s.child=l,l.return=s,s=l):(l=En(r.child,f),l.ref=s.ref,s.child=l,l.return=s,s=l),s;case 22:return Ov(r,s,l);case 24:return dr(s),f=ze(_e),r===null?(d=td(),d===null&&(d=Yt,p=Jh(),d.pooledCache=p,p.refCount++,p!==null&&(d.pooledCacheLanes|=l),d=p),s.memoizedState={parent:f,cache:d},Ad(s),gi(s,_e,d)):((r.lanes&l)!==0&&(nd(r,s),Ka(s,null,null,l),Na()),d=r.memoizedState,p=s.memoizedState,d.parent!==f?(d={parent:f,cache:f},s.memoizedState=d,s.lanes===0&&(s.memoizedState=s.updateQueue.baseState=d),gi(s,_e,f)):(f=p.cache,gi(s,_e,f),f!==d.cache&&$h(s,[_e],l,!0))),Me(r,s,s.pendingProps.children,l),s.child;case 29:throw s.pendingProps}throw Error(n(156,s.tag))}function In(r){r.flags|=4}function kv(r,s){if(s.type!=="stylesheet"||(s.state.loading&4)!==0)r.flags&=-16777217;else if(r.flags|=16777216,!Yw(s)){if(s=MA.current,s!==null&&((Ht&4194048)===Ht?ln!==null:(Ht&62914560)!==Ht&&(Ht&536870912)===0||s!==ln))throw Ia=ed,_0;r.flags|=8192}}function gu(r,s){s!==null&&(r.flags|=4),r.flags&16384&&(s=r.tag!==22?vm():536870912,r.lanes|=s,Us|=s)}function Xa(r,s){if(!Lt)switch(r.tailMode){case"hidden":s=r.tail;for(var l=null;s!==null;)s.alternate!==null&&(l=s),s=s.sibling;l===null?r.tail=null:l.sibling=null;break;case"collapsed":l=r.tail;for(var f=null;l!==null;)l.alternate!==null&&(f=l),l=l.sibling;f===null?s||r.tail===null?r.tail=null:r.tail.sibling=null:f.sibling=null}}function le(r){var s=r.alternate!==null&&r.alternate.child===r.child,l=0,f=0;if(s)for(var d=r.child;d!==null;)l|=d.lanes|d.childLanes,f|=d.subtreeFlags&65011712,f|=d.flags&65011712,d.return=r,d=d.sibling;else for(d=r.child;d!==null;)l|=d.lanes|d.childLanes,f|=d.subtreeFlags,f|=d.flags,d.return=r,d=d.sibling;return r.subtreeFlags|=f,r.childLanes=l,s}function uS(r,s,l){var f=s.pendingProps;switch(Xh(s),s.tag){case 31:case 16:case 15:case 0:case 11:case 7:case 8:case 12:case 9:case 14:return le(s),null;case 1:return le(s),null;case 3:return l=s.stateNode,f=null,r!==null&&(f=r.memoizedState.cache),s.memoizedState.cache!==f&&(s.flags|=2048),Tn(_e),FA(),l.pendingContext&&(l.context=l.pendingContext,l.pendingContext=null),(r===null||r.child===null)&&(Sa(s)?In(s):r===null||r.memoizedState.isDehydrated&&(s.flags&256)===0||(s.flags|=1024,v0())),le(s),null;case 26:return l=s.memoizedState,r===null?(In(s),l!==null?(le(s),kv(s,l)):(le(s),s.flags&=-16777217)):l?l!==r.memoizedState?(In(s),le(s),kv(s,l)):(le(s),s.flags&=-16777217):(r.memoizedProps!==f&&In(s),le(s),s.flags&=-16777217),null;case 27:$r(s),l=st.current;var d=s.type;if(r!==null&&s.stateNode!=null)r.memoizedProps!==f&&In(s);else{if(!f){if(s.stateNode===null)throw Error(n(166));return le(s),null}r=Y.current,Sa(s)?B0(s):(r=kw(d,f,l),s.stateNode=r,In(s))}return le(s),null;case 5:if($r(s),l=s.type,r!==null&&s.stateNode!=null)r.memoizedProps!==f&&In(s);else{if(!f){if(s.stateNode===null)throw Error(n(166));return le(s),null}if(r=Y.current,Sa(s))B0(s);else{switch(d=Fu(st.current),r){case 1:r=d.createElementNS("http://www.w3.org/2000/svg",l);break;case 2:r=d.createElementNS("http://www.w3.org/1998/Math/MathML",l);break;default:switch(l){case"svg":r=d.createElementNS("http://www.w3.org/2000/svg",l);break;case"math":r=d.createElementNS("http://www.w3.org/1998/Math/MathML",l);break;case"script":r=d.createElement("div"),r.innerHTML=" - + +
diff --git a/vite-app/package.json b/vite-app/package.json index 5b120e20..bdb187e5 100644 --- a/vite-app/package.json +++ b/vite-app/package.json @@ -13,9 +13,12 @@ "test:coverage": "vitest run --coverage" }, "dependencies": { + "chart.js": "^4.5.0", + "html2canvas-oklch": "1.5.0-alpha.0", "mobx": "^6.13.7", "mobx-react": "^9.2.0", "react": "^19.1.0", + "react-chartjs-2": "^5.3.0", "react-dom": "^19.1.0", "react-router-dom": "^7.7.1", "zod": "^4.0.14" diff --git a/vite-app/pnpm-lock.yaml b/vite-app/pnpm-lock.yaml index 60e1ea98..0ec51189 100644 --- a/vite-app/pnpm-lock.yaml +++ b/vite-app/pnpm-lock.yaml @@ -8,6 +8,12 @@ importers: .: dependencies: + chart.js: + specifier: ^4.5.0 + version: 4.5.0 + html2canvas-oklch: + specifier: 1.5.0-alpha.0 + version: 1.5.0-alpha.0 mobx: specifier: ^6.13.7 version: 6.13.7 @@ -17,6 +23,9 @@ importers: react: specifier: ^19.1.0 version: 19.1.1 + react-chartjs-2: + specifier: ^5.3.0 + version: 5.3.0(chart.js@4.5.0)(react@19.1.1) react-dom: specifier: ^19.1.0 version: 19.1.1(react@19.1.1) @@ -408,6 +417,9 @@ packages: '@jridgewell/trace-mapping@0.3.29': resolution: {integrity: sha512-uw6guiW/gcAGPDhLmd77/6lW8QLeiV5RUTsAX46Db6oLhGaVj4lhnPwb184s1bkc8kdVg/+h988dro8GRDpmYQ==} + '@kurkle/color@0.3.4': + resolution: {integrity: sha512-M5UknZPHRu3DEDWoipU6sE8PdkZ6Z/S+v4dD+Ke8IaNlpdSQah50lz1KtcFBa2vsdOnwbbnxJwVM4wty6udA5w==} + '@nodelib/fs.scandir@2.1.5': resolution: {integrity: sha512-vq24Bq3ym5HEQm2NKCr3yXDwjc7vTsEThRDnkp2DK9p1uqLR+DHurm/NOTo0KG7HYHU7eppKZj3MyqYuMBf62g==} engines: {node: '>= 8'} @@ -797,6 +809,10 @@ packages: balanced-match@1.0.2: resolution: {integrity: sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==} + base64-arraybuffer@1.0.2: + resolution: {integrity: sha512-I3yl4r9QB5ZRY3XuJVEPfc2XhZO6YweFPI+UovAzn+8/hb3oJ6lnysaFcjVpkCPfVWFUDvoZ8kmVDP7WyRtYtQ==} + engines: {node: '>= 0.6.0'} + brace-expansion@1.1.12: resolution: {integrity: sha512-9T9UjW3r0UW5c1Q7GTwllptXwhvYmEzFhzMfZ9H7FQWt+uZePjZPjBP/W1ZEyZ1twGWom5/56TF4lPcqjnDHcg==} @@ -831,6 +847,10 @@ packages: resolution: {integrity: sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==} engines: {node: '>=10'} + chart.js@4.5.0: + resolution: {integrity: sha512-aYeC/jDgSEx8SHWZvANYMioYMZ2KX02W6f6uVfyteuCGcadDLcYVHdfdygsTQkQ4TKn5lghoojAsPj5pu0SnvQ==} + engines: {pnpm: '>=8'} + check-error@2.1.1: resolution: {integrity: sha512-OAlb+T7V4Op9OwdkjmguYRqncdlx5JiofwOAUkmTF+jNdHwzTaTs4sRAGpzLF3oOz5xAyDGrPgeIDFQmDOTiJw==} engines: {node: '>= 16'} @@ -860,6 +880,9 @@ packages: resolution: {integrity: sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==} engines: {node: '>= 8'} + css-line-break@2.1.0: + resolution: {integrity: sha512-FHcKFCZcAha3LwfVBhCQbW2nCNbkZXn7KVUJcsT5/P8YmfsVja0FMPJr0B903j/E69HUphKiV9iQArX8SDYA4w==} + csstype@3.1.3: resolution: {integrity: sha512-M1uQkMl8rQK/szD0LNhtqxIPLpimGm8sOBwU7lLnCpSbTyY3yeU1Vc7l4KT5zT4s/yOxHH5O7tIuuLOCnLADRw==} @@ -1064,6 +1087,10 @@ packages: html-escaper@2.0.2: resolution: {integrity: sha512-H2iMtd0I4Mt5eYiapRdIDjp+XzelXQ0tFE4JS7YFwFevXXMmOp9myNrUvCg0D6ws8iqkRPBfKHgbwig1SmlLfg==} + html2canvas-oklch@1.5.0-alpha.0: + resolution: {integrity: sha512-7cp1ODcbd+lkwi+t3igDIMf7TzV8YIRgG7Nt3XzjSkVCxUDUB94m/RPtb/wO2/EhX80tUTFFbVf0Ap75uQQx8w==} + engines: {node: '>=8.0.0'} + ignore@5.3.2: resolution: {integrity: sha512-hsBTNUqQTDwkWtcdYI2i06Y/nUBEsNEDJKjWdigLvegy8kDuJAS8uRlpkkcQpyEXL0Z/pjDy5HBmMjRCJ2gq+g==} engines: {node: '>= 4'} @@ -1383,6 +1410,12 @@ packages: queue-microtask@1.2.3: resolution: {integrity: sha512-NuaNSa6flKT5JaSYQzJok04JzTL1CA6aGhv5rfLW3PgqA+M2ChpZQnAC8h8i4ZFkBS8X5RqkDBHA7r4hej3K9A==} + react-chartjs-2@5.3.0: + resolution: {integrity: sha512-UfZZFnDsERI3c3CZGxzvNJd02SHjaSJ8kgW1djn65H1KK8rehwTjyrRKOG3VTMG8wtHZ5rgAO5oTHtHi9GCCmw==} + peerDependencies: + chart.js: ^4.1.1 + react: ^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0 + react-dom@19.1.1: resolution: {integrity: sha512-Dlq/5LAZgF0Gaz6yiqZCf6VCcZs1ghAJyrsu84Q/GT0gV+mCxbfmKNoGRKBYMJ8IEdGPqu49YWXD02GCknEDkw==} peerDependencies: @@ -1511,6 +1544,9 @@ packages: resolution: {integrity: sha512-pFYqmTw68LXVjeWJMST4+borgQP2AyMNbg1BpZh9LbyhUeNkeaPF9gzfPGUAnSMV3qPYdWUwDIjjCLiSDOl7vg==} engines: {node: '>=18'} + text-segmentation@1.0.3: + resolution: {integrity: sha512-iOiPUo/BGnZ6+54OsWxZidGCsdU8YbE4PSpdPinp7DeMtUJNJBoJ/ouUSTJjHkh1KntHaltHl/gDs2FC4i5+Nw==} + tinybench@2.9.0: resolution: {integrity: sha512-0+DUvqWMValLmha6lr4kD8iAMK1HzV0/aKnCtWb9v9641TnP/MFb7Pc2bxoxQjTXAErryXVgUOfv2YqNllqGeg==} @@ -1576,6 +1612,9 @@ packages: peerDependencies: react: ^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0 + utrie@1.0.2: + resolution: {integrity: sha512-1MLa5ouZiOmQzUbjbu9VmjLzn1QLXBhwpUa7kdLUQK+KQ5KA9I1vk5U4YHe/X2Ch7PYnJfWuWT+VbuxbGwljhw==} + vite-node@3.2.4: resolution: {integrity: sha512-EbKSKh+bh1E1IFxeO0pg1n4dvoOTt0UDiXMd/qn++r98+jPO1xtJilvXldeuQ8giIB5IkpjCgMleHMNEsGH6pg==} engines: {node: ^18.0.0 || ^20.0.0 || >=22.0.0} @@ -1970,6 +2009,8 @@ snapshots: '@jridgewell/resolve-uri': 3.1.2 '@jridgewell/sourcemap-codec': 1.5.4 + '@kurkle/color@0.3.4': {} + '@nodelib/fs.scandir@2.1.5': dependencies: '@nodelib/fs.stat': 2.0.5 @@ -2362,6 +2403,8 @@ snapshots: balanced-match@1.0.2: {} + base64-arraybuffer@1.0.2: {} + brace-expansion@1.1.12: dependencies: balanced-match: 1.0.2 @@ -2401,6 +2444,10 @@ snapshots: ansi-styles: 4.3.0 supports-color: 7.2.0 + chart.js@4.5.0: + dependencies: + '@kurkle/color': 0.3.4 + check-error@2.1.1: {} chownr@3.0.0: {} @@ -2423,6 +2470,10 @@ snapshots: shebang-command: 2.0.0 which: 2.0.2 + css-line-break@2.1.0: + dependencies: + utrie: 1.0.2 + csstype@3.1.3: {} debug@4.4.1: @@ -2647,6 +2698,11 @@ snapshots: html-escaper@2.0.2: {} + html2canvas-oklch@1.5.0-alpha.0: + dependencies: + css-line-break: 2.1.0 + text-segmentation: 1.0.3 + ignore@5.3.2: {} ignore@7.0.5: {} @@ -2902,6 +2958,11 @@ snapshots: queue-microtask@1.2.3: {} + react-chartjs-2@5.3.0(chart.js@4.5.0)(react@19.1.1): + dependencies: + chart.js: 4.5.0 + react: 19.1.1 + react-dom@19.1.1(react@19.1.1): dependencies: react: 19.1.1 @@ -3032,6 +3093,10 @@ snapshots: glob: 10.4.5 minimatch: 9.0.5 + text-segmentation@1.0.3: + dependencies: + utrie: 1.0.2 + tinybench@2.9.0: {} tinyexec@0.3.2: {} @@ -3088,6 +3153,10 @@ snapshots: dependencies: react: 19.1.1 + utrie@1.0.2: + dependencies: + base64-arraybuffer: 1.0.2 + vite-node@3.2.4(@types/node@24.2.1)(jiti@2.5.1)(lightningcss@1.30.1): dependencies: cac: 6.7.14 diff --git a/vite-app/src/components/ChartExport.tsx b/vite-app/src/components/ChartExport.tsx new file mode 100644 index 00000000..755f7201 --- /dev/null +++ b/vite-app/src/components/ChartExport.tsx @@ -0,0 +1,304 @@ +import { useRef, useCallback, useState } from "react"; +import { Chart as ChartJS, registerables } from "chart.js"; +import { Chart } from "react-chartjs-2"; +import html2canvas from "html2canvas-oklch"; +import Button from "./Button"; +import Select from "./Select"; + +// Register Chart.js components +ChartJS.register(...registerables); + +interface ChartExportProps> { + /** + * Pivot table data structure + */ + pivotData: { + rowKeyTuples: unknown[][]; + colKeyTuples: unknown[][]; + cells: Record>; + rowTotals: Record; + colTotals: Record; + grandTotal: number; + }; + /** + * Row fields configuration + */ + rowFields: (keyof T)[]; + /** + * Column fields configuration + */ + columnFields: (keyof T)[]; + /** + * Value field configuration + */ + valueField?: keyof T; + /** + * Aggregator type + */ + aggregator: string; + /** + * Chart type to render + */ + chartType?: "bar" | "line" | "doughnut" | "pie"; + /** + * Whether to show row totals + */ + showRowTotals?: boolean; + /** + * Whether to show column totals + */ + showColumnTotals?: boolean; +} + +type ChartType = "bar" | "line" | "doughnut" | "pie"; + +const ChartExport = >({ + pivotData, + rowFields, + columnFields, + valueField, + aggregator, + chartType = "bar", +}: ChartExportProps) => { + const chartRef = useRef(null); + const [selectedChartType, setSelectedChartType] = + useState(chartType); + const [isExporting, setIsExporting] = useState(false); + + // Convert pivot data to Chart.js format + const getChartData = useCallback(() => { + const { rowKeyTuples, colKeyTuples, cells } = pivotData; + + if (selectedChartType === "bar" || selectedChartType === "line") { + // For bar/line charts, use row groups as labels and columns as datasets + const labels = rowKeyTuples.map((tuple) => + tuple.map((v) => String(v ?? "")).join(" / ") + ); + + const datasets = colKeyTuples.map((colTuple, colIdx) => { + const colKey = colTuple.map((v) => String(v ?? "")).join(" / "); + const colLabel = + columnFields.length > 0 ? colKey : `Column ${colIdx + 1}`; + + const data = rowKeyTuples.map((rowTuple) => { + const rowKey = rowTuple.map((v) => String(v ?? "")).join("||"); + const cell = cells[rowKey]?.[colKey]; + return cell ? cell.value : 0; + }); + + // Generate a color for each dataset + const hue = (colIdx * 137.5) % 360; + const color = `hsl(${hue}, 70%, 60%)`; + + return { + label: colLabel, + data, + backgroundColor: selectedChartType === "line" ? "transparent" : color, + borderColor: color, + borderWidth: selectedChartType === "line" ? 2 : 1, + fill: selectedChartType === "line" ? false : true, + tension: selectedChartType === "line" ? 0.1 : undefined, + }; + }); + + return { labels, datasets }; + } else { + // For pie/doughnut charts, aggregate all data into a single dataset + const aggregatedData: { [key: string]: number } = {}; + + // Sum up all cell values + Object.values(cells).forEach((colCells) => { + Object.values(colCells).forEach((cell) => { + const colKey = Object.keys(colCells).find( + (key) => colCells[key] === cell + ); + if (colKey) { + const label = colKey || "Unknown"; + aggregatedData[label] = (aggregatedData[label] || 0) + cell.value; + } + }); + }); + + const labels = Object.keys(aggregatedData); + const data = Object.values(aggregatedData); + const backgroundColor = labels.map((_, idx) => { + const hue = (idx * 137.5) % 360; + return `hsl(${hue}, 60%, 60%)`; + }); + + return { + labels, + datasets: [ + { + data, + backgroundColor, + borderColor: backgroundColor.map((color) => color), + borderWidth: 1, + }, + ], + }; + } + }, [pivotData, rowFields, columnFields, selectedChartType]); + + const chartData = getChartData(); + + // Don't render chart if no data + if (!chartData.labels.length || !chartData.datasets.length) { + return ( +
+
+ No data available for chart visualization. Please select row and + column fields. +
+
+ ); + } + + // Additional safety check for line charts + if ( + selectedChartType === "line" && + chartData.datasets.some((dataset) => dataset.data.length === 0) + ) { + return ( +
+
+ Line charts require data in all datasets. Please check your pivot + configuration. +
+
+ ); + } + + const chartOptions = { + responsive: true, + maintainAspectRatio: false, + plugins: { + title: { + display: true, + text: `Pivot Table: ${aggregator} of ${String( + valueField || "records" + )}`, + font: { + size: 16, + weight: "bold" as const, + }, + }, + legend: { + display: true, + position: "top" as const, + }, + tooltip: { + enabled: true, + }, + }, + scales: + selectedChartType === "bar" || selectedChartType === "line" + ? { + y: { + type: "linear" as const, + beginAtZero: true, + title: { + display: true, + text: aggregator === "count" ? "Count" : "Value", + }, + }, + x: { + type: "category" as const, + title: { + display: true, + text: rowFields.map((f) => String(f)).join(" / "), + }, + }, + } + : undefined, + elements: + selectedChartType === "line" + ? { + line: { + tension: 0.1, + }, + point: { + radius: 3, + hoverRadius: 5, + }, + } + : undefined, + }; + + const exportChartAsImage = useCallback(async () => { + if (!chartRef.current) return; + + setIsExporting(true); + try { + const canvas = await html2canvas(chartRef.current, { + backgroundColor: "#ffffff", + scale: 2, // Higher resolution + useCORS: true, + allowTaint: true, + }); + + // Create download link + const link = document.createElement("a"); + link.download = `pivot-chart-${selectedChartType}-${Date.now()}.png`; + link.href = canvas.toDataURL("image/png"); + link.click(); + } catch (error) { + console.error("Error exporting chart:", error); + } finally { + setIsExporting(false); + } + }, [selectedChartType]); + + const chartTypes: { value: ChartType; label: string }[] = [ + { value: "bar", label: "Bar Chart" }, + { value: "line", label: "Line Chart" }, + { value: "doughnut", label: "Doughnut Chart" }, + { value: "pie", label: "Pie Chart" }, + ]; + + return ( +
+
+

Chart Export

+
+ + +
+
+ +
+ Visualize your pivot table data as a chart and export it as a + high-resolution PNG image. You can adjust your browser window size to + change the exported image dimensions. +
+ +
+ +
+
+ ); +}; + +export default ChartExport; diff --git a/vite-app/src/components/PivotTab.tsx b/vite-app/src/components/PivotTab.tsx index 0daa4fc3..4a928998 100644 --- a/vite-app/src/components/PivotTab.tsx +++ b/vite-app/src/components/PivotTab.tsx @@ -1,11 +1,22 @@ import { observer } from "mobx-react"; import PivotTable from "./PivotTable"; +import ChartExport from "./ChartExport"; import SearchableSelect from "./SearchableSelect"; import Button from "./Button"; import FilterSelector from "./FilterSelector"; -import { state } from "../App"; import { type FilterGroup } from "../types/filters"; -import { createFilterFunction } from "../util/filter-utils"; +import { usePivotData } from "../hooks/usePivotData"; +import { + createFieldHandlerSet, + getAvailableKeys, + getPivotConfig, + updatePivotConfig, + resetPivotConfig, + updateFilterConfig, + getFlattenedDataset, + createFilterFunction, + getFilterConfig, +} from "../util/field-processors"; interface FieldSelectorProps { title: string; @@ -125,62 +136,36 @@ const AggregatorSelector = ({ ); const PivotTab = observer(() => { - const { pivotConfig } = state; + const pivotConfig = getPivotConfig(); + const availableKeys = getAvailableKeys(); - const updateRowFields = (index: number, value: string) => { - const newRowFields = [...pivotConfig.selectedRowFields]; - newRowFields[index] = value; - state.updatePivotConfig({ selectedRowFields: newRowFields }); - }; - - const updateColumnFields = (index: number, value: string) => { - const newColumnFields = [...pivotConfig.selectedColumnFields]; - newColumnFields[index] = value; - state.updatePivotConfig({ selectedColumnFields: newColumnFields }); - }; + // Use the pivot data hook + const pivotData = usePivotData({ + rowFields: pivotConfig.selectedRowFields, + columnFields: pivotConfig.selectedColumnFields, + valueField: pivotConfig.selectedValueField, + aggregator: pivotConfig.selectedAggregator as + | "count" + | "sum" + | "avg" + | "min" + | "max", + showRowTotals: true, + showColumnTotals: true, + }); const updateValueField = (value: string) => { - state.updatePivotConfig({ selectedValueField: value }); + updatePivotConfig({ selectedValueField: value }); }; const updateAggregator = (value: string) => { - state.updatePivotConfig({ selectedAggregator: value }); + updatePivotConfig({ selectedAggregator: value }); }; const updateFilters = (filters: FilterGroup[]) => { - state.updateFilterConfig(filters); - }; - - const createFieldHandler = ( - updater: (index: number, value: string) => void - ) => { - return (index: number, value: string) => { - updater(index, value); - }; - }; - - const createAddHandler = ( - fields: string[], - updater: (fields: string[]) => void - ) => { - return () => { - if (fields.length < 3) { - updater([...fields, ""]); - } - }; + updateFilterConfig(filters); }; - const createRemoveHandler = ( - fields: string[], - updater: (fields: string[]) => void - ) => { - return (index: number) => { - updater(fields.filter((_, i) => i !== index)); - }; - }; - - const availableKeys = state.flattenedDatasetKeys; - return (
@@ -194,7 +179,7 @@ const PivotTab = observer(() => { {/* Controls Section with Reset Button */}
); diff --git a/vite-app/src/components/PivotTable.tsx b/vite-app/src/components/PivotTable.tsx index 52b2213d..af6063de 100644 --- a/vite-app/src/components/PivotTable.tsx +++ b/vite-app/src/components/PivotTable.tsx @@ -106,7 +106,6 @@ export function PivotTable>({ filter, }); - debugger; return ( diff --git a/vite-app/src/hooks/usePivotData.ts b/vite-app/src/hooks/usePivotData.ts new file mode 100644 index 00000000..ad170a15 --- /dev/null +++ b/vite-app/src/hooks/usePivotData.ts @@ -0,0 +1,87 @@ +import { useMemo } from 'react'; +import { computePivot } from '../util/pivot'; +import { createFilterFunction } from '../util/filter-utils'; +import { state } from '../App'; + +export interface PivotDataConfig { + rowFields: string[]; + columnFields: string[]; + valueField?: string; + aggregator: 'count' | 'sum' | 'avg' | 'min' | 'max'; + showRowTotals?: boolean; + showColumnTotals?: boolean; +} + +export interface ProcessedPivotData { + rowFields: string[]; + columnFields: string[]; + valueField?: string; + aggregator: 'count' | 'sum' | 'avg' | 'min' | 'max'; + pivotResult: ReturnType>; + hasValidConfiguration: boolean; +} + +/** + * Custom hook that processes pivot configuration and computes pivot data + * Centralizes all pivot-related logic to avoid duplication + */ +export function usePivotData( + config: PivotDataConfig +): ProcessedPivotData { + const { rowFields, columnFields, valueField, aggregator, showRowTotals = true, showColumnTotals = true } = config; + + // Filter out empty fields and cast to proper types + const processedRowFields = useMemo( + () => rowFields.filter((field) => field !== '') as string[], + [rowFields] + ); + + const processedColumnFields = useMemo( + () => columnFields.filter((field) => field !== '') as string[], + [columnFields] + ); + + const processedValueField = useMemo( + () => (valueField && valueField !== '' ? valueField : undefined) as string | undefined, + [valueField] + ); + + // Check if we have a valid configuration for pivot computation + const hasValidConfiguration = useMemo( + () => processedRowFields.length > 0 && processedColumnFields.length > 0, + [processedRowFields, processedColumnFields] + ); + + // Compute pivot data only when configuration is valid + const pivotResult = useMemo(() => { + if (!hasValidConfiguration) { + // Return empty pivot result structure + return { + rowKeyTuples: [], + colKeyTuples: [], + cells: {}, + rowTotals: {}, + colTotals: {}, + grandTotal: 0, + } as ReturnType>; + } + + return computePivot({ + data: state.flattenedDataset, + rowFields: processedRowFields, + columnFields: processedColumnFields, + valueField: processedValueField, + aggregator, + filter: createFilterFunction(state.filterConfig), + }); + }, [hasValidConfiguration, processedRowFields, processedColumnFields, processedValueField, aggregator, state.filterConfig]); + + return { + rowFields: processedRowFields, + columnFields: processedColumnFields, + valueField: processedValueField, + aggregator, + pivotResult, + hasValidConfiguration, + }; +} diff --git a/vite-app/src/util/field-processors.ts b/vite-app/src/util/field-processors.ts new file mode 100644 index 00000000..e1851f15 --- /dev/null +++ b/vite-app/src/util/field-processors.ts @@ -0,0 +1,121 @@ +import { state } from '../App'; +import { createFilterFunction as createFilterFunctionUtil } from '../util/filter-utils'; +import { type FilterGroup } from '../types/filters'; + +/** + * Utility functions for processing field configurations and creating handlers + * Centralizes common field manipulation logic + */ + +/** + * Creates a field change handler for a specific index + */ +export function createFieldHandler( + updater: (index: number, value: string) => void +) { + return (index: number, value: string) => { + updater(index, value); + }; +} + +/** + * Creates an add field handler that respects the maximum limit + */ +export function createAddHandler( + fields: string[], + updater: (fields: string[]) => void, + maxFields: number = 3 +) { + return () => { + if (fields.length < maxFields) { + updater([...fields, '']); + } + }; +} + +/** + * Creates a remove field handler + */ +export function createRemoveHandler( + fields: string[], + updater: (fields: string[]) => void +) { + return (index: number) => { + updater(fields.filter((_, i) => i !== index)); + }; +} + +/** + * Creates a complete field handler set for a field array + */ +export function createFieldHandlerSet( + fields: string[], + updater: (fields: string[]) => void, + maxFields: number = 3 +) { + return { + onFieldChange: createFieldHandler((index: number, value: string) => { + const newFields = [...fields]; + newFields[index] = value; + updater(newFields); + }), + onAddField: createAddHandler(fields, updater, maxFields), + onRemoveField: createRemoveHandler(fields, updater), + }; +} + +/** + * Gets available keys from the current dataset state + */ +export function getAvailableKeys(): string[] { + return state.flattenedDatasetKeys; +} + +/** + * Processes pivot configuration from state + */ +export function getPivotConfig() { + return state.pivotConfig; +} + +/** + * Updates pivot configuration + */ +export function updatePivotConfig(updates: Partial) { + state.updatePivotConfig(updates); +} + +/** + * Resets pivot configuration to defaults + */ +export function resetPivotConfig() { + state.resetPivotConfig(); +} + +/** + * Updates filter configuration + */ +export function updateFilterConfig(filters: FilterGroup[]) { + state.updateFilterConfig(filters); +} + +/** + * Gets the flattened dataset from state + */ +export function getFlattenedDataset() { + return state.flattenedDataset; +} + +/** + * Creates a filter function using the current filter config + */ +export function createFilterFunction() { + return createFilterFunctionUtil(state.filterConfig); +} + +/** + * Gets the current filter configuration + */ +export function getFilterConfig() { + return state.filterConfig; +} From 11330f5ca4d73b6e481c2d13f9278fa26b40943c Mon Sep 17 00:00:00 2001 From: Derek Xu <32891260+xzrderek@users.noreply.github.com> Date: Sat, 16 Aug 2025 14:06:15 -0700 Subject: [PATCH 22/26] BigQuery Adapter (#86) * BigQuery * removing unneeded --- eval_protocol/adapters/__init__.py | 38 +- eval_protocol/adapters/bigquery.py | 285 +++++++++++++ examples/adapters/README.md | 46 ++- pyproject.toml | 8 + tests/test_adapters_e2e.py | 630 ++++++++++++++++++++++------- uv.lock | 233 ++++++++++- 6 files changed, 1069 insertions(+), 171 deletions(-) create mode 100644 eval_protocol/adapters/bigquery.py diff --git a/eval_protocol/adapters/__init__.py b/eval_protocol/adapters/__init__.py index fc04237b..57757901 100644 --- a/eval_protocol/adapters/__init__.py +++ b/eval_protocol/adapters/__init__.py @@ -6,6 +6,7 @@ Available adapters: - LangfuseAdapter: Pull data from Langfuse deployments - HuggingFaceAdapter: Load datasets from HuggingFace Hub +- BigQueryAdapter: Query data from Google BigQuery - Braintrust integration (legacy) - TRL integration (legacy) """ @@ -13,35 +14,56 @@ # Conditional imports based on available dependencies try: from .langfuse import LangfuseAdapter, create_langfuse_adapter + __all__ = ["LangfuseAdapter", "create_langfuse_adapter"] except ImportError: __all__ = [] try: from .huggingface import ( - HuggingFaceAdapter, - create_huggingface_adapter, + HuggingFaceAdapter, create_gsm8k_adapter, + create_huggingface_adapter, create_math_adapter, ) - __all__.extend([ - "HuggingFaceAdapter", - "create_huggingface_adapter", - "create_gsm8k_adapter", - "create_math_adapter", - ]) + + __all__.extend( + [ + "HuggingFaceAdapter", + "create_huggingface_adapter", + "create_gsm8k_adapter", + "create_math_adapter", + ] + ) +except ImportError: + pass + +try: + from .bigquery import ( + BigQueryAdapter, + create_bigquery_adapter, + ) + + __all__.extend( + [ + "BigQueryAdapter", + "create_bigquery_adapter", + ] + ) except ImportError: pass # Legacy adapters (always available) try: from .braintrust import reward_fn_to_scorer, scorer_to_reward_fn + __all__.extend(["scorer_to_reward_fn", "reward_fn_to_scorer"]) except ImportError: pass try: from .trl import create_trl_adapter + __all__.extend(["create_trl_adapter"]) except ImportError: pass diff --git a/eval_protocol/adapters/bigquery.py b/eval_protocol/adapters/bigquery.py new file mode 100644 index 00000000..1275b1e1 --- /dev/null +++ b/eval_protocol/adapters/bigquery.py @@ -0,0 +1,285 @@ +"""Google BigQuery adapter for Eval Protocol. + +This adapter allows querying data from Google BigQuery tables and converting it +to EvaluationRow format for use in evaluation pipelines. +""" + +import logging +from typing import Any, Callable, Dict, Iterator, List, Optional, Union + +from eval_protocol.models import CompletionParams, EvaluationRow, InputMetadata, Message + +logger = logging.getLogger(__name__) + +try: + from google.auth.exceptions import DefaultCredentialsError + from google.cloud import bigquery + from google.cloud.exceptions import Forbidden, NotFound + from google.oauth2 import service_account + + BIGQUERY_AVAILABLE = True +except ImportError: + BIGQUERY_AVAILABLE = False + logger.warning("Google Cloud BigQuery not installed. Install with: pip install 'eval-protocol[bigquery]'") + +# Type alias for transformation function +TransformFunction = Callable[[Dict[str, Any]], Dict[str, Any]] + + +class BigQueryAdapter: + """Adapter to query data from Google BigQuery and convert to EvaluationRow format. + + This adapter connects to Google BigQuery, executes SQL queries, and applies + a user-provided transformation function to convert each row to the format + expected by EvaluationRow. + + The transformation function should take a BigQuery row dictionary and return: + { + 'messages': List[Dict] - list of message dictionaries with 'role' and 'content' + 'ground_truth': Optional[str] - expected answer/output + 'metadata': Optional[Dict] - any additional metadata to preserve + 'tools': Optional[List[Dict]] - tool definitions for tool calling scenarios + } + """ + + def __init__( + self, + transform_fn: TransformFunction, + dataset_id: Optional[str] = None, + credentials_path: Optional[str] = None, + location: Optional[str] = None, + **client_kwargs, + ): + """Initialize the BigQuery adapter. + + Args: + transform_fn: Function to transform BigQuery rows to evaluation format + dataset_id: Google Cloud project ID (if None, uses default from environment) + credentials_path: Path to service account JSON file (if None, uses default auth) + location: Default location for BigQuery jobs + **client_kwargs: Additional arguments to pass to BigQuery client + + Raises: + ImportError: If google-cloud-bigquery is not installed + DefaultCredentialsError: If authentication fails + """ + if not BIGQUERY_AVAILABLE: + raise ImportError( + "Google Cloud BigQuery not installed. Install with: pip install 'eval-protocol[bigquery]'" + ) + + self.transform_fn = transform_fn + self.dataset_id = dataset_id + self.location = location + + # Initialize BigQuery client + try: + client_args = {} + if dataset_id: + client_args["project"] = dataset_id + if credentials_path: + credentials = service_account.Credentials.from_service_account_file(credentials_path) + client_args["credentials"] = credentials + if location: + client_args["location"] = location + + client_args.update(client_kwargs) + self.client = bigquery.Client(**client_args) + + except DefaultCredentialsError as e: + logger.error("Failed to authenticate with BigQuery: %s", e) + raise + except Exception as e: + logger.error("Failed to initialize BigQuery client: %s", e) + raise + + def get_evaluation_rows( + self, + query: str, + query_params: Optional[List[Union[bigquery.ScalarQueryParameter, bigquery.ArrayQueryParameter]]] = None, + limit: Optional[int] = None, + offset: int = 0, + model_name: str = "gpt-3.5-turbo", + temperature: float = 0.0, + max_tokens: Optional[int] = None, + **completion_params_kwargs, + ) -> Iterator[EvaluationRow]: + """Execute BigQuery query and convert results to EvaluationRow format. + + Args: + query: SQL query to execute + query_params: Optional list of query parameters for parameterized queries + limit: Maximum number of rows to return (applied after BigQuery query) + offset: Number of rows to skip (applied after BigQuery query) + model_name: Model name for completion parameters + temperature: Temperature for completion parameters + max_tokens: Max tokens for completion parameters + **completion_params_kwargs: Additional completion parameters + + Yields: + EvaluationRow: Converted evaluation rows + + Raises: + NotFound: If the query references non-existent tables/datasets + Forbidden: If insufficient permissions + """ + try: + # Configure query job + job_config = bigquery.QueryJobConfig() + if query_params: + job_config.query_parameters = query_params + if self.location: + job_config.location = self.location + + query_job = self.client.query(query, job_config=job_config) + + results = query_job.result() + + completion_params: CompletionParams = { + "model": model_name, + "temperature": temperature, + "max_tokens": max_tokens, + **completion_params_kwargs, + } + + # Convert rows with offset/limit + row_count = 0 + processed_count = 0 + + for raw_row in results: + # Apply offset + if row_count < offset: + row_count += 1 + continue + + # Apply limit + if limit is not None and processed_count >= limit: + break + + try: + eval_row = self._convert_row_to_evaluation_row(raw_row, processed_count, completion_params) + if eval_row: + yield eval_row + processed_count += 1 + + except (AttributeError, ValueError, KeyError) as e: + logger.warning("Failed to convert row %d: %s", row_count, e) + + row_count += 1 + + except (NotFound, Forbidden) as e: + logger.error("BigQuery access error: %s", e) + raise + except Exception as e: + logger.error("Error executing BigQuery query: %s", e) + raise + + def _convert_row_to_evaluation_row( + self, + raw_row: Dict[str, Any], + row_index: int, + completion_params: CompletionParams, + ) -> EvaluationRow: + """Convert a single BigQuery row to EvaluationRow format. + + Args: + raw_row: BigQuery row dictionary + row_index: Index of the row in the result set + completion_params: Completion parameters to use + + Returns: + EvaluationRow object or None if conversion fails + """ + # Apply user transformation + transformed = self.transform_fn(raw_row) + + # Validate required fields + if "messages" not in transformed: + raise ValueError("Transform function must return 'messages' field") + + # Convert message dictionaries to Message objects + messages = [] + for msg_dict in transformed["messages"]: + if not isinstance(msg_dict, dict): + raise ValueError("Each message must be a dictionary") + if "role" not in msg_dict: + raise ValueError("Each message must have a 'role' field") + + messages.append( + Message( + role=msg_dict["role"], + content=msg_dict.get("content"), + name=msg_dict.get("name"), + tool_call_id=msg_dict.get("tool_call_id"), + tool_calls=msg_dict.get("tool_calls"), + function_call=msg_dict.get("function_call"), + ) + ) + + # Extract other fields + ground_truth = transformed.get("ground_truth") + tools = transformed.get("tools") + user_metadata = transformed.get("metadata", {}) + + # Create dataset info + dataset_info = { + "source": "bigquery", + "dataset_id": self.dataset_id or self.client.project, + "row_index": row_index, + "transform_function": ( + self.transform_fn.__name__ if hasattr(self.transform_fn, "__name__") else "anonymous" + ), + } + + # Add user metadata + dataset_info.update(user_metadata) + + # Add original row data (with prefix to avoid conflicts) + for key, value in raw_row.items(): + # Convert BigQuery types to JSON-serializable types + dataset_info[f"original_{key}"] = value + + # Create input metadata (following HuggingFace pattern) + input_metadata = InputMetadata( + row_id=f"{self.dataset_id}_{row_index}", + completion_params=completion_params, + dataset_info=dataset_info, + session_data={ + "dataset_source": "bigquery", + }, + ) + + return EvaluationRow( + messages=messages, + tools=tools, + input_metadata=input_metadata, + ground_truth=str(ground_truth) if ground_truth is not None else None, + ) + + +def create_bigquery_adapter( + transform_fn: TransformFunction, + dataset_id: Optional[str] = None, + credentials_path: Optional[str] = None, + location: Optional[str] = None, + **client_kwargs, +) -> BigQueryAdapter: + """Factory function to create a BigQuery adapter. + + Args: + transform_fn: Function to transform BigQuery rows to evaluation format + dataset_id: Google Cloud project ID + credentials_path: Path to service account JSON file + location: Default location for BigQuery jobs + **client_kwargs: Additional arguments for BigQuery client + + Returns: + BigQueryAdapter instance + """ + return BigQueryAdapter( + transform_fn=transform_fn, + dataset_id=dataset_id, + credentials_path=credentials_path, + location=location, + **client_kwargs, + ) diff --git a/examples/adapters/README.md b/examples/adapters/README.md index f51cd387..591bbbb2 100644 --- a/examples/adapters/README.md +++ b/examples/adapters/README.md @@ -43,6 +43,34 @@ Loads datasets from HuggingFace Hub and converts them to EvaluationRow format. pip install 'eval-protocol[huggingface]' ``` +### 3. BigQuery Adapter (`bigquery_example.py`) + +Queries data from Google BigQuery tables and converts them to EvaluationRow format. + +**Features:** +- Execute custom SQL queries against BigQuery datasets +- Support for parameterized queries and batch processing +- Built-in convenience adapters for conversation and Q&A data +- Rich metadata preservation including query information +- Integration with Google Cloud authentication +- Schema introspection and dataset exploration + +**Prerequisites:** +```bash +pip install 'eval-protocol[bigquery]' +``` + +**Environment Variables:** +```bash +export GOOGLE_CLOUD_PROJECT="your-project-id" +export GOOGLE_APPLICATION_CREDENTIALS="/path/to/service-account.json" # optional +``` + +**Alternative Authentication:** +```bash +gcloud auth application-default login +``` + ## Running the Examples ### Basic Usage @@ -51,9 +79,12 @@ pip install 'eval-protocol[huggingface]' # Run Langfuse example python examples/adapters/langfuse_example.py -# Run HuggingFace example +# Run HuggingFace example python examples/adapters/huggingface_example.py +# Run BigQuery example +python examples/adapters/bigquery_example.py + # Run GSM8K replacement example python examples/adapters/gsm8k_replacement_example.py ``` @@ -66,6 +97,11 @@ export LANGFUSE_PUBLIC_KEY="pk_..." export LANGFUSE_SECRET_KEY="sk_..." python examples/adapters/langfuse_example.py +# Set up Google Cloud credentials for BigQuery +export GOOGLE_CLOUD_PROJECT="your-project-id" +export GOOGLE_APPLICATION_CREDENTIALS="/path/to/service-account.json" # optional +python examples/adapters/bigquery_example.py + # HuggingFace works without credentials for public datasets python examples/adapters/huggingface_example.py ``` @@ -100,7 +136,7 @@ def custom_gsm8k_transform(row): from eval_protocol.adapters.huggingface import create_huggingface_adapter custom_adapter = create_huggingface_adapter( dataset_id="gsm8k", - config_name="main", + config_name="main", transform_fn=custom_gsm8k_transform ) ``` @@ -150,7 +186,7 @@ rows = list(adapter.get_evaluation_rows(limit=10)) for row in rows: # Add model response (you would generate this) row.messages.append(Message(role="assistant", content="...")) - + # Evaluate result = math_reward(messages=row.messages, ground_truth=row.ground_truth) print(f"Score: {result.score}") @@ -222,7 +258,7 @@ class MyCustomAdapter: def __init__(self, **config): # Initialize your data source connection pass - + def get_evaluation_rows(self, **kwargs) -> Iterator[EvaluationRow]: # Fetch data and convert to EvaluationRow format pass @@ -272,4 +308,4 @@ We welcome contributions of new adapters! Popular integrations that would be val - **File format adapters**: Parquet, Excel, etc. - **Monitoring platform adapters**: DataDog, New Relic, etc. -See the adapter contributing guide for detailed instructions. \ No newline at end of file +See the adapter contributing guide for detailed instructions. diff --git a/pyproject.toml b/pyproject.toml index 4026ce9e..73105fd5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -109,10 +109,18 @@ huggingface = [ "datasets>=2.0.0", "transformers>=4.0.0", ] +bigquery = [ + "google-cloud-bigquery>=3.0.0", + "google-auth>=2.0.0", + "google-auth-oauthlib>=1.0.0", +] adapters = [ "langfuse>=2.0.0", "datasets>=2.0.0", "transformers>=4.0.0", + "google-cloud-bigquery>=3.0.0", + "google-auth>=2.0.0", + "google-auth-oauthlib>=1.0.0", ] svgbench = [ "selenium>=4.0.0", diff --git a/tests/test_adapters_e2e.py b/tests/test_adapters_e2e.py index a598dff7..72449e8b 100644 --- a/tests/test_adapters_e2e.py +++ b/tests/test_adapters_e2e.py @@ -6,31 +6,34 @@ """ import os -import pytest from datetime import datetime, timedelta -from typing import Dict, Any +from typing import Any, Dict + +import pytest -from eval_protocol.models import EvaluationRow, Message, InputMetadata +from eval_protocol.models import EvaluationRow, InputMetadata, Message class TestLangfuseAdapterE2E: """End-to-end tests for Langfuse adapter with real deployment.""" - + def _get_langfuse_credentials(self): """Get Langfuse credentials from environment.""" public_key = os.getenv("LANGFUSE_PUBLIC_KEY") secret_key = os.getenv("LANGFUSE_SECRET_KEY") host = os.getenv("LANGFUSE_HOST", "https://langfuse-web-prod-zfdbl7ykrq-uc.a.run.app") project_id = os.getenv("LANGFUSE_PROJECT_ID", "cmdj5yxhk0006s6022cyi0prv") - + return public_key, secret_key, host, project_id - + @pytest.mark.skipif( - not all([ - os.getenv("LANGFUSE_PUBLIC_KEY"), - os.getenv("LANGFUSE_SECRET_KEY"), - ]), - reason="Langfuse credentials not available in environment" + not all( + [ + os.getenv("LANGFUSE_PUBLIC_KEY"), + os.getenv("LANGFUSE_SECRET_KEY"), + ] + ), + reason="Langfuse credentials not available in environment", ) def test_langfuse_adapter_real_connection(self): """Test that we can connect to real Langfuse deployment and pull data.""" @@ -38,9 +41,9 @@ def test_langfuse_adapter_real_connection(self): from eval_protocol.adapters.langfuse import create_langfuse_adapter except ImportError: pytest.skip("Langfuse dependencies not installed") - + public_key, secret_key, host, project_id = self._get_langfuse_credentials() - + # Create adapter adapter = create_langfuse_adapter( public_key=public_key, @@ -48,40 +51,47 @@ def test_langfuse_adapter_real_connection(self): host=host, project_id=project_id, ) - + # Test basic connection by trying to get a small number of traces rows = list(adapter.get_evaluation_rows(limit=3)) - + # Verify we got some data assert isinstance(rows, list), "Should return a list of rows" print(f"Retrieved {len(rows)} evaluation rows from Langfuse") - + # Verify each row is properly formatted for i, row in enumerate(rows): assert isinstance(row, EvaluationRow), f"Row {i} should be EvaluationRow" assert isinstance(row.messages, list), f"Row {i} should have messages list" assert len(row.messages) > 0, f"Row {i} should have at least one message" - + # Verify messages are properly formatted for j, msg in enumerate(row.messages): assert isinstance(msg, Message), f"Row {i} message {j} should be Message object" - assert hasattr(msg, 'role'), f"Row {i} message {j} should have role" - assert msg.role in ['user', 'assistant', 'system', 'tool'], f"Row {i} message {j} has invalid role: {msg.role}" - + assert hasattr(msg, "role"), f"Row {i} message {j} should have role" + assert msg.role in [ + "user", + "assistant", + "system", + "tool", + ], f"Row {i} message {j} has invalid role: {msg.role}" + # Verify metadata if row.input_metadata: assert isinstance(row.input_metadata, InputMetadata), f"Row {i} should have InputMetadata" assert row.input_metadata.row_id, f"Row {i} should have row_id" print(f" Row {i}: ID={row.input_metadata.row_id}, Messages={len(row.messages)}") - + print(f" Row {i}: {len(row.messages)} messages, Tools={'Yes' if row.tools else 'No'}") - + @pytest.mark.skipif( - not all([ - os.getenv("LANGFUSE_PUBLIC_KEY"), - os.getenv("LANGFUSE_SECRET_KEY"), - ]), - reason="Langfuse credentials not available" + not all( + [ + os.getenv("LANGFUSE_PUBLIC_KEY"), + os.getenv("LANGFUSE_SECRET_KEY"), + ] + ), + reason="Langfuse credentials not available", ) def test_langfuse_adapter_with_filters(self): """Test Langfuse adapter with various filters.""" @@ -89,46 +99,52 @@ def test_langfuse_adapter_with_filters(self): from eval_protocol.adapters.langfuse import create_langfuse_adapter except ImportError: pytest.skip("Langfuse dependencies not installed") - + public_key, secret_key, host, project_id = self._get_langfuse_credentials() - + adapter = create_langfuse_adapter( public_key=public_key, secret_key=secret_key, host=host, project_id=project_id, ) - + # Test with time filter (last 7 days) - recent_rows = list(adapter.get_evaluation_rows( - limit=5, - from_timestamp=datetime.now() - timedelta(days=7), - include_tool_calls=True, - )) - + recent_rows = list( + adapter.get_evaluation_rows( + limit=5, + from_timestamp=datetime.now() - timedelta(days=7), + include_tool_calls=True, + ) + ) + print(f"Recent rows (last 7 days): {len(recent_rows)}") - + # Verify tool calling data is preserved tool_calling_rows = [row for row in recent_rows if row.tools] print(f"Rows with tool definitions: {len(tool_calling_rows)}") - + # Test specific filtering try: # This might not return data if no traces match, which is fine - tagged_rows = list(adapter.get_evaluation_rows( - limit=2, - tags=["production"], # May not exist, that's OK - )) + tagged_rows = list( + adapter.get_evaluation_rows( + limit=2, + tags=["production"], # May not exist, that's OK + ) + ) print(f"Tagged rows: {len(tagged_rows)}") except Exception as e: print(f"Tagged query failed (expected if no tags): {e}") - + @pytest.mark.skipif( - not all([ - os.getenv("LANGFUSE_PUBLIC_KEY"), - os.getenv("LANGFUSE_SECRET_KEY"), - ]), - reason="Langfuse credentials not available" + not all( + [ + os.getenv("LANGFUSE_PUBLIC_KEY"), + os.getenv("LANGFUSE_SECRET_KEY"), + ] + ), + reason="Langfuse credentials not available", ) def test_langfuse_conversation_analysis(self): """Test analysis of conversation types from Langfuse.""" @@ -136,51 +152,51 @@ def test_langfuse_conversation_analysis(self): from eval_protocol.adapters.langfuse import create_langfuse_adapter except ImportError: pytest.skip("Langfuse dependencies not installed") - + public_key, secret_key, host, project_id = self._get_langfuse_credentials() - + adapter = create_langfuse_adapter( public_key=public_key, secret_key=secret_key, host=host, project_id=project_id, ) - + # Get more data for analysis rows = list(adapter.get_evaluation_rows(limit=10, include_tool_calls=True)) - + # Analyze conversation patterns chat_only = [] tool_calling = [] multi_turn = [] - + for row in rows: # Check for tool calling has_tools = ( - row.tools or - any(hasattr(msg, 'tool_calls') and msg.tool_calls for msg in row.messages) or - any(msg.role == 'tool' for msg in row.messages) + row.tools + or any(hasattr(msg, "tool_calls") and msg.tool_calls for msg in row.messages) + or any(msg.role == "tool" for msg in row.messages) ) - + if has_tools: tool_calling.append(row) else: chat_only.append(row) - + # Check for multi-turn conversations if len(row.messages) > 2: # More than user + assistant multi_turn.append(row) - + print(f"Analysis of {len(rows)} conversations:") print(f" Chat-only: {len(chat_only)}") - print(f" Tool calling: {len(tool_calling)}") + print(f" Tool calling: {len(tool_calling)}") print(f" Multi-turn: {len(multi_turn)}") - + # Show example of each type if available if chat_only: row = chat_only[0] print(f" Example chat: {len(row.messages)} messages") - + if tool_calling: row = tool_calling[0] print(f" Example tool calling: {len(row.messages)} messages, {len(row.tools or [])} tools") @@ -188,220 +204,514 @@ def test_langfuse_conversation_analysis(self): class TestHuggingFaceAdapterE2E: """End-to-end tests for HuggingFace adapter with real datasets.""" - + def test_gsm8k_adapter_real_data(self): """Test loading real GSM8K data and converting to EvaluationRow.""" try: from eval_protocol.adapters.huggingface import create_huggingface_adapter except ImportError: pytest.skip("HuggingFace dependencies not installed") - + def gsm8k_transform(row: Dict[str, Any]) -> Dict[str, Any]: """Transform GSM8K row to our format.""" return { - 'messages': [ - {'role': 'system', 'content': 'You are a helpful assistant that solves math problems step by step.'}, - {'role': 'user', 'content': row['question']}, + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant that solves math problems step by step.", + }, + {"role": "user", "content": row["question"]}, ], - 'ground_truth': row['answer'], - 'metadata': { - 'dataset': 'gsm8k', - 'original_question': row['question'], - 'original_answer': row['answer'], - } + "ground_truth": row["answer"], + "metadata": { + "dataset": "gsm8k", + "original_question": row["question"], + "original_answer": row["answer"], + }, } - + # Create adapter with transform function adapter = create_huggingface_adapter( dataset_id="gsm8k", config_name="main", transform_fn=gsm8k_transform, ) - + # Test loading data rows = list(adapter.get_evaluation_rows(split="test", limit=5)) - + # Verify we got data assert len(rows) > 0, "Should retrieve some GSM8K data" print(f"Retrieved {len(rows)} GSM8K evaluation rows") - + # Verify each row is properly formatted for i, row in enumerate(rows): assert isinstance(row, EvaluationRow), f"Row {i} should be EvaluationRow" assert isinstance(row.messages, list), f"Row {i} should have messages" assert len(row.messages) >= 2, f"Row {i} should have system + user messages" - + # Check system prompt system_msg = row.messages[0] - assert system_msg.role == 'system', f"Row {i} first message should be system" - assert 'math problems' in system_msg.content.lower(), f"Row {i} should have math system prompt" - + assert system_msg.role == "system", f"Row {i} first message should be system" + assert "math problems" in system_msg.content.lower(), f"Row {i} should have math system prompt" + # Check user question user_msg = row.messages[1] - assert user_msg.role == 'user', f"Row {i} second message should be user" + assert user_msg.role == "user", f"Row {i} second message should be user" assert len(user_msg.content) > 0, f"Row {i} should have non-empty question" - + # Check ground truth assert row.ground_truth, f"Row {i} should have ground truth answer" - + # Check metadata assert row.input_metadata, f"Row {i} should have metadata" assert row.input_metadata.dataset_info, f"Row {i} should have dataset info" - + print(f" Row {i}: Question length={len(user_msg.content)}, Answer length={len(row.ground_truth)}") - + def test_math_dataset_real_data(self): """Test loading real MATH competition dataset.""" try: from eval_protocol.adapters.huggingface import create_huggingface_adapter except ImportError: pytest.skip("HuggingFace dependencies not installed") - + def math_transform(row: Dict[str, Any]) -> Dict[str, Any]: """Transform MATH dataset row.""" return { - 'messages': [ - {'role': 'system', 'content': 'You are an expert mathematician. Solve this step by step.'}, - {'role': 'user', 'content': row['problem']}, + "messages": [ + {"role": "system", "content": "You are an expert mathematician. Solve this step by step."}, + {"role": "user", "content": row["problem"]}, ], - 'ground_truth': row['solution'], - 'metadata': { - 'dataset': 'hendrycks_math', - 'type': row.get('type', 'unknown'), - 'level': row.get('level', 'unknown'), - 'original_problem': row['problem'], - 'original_solution': row['solution'], - } + "ground_truth": row["solution"], + "metadata": { + "dataset": "hendrycks_math", + "type": row.get("type", "unknown"), + "level": row.get("level", "unknown"), + "original_problem": row["problem"], + "original_solution": row["solution"], + }, } - + # Create adapter adapter = create_huggingface_adapter( dataset_id="SuperSecureHuman/competition_math_hf_dataset", transform_fn=math_transform, ) - + # Test loading data rows = list(adapter.get_evaluation_rows(split="test", limit=3)) - + # Verify data assert len(rows) > 0, "Should retrieve MATH dataset data" print(f"Retrieved {len(rows)} MATH dataset evaluation rows") - + for i, row in enumerate(rows): assert isinstance(row, EvaluationRow), f"Row {i} should be EvaluationRow" assert len(row.messages) >= 2, f"Row {i} should have system + user messages" assert row.ground_truth, f"Row {i} should have solution" - + # Check for MATH-specific metadata dataset_info = row.input_metadata.dataset_info - assert 'type' in dataset_info, f"Row {i} should have problem type" - assert 'level' in dataset_info, f"Row {i} should have difficulty level" - + assert "type" in dataset_info, f"Row {i} should have problem type" + assert "level" in dataset_info, f"Row {i} should have difficulty level" + print(f" Row {i}: Type={dataset_info.get('type')}, Level={dataset_info.get('level')}") - + def test_custom_dataset_transform(self): """Test adapter with a completely custom transformation.""" try: from eval_protocol.adapters.huggingface import create_huggingface_adapter except ImportError: pytest.skip("HuggingFace dependencies not installed") - + def squad_transform(row: Dict[str, Any]) -> Dict[str, Any]: """Custom transform for SQuAD dataset.""" - context = row['context'] - question = row['question'] - answers = row['answers'] - + context = row["context"] + question = row["question"] + answers = row["answers"] + # Get first answer - answer_text = answers['text'][0] if answers['text'] else "No answer" - + answer_text = answers["text"][0] if answers["text"] else "No answer" + return { - 'messages': [ - {'role': 'system', 'content': 'Answer the question based on the given context.'}, - {'role': 'user', 'content': f"Context: {context}\n\nQuestion: {question}"}, + "messages": [ + {"role": "system", "content": "Answer the question based on the given context."}, + {"role": "user", "content": f"Context: {context}\n\nQuestion: {question}"}, ], - 'ground_truth': answer_text, - 'metadata': { - 'dataset': 'squad', - 'context_length': len(context), - 'question_length': len(question), - 'num_answers': len(answers['text']), - } + "ground_truth": answer_text, + "metadata": { + "dataset": "squad", + "context_length": len(context), + "question_length": len(question), + "num_answers": len(answers["text"]), + }, } - + # Create adapter for SQuAD adapter = create_huggingface_adapter( dataset_id="squad", transform_fn=squad_transform, ) - + # Test loading rows = list(adapter.get_evaluation_rows(split="validation", limit=2)) - + assert len(rows) > 0, "Should retrieve SQuAD data" print(f"Retrieved {len(rows)} SQuAD evaluation rows") - + for i, row in enumerate(rows): assert isinstance(row, EvaluationRow), f"Row {i} should be EvaluationRow" - user_msg = next(msg for msg in row.messages if msg.role == 'user') - assert 'Context:' in user_msg.content, f"Row {i} should have context" - assert 'Question:' in user_msg.content, f"Row {i} should have question" - + user_msg = next(msg for msg in row.messages if msg.role == "user") + assert "Context:" in user_msg.content, f"Row {i} should have context" + assert "Question:" in user_msg.content, f"Row {i} should have question" + dataset_info = row.input_metadata.dataset_info print(f" Row {i}: Context length={dataset_info.get('context_length')}") +class TestBigQueryAdapterE2E: + """End-to-end tests for BigQuery adapter with real data sources.""" + + def _get_bigquery_credentials(self): + """Get BigQuery credentials from environment.""" + project_id = os.getenv("GOOGLE_CLOUD_PROJECT") + credentials_path = os.getenv("GOOGLE_APPLICATION_CREDENTIALS") + + return project_id, credentials_path + + @pytest.mark.skipif( + not os.getenv("GOOGLE_CLOUD_PROJECT"), reason="Google Cloud project not configured in environment" + ) + def test_bigquery_adapter_real_connection(self): + """Test that we can connect to real BigQuery and execute queries.""" + try: + from eval_protocol.adapters.bigquery import create_bigquery_adapter + except ImportError: + pytest.skip("BigQuery dependencies not installed") + + project_id, credentials_path = self._get_bigquery_credentials() + + # Define a simple transform for testing + def test_transform(row: Dict[str, Any]) -> Dict[str, Any]: + """Transform test query results to evaluation format.""" + return { + "messages": [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": str(row.get("text", "Test query"))}, + ], + "ground_truth": str(row.get("label", "test")), + "metadata": { + "source": "bigquery", + "row_data": dict(row), + }, + } + + # Create adapter + adapter = create_bigquery_adapter( + transform_fn=test_transform, + dataset_id=project_id, + credentials_path=credentials_path, + ) + + # Test with a simple query that should work on any BigQuery project + # Using INFORMATION_SCHEMA which is available in all projects + query = """ + SELECT + 'test_text' as text, + 'test_label' as label, + CURRENT_TIMESTAMP() as created_at, + 1 as id + LIMIT 3 + """ + + # Execute query and get rows + rows = list( + adapter.get_evaluation_rows( + query=query, + limit=2, + model_name="gpt-3.5-turbo", + temperature=0.0, + ) + ) + + # Verify we got data + assert len(rows) > 0, "Should retrieve data from BigQuery" + print(f"Retrieved {len(rows)} evaluation rows from BigQuery") + + # Verify each row is properly formatted + for i, row in enumerate(rows): + assert isinstance(row, EvaluationRow), f"Row {i} should be EvaluationRow" + assert isinstance(row.messages, list), f"Row {i} should have messages list" + assert len(row.messages) >= 2, f"Row {i} should have system + user messages" + + # Check system and user messages + system_msg = row.messages[0] + user_msg = row.messages[1] + assert system_msg.role == "system", f"Row {i} first message should be system" + assert user_msg.role == "user", f"Row {i} second message should be user" + + # Verify metadata + assert row.input_metadata, f"Row {i} should have metadata" + assert row.input_metadata.row_id, f"Row {i} should have row_id" + + # Check BigQuery-specific metadata + dataset_info = row.input_metadata.dataset_info + assert dataset_info["source"] == "bigquery", f"Row {i} should have BigQuery source" + + print(f" Row {i}: ID={row.input_metadata.row_id}, Messages={len(row.messages)}") + + @pytest.mark.skipif(not os.getenv("GOOGLE_CLOUD_PROJECT"), reason="Google Cloud project not configured") + def test_bigquery_advanced_features(self): + """Test advanced BigQuery adapter features like parameterized queries.""" + try: + from google.cloud import bigquery + + from eval_protocol.adapters.bigquery import create_bigquery_adapter + except ImportError: + pytest.skip("BigQuery dependencies not installed") + + project_id, credentials_path = self._get_bigquery_credentials() + + def transform_fn(row): + return { + "messages": [{"role": "user", "content": str(row["content"])}], + "ground_truth": str(row["label"]), + "metadata": {"category": row.get("category", "unknown")}, + } + + adapter = create_bigquery_adapter( + transform_fn=transform_fn, + dataset_id=project_id, + credentials_path=credentials_path, + ) + + # Test parameterized query + query = """ + SELECT + @prefix || ' example content' as content, + 'test_label' as label, + @category as category + """ + + query_params = [ + bigquery.ScalarQueryParameter("prefix", "STRING", "BigQuery"), + bigquery.ScalarQueryParameter("category", "STRING", "test_data"), + ] + + rows = list( + adapter.get_evaluation_rows( + query=query, + query_params=query_params, + limit=1, + ) + ) + + assert len(rows) == 1, "Should retrieve parameterized query result" + row = rows[0] + + user_msg = row.messages[0] + assert "BigQuery example content" in user_msg.content + assert row.ground_truth == "test_label" + + print(f"Parameterized query test: '{user_msg.content}' -> '{row.ground_truth}'") + + @pytest.mark.skipif( + not os.getenv("GOOGLE_CLOUD_PROJECT"), reason="Google Cloud project required to query public datasets" + ) + def test_bigquery_public_dataset_google_books_ngrams(self): + """Test BigQuery adapter with a public dataset to test specific logic.""" + try: + from eval_protocol.adapters.bigquery import create_bigquery_adapter + except ImportError: + pytest.skip("BigQuery dependencies not installed") + + # Get user's project credentials (needed to run the query job) + project_id, credentials_path = self._get_bigquery_credentials() + + def google_books_transform(row: Dict[str, Any]) -> Dict[str, Any]: + """Transform Google Books ngrams data to evaluation format.""" + term = str(row.get("term", "")) + term_frequency = row.get("term_frequency", 0) + document_frequency = row.get("document_frequency", 0) + tokens = row.get("tokens", []) # This is a REPEATED field (array) + has_tag = row.get("has_tag", False) + years = row.get("years", []) # This is a REPEATED RECORD (array of objects) + + # Create an educational question about the term + system_prompt = ( + """You are a linguistics expert who helps explain word usage patterns from Google Books data.""" + ) + + # Create a question about the term's usage + if tokens and len(tokens) > 0: + tokens_str = ", ".join(str(token) for token in tokens[:3]) # Take first 3 tokens + question = f"What can you tell me about the term '{term}' and its linguistic tokens: {tokens_str}?" + else: + question = f"What can you tell me about the Chinese term '{term}' based on its usage patterns?" + + # Create ground truth based on frequency data + frequency_desc = ( + "high frequency" + if term_frequency > 1000 + else "moderate frequency" if term_frequency > 100 else "low frequency" + ) + document_desc = ( + f"appears in {document_frequency} documents" if document_frequency > 0 else "rare occurrence" + ) + + ground_truth = ( + f"The term '{term}' has {frequency_desc} usage ({term_frequency} occurrences) and {document_desc}." + ) + if has_tag: + ground_truth += " This term has special linguistic tags." + + return { + "messages": [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": question}, + ], + "ground_truth": ground_truth, + "metadata": { + "dataset": "google_books_ngrams_chi_sim", + "term": term, + "term_frequency": term_frequency, + "document_frequency": document_frequency, + "num_tokens": len(tokens) if tokens else 0, + "has_tag": has_tag, + "num_year_records": len(years) if years else 0, + "tokens_sample": tokens[:3] if tokens else [], # Store first 3 tokens as sample + }, + } + + # Create adapter - use YOUR project to run the job, but query PUBLIC data + adapter = create_bigquery_adapter( + transform_fn=google_books_transform, + dataset_id=project_id, # YOUR project (to run the job) + credentials_path=credentials_path, + ) + + # Query the public Google Books ngrams dataset (full table reference in SQL) + query = """ + SELECT + term, + term_frequency, + document_frequency, + tokens, + has_tag, + years + FROM `bigquery-public-data.google_books_ngrams_2020.chi_sim_1` + WHERE term_frequency > 100 + AND document_frequency > 5 + AND LENGTH(term) >= 2 + ORDER BY term_frequency DESC + LIMIT 10 + """ + + # Execute query and get rows + rows = list( + adapter.get_evaluation_rows( + query=query, + limit=3, + model_name="gpt-4", + temperature=0.0, + ) + ) + + # Verify we got data + assert len(rows) > 0, "Should retrieve data from Google Books ngrams dataset" + print(f"Retrieved {len(rows)} evaluation rows from Google Books ngrams") + + # Verify each row is properly formatted + for i, row in enumerate(rows): + assert isinstance(row, EvaluationRow), f"Row {i} should be EvaluationRow" + assert isinstance(row.messages, list), f"Row {i} should have messages list" + assert len(row.messages) >= 2, f"Row {i} should have system + user messages" + + # Check message content + system_msg = row.messages[0] + user_msg = row.messages[1] + assert system_msg.role == "system", f"Row {i} first message should be system" + assert user_msg.role == "user", f"Row {i} second message should be user" + assert "linguistics expert" in system_msg.content, f"Row {i} should have linguistics system prompt" + assert "term" in user_msg.content, f"Row {i} should ask about the term" + + # Verify ground truth + assert row.ground_truth, f"Row {i} should have ground truth" + assert "frequency" in row.ground_truth, f"Row {i} should mention frequency" + + # Verify metadata + assert row.input_metadata, f"Row {i} should have metadata" + dataset_info = row.input_metadata.dataset_info + assert dataset_info["dataset"] == "google_books_ngrams_chi_sim", f"Row {i} should have correct dataset" + assert "term" in dataset_info, f"Row {i} should have term in metadata" + assert "term_frequency" in dataset_info, f"Row {i} should have frequency in metadata" + assert "num_tokens" in dataset_info, f"Row {i} should have token count in metadata" + + # Check repeated fields handling + term = dataset_info["term"] + term_freq = dataset_info["term_frequency"] + doc_freq = dataset_info["document_frequency"] + num_tokens = dataset_info["num_tokens"] + + print(f" Row {i}: Term='{term}', Frequency={term_freq}, Docs={doc_freq}, Tokens={num_tokens}") + + # Verify filtering worked (should have high frequency terms) + assert term_freq > 100, f"Row {i} should have term frequency > 100" + assert doc_freq > 5, f"Row {i} should have document frequency > 5" + + def test_adapters_integration(): """Test that adapters work with evaluation pipeline.""" print("Testing adapter integration with evaluation pipeline...") - + # This test doesn't require external credentials try: from eval_protocol.adapters.huggingface import create_huggingface_adapter from eval_protocol.rewards.accuracy import accuracy_reward except ImportError as e: pytest.skip(f"Dependencies not available: {e}") - + def simple_transform(row: Dict[str, Any]) -> Dict[str, Any]: """Simple transform for testing.""" return { - 'messages': [ - {'role': 'user', 'content': row['question']}, - {'role': 'assistant', 'content': 'Test response'}, # Simulated response + "messages": [ + {"role": "user", "content": row["question"]}, + {"role": "assistant", "content": "Test response"}, # Simulated response ], - 'ground_truth': row['answer'], - 'metadata': {'test': True} + "ground_truth": row["answer"], + "metadata": {"test": True}, } - + # Create adapter with GSM8K (small sample) adapter = create_huggingface_adapter( dataset_id="gsm8k", - config_name="main", + config_name="main", transform_fn=simple_transform, ) - + # Get one row rows = list(adapter.get_evaluation_rows(split="test", limit=1)) assert len(rows) == 1, "Should get exactly one row" - + row = rows[0] - + # Test evaluation result = accuracy_reward( messages=row.messages, ground_truth=row.ground_truth, ) - - assert hasattr(result, 'score'), "Should have evaluation score" + + assert hasattr(result, "score"), "Should have evaluation score" assert 0 <= result.score <= 1, "Score should be between 0 and 1" - + print(f"Integration test successful: Score={result.score}") if __name__ == "__main__": # Run tests manually for development import sys - + print("Running Langfuse E2E tests...") if all([os.getenv("LANGFUSE_PUBLIC_KEY"), os.getenv("LANGFUSE_SECRET_KEY")]): try: @@ -415,20 +725,20 @@ def simple_transform(row: Dict[str, Any]) -> Dict[str, Any]: print(" This is expected if Langfuse API has changed - the adapter needs updating") else: print("⚠️ Skipping Langfuse tests (credentials not available)") - + print("\nRunning HuggingFace E2E tests...") try: test_hf = TestHuggingFaceAdapterE2E() test_hf.test_gsm8k_adapter_real_data() print("✅ GSM8K adapter test passed!") - + # Skip MATH dataset test for now (dataset may not be available) try: test_hf.test_math_dataset_real_data() print("✅ MATH dataset test passed!") except Exception as e: print(f"⚠️ MATH dataset test failed (dataset may not be available): {e}") - + # Skip SQuAD test for now (focus on core functionality) try: test_hf.test_custom_dataset_transform() @@ -439,9 +749,15 @@ def simple_transform(row: Dict[str, Any]) -> Dict[str, Any]: except Exception as e: print(f"❌ HuggingFace tests failed: {e}") sys.exit(1) - - print("\nRunning integration test...") - test_adapters_integration() - print("✅ Integration test passed!") - - print("\n🎉 All E2E tests completed successfully!") \ No newline at end of file + + print("\nRunning BigQuery E2E test...") + try: + test_bq = TestBigQueryAdapterE2E() + # Only test the public Google Books ngrams dataset (no auth required) + test_bq.test_bigquery_public_dataset_google_books_ngrams() + print("✅ BigQuery Google Books ngrams test passed!") + + except Exception as e: + print(f"❌ BigQuery test failed: {e}") + + print("\n🎉 BigQuery E2E test completed successfully!") diff --git a/uv.lock b/uv.lock index d439a47a..0c27bafe 100644 --- a/uv.lock +++ b/uv.lock @@ -585,6 +585,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/84/c2/80633736cd183ee4a62107413def345f7e6e3c01563dbca1417363cf957e/build-1.2.2.post1-py3-none-any.whl", hash = "sha256:1d61c0887fa860c01971625baae8bdd338e517b836a2f70dd1f7aa3a6b2fc5b5", size = 22950, upload-time = "2024-10-06T17:22:23.299Z" }, ] +[[package]] +name = "cachetools" +version = "5.5.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/6c/81/3747dad6b14fa2cf53fcf10548cf5aea6913e96fab41a3c198676f8948a5/cachetools-5.5.2.tar.gz", hash = "sha256:1a661caa9175d26759571b2e19580f9d6393969e5dfca11fdb1f947a23e640d4", size = 28380, upload-time = "2025-02-20T21:01:19.524Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/72/76/20fa66124dbe6be5cafeb312ece67de6b61dd91a0247d1ea13db4ebb33c2/cachetools-5.5.2-py3-none-any.whl", hash = "sha256:d26a22bcc62eb95c3beabd9f1ee5e820d3d2704fe2967cbe350e20c8ffcd3f0a", size = 10080, upload-time = "2025-02-20T21:01:16.647Z" }, +] + [[package]] name = "certifi" version = "2025.7.14" @@ -1158,9 +1167,17 @@ dependencies = [ [package.optional-dependencies] adapters = [ { name = "datasets" }, + { name = "google-auth" }, + { name = "google-auth-oauthlib" }, + { name = "google-cloud-bigquery" }, { name = "langfuse" }, { name = "transformers" }, ] +bigquery = [ + { name = "google-auth" }, + { name = "google-auth-oauthlib" }, + { name = "google-cloud-bigquery" }, +] box2d = [ { name = "gymnasium", extra = ["box2d"] }, { name = "pillow" }, @@ -1248,6 +1265,12 @@ requires-dist = [ { name = "fireworks-ai", marker = "extra == 'fireworks'", specifier = ">=0.19.12" }, { name = "flake8", marker = "extra == 'dev'", specifier = ">=3.9.2" }, { name = "fsspec" }, + { name = "google-auth", marker = "extra == 'adapters'", specifier = ">=2.0.0" }, + { name = "google-auth", marker = "extra == 'bigquery'", specifier = ">=2.0.0" }, + { name = "google-auth-oauthlib", marker = "extra == 'adapters'", specifier = ">=1.0.0" }, + { name = "google-auth-oauthlib", marker = "extra == 'bigquery'", specifier = ">=1.0.0" }, + { name = "google-cloud-bigquery", marker = "extra == 'adapters'", specifier = ">=3.0.0" }, + { name = "google-cloud-bigquery", marker = "extra == 'bigquery'", specifier = ">=3.0.0" }, { name = "gymnasium", specifier = ">=0.29.0" }, { name = "gymnasium", extras = ["box2d"], marker = "extra == 'box2d'", specifier = ">=0.29.0" }, { name = "haikus", marker = "extra == 'dev'", specifier = "==0.3.8" }, @@ -1304,7 +1327,7 @@ requires-dist = [ { name = "websockets", specifier = ">=15.0.1" }, { name = "werkzeug", marker = "extra == 'dev'", specifier = ">=2.0.0" }, ] -provides-extras = ["dev", "trl", "openevals", "fireworks", "box2d", "langfuse", "huggingface", "adapters", "svgbench"] +provides-extras = ["dev", "trl", "openevals", "fireworks", "box2d", "langfuse", "huggingface", "bigquery", "adapters", "svgbench"] [package.metadata.requires-dev] dev = [ @@ -1612,6 +1635,133 @@ http = [ { name = "aiohttp" }, ] +[[package]] +name = "google-api-core" +version = "2.25.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "google-auth" }, + { name = "googleapis-common-protos" }, + { name = "proto-plus" }, + { name = "protobuf" }, + { name = "requests" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/dc/21/e9d043e88222317afdbdb567165fdbc3b0aad90064c7e0c9eb0ad9955ad8/google_api_core-2.25.1.tar.gz", hash = "sha256:d2aaa0b13c78c61cb3f4282c464c046e45fbd75755683c9c525e6e8f7ed0a5e8", size = 165443, upload-time = "2025-06-12T20:52:20.439Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/14/4b/ead00905132820b623732b175d66354e9d3e69fcf2a5dcdab780664e7896/google_api_core-2.25.1-py3-none-any.whl", hash = "sha256:8a2a56c1fef82987a524371f99f3bd0143702fecc670c72e600c1cda6bf8dbb7", size = 160807, upload-time = "2025-06-12T20:52:19.334Z" }, +] + +[package.optional-dependencies] +grpc = [ + { name = "grpcio" }, + { name = "grpcio-status" }, +] + +[[package]] +name = "google-auth" +version = "2.40.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cachetools" }, + { name = "pyasn1-modules" }, + { name = "rsa" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9e/9b/e92ef23b84fa10a64ce4831390b7a4c2e53c0132568d99d4ae61d04c8855/google_auth-2.40.3.tar.gz", hash = "sha256:500c3a29adedeb36ea9cf24b8d10858e152f2412e3ca37829b3fa18e33d63b77", size = 281029, upload-time = "2025-06-04T18:04:57.577Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/17/63/b19553b658a1692443c62bd07e5868adaa0ad746a0751ba62c59568cd45b/google_auth-2.40.3-py2.py3-none-any.whl", hash = "sha256:1370d4593e86213563547f97a92752fc658456fe4514c809544f330fed45a7ca", size = 216137, upload-time = "2025-06-04T18:04:55.573Z" }, +] + +[[package]] +name = "google-auth-oauthlib" +version = "1.2.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "google-auth" }, + { name = "requests-oauthlib" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/fb/87/e10bf24f7bcffc1421b84d6f9c3377c30ec305d082cd737ddaa6d8f77f7c/google_auth_oauthlib-1.2.2.tar.gz", hash = "sha256:11046fb8d3348b296302dd939ace8af0a724042e8029c1b872d87fabc9f41684", size = 20955, upload-time = "2025-04-22T16:40:29.172Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ac/84/40ee070be95771acd2f4418981edb834979424565c3eec3cd88b6aa09d24/google_auth_oauthlib-1.2.2-py3-none-any.whl", hash = "sha256:fd619506f4b3908b5df17b65f39ca8d66ea56986e5472eb5978fd8f3786f00a2", size = 19072, upload-time = "2025-04-22T16:40:28.174Z" }, +] + +[[package]] +name = "google-cloud-bigquery" +version = "3.35.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "google-api-core", extra = ["grpc"] }, + { name = "google-auth" }, + { name = "google-cloud-core" }, + { name = "google-resumable-media" }, + { name = "packaging" }, + { name = "python-dateutil" }, + { name = "requests" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/44/e4/9cf03fa81fefd1b9811a7cd6e398804ae0de3b6a4edef810e2acd45cabbc/google_cloud_bigquery-3.35.1.tar.gz", hash = "sha256:599f26cacf190acfe88000f6cc5f4bc9e6baac7899e4f406ca054f1906f71960", size = 496433, upload-time = "2025-07-24T15:09:04.108Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/63/50/96fe9bc5b83d3a421e91ed8edc2535de45957e9af398273e3ecb5c3a1094/google_cloud_bigquery-3.35.1-py3-none-any.whl", hash = "sha256:6739a6ba63c6d80735ca2b34b1df2090ff473b80c1a62354caa2debe6dbbd961", size = 256877, upload-time = "2025-07-24T15:09:02.443Z" }, +] + +[[package]] +name = "google-cloud-core" +version = "2.4.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "google-api-core" }, + { name = "google-auth" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d6/b8/2b53838d2acd6ec6168fd284a990c76695e84c65deee79c9f3a4276f6b4f/google_cloud_core-2.4.3.tar.gz", hash = "sha256:1fab62d7102844b278fe6dead3af32408b1df3eb06f5c7e8634cbd40edc4da53", size = 35861, upload-time = "2025-03-10T21:05:38.948Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/40/86/bda7241a8da2d28a754aad2ba0f6776e35b67e37c36ae0c45d49370f1014/google_cloud_core-2.4.3-py2.py3-none-any.whl", hash = "sha256:5130f9f4c14b4fafdff75c79448f9495cfade0d8775facf1b09c3bf67e027f6e", size = 29348, upload-time = "2025-03-10T21:05:37.785Z" }, +] + +[[package]] +name = "google-crc32c" +version = "1.7.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/19/ae/87802e6d9f9d69adfaedfcfd599266bf386a54d0be058b532d04c794f76d/google_crc32c-1.7.1.tar.gz", hash = "sha256:2bff2305f98846f3e825dbeec9ee406f89da7962accdb29356e4eadc251bd472", size = 14495, upload-time = "2025-03-26T14:29:13.32Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/eb/69/b1b05cf415df0d86691d6a8b4b7e60ab3a6fb6efb783ee5cd3ed1382bfd3/google_crc32c-1.7.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:b07d48faf8292b4db7c3d64ab86f950c2e94e93a11fd47271c28ba458e4a0d76", size = 30467, upload-time = "2025-03-26T14:31:11.92Z" }, + { url = "https://files.pythonhosted.org/packages/44/3d/92f8928ecd671bd5b071756596971c79d252d09b835cdca5a44177fa87aa/google_crc32c-1.7.1-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:7cc81b3a2fbd932a4313eb53cc7d9dde424088ca3a0337160f35d91826880c1d", size = 30311, upload-time = "2025-03-26T14:53:14.161Z" }, + { url = "https://files.pythonhosted.org/packages/33/42/c2d15a73df79d45ed6b430b9e801d0bd8e28ac139a9012d7d58af50a385d/google_crc32c-1.7.1-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:1c67ca0a1f5b56162951a9dae987988679a7db682d6f97ce0f6381ebf0fbea4c", size = 37889, upload-time = "2025-03-26T14:41:27.83Z" }, + { url = "https://files.pythonhosted.org/packages/57/ea/ac59c86a3c694afd117bb669bde32aaf17d0de4305d01d706495f09cbf19/google_crc32c-1.7.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc5319db92daa516b653600794d5b9f9439a9a121f3e162f94b0e1891c7933cb", size = 33028, upload-time = "2025-03-26T14:41:29.141Z" }, + { url = "https://files.pythonhosted.org/packages/60/44/87e77e8476767a4a93f6cf271157c6d948eacec63688c093580af13b04be/google_crc32c-1.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dcdf5a64adb747610140572ed18d011896e3b9ae5195f2514b7ff678c80f1603", size = 38026, upload-time = "2025-03-26T14:41:29.921Z" }, + { url = "https://files.pythonhosted.org/packages/c8/bf/21ac7bb305cd7c1a6de9c52f71db0868e104a5b573a4977cd9d0ff830f82/google_crc32c-1.7.1-cp310-cp310-win_amd64.whl", hash = "sha256:754561c6c66e89d55754106739e22fdaa93fafa8da7221b29c8b8e8270c6ec8a", size = 33476, upload-time = "2025-03-26T14:29:09.086Z" }, + { url = "https://files.pythonhosted.org/packages/f7/94/220139ea87822b6fdfdab4fb9ba81b3fff7ea2c82e2af34adc726085bffc/google_crc32c-1.7.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:6fbab4b935989e2c3610371963ba1b86afb09537fd0c633049be82afe153ac06", size = 30468, upload-time = "2025-03-26T14:32:52.215Z" }, + { url = "https://files.pythonhosted.org/packages/94/97/789b23bdeeb9d15dc2904660463ad539d0318286d7633fe2760c10ed0c1c/google_crc32c-1.7.1-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:ed66cbe1ed9cbaaad9392b5259b3eba4a9e565420d734e6238813c428c3336c9", size = 30313, upload-time = "2025-03-26T14:57:38.758Z" }, + { url = "https://files.pythonhosted.org/packages/81/b8/976a2b843610c211e7ccb3e248996a61e87dbb2c09b1499847e295080aec/google_crc32c-1.7.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ee6547b657621b6cbed3562ea7826c3e11cab01cd33b74e1f677690652883e77", size = 33048, upload-time = "2025-03-26T14:41:30.679Z" }, + { url = "https://files.pythonhosted.org/packages/c9/16/a3842c2cf591093b111d4a5e2bfb478ac6692d02f1b386d2a33283a19dc9/google_crc32c-1.7.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d68e17bad8f7dd9a49181a1f5a8f4b251c6dbc8cc96fb79f1d321dfd57d66f53", size = 32669, upload-time = "2025-03-26T14:41:31.432Z" }, + { url = "https://files.pythonhosted.org/packages/04/17/ed9aba495916fcf5fe4ecb2267ceb851fc5f273c4e4625ae453350cfd564/google_crc32c-1.7.1-cp311-cp311-win_amd64.whl", hash = "sha256:6335de12921f06e1f774d0dd1fbea6bf610abe0887a1638f64d694013138be5d", size = 33476, upload-time = "2025-03-26T14:29:10.211Z" }, + { url = "https://files.pythonhosted.org/packages/dd/b7/787e2453cf8639c94b3d06c9d61f512234a82e1d12d13d18584bd3049904/google_crc32c-1.7.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:2d73a68a653c57281401871dd4aeebbb6af3191dcac751a76ce430df4d403194", size = 30470, upload-time = "2025-03-26T14:34:31.655Z" }, + { url = "https://files.pythonhosted.org/packages/ed/b4/6042c2b0cbac3ec3a69bb4c49b28d2f517b7a0f4a0232603c42c58e22b44/google_crc32c-1.7.1-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:22beacf83baaf59f9d3ab2bbb4db0fb018da8e5aebdce07ef9f09fce8220285e", size = 30315, upload-time = "2025-03-26T15:01:54.634Z" }, + { url = "https://files.pythonhosted.org/packages/29/ad/01e7a61a5d059bc57b702d9ff6a18b2585ad97f720bd0a0dbe215df1ab0e/google_crc32c-1.7.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:19eafa0e4af11b0a4eb3974483d55d2d77ad1911e6cf6f832e1574f6781fd337", size = 33180, upload-time = "2025-03-26T14:41:32.168Z" }, + { url = "https://files.pythonhosted.org/packages/3b/a5/7279055cf004561894ed3a7bfdf5bf90a53f28fadd01af7cd166e88ddf16/google_crc32c-1.7.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b6d86616faaea68101195c6bdc40c494e4d76f41e07a37ffdef270879c15fb65", size = 32794, upload-time = "2025-03-26T14:41:33.264Z" }, + { url = "https://files.pythonhosted.org/packages/0f/d6/77060dbd140c624e42ae3ece3df53b9d811000729a5c821b9fd671ceaac6/google_crc32c-1.7.1-cp312-cp312-win_amd64.whl", hash = "sha256:b7491bdc0c7564fcf48c0179d2048ab2f7c7ba36b84ccd3a3e1c3f7a72d3bba6", size = 33477, upload-time = "2025-03-26T14:29:10.94Z" }, + { url = "https://files.pythonhosted.org/packages/8b/72/b8d785e9184ba6297a8620c8a37cf6e39b81a8ca01bb0796d7cbb28b3386/google_crc32c-1.7.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:df8b38bdaf1629d62d51be8bdd04888f37c451564c2042d36e5812da9eff3c35", size = 30467, upload-time = "2025-03-26T14:36:06.909Z" }, + { url = "https://files.pythonhosted.org/packages/34/25/5f18076968212067c4e8ea95bf3b69669f9fc698476e5f5eb97d5b37999f/google_crc32c-1.7.1-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:e42e20a83a29aa2709a0cf271c7f8aefaa23b7ab52e53b322585297bb94d4638", size = 30309, upload-time = "2025-03-26T15:06:15.318Z" }, + { url = "https://files.pythonhosted.org/packages/92/83/9228fe65bf70e93e419f38bdf6c5ca5083fc6d32886ee79b450ceefd1dbd/google_crc32c-1.7.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:905a385140bf492ac300026717af339790921f411c0dfd9aa5a9e69a08ed32eb", size = 33133, upload-time = "2025-03-26T14:41:34.388Z" }, + { url = "https://files.pythonhosted.org/packages/c3/ca/1ea2fd13ff9f8955b85e7956872fdb7050c4ace8a2306a6d177edb9cf7fe/google_crc32c-1.7.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b211ddaf20f7ebeec5c333448582c224a7c90a9d98826fbab82c0ddc11348e6", size = 32773, upload-time = "2025-03-26T14:41:35.19Z" }, + { url = "https://files.pythonhosted.org/packages/89/32/a22a281806e3ef21b72db16f948cad22ec68e4bdd384139291e00ff82fe2/google_crc32c-1.7.1-cp313-cp313-win_amd64.whl", hash = "sha256:0f99eaa09a9a7e642a61e06742856eec8b19fc0037832e03f941fe7cf0c8e4db", size = 33475, upload-time = "2025-03-26T14:29:11.771Z" }, + { url = "https://files.pythonhosted.org/packages/b8/c5/002975aff514e57fc084ba155697a049b3f9b52225ec3bc0f542871dd524/google_crc32c-1.7.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:32d1da0d74ec5634a05f53ef7df18fc646666a25efaaca9fc7dcfd4caf1d98c3", size = 33243, upload-time = "2025-03-26T14:41:35.975Z" }, + { url = "https://files.pythonhosted.org/packages/61/cb/c585282a03a0cea70fcaa1bf55d5d702d0f2351094d663ec3be1c6c67c52/google_crc32c-1.7.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e10554d4abc5238823112c2ad7e4560f96c7bf3820b202660373d769d9e6e4c9", size = 32870, upload-time = "2025-03-26T14:41:37.08Z" }, + { url = "https://files.pythonhosted.org/packages/0b/43/31e57ce04530794917dfe25243860ec141de9fadf4aa9783dffe7dac7c39/google_crc32c-1.7.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a8e9afc74168b0b2232fb32dd202c93e46b7d5e4bf03e66ba5dc273bb3559589", size = 28242, upload-time = "2025-03-26T14:41:42.858Z" }, + { url = "https://files.pythonhosted.org/packages/eb/f3/8b84cd4e0ad111e63e30eb89453f8dd308e3ad36f42305cf8c202461cdf0/google_crc32c-1.7.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa8136cc14dd27f34a3221c0f16fd42d8a40e4778273e61a3c19aedaa44daf6b", size = 28049, upload-time = "2025-03-26T14:41:44.651Z" }, + { url = "https://files.pythonhosted.org/packages/16/1b/1693372bf423ada422f80fd88260dbfd140754adb15cbc4d7e9a68b1cb8e/google_crc32c-1.7.1-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:85fef7fae11494e747c9fd1359a527e5970fc9603c90764843caabd3a16a0a48", size = 28241, upload-time = "2025-03-26T14:41:45.898Z" }, + { url = "https://files.pythonhosted.org/packages/fd/3c/2a19a60a473de48717b4efb19398c3f914795b64a96cf3fbe82588044f78/google_crc32c-1.7.1-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6efb97eb4369d52593ad6f75e7e10d053cf00c48983f7a973105bc70b0ac4d82", size = 28048, upload-time = "2025-03-26T14:41:46.696Z" }, +] + +[[package]] +name = "google-resumable-media" +version = "2.7.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "google-crc32c" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/58/5a/0efdc02665dca14e0837b62c8a1a93132c264bd02054a15abb2218afe0ae/google_resumable_media-2.7.2.tar.gz", hash = "sha256:5280aed4629f2b60b847b0d42f9857fd4935c11af266744df33d8074cae92fe0", size = 2163099, upload-time = "2024-08-07T22:20:38.555Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/82/35/b8d3baf8c46695858cb9d8835a53baa1eeb9906ddaf2f728a5f5b640fd1e/google_resumable_media-2.7.2-py2.py3-none-any.whl", hash = "sha256:3ce7551e9fe6d99e9a126101d2536612bb73486721951e9562fee0f90c6ababa", size = 81251, upload-time = "2024-08-07T22:20:36.409Z" }, +] + [[package]] name = "googleapis-common-protos" version = "1.70.0" @@ -1723,6 +1873,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/34/80/de3eb55eb581815342d097214bed4c59e806b05f1b3110df03b2280d6dfd/grpcio-1.74.0-cp313-cp313-win_amd64.whl", hash = "sha256:fd3c71aeee838299c5887230b8a1822795325ddfea635edd82954c1eaa831e24", size = 4489214, upload-time = "2025-07-24T18:53:59.771Z" }, ] +[[package]] +name = "grpcio-status" +version = "1.71.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "googleapis-common-protos" }, + { name = "grpcio" }, + { name = "protobuf" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/fd/d1/b6e9877fedae3add1afdeae1f89d1927d296da9cf977eca0eb08fb8a460e/grpcio_status-1.71.2.tar.gz", hash = "sha256:c7a97e176df71cdc2c179cd1847d7fc86cca5832ad12e9798d7fed6b7a1aab50", size = 13677, upload-time = "2025-06-28T04:24:05.426Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/67/58/317b0134129b556a93a3b0afe00ee675b5657f0155509e22fcb853bafe2d/grpcio_status-1.71.2-py3-none-any.whl", hash = "sha256:803c98cb6a8b7dc6dbb785b1111aed739f241ab5e9da0bba96888aa74704cfd3", size = 14424, upload-time = "2025-06-28T04:23:42.136Z" }, +] + [[package]] name = "grpclib" version = "0.4.8" @@ -3555,6 +3719,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/9e/4e/0d0c945463719429b7bd21dece907ad0bde437a2ff12b9b12fee94722ab0/nvidia_nvtx_cu12-12.6.77-py3-none-manylinux2014_x86_64.whl", hash = "sha256:6574241a3ec5fdc9334353ab8c479fe75841dbe8f4532a8fc97ce63503330ba1", size = 89265, upload-time = "2024-10-01T17:00:38.172Z" }, ] +[[package]] +name = "oauthlib" +version = "3.3.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/0b/5f/19930f824ffeb0ad4372da4812c50edbd1434f678c90c2733e1188edfc63/oauthlib-3.3.1.tar.gz", hash = "sha256:0f0f8aa759826a193cf66c12ea1af1637f87b9b4622d46e866952bb022e538c9", size = 185918, upload-time = "2025-06-19T22:48:08.269Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/be/9c/92789c596b8df838baa98fa71844d84283302f7604ed565dafe5a6b5041a/oauthlib-3.3.1-py3-none-any.whl", hash = "sha256:88119c938d2b8fb88561af5f6ee0eec8cc8d552b7bb1f712743136eb7523b7a1", size = 160065, upload-time = "2025-06-19T22:48:06.508Z" }, +] + [[package]] name = "omegaconf" version = "2.3.0" @@ -4203,6 +4376,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cc/35/cc0aaecf278bb4575b8555f2b137de5ab821595ddae9da9d3cd1da4072c7/propcache-0.3.2-py3-none-any.whl", hash = "sha256:98f1ec44fb675f5052cccc8e609c46ed23a35a1cfd18545ad4e29002d858a43f", size = 12663, upload-time = "2025-06-09T22:56:04.484Z" }, ] +[[package]] +name = "proto-plus" +version = "1.26.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "protobuf" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f4/ac/87285f15f7cce6d4a008f33f1757fb5a13611ea8914eb58c3d0d26243468/proto_plus-1.26.1.tar.gz", hash = "sha256:21a515a4c4c0088a773899e23c7bbade3d18f9c66c73edd4c7ee3816bc96a012", size = 56142, upload-time = "2025-03-10T15:54:38.843Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4e/6d/280c4c2ce28b1593a19ad5239c8b826871fc6ec275c21afc8e1820108039/proto_plus-1.26.1-py3-none-any.whl", hash = "sha256:13285478c2dcf2abb829db158e1047e2f1e8d63a077d94263c2b88b043c75a66", size = 50163, upload-time = "2025-03-10T15:54:37.335Z" }, +] + [[package]] name = "protobuf" version = "5.29.3" @@ -4293,6 +4478,27 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e5/4e/519c1bc1876625fe6b71e9a28287c43ec2f20f73c658b9ae1d485c0c206e/pyarrow-21.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:222c39e2c70113543982c6b34f3077962b44fca38c0bd9e68bb6781534425c10", size = 26371006, upload-time = "2025-07-18T00:56:56.379Z" }, ] +[[package]] +name = "pyasn1" +version = "0.6.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ba/e9/01f1a64245b89f039897cb0130016d79f77d52669aae6ee7b159a6c4c018/pyasn1-0.6.1.tar.gz", hash = "sha256:6f580d2bdd84365380830acf45550f2511469f673cb4a5ae3857a3170128b034", size = 145322, upload-time = "2024-09-10T22:41:42.55Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c8/f1/d6a797abb14f6283c0ddff96bbdd46937f64122b8c925cab503dd37f8214/pyasn1-0.6.1-py3-none-any.whl", hash = "sha256:0d632f46f2ba09143da3a8afe9e33fb6f92fa2320ab7e886e2d0f7672af84629", size = 83135, upload-time = "2024-09-11T16:00:36.122Z" }, +] + +[[package]] +name = "pyasn1-modules" +version = "0.4.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pyasn1" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/e9/e6/78ebbb10a8c8e4b61a59249394a4a594c1a7af95593dc933a349c8d00964/pyasn1_modules-0.4.2.tar.gz", hash = "sha256:677091de870a80aae844b1ca6134f54652fa2c8c5a52aa396440ac3106e941e6", size = 307892, upload-time = "2025-03-28T02:41:22.17Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/47/8d/d529b5d697919ba8c11ad626e835d4039be708a35b0d22de83a269a6682c/pyasn1_modules-0.4.2-py3-none-any.whl", hash = "sha256:29253a9207ce32b64c3ac6600edc75368f98473906e8fd1043bd6b5b1de2c14a", size = 181259, upload-time = "2025-03-28T02:41:19.028Z" }, +] + [[package]] name = "pycares" version = "4.9.0" @@ -4972,6 +5178,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7c/e4/56027c4a6b4ae70ca9de302488c5ca95ad4a39e190093d6c1a8ace08341b/requests-2.32.4-py3-none-any.whl", hash = "sha256:27babd3cda2a6d50b30443204ee89830707d396671944c998b5975b031ac2b2c", size = 64847, upload-time = "2025-06-09T16:43:05.728Z" }, ] +[[package]] +name = "requests-oauthlib" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "oauthlib" }, + { name = "requests" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/42/f2/05f29bc3913aea15eb670be136045bf5c5bbf4b99ecb839da9b422bb2c85/requests-oauthlib-2.0.0.tar.gz", hash = "sha256:b3dffaebd884d8cd778494369603a9e7b58d29111bf6b41bdc2dcd87203af4e9", size = 55650, upload-time = "2024-03-22T20:32:29.939Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3b/5d/63d4ae3b9daea098d5d6f5da83984853c1bbacd5dc826764b249fe119d24/requests_oauthlib-2.0.0-py2.py3-none-any.whl", hash = "sha256:7dd8a5c40426b779b0868c404bdef9768deccf22749cde15852df527e6269b36", size = 24179, upload-time = "2024-03-22T20:32:28.055Z" }, +] + [[package]] name = "requests-toolbelt" version = "1.0.0" @@ -5316,6 +5535,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c8/ed/9de62c2150ca8e2e5858acf3f4f4d0d180a38feef9fdab4078bea63d8dba/rpds_py-0.26.0-pp311-pypy311_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:e99685fc95d386da368013e7fb4269dd39c30d99f812a8372d62f244f662709c", size = 555334, upload-time = "2025-07-01T15:56:51.703Z" }, ] +[[package]] +name = "rsa" +version = "4.9.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pyasn1" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/da/8a/22b7beea3ee0d44b1916c0c1cb0ee3af23b700b6da9f04991899d0c555d4/rsa-4.9.1.tar.gz", hash = "sha256:e7bdbfdb5497da4c07dfd35530e1a902659db6ff241e39d9953cad06ebd0ae75", size = 29034, upload-time = "2025-04-16T09:51:18.218Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/64/8d/0133e4eb4beed9e425d9a98ed6e081a55d195481b7632472be1af08d2f6b/rsa-4.9.1-py3-none-any.whl", hash = "sha256:68635866661c6836b8d39430f97a996acbd61bfa49406748ea243539fe239762", size = 34696, upload-time = "2025-04-16T09:51:17.142Z" }, +] + [[package]] name = "ruff" version = "0.9.10" From f3fd112222d6b297cc486c18f921d39ae4b080b5 Mon Sep 17 00:00:00 2001 From: Yinghan Ma Date: Sat, 16 Aug 2025 16:53:48 -0700 Subject: [PATCH 23/26] fix the message parsing when there is no content only tool call (#88) --- eval_protocol/mcp/execution/base_policy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eval_protocol/mcp/execution/base_policy.py b/eval_protocol/mcp/execution/base_policy.py index 22bad57e..819b33dd 100644 --- a/eval_protocol/mcp/execution/base_policy.py +++ b/eval_protocol/mcp/execution/base_policy.py @@ -182,7 +182,7 @@ async def _generate_live_tool_calls( # This is crucial for proper tool call ID management in add_tool_response assistant_message_for_history = { "role": "assistant", - "content": response["choices"][0]["message"]["content"], + "content": response["choices"][0]["message"].get("content", ""), } usage_stats = CompletionUsage( prompt_tokens=response["usage"]["prompt_tokens"], From 766e47c3f9976a3978058999562414b5cf8c7b2e Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Sun, 17 Aug 2025 11:11:20 -0700 Subject: [PATCH 24/26] remove dataset summary / vite build (#89) --- .../{index-CbFeqRvW.js => index-BXWyy1QT.js} | 42 +++++++++---------- ...-CbFeqRvW.js.map => index-BXWyy1QT.js.map} | 2 +- vite-app/dist/index.html | 2 +- vite-app/src/components/Dashboard.tsx | 11 ----- 4 files changed, 23 insertions(+), 34 deletions(-) rename vite-app/dist/assets/{index-CbFeqRvW.js => index-BXWyy1QT.js} (89%) rename vite-app/dist/assets/{index-CbFeqRvW.js.map => index-BXWyy1QT.js.map} (68%) diff --git a/vite-app/dist/assets/index-CbFeqRvW.js b/vite-app/dist/assets/index-BXWyy1QT.js similarity index 89% rename from vite-app/dist/assets/index-CbFeqRvW.js rename to vite-app/dist/assets/index-BXWyy1QT.js index 37226d6f..82b53bca 100644 --- a/vite-app/dist/assets/index-CbFeqRvW.js +++ b/vite-app/dist/assets/index-BXWyy1QT.js @@ -6,7 +6,7 @@ * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. - */var ab;function l1(){if(ab)return ho;ab=1;var e=Symbol.for("react.transitional.element"),t=Symbol.for("react.fragment");function A(n,i,a){var o=null;if(a!==void 0&&(o=""+a),i.key!==void 0&&(o=""+i.key),"key"in i){a={};for(var u in i)u!=="key"&&(a[u]=i[u])}else a=i;return i=a.ref,{$$typeof:e,type:n,key:o,ref:i!==void 0?i:null,props:a}}return ho.Fragment=t,ho.jsx=A,ho.jsxs=A,ho}var ob;function u1(){return ob||(ob=1,_g.exports=l1()),_g.exports}var x=u1(),Qg={exports:{}},Ct={};/** + */var ab;function l1(){if(ab)return ho;ab=1;var e=Symbol.for("react.transitional.element"),t=Symbol.for("react.fragment");function A(n,i,a){var o=null;if(a!==void 0&&(o=""+a),i.key!==void 0&&(o=""+i.key),"key"in i){a={};for(var u in i)u!=="key"&&(a[u]=i[u])}else a=i;return i=a.ref,{$$typeof:e,type:n,key:o,ref:i!==void 0?i:null,props:a}}return ho.Fragment=t,ho.jsx=A,ho.jsxs=A,ho}var ob;function u1(){return ob||(ob=1,_g.exports=l1()),_g.exports}var U=u1(),Qg={exports:{}},Ct={};/** * @license React * react.production.js * @@ -14,7 +14,7 @@ * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. - */var lb;function c1(){if(lb)return Ct;lb=1;var e=Symbol.for("react.transitional.element"),t=Symbol.for("react.portal"),A=Symbol.for("react.fragment"),n=Symbol.for("react.strict_mode"),i=Symbol.for("react.profiler"),a=Symbol.for("react.consumer"),o=Symbol.for("react.context"),u=Symbol.for("react.forward_ref"),c=Symbol.for("react.suspense"),h=Symbol.for("react.memo"),g=Symbol.for("react.lazy"),B=Symbol.iterator;function m(H){return H===null||typeof H!="object"?null:(H=B&&H[B]||H["@@iterator"],typeof H=="function"?H:null)}var v={isMounted:function(){return!1},enqueueForceUpdate:function(){},enqueueReplaceState:function(){},enqueueSetState:function(){}},b=Object.assign,y={};function _(H,X,tt){this.props=H,this.context=X,this.refs=y,this.updater=tt||v}_.prototype.isReactComponent={},_.prototype.setState=function(H,X){if(typeof H!="object"&&typeof H!="function"&&H!=null)throw Error("takes an object of state variables to update or a function which returns an object of state variables.");this.updater.enqueueSetState(this,H,X,"setState")},_.prototype.forceUpdate=function(H){this.updater.enqueueForceUpdate(this,H,"forceUpdate")};function C(){}C.prototype=_.prototype;function U(H,X,tt){this.props=H,this.context=X,this.refs=y,this.updater=tt||v}var E=U.prototype=new C;E.constructor=U,b(E,_.prototype),E.isPureReactComponent=!0;var O=Array.isArray,F={H:null,A:null,T:null,S:null,V:null},T=Object.prototype.hasOwnProperty;function k(H,X,tt,q,Y,ot){return tt=ot.ref,{$$typeof:e,type:H,key:X,ref:tt!==void 0?tt:null,props:ot}}function P(H,X){return k(H.type,X,void 0,void 0,void 0,H.props)}function N(H){return typeof H=="object"&&H!==null&&H.$$typeof===e}function J(H){var X={"=":"=0",":":"=2"};return"$"+H.replace(/[=:]/g,function(tt){return X[tt]})}var et=/\/+/g;function nt(H,X){return typeof H=="object"&&H!==null&&H.key!=null?J(""+H.key):X.toString(36)}function ct(){}function bt(H){switch(H.status){case"fulfilled":return H.value;case"rejected":throw H.reason;default:switch(typeof H.status=="string"?H.then(ct,ct):(H.status="pending",H.then(function(X){H.status==="pending"&&(H.status="fulfilled",H.value=X)},function(X){H.status==="pending"&&(H.status="rejected",H.reason=X)})),H.status){case"fulfilled":return H.value;case"rejected":throw H.reason}}throw H}function ut(H,X,tt,q,Y){var ot=typeof H;(ot==="undefined"||ot==="boolean")&&(H=null);var st=!1;if(H===null)st=!0;else switch(ot){case"bigint":case"string":case"number":st=!0;break;case"object":switch(H.$$typeof){case e:case t:st=!0;break;case g:return st=H._init,ut(st(H._payload),X,tt,q,Y)}}if(st)return Y=Y(H),st=q===""?"."+nt(H,0):q,O(Y)?(tt="",st!=null&&(tt=st.replace(et,"$&/")+"/"),ut(Y,X,tt,"",function(FA){return FA})):Y!=null&&(N(Y)&&(Y=P(Y,tt+(Y.key==null||H&&H.key===Y.key?"":(""+Y.key).replace(et,"$&/")+"/")+st)),X.push(Y)),1;st=0;var be=q===""?".":q+":";if(O(H))for(var qt=0;qt>>1,H=I[ht];if(0>>1;hti(q,rt))Yi(ot,q)?(I[ht]=ot,I[Y]=rt,ht=Y):(I[ht]=q,I[tt]=rt,ht=tt);else if(Yi(ot,rt))I[ht]=ot,I[Y]=rt,ht=Y;else break t}}return W}function i(I,W){var rt=I.sortIndex-W.sortIndex;return rt!==0?rt:I.id-W.id}if(e.unstable_now=void 0,typeof performance=="object"&&typeof performance.now=="function"){var a=performance;e.unstable_now=function(){return a.now()}}else{var o=Date,u=o.now();e.unstable_now=function(){return o.now()-u}}var c=[],h=[],g=1,B=null,m=3,v=!1,b=!1,y=!1,_=!1,C=typeof setTimeout=="function"?setTimeout:null,U=typeof clearTimeout=="function"?clearTimeout:null,E=typeof setImmediate<"u"?setImmediate:null;function O(I){for(var W=A(h);W!==null;){if(W.callback===null)n(h);else if(W.startTime<=I)n(h),W.sortIndex=W.expirationTime,t(c,W);else break;W=A(h)}}function F(I){if(y=!1,O(I),!b)if(A(c)!==null)b=!0,T||(T=!0,nt());else{var W=A(h);W!==null&&ut(F,W.startTime-I)}}var T=!1,k=-1,P=5,N=-1;function J(){return _?!0:!(e.unstable_now()-NI&&J());){var ht=B.callback;if(typeof ht=="function"){B.callback=null,m=B.priorityLevel;var H=ht(B.expirationTime<=I);if(I=e.unstable_now(),typeof H=="function"){B.callback=H,O(I),W=!0;break e}B===A(c)&&n(c),O(I)}else n(c);B=A(c)}if(B!==null)W=!0;else{var X=A(h);X!==null&&ut(F,X.startTime-I),W=!1}}break t}finally{B=null,m=rt,v=!1}W=void 0}}finally{W?nt():T=!1}}}var nt;if(typeof E=="function")nt=function(){E(et)};else if(typeof MessageChannel<"u"){var ct=new MessageChannel,bt=ct.port2;ct.port1.onmessage=et,nt=function(){bt.postMessage(null)}}else nt=function(){C(et,0)};function ut(I,W){k=C(function(){I(e.unstable_now())},W)}e.unstable_IdlePriority=5,e.unstable_ImmediatePriority=1,e.unstable_LowPriority=4,e.unstable_NormalPriority=3,e.unstable_Profiling=null,e.unstable_UserBlockingPriority=2,e.unstable_cancelCallback=function(I){I.callback=null},e.unstable_forceFrameRate=function(I){0>I||125ht?(I.sortIndex=rt,t(h,I),A(c)===null&&I===A(h)&&(y?(U(k),k=-1):y=!0,ut(F,rt-ht))):(I.sortIndex=H,t(c,I),b||v||(b=!0,T||(T=!0,nt()))),I},e.unstable_shouldYield=J,e.unstable_wrapCallback=function(I){var W=m;return function(){var rt=m;m=W;try{return I.apply(this,arguments)}finally{m=rt}}}}(Fg)),Fg}var fb;function h1(){return fb||(fb=1,Ug.exports=f1()),Ug.exports}var Eg={exports:{}},Ve={};/** + */var cb;function f1(){return cb||(cb=1,function(e){function t(I,W){var rt=I.length;I.push(W);t:for(;0>>1,H=I[ht];if(0>>1;hti(q,rt))Yi(ot,q)?(I[ht]=ot,I[Y]=rt,ht=Y):(I[ht]=q,I[tt]=rt,ht=tt);else if(Yi(ot,rt))I[ht]=ot,I[Y]=rt,ht=Y;else break t}}return W}function i(I,W){var rt=I.sortIndex-W.sortIndex;return rt!==0?rt:I.id-W.id}if(e.unstable_now=void 0,typeof performance=="object"&&typeof performance.now=="function"){var a=performance;e.unstable_now=function(){return a.now()}}else{var o=Date,u=o.now();e.unstable_now=function(){return o.now()-u}}var c=[],h=[],g=1,B=null,m=3,v=!1,b=!1,y=!1,_=!1,C=typeof setTimeout=="function"?setTimeout:null,x=typeof clearTimeout=="function"?clearTimeout:null,E=typeof setImmediate<"u"?setImmediate:null;function O(I){for(var W=A(h);W!==null;){if(W.callback===null)n(h);else if(W.startTime<=I)n(h),W.sortIndex=W.expirationTime,t(c,W);else break;W=A(h)}}function F(I){if(y=!1,O(I),!b)if(A(c)!==null)b=!0,T||(T=!0,nt());else{var W=A(h);W!==null&&ut(F,W.startTime-I)}}var T=!1,k=-1,P=5,N=-1;function J(){return _?!0:!(e.unstable_now()-NI&&J());){var ht=B.callback;if(typeof ht=="function"){B.callback=null,m=B.priorityLevel;var H=ht(B.expirationTime<=I);if(I=e.unstable_now(),typeof H=="function"){B.callback=H,O(I),W=!0;break e}B===A(c)&&n(c),O(I)}else n(c);B=A(c)}if(B!==null)W=!0;else{var X=A(h);X!==null&&ut(F,X.startTime-I),W=!1}}break t}finally{B=null,m=rt,v=!1}W=void 0}}finally{W?nt():T=!1}}}var nt;if(typeof E=="function")nt=function(){E(et)};else if(typeof MessageChannel<"u"){var ct=new MessageChannel,bt=ct.port2;ct.port1.onmessage=et,nt=function(){bt.postMessage(null)}}else nt=function(){C(et,0)};function ut(I,W){k=C(function(){I(e.unstable_now())},W)}e.unstable_IdlePriority=5,e.unstable_ImmediatePriority=1,e.unstable_LowPriority=4,e.unstable_NormalPriority=3,e.unstable_Profiling=null,e.unstable_UserBlockingPriority=2,e.unstable_cancelCallback=function(I){I.callback=null},e.unstable_forceFrameRate=function(I){0>I||125ht?(I.sortIndex=rt,t(h,I),A(c)===null&&I===A(h)&&(y?(x(k),k=-1):y=!0,ut(F,rt-ht))):(I.sortIndex=H,t(c,I),b||v||(b=!0,T||(T=!0,nt()))),I},e.unstable_shouldYield=J,e.unstable_wrapCallback=function(I){var W=m;return function(){var rt=m;m=W;try{return I.apply(this,arguments)}finally{m=rt}}}}(Fg)),Fg}var fb;function h1(){return fb||(fb=1,xg.exports=f1()),xg.exports}var Eg={exports:{}},Ve={};/** * @license React * react-dom.production.js * @@ -38,15 +38,15 @@ * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. - */var gb;function g1(){if(gb)return go;gb=1;var e=h1(),t=Hf(),A=W_();function n(r){var s="https://react.dev/errors/"+r;if(1H||(r.current=ht[H],ht[H]=null,H--)}function q(r,s){H++,ht[H]=r.current,r.current=s}var Y=X(null),ot=X(null),st=X(null),be=X(null);function qt(r,s){switch(q(st,s),q(ot,r),q(Y,null),s.nodeType){case 9:case 11:r=(r=s.documentElement)&&(r=r.namespaceURI)?Mw(r):0;break;default:if(r=s.tagName,s=s.namespaceURI)s=Mw(s),r=Lw(s,r);else switch(r){case"svg":r=1;break;case"math":r=2;break;default:r=0}}tt(Y),q(Y,r)}function FA(){tt(Y),tt(ot),tt(st)}function da(r){r.memoizedState!==null&&q(be,r);var s=Y.current,l=Lw(s,r.type);s!==l&&(q(ot,r),q(Y,l))}function $r(r){ot.current===r&&(tt(Y),tt(ot)),be.current===r&&(tt(be),oo._currentValue=rt)}var ui=Object.prototype.hasOwnProperty,ci=e.unstable_scheduleCallback,ga=e.unstable_cancelCallback,dm=e.unstable_shouldYield,zF=e.unstable_requestPaint,an=e.unstable_now,VF=e.unstable_getCurrentPriorityLevel,gm=e.unstable_ImmediatePriority,pm=e.unstable_UserBlockingPriority,Ul=e.unstable_NormalPriority,PF=e.unstable_LowPriority,Bm=e.unstable_IdlePriority,jF=e.log,GF=e.unstable_setDisableYieldValue,pa=null,fA=null;function fi(r){if(typeof jF=="function"&&GF(r),fA&&typeof fA.setStrictMode=="function")try{fA.setStrictMode(pa,r)}catch{}}var hA=Math.clz32?Math.clz32:YF,XF=Math.log,ZF=Math.LN2;function YF(r){return r>>>=0,r===0?32:31-(XF(r)/ZF|0)|0}var Fl=256,El=4194304;function nr(r){var s=r&42;if(s!==0)return s;switch(r&-r){case 1:return 1;case 2:return 2;case 4:return 4;case 8:return 8;case 16:return 16;case 32:return 32;case 64:return 64;case 128:return 128;case 256:case 512:case 1024:case 2048:case 4096:case 8192:case 16384:case 32768:case 65536:case 131072:case 262144:case 524288:case 1048576:case 2097152:return r&4194048;case 4194304:case 8388608:case 16777216:case 33554432:return r&62914560;case 67108864:return 67108864;case 134217728:return 134217728;case 268435456:return 268435456;case 536870912:return 536870912;case 1073741824:return 0;default:return r}}function Sl(r,s,l){var f=r.pendingLanes;if(f===0)return 0;var d=0,p=r.suspendedLanes,w=r.pingedLanes;r=r.warmLanes;var Q=f&134217727;return Q!==0?(f=Q&~p,f!==0?d=nr(f):(w&=Q,w!==0?d=nr(w):l||(l=Q&~r,l!==0&&(d=nr(l))))):(Q=f&~p,Q!==0?d=nr(Q):w!==0?d=nr(w):l||(l=f&~r,l!==0&&(d=nr(l)))),d===0?0:s!==0&&s!==d&&(s&p)===0&&(p=d&-d,l=s&-s,p>=l||p===32&&(l&4194048)!==0)?s:d}function Ba(r,s){return(r.pendingLanes&~(r.suspendedLanes&~r.pingedLanes)&s)===0}function WF(r,s){switch(r){case 1:case 2:case 4:case 8:case 64:return s+250;case 16:case 32:case 128:case 256:case 512:case 1024:case 2048:case 4096:case 8192:case 16384:case 32768:case 65536:case 131072:case 262144:case 524288:case 1048576:case 2097152:return s+5e3;case 4194304:case 8388608:case 16777216:case 33554432:return-1;case 67108864:case 134217728:case 268435456:case 536870912:case 1073741824:return-1;default:return-1}}function mm(){var r=Fl;return Fl<<=1,(Fl&4194048)===0&&(Fl=256),r}function vm(){var r=El;return El<<=1,(El&62914560)===0&&(El=4194304),r}function hh(r){for(var s=[],l=0;31>l;l++)s.push(r);return s}function ma(r,s){r.pendingLanes|=s,s!==268435456&&(r.suspendedLanes=0,r.pingedLanes=0,r.warmLanes=0)}function $F(r,s,l,f,d,p){var w=r.pendingLanes;r.pendingLanes=l,r.suspendedLanes=0,r.pingedLanes=0,r.warmLanes=0,r.expiredLanes&=l,r.entangledLanes&=l,r.errorRecoveryDisabledLanes&=l,r.shellSuspendCounter=0;var Q=r.entanglements,S=r.expirationTimes,K=r.hiddenUpdates;for(l=w&~l;0H||(r.current=ht[H],ht[H]=null,H--)}function q(r,s){H++,ht[H]=r.current,r.current=s}var Y=X(null),ot=X(null),st=X(null),be=X(null);function qt(r,s){switch(q(st,s),q(ot,r),q(Y,null),s.nodeType){case 9:case 11:r=(r=s.documentElement)&&(r=r.namespaceURI)?Mw(r):0;break;default:if(r=s.tagName,s=s.namespaceURI)s=Mw(s),r=Lw(s,r);else switch(r){case"svg":r=1;break;case"math":r=2;break;default:r=0}}tt(Y),q(Y,r)}function FA(){tt(Y),tt(ot),tt(st)}function da(r){r.memoizedState!==null&&q(be,r);var s=Y.current,l=Lw(s,r.type);s!==l&&(q(ot,r),q(Y,l))}function $r(r){ot.current===r&&(tt(Y),tt(ot)),be.current===r&&(tt(be),oo._currentValue=rt)}var ui=Object.prototype.hasOwnProperty,ci=e.unstable_scheduleCallback,ga=e.unstable_cancelCallback,dm=e.unstable_shouldYield,zF=e.unstable_requestPaint,an=e.unstable_now,VF=e.unstable_getCurrentPriorityLevel,gm=e.unstable_ImmediatePriority,pm=e.unstable_UserBlockingPriority,xl=e.unstable_NormalPriority,PF=e.unstable_LowPriority,Bm=e.unstable_IdlePriority,jF=e.log,GF=e.unstable_setDisableYieldValue,pa=null,fA=null;function fi(r){if(typeof jF=="function"&&GF(r),fA&&typeof fA.setStrictMode=="function")try{fA.setStrictMode(pa,r)}catch{}}var hA=Math.clz32?Math.clz32:YF,XF=Math.log,ZF=Math.LN2;function YF(r){return r>>>=0,r===0?32:31-(XF(r)/ZF|0)|0}var Fl=256,El=4194304;function nr(r){var s=r&42;if(s!==0)return s;switch(r&-r){case 1:return 1;case 2:return 2;case 4:return 4;case 8:return 8;case 16:return 16;case 32:return 32;case 64:return 64;case 128:return 128;case 256:case 512:case 1024:case 2048:case 4096:case 8192:case 16384:case 32768:case 65536:case 131072:case 262144:case 524288:case 1048576:case 2097152:return r&4194048;case 4194304:case 8388608:case 16777216:case 33554432:return r&62914560;case 67108864:return 67108864;case 134217728:return 134217728;case 268435456:return 268435456;case 536870912:return 536870912;case 1073741824:return 0;default:return r}}function Sl(r,s,l){var f=r.pendingLanes;if(f===0)return 0;var d=0,p=r.suspendedLanes,w=r.pingedLanes;r=r.warmLanes;var Q=f&134217727;return Q!==0?(f=Q&~p,f!==0?d=nr(f):(w&=Q,w!==0?d=nr(w):l||(l=Q&~r,l!==0&&(d=nr(l))))):(Q=f&~p,Q!==0?d=nr(Q):w!==0?d=nr(w):l||(l=f&~r,l!==0&&(d=nr(l)))),d===0?0:s!==0&&s!==d&&(s&p)===0&&(p=d&-d,l=s&-s,p>=l||p===32&&(l&4194048)!==0)?s:d}function Ba(r,s){return(r.pendingLanes&~(r.suspendedLanes&~r.pingedLanes)&s)===0}function WF(r,s){switch(r){case 1:case 2:case 4:case 8:case 64:return s+250;case 16:case 32:case 128:case 256:case 512:case 1024:case 2048:case 4096:case 8192:case 16384:case 32768:case 65536:case 131072:case 262144:case 524288:case 1048576:case 2097152:return s+5e3;case 4194304:case 8388608:case 16777216:case 33554432:return-1;case 67108864:case 134217728:case 268435456:case 536870912:case 1073741824:return-1;default:return-1}}function mm(){var r=Fl;return Fl<<=1,(Fl&4194048)===0&&(Fl=256),r}function vm(){var r=El;return El<<=1,(El&62914560)===0&&(El=4194304),r}function hh(r){for(var s=[],l=0;31>l;l++)s.push(r);return s}function ma(r,s){r.pendingLanes|=s,s!==268435456&&(r.suspendedLanes=0,r.pingedLanes=0,r.warmLanes=0)}function $F(r,s,l,f,d,p){var w=r.pendingLanes;r.pendingLanes=l,r.suspendedLanes=0,r.pingedLanes=0,r.warmLanes=0,r.expiredLanes&=l,r.entangledLanes&=l,r.errorRecoveryDisabledLanes&=l,r.shellSuspendCounter=0;var Q=r.entanglements,S=r.expirationTimes,K=r.hiddenUpdates;for(l=w&~l;0)":-1d||S[f]!==K[d]){var j=` `+S[f].replace(" at new "," at ");return r.displayName&&j.includes("")&&(j=j.replace("",r.displayName)),j}while(1<=f&&0<=d);break}}}finally{vh=!1,Error.prepareStackTrace=l}return(l=r?r.displayName||r.name:"")?ns(l):""}function nE(r){switch(r.tag){case 26:case 27:case 5:return ns(r.type);case 16:return ns("Lazy");case 13:return ns("Suspense");case 19:return ns("SuspenseList");case 0:case 15:return wh(r.type,!1);case 11:return wh(r.type.render,!1);case 1:return wh(r.type,!0);case 31:return ns("Activity");default:return""}}function Em(r){try{var s="";do s+=nE(r),r=r.return;while(r);return s}catch(l){return` Error generating stack: `+l.message+` -`+l.stack}}function EA(r){switch(typeof r){case"bigint":case"boolean":case"number":case"string":case"undefined":return r;case"object":return r;default:return""}}function Sm(r){var s=r.type;return(r=r.nodeName)&&r.toLowerCase()==="input"&&(s==="checkbox"||s==="radio")}function iE(r){var s=Sm(r)?"checked":"value",l=Object.getOwnPropertyDescriptor(r.constructor.prototype,s),f=""+r[s];if(!r.hasOwnProperty(s)&&typeof l<"u"&&typeof l.get=="function"&&typeof l.set=="function"){var d=l.get,p=l.set;return Object.defineProperty(r,s,{configurable:!0,get:function(){return d.call(this)},set:function(w){f=""+w,p.call(this,w)}}),Object.defineProperty(r,s,{enumerable:l.enumerable}),{getValue:function(){return f},setValue:function(w){f=""+w},stopTracking:function(){r._valueTracker=null,delete r[s]}}}}function Tl(r){r._valueTracker||(r._valueTracker=iE(r))}function Hm(r){if(!r)return!1;var s=r._valueTracker;if(!s)return!0;var l=s.getValue(),f="";return r&&(f=Sm(r)?r.checked?"true":"false":r.value),r=f,r!==l?(s.setValue(r),!0):!1}function Dl(r){if(r=r||(typeof document<"u"?document:void 0),typeof r>"u")return null;try{return r.activeElement||r.body}catch{return r.body}}var rE=/[\n"\\]/g;function SA(r){return r.replace(rE,function(s){return"\\"+s.charCodeAt(0).toString(16)+" "})}function bh(r,s,l,f,d,p,w,Q){r.name="",w!=null&&typeof w!="function"&&typeof w!="symbol"&&typeof w!="boolean"?r.type=w:r.removeAttribute("type"),s!=null?w==="number"?(s===0&&r.value===""||r.value!=s)&&(r.value=""+EA(s)):r.value!==""+EA(s)&&(r.value=""+EA(s)):w!=="submit"&&w!=="reset"||r.removeAttribute("value"),s!=null?yh(r,w,EA(s)):l!=null?yh(r,w,EA(l)):f!=null&&r.removeAttribute("value"),d==null&&p!=null&&(r.defaultChecked=!!p),d!=null&&(r.checked=d&&typeof d!="function"&&typeof d!="symbol"),Q!=null&&typeof Q!="function"&&typeof Q!="symbol"&&typeof Q!="boolean"?r.name=""+EA(Q):r.removeAttribute("name")}function Om(r,s,l,f,d,p,w,Q){if(p!=null&&typeof p!="function"&&typeof p!="symbol"&&typeof p!="boolean"&&(r.type=p),s!=null||l!=null){if(!(p!=="submit"&&p!=="reset"||s!=null))return;l=l!=null?""+EA(l):"",s=s!=null?""+EA(s):l,Q||s===r.value||(r.value=s),r.defaultValue=s}f=f??d,f=typeof f!="function"&&typeof f!="symbol"&&!!f,r.checked=Q?r.checked:!!f,r.defaultChecked=!!f,w!=null&&typeof w!="function"&&typeof w!="symbol"&&typeof w!="boolean"&&(r.name=w)}function yh(r,s,l){s==="number"&&Dl(r.ownerDocument)===r||r.defaultValue===""+l||(r.defaultValue=""+l)}function is(r,s,l,f){if(r=r.options,s){s={};for(var d=0;d"u"||typeof window.document>"u"||typeof window.document.createElement>"u"),Uh=!1;if(Fn)try{var ya={};Object.defineProperty(ya,"passive",{get:function(){Uh=!0}}),window.addEventListener("test",ya,ya),window.removeEventListener("test",ya,ya)}catch{Uh=!1}var di=null,Fh=null,Ll=null;function Nm(){if(Ll)return Ll;var r,s=Fh,l=s.length,f,d="value"in di?di.value:di.textContent,p=d.length;for(r=0;r=Qa),jm=" ",Gm=!1;function Xm(r,s){switch(r){case"keyup":return TE.indexOf(s.keyCode)!==-1;case"keydown":return s.keyCode!==229;case"keypress":case"mousedown":case"focusout":return!0;default:return!1}}function Zm(r){return r=r.detail,typeof r=="object"&&"data"in r?r.data:null}var os=!1;function ME(r,s){switch(r){case"compositionend":return Zm(s);case"keypress":return s.which!==32?null:(Gm=!0,jm);case"textInput":return r=s.data,r===jm&&Gm?null:r;default:return null}}function LE(r,s){if(os)return r==="compositionend"||!Th&&Xm(r,s)?(r=Nm(),Ll=Fh=di=null,os=!1,r):null;switch(r){case"paste":return null;case"keypress":if(!(s.ctrlKey||s.altKey||s.metaKey)||s.ctrlKey&&s.altKey){if(s.char&&1=s)return{node:l,offset:s-r};r=f}t:{for(;l;){if(l.nextSibling){l=l.nextSibling;break t}l=l.parentNode}l=void 0}l=A0(l)}}function i0(r,s){return r&&s?r===s?!0:r&&r.nodeType===3?!1:s&&s.nodeType===3?i0(r,s.parentNode):"contains"in r?r.contains(s):r.compareDocumentPosition?!!(r.compareDocumentPosition(s)&16):!1:!1}function r0(r){r=r!=null&&r.ownerDocument!=null&&r.ownerDocument.defaultView!=null?r.ownerDocument.defaultView:window;for(var s=Dl(r.document);s instanceof r.HTMLIFrameElement;){try{var l=typeof s.contentWindow.location.href=="string"}catch{l=!1}if(l)r=s.contentWindow;else break;s=Dl(r.document)}return s}function Lh(r){var s=r&&r.nodeName&&r.nodeName.toLowerCase();return s&&(s==="input"&&(r.type==="text"||r.type==="search"||r.type==="tel"||r.type==="url"||r.type==="password")||s==="textarea"||r.contentEditable==="true")}var PE=Fn&&"documentMode"in document&&11>=document.documentMode,ls=null,Ih=null,Ea=null,Rh=!1;function s0(r,s,l){var f=l.window===l?l.document:l.nodeType===9?l:l.ownerDocument;Rh||ls==null||ls!==Dl(f)||(f=ls,"selectionStart"in f&&Lh(f)?f={start:f.selectionStart,end:f.selectionEnd}:(f=(f.ownerDocument&&f.ownerDocument.defaultView||window).getSelection(),f={anchorNode:f.anchorNode,anchorOffset:f.anchorOffset,focusNode:f.focusNode,focusOffset:f.focusOffset}),Ea&&Fa(Ea,f)||(Ea=f,f=xu(Ih,"onSelect"),0>=w,d-=w,Sn=1<<32-hA(s)+d|l<p?p:8;var w=I.T,Q={};I.T=Q,Cd(r,!1,s,l);try{var S=d(),K=I.S;if(K!==null&&K(Q,S),S!==null&&typeof S=="object"&&typeof S.then=="function"){var j=qE(S,f);Pa(r,s,j,vA(r))}else Pa(r,s,f,vA(r))}catch($){Pa(r,s,{then:function(){},status:"rejected",reason:$},vA())}finally{W.p=p,I.T=w}}function iS(){}function bd(r,s,l,f){if(r.tag!==5)throw Error(n(476));var d=av(r).queue;sv(r,d,s,rt,l===null?iS:function(){return ov(r),l(f)})}function av(r){var s=r.memoizedState;if(s!==null)return s;s={memoizedState:rt,baseState:rt,baseQueue:null,queue:{pending:null,lanes:0,dispatch:null,lastRenderedReducer:Dn,lastRenderedState:rt},next:null};var l={};return s.next={memoizedState:l,baseState:l,baseQueue:null,queue:{pending:null,lanes:0,dispatch:null,lastRenderedReducer:Dn,lastRenderedState:l},next:null},r.memoizedState=s,r=r.alternate,r!==null&&(r.memoizedState=s),s}function ov(r){var s=av(r).next.queue;Pa(r,s,{},vA())}function yd(){return ze(oo)}function lv(){return Ce().memoizedState}function uv(){return Ce().memoizedState}function rS(r){for(var s=r.return;s!==null;){switch(s.tag){case 24:case 3:var l=vA();r=Bi(l);var f=mi(s,r,l);f!==null&&(wA(f,s,l),Ra(f,s,l)),s={cache:Jh()},r.payload=s;return}s=s.return}}function sS(r,s,l){var f=vA();l={lane:f,revertLane:0,action:l,hasEagerState:!1,eagerState:null,next:null},su(r)?fv(s,l):(l=zh(r,s,l,f),l!==null&&(wA(l,r,f),hv(l,s,f)))}function cv(r,s,l){var f=vA();Pa(r,s,l,f)}function Pa(r,s,l,f){var d={lane:f,revertLane:0,action:l,hasEagerState:!1,eagerState:null,next:null};if(su(r))fv(s,d);else{var p=r.alternate;if(r.lanes===0&&(p===null||p.lanes===0)&&(p=s.lastRenderedReducer,p!==null))try{var w=s.lastRenderedState,Q=p(w,l);if(d.hasEagerState=!0,d.eagerState=Q,dA(Q,w))return Vl(r,s,d,0),Yt===null&&zl(),!1}catch{}finally{}if(l=zh(r,s,d,f),l!==null)return wA(l,r,f),hv(l,s,f),!0}return!1}function Cd(r,s,l,f){if(f={lane:2,revertLane:eg(),action:f,hasEagerState:!1,eagerState:null,next:null},su(r)){if(s)throw Error(n(479))}else s=zh(r,l,f,2),s!==null&&wA(s,r,2)}function su(r){var s=r.alternate;return r===_t||s!==null&&s===_t}function fv(r,s){vs=tu=!0;var l=r.pending;l===null?s.next=s:(s.next=l.next,l.next=s),r.pending=s}function hv(r,s,l){if((l&4194048)!==0){var f=s.lanes;f&=r.pendingLanes,l|=f,s.lanes=l,bm(r,l)}}var au={readContext:ze,use:Au,useCallback:pe,useContext:pe,useEffect:pe,useImperativeHandle:pe,useLayoutEffect:pe,useInsertionEffect:pe,useMemo:pe,useReducer:pe,useRef:pe,useState:pe,useDebugValue:pe,useDeferredValue:pe,useTransition:pe,useSyncExternalStore:pe,useId:pe,useHostTransitionStatus:pe,useFormState:pe,useActionState:pe,useOptimistic:pe,useMemoCache:pe,useCacheRefresh:pe},dv={readContext:ze,use:Au,useCallback:function(r,s){return AA().memoizedState=[r,s===void 0?null:s],r},useContext:ze,useEffect:$0,useImperativeHandle:function(r,s,l){l=l!=null?l.concat([r]):null,ru(4194308,4,ev.bind(null,s,r),l)},useLayoutEffect:function(r,s){return ru(4194308,4,r,s)},useInsertionEffect:function(r,s){ru(4,2,r,s)},useMemo:function(r,s){var l=AA();s=s===void 0?null:s;var f=r();if(pr){fi(!0);try{r()}finally{fi(!1)}}return l.memoizedState=[f,s],f},useReducer:function(r,s,l){var f=AA();if(l!==void 0){var d=l(s);if(pr){fi(!0);try{l(s)}finally{fi(!1)}}}else d=s;return f.memoizedState=f.baseState=d,r={pending:null,lanes:0,dispatch:null,lastRenderedReducer:r,lastRenderedState:d},f.queue=r,r=r.dispatch=sS.bind(null,_t,r),[f.memoizedState,r]},useRef:function(r){var s=AA();return r={current:r},s.memoizedState=r},useState:function(r){r=Bd(r);var s=r.queue,l=cv.bind(null,_t,s);return s.dispatch=l,[r.memoizedState,l]},useDebugValue:vd,useDeferredValue:function(r,s){var l=AA();return wd(l,r,s)},useTransition:function(){var r=Bd(!1);return r=sv.bind(null,_t,r.queue,!0,!1),AA().memoizedState=r,[!1,r]},useSyncExternalStore:function(r,s,l){var f=_t,d=AA();if(Lt){if(l===void 0)throw Error(n(407));l=l()}else{if(l=s(),Yt===null)throw Error(n(349));(Ht&124)!==0||M0(f,s,l)}d.memoizedState=l;var p={value:l,getSnapshot:s};return d.queue=p,$0(I0.bind(null,f,p,r),[r]),f.flags|=2048,bs(9,iu(),L0.bind(null,f,p,l,s),null),l},useId:function(){var r=AA(),s=Yt.identifierPrefix;if(Lt){var l=Hn,f=Sn;l=(f&~(1<<32-hA(f)-1)).toString(32)+l,s="«"+s+"R"+l,l=eu++,0mt?(He=pt,pt=null):He=pt.sibling;var Tt=z(L,pt,R[mt],Z);if(Tt===null){pt===null&&(pt=He);break}r&&pt&&Tt.alternate===null&&s(L,pt),D=p(Tt,D,mt),Qt===null?lt=Tt:Qt.sibling=Tt,Qt=Tt,pt=He}if(mt===R.length)return l(L,pt),Lt&&ur(L,mt),lt;if(pt===null){for(;mtmt?(He=pt,pt=null):He=pt.sibling;var Mi=z(L,pt,Tt.value,Z);if(Mi===null){pt===null&&(pt=He);break}r&&pt&&Mi.alternate===null&&s(L,pt),D=p(Mi,D,mt),Qt===null?lt=Mi:Qt.sibling=Mi,Qt=Mi,pt=He}if(Tt.done)return l(L,pt),Lt&&ur(L,mt),lt;if(pt===null){for(;!Tt.done;mt++,Tt=R.next())Tt=$(L,Tt.value,Z),Tt!==null&&(D=p(Tt,D,mt),Qt===null?lt=Tt:Qt.sibling=Tt,Qt=Tt);return Lt&&ur(L,mt),lt}for(pt=f(pt);!Tt.done;mt++,Tt=R.next())Tt=V(pt,L,mt,Tt.value,Z),Tt!==null&&(r&&Tt.alternate!==null&&pt.delete(Tt.key===null?mt:Tt.key),D=p(Tt,D,mt),Qt===null?lt=Tt:Qt.sibling=Tt,Qt=Tt);return r&&pt.forEach(function(o1){return s(L,o1)}),Lt&&ur(L,mt),lt}function Vt(L,D,R,Z){if(typeof R=="object"&&R!==null&&R.type===b&&R.key===null&&(R=R.props.children),typeof R=="object"&&R!==null){switch(R.$$typeof){case m:t:{for(var lt=R.key;D!==null;){if(D.key===lt){if(lt=R.type,lt===b){if(D.tag===7){l(L,D.sibling),Z=d(D,R.props.children),Z.return=L,L=Z;break t}}else if(D.elementType===lt||typeof lt=="object"&<!==null&<.$$typeof===P&&pv(lt)===D.type){l(L,D.sibling),Z=d(D,R.props),Ga(Z,R),Z.return=L,L=Z;break t}l(L,D);break}else s(L,D);D=D.sibling}R.type===b?(Z=or(R.props.children,L.mode,Z,R.key),Z.return=L,L=Z):(Z=jl(R.type,R.key,R.props,null,L.mode,Z),Ga(Z,R),Z.return=L,L=Z)}return w(L);case v:t:{for(lt=R.key;D!==null;){if(D.key===lt)if(D.tag===4&&D.stateNode.containerInfo===R.containerInfo&&D.stateNode.implementation===R.implementation){l(L,D.sibling),Z=d(D,R.children||[]),Z.return=L,L=Z;break t}else{l(L,D);break}else s(L,D);D=D.sibling}Z=jh(R,L.mode,Z),Z.return=L,L=Z}return w(L);case P:return lt=R._init,R=lt(R._payload),Vt(L,D,R,Z)}if(ut(R))return vt(L,D,R,Z);if(nt(R)){if(lt=nt(R),typeof lt!="function")throw Error(n(150));return R=lt.call(R),Bt(L,D,R,Z)}if(typeof R.then=="function")return Vt(L,D,ou(R),Z);if(R.$$typeof===E)return Vt(L,D,Yl(L,R),Z);lu(L,R)}return typeof R=="string"&&R!==""||typeof R=="number"||typeof R=="bigint"?(R=""+R,D!==null&&D.tag===6?(l(L,D.sibling),Z=d(D,R),Z.return=L,L=Z):(l(L,D),Z=Ph(R,L.mode,Z),Z.return=L,L=Z),w(L)):l(L,D)}return function(L,D,R,Z){try{ja=0;var lt=Vt(L,D,R,Z);return ys=null,lt}catch(pt){if(pt===La||pt===$l)throw pt;var Qt=gA(29,pt,null,L.mode);return Qt.lanes=Z,Qt.return=L,Qt}finally{}}}var Cs=Bv(!0),mv=Bv(!1),MA=X(null),ln=null;function wi(r){var s=r.alternate;q(Qe,Qe.current&1),q(MA,r),ln===null&&(s===null||ms.current!==null||s.memoizedState!==null)&&(ln=r)}function vv(r){if(r.tag===22){if(q(Qe,Qe.current),q(MA,r),ln===null){var s=r.alternate;s!==null&&s.memoizedState!==null&&(ln=r)}}else bi()}function bi(){q(Qe,Qe.current),q(MA,MA.current)}function Mn(r){tt(MA),ln===r&&(ln=null),tt(Qe)}var Qe=X(0);function uu(r){for(var s=r;s!==null;){if(s.tag===13){var l=s.memoizedState;if(l!==null&&(l=l.dehydrated,l===null||l.data==="$?"||hg(l)))return s}else if(s.tag===19&&s.memoizedProps.revealOrder!==void 0){if((s.flags&128)!==0)return s}else if(s.child!==null){s.child.return=s,s=s.child;continue}if(s===r)break;for(;s.sibling===null;){if(s.return===null||s.return===r)return null;s=s.return}s.sibling.return=s.return,s=s.sibling}return null}function _d(r,s,l,f){s=r.memoizedState,l=l(f,s),l=l==null?s:g({},s,l),r.memoizedState=l,r.lanes===0&&(r.updateQueue.baseState=l)}var Qd={enqueueSetState:function(r,s,l){r=r._reactInternals;var f=vA(),d=Bi(f);d.payload=s,l!=null&&(d.callback=l),s=mi(r,d,f),s!==null&&(wA(s,r,f),Ra(s,r,f))},enqueueReplaceState:function(r,s,l){r=r._reactInternals;var f=vA(),d=Bi(f);d.tag=1,d.payload=s,l!=null&&(d.callback=l),s=mi(r,d,f),s!==null&&(wA(s,r,f),Ra(s,r,f))},enqueueForceUpdate:function(r,s){r=r._reactInternals;var l=vA(),f=Bi(l);f.tag=2,s!=null&&(f.callback=s),s=mi(r,f,l),s!==null&&(wA(s,r,l),Ra(s,r,l))}};function wv(r,s,l,f,d,p,w){return r=r.stateNode,typeof r.shouldComponentUpdate=="function"?r.shouldComponentUpdate(f,p,w):s.prototype&&s.prototype.isPureReactComponent?!Fa(l,f)||!Fa(d,p):!0}function bv(r,s,l,f){r=s.state,typeof s.componentWillReceiveProps=="function"&&s.componentWillReceiveProps(l,f),typeof s.UNSAFE_componentWillReceiveProps=="function"&&s.UNSAFE_componentWillReceiveProps(l,f),s.state!==r&&Qd.enqueueReplaceState(s,s.state,null)}function Br(r,s){var l=s;if("ref"in s){l={};for(var f in s)f!=="ref"&&(l[f]=s[f])}if(r=r.defaultProps){l===s&&(l=g({},l));for(var d in r)l[d]===void 0&&(l[d]=r[d])}return l}var cu=typeof reportError=="function"?reportError:function(r){if(typeof window=="object"&&typeof window.ErrorEvent=="function"){var s=new window.ErrorEvent("error",{bubbles:!0,cancelable:!0,message:typeof r=="object"&&r!==null&&typeof r.message=="string"?String(r.message):String(r),error:r});if(!window.dispatchEvent(s))return}else if(typeof process=="object"&&typeof process.emit=="function"){process.emit("uncaughtException",r);return}console.error(r)};function yv(r){cu(r)}function Cv(r){console.error(r)}function _v(r){cu(r)}function fu(r,s){try{var l=r.onUncaughtError;l(s.value,{componentStack:s.stack})}catch(f){setTimeout(function(){throw f})}}function Qv(r,s,l){try{var f=r.onCaughtError;f(l.value,{componentStack:l.stack,errorBoundary:s.tag===1?s.stateNode:null})}catch(d){setTimeout(function(){throw d})}}function xd(r,s,l){return l=Bi(l),l.tag=3,l.payload={element:null},l.callback=function(){fu(r,s)},l}function xv(r){return r=Bi(r),r.tag=3,r}function Uv(r,s,l,f){var d=l.type.getDerivedStateFromError;if(typeof d=="function"){var p=f.value;r.payload=function(){return d(p)},r.callback=function(){Qv(s,l,f)}}var w=l.stateNode;w!==null&&typeof w.componentDidCatch=="function"&&(r.callback=function(){Qv(s,l,f),typeof d!="function"&&(Ui===null?Ui=new Set([this]):Ui.add(this));var Q=f.stack;this.componentDidCatch(f.value,{componentStack:Q!==null?Q:""})})}function oS(r,s,l,f,d){if(l.flags|=32768,f!==null&&typeof f=="object"&&typeof f.then=="function"){if(s=l.alternate,s!==null&&Ta(s,l,d,!0),l=MA.current,l!==null){switch(l.tag){case 13:return ln===null?Wd():l.alternate===null&&he===0&&(he=3),l.flags&=-257,l.flags|=65536,l.lanes=d,f===ed?l.flags|=16384:(s=l.updateQueue,s===null?l.updateQueue=new Set([f]):s.add(f),Jd(r,f,d)),!1;case 22:return l.flags|=65536,f===ed?l.flags|=16384:(s=l.updateQueue,s===null?(s={transitions:null,markerInstances:null,retryQueue:new Set([f])},l.updateQueue=s):(l=s.retryQueue,l===null?s.retryQueue=new Set([f]):l.add(f)),Jd(r,f,d)),!1}throw Error(n(435,l.tag))}return Jd(r,f,d),Wd(),!1}if(Lt)return s=MA.current,s!==null?((s.flags&65536)===0&&(s.flags|=256),s.flags|=65536,s.lanes=d,f!==Zh&&(r=Error(n(422),{cause:f}),Oa(HA(r,l)))):(f!==Zh&&(s=Error(n(423),{cause:f}),Oa(HA(s,l))),r=r.current.alternate,r.flags|=65536,d&=-d,r.lanes|=d,f=HA(f,l),d=xd(r.stateNode,f,d),id(r,d),he!==4&&(he=2)),!1;var p=Error(n(520),{cause:f});if(p=HA(p,l),qa===null?qa=[p]:qa.push(p),he!==4&&(he=2),s===null)return!0;f=HA(f,l),l=s;do{switch(l.tag){case 3:return l.flags|=65536,r=d&-d,l.lanes|=r,r=xd(l.stateNode,f,r),id(l,r),!1;case 1:if(s=l.type,p=l.stateNode,(l.flags&128)===0&&(typeof s.getDerivedStateFromError=="function"||p!==null&&typeof p.componentDidCatch=="function"&&(Ui===null||!Ui.has(p))))return l.flags|=65536,d&=-d,l.lanes|=d,d=xv(d),Uv(d,r,l,f),id(l,d),!1}l=l.return}while(l!==null);return!1}var Fv=Error(n(461)),Ee=!1;function Me(r,s,l,f){s.child=r===null?mv(s,null,l,f):Cs(s,r.child,l,f)}function Ev(r,s,l,f,d){l=l.render;var p=s.ref;if("ref"in f){var w={};for(var Q in f)Q!=="ref"&&(w[Q]=f[Q])}else w=f;return dr(s),f=ld(r,s,l,w,p,d),Q=ud(),r!==null&&!Ee?(cd(r,s,d),Ln(r,s,d)):(Lt&&Q&&Gh(s),s.flags|=1,Me(r,s,f,d),s.child)}function Sv(r,s,l,f,d){if(r===null){var p=l.type;return typeof p=="function"&&!Vh(p)&&p.defaultProps===void 0&&l.compare===null?(s.tag=15,s.type=p,Hv(r,s,p,f,d)):(r=jl(l.type,null,f,s,s.mode,d),r.ref=s.ref,r.return=s,s.child=r)}if(p=r.child,!Dd(r,d)){var w=p.memoizedProps;if(l=l.compare,l=l!==null?l:Fa,l(w,f)&&r.ref===s.ref)return Ln(r,s,d)}return s.flags|=1,r=En(p,f),r.ref=s.ref,r.return=s,s.child=r}function Hv(r,s,l,f,d){if(r!==null){var p=r.memoizedProps;if(Fa(p,f)&&r.ref===s.ref)if(Ee=!1,s.pendingProps=f=p,Dd(r,d))(r.flags&131072)!==0&&(Ee=!0);else return s.lanes=r.lanes,Ln(r,s,d)}return Ud(r,s,l,f,d)}function Ov(r,s,l){var f=s.pendingProps,d=f.children,p=r!==null?r.memoizedState:null;if(f.mode==="hidden"){if((s.flags&128)!==0){if(f=p!==null?p.baseLanes|l:l,r!==null){for(d=s.child=r.child,p=0;d!==null;)p=p|d.lanes|d.childLanes,d=d.sibling;s.childLanes=p&~f}else s.childLanes=0,s.child=null;return Tv(r,s,f,l)}if((l&536870912)!==0)s.memoizedState={baseLanes:0,cachePool:null},r!==null&&Wl(s,p!==null?p.cachePool:null),p!==null?H0(s,p):sd(),vv(s);else return s.lanes=s.childLanes=536870912,Tv(r,s,p!==null?p.baseLanes|l:l,l)}else p!==null?(Wl(s,p.cachePool),H0(s,p),bi(),s.memoizedState=null):(r!==null&&Wl(s,null),sd(),bi());return Me(r,s,d,l),s.child}function Tv(r,s,l,f){var d=td();return d=d===null?null:{parent:_e._currentValue,pool:d},s.memoizedState={baseLanes:l,cachePool:d},r!==null&&Wl(s,null),sd(),vv(s),r!==null&&Ta(r,s,f,!0),null}function hu(r,s){var l=s.ref;if(l===null)r!==null&&r.ref!==null&&(s.flags|=4194816);else{if(typeof l!="function"&&typeof l!="object")throw Error(n(284));(r===null||r.ref!==l)&&(s.flags|=4194816)}}function Ud(r,s,l,f,d){return dr(s),l=ld(r,s,l,f,void 0,d),f=ud(),r!==null&&!Ee?(cd(r,s,d),Ln(r,s,d)):(Lt&&f&&Gh(s),s.flags|=1,Me(r,s,l,d),s.child)}function Dv(r,s,l,f,d,p){return dr(s),s.updateQueue=null,l=T0(s,f,l,d),O0(r),f=ud(),r!==null&&!Ee?(cd(r,s,p),Ln(r,s,p)):(Lt&&f&&Gh(s),s.flags|=1,Me(r,s,l,p),s.child)}function Mv(r,s,l,f,d){if(dr(s),s.stateNode===null){var p=hs,w=l.contextType;typeof w=="object"&&w!==null&&(p=ze(w)),p=new l(f,p),s.memoizedState=p.state!==null&&p.state!==void 0?p.state:null,p.updater=Qd,s.stateNode=p,p._reactInternals=s,p=s.stateNode,p.props=f,p.state=s.memoizedState,p.refs={},Ad(s),w=l.contextType,p.context=typeof w=="object"&&w!==null?ze(w):hs,p.state=s.memoizedState,w=l.getDerivedStateFromProps,typeof w=="function"&&(_d(s,l,w,f),p.state=s.memoizedState),typeof l.getDerivedStateFromProps=="function"||typeof p.getSnapshotBeforeUpdate=="function"||typeof p.UNSAFE_componentWillMount!="function"&&typeof p.componentWillMount!="function"||(w=p.state,typeof p.componentWillMount=="function"&&p.componentWillMount(),typeof p.UNSAFE_componentWillMount=="function"&&p.UNSAFE_componentWillMount(),w!==p.state&&Qd.enqueueReplaceState(p,p.state,null),Ka(s,f,p,d),Na(),p.state=s.memoizedState),typeof p.componentDidMount=="function"&&(s.flags|=4194308),f=!0}else if(r===null){p=s.stateNode;var Q=s.memoizedProps,S=Br(l,Q);p.props=S;var K=p.context,j=l.contextType;w=hs,typeof j=="object"&&j!==null&&(w=ze(j));var $=l.getDerivedStateFromProps;j=typeof $=="function"||typeof p.getSnapshotBeforeUpdate=="function",Q=s.pendingProps!==Q,j||typeof p.UNSAFE_componentWillReceiveProps!="function"&&typeof p.componentWillReceiveProps!="function"||(Q||K!==w)&&bv(s,p,f,w),pi=!1;var z=s.memoizedState;p.state=z,Ka(s,f,p,d),Na(),K=s.memoizedState,Q||z!==K||pi?(typeof $=="function"&&(_d(s,l,$,f),K=s.memoizedState),(S=pi||wv(s,l,S,f,z,K,w))?(j||typeof p.UNSAFE_componentWillMount!="function"&&typeof p.componentWillMount!="function"||(typeof p.componentWillMount=="function"&&p.componentWillMount(),typeof p.UNSAFE_componentWillMount=="function"&&p.UNSAFE_componentWillMount()),typeof p.componentDidMount=="function"&&(s.flags|=4194308)):(typeof p.componentDidMount=="function"&&(s.flags|=4194308),s.memoizedProps=f,s.memoizedState=K),p.props=f,p.state=K,p.context=w,f=S):(typeof p.componentDidMount=="function"&&(s.flags|=4194308),f=!1)}else{p=s.stateNode,nd(r,s),w=s.memoizedProps,j=Br(l,w),p.props=j,$=s.pendingProps,z=p.context,K=l.contextType,S=hs,typeof K=="object"&&K!==null&&(S=ze(K)),Q=l.getDerivedStateFromProps,(K=typeof Q=="function"||typeof p.getSnapshotBeforeUpdate=="function")||typeof p.UNSAFE_componentWillReceiveProps!="function"&&typeof p.componentWillReceiveProps!="function"||(w!==$||z!==S)&&bv(s,p,f,S),pi=!1,z=s.memoizedState,p.state=z,Ka(s,f,p,d),Na();var V=s.memoizedState;w!==$||z!==V||pi||r!==null&&r.dependencies!==null&&Zl(r.dependencies)?(typeof Q=="function"&&(_d(s,l,Q,f),V=s.memoizedState),(j=pi||wv(s,l,j,f,z,V,S)||r!==null&&r.dependencies!==null&&Zl(r.dependencies))?(K||typeof p.UNSAFE_componentWillUpdate!="function"&&typeof p.componentWillUpdate!="function"||(typeof p.componentWillUpdate=="function"&&p.componentWillUpdate(f,V,S),typeof p.UNSAFE_componentWillUpdate=="function"&&p.UNSAFE_componentWillUpdate(f,V,S)),typeof p.componentDidUpdate=="function"&&(s.flags|=4),typeof p.getSnapshotBeforeUpdate=="function"&&(s.flags|=1024)):(typeof p.componentDidUpdate!="function"||w===r.memoizedProps&&z===r.memoizedState||(s.flags|=4),typeof p.getSnapshotBeforeUpdate!="function"||w===r.memoizedProps&&z===r.memoizedState||(s.flags|=1024),s.memoizedProps=f,s.memoizedState=V),p.props=f,p.state=V,p.context=S,f=j):(typeof p.componentDidUpdate!="function"||w===r.memoizedProps&&z===r.memoizedState||(s.flags|=4),typeof p.getSnapshotBeforeUpdate!="function"||w===r.memoizedProps&&z===r.memoizedState||(s.flags|=1024),f=!1)}return p=f,hu(r,s),f=(s.flags&128)!==0,p||f?(p=s.stateNode,l=f&&typeof l.getDerivedStateFromError!="function"?null:p.render(),s.flags|=1,r!==null&&f?(s.child=Cs(s,r.child,null,d),s.child=Cs(s,null,l,d)):Me(r,s,l,d),s.memoizedState=p.state,r=s.child):r=Ln(r,s,d),r}function Lv(r,s,l,f){return Ha(),s.flags|=256,Me(r,s,l,f),s.child}var Fd={dehydrated:null,treeContext:null,retryLane:0,hydrationErrors:null};function Ed(r){return{baseLanes:r,cachePool:C0()}}function Sd(r,s,l){return r=r!==null?r.childLanes&~l:0,s&&(r|=LA),r}function Iv(r,s,l){var f=s.pendingProps,d=!1,p=(s.flags&128)!==0,w;if((w=p)||(w=r!==null&&r.memoizedState===null?!1:(Qe.current&2)!==0),w&&(d=!0,s.flags&=-129),w=(s.flags&32)!==0,s.flags&=-33,r===null){if(Lt){if(d?wi(s):bi(),Lt){var Q=fe,S;if(S=Q){t:{for(S=Q,Q=on;S.nodeType!==8;){if(!Q){Q=null;break t}if(S=ZA(S.nextSibling),S===null){Q=null;break t}}Q=S}Q!==null?(s.memoizedState={dehydrated:Q,treeContext:lr!==null?{id:Sn,overflow:Hn}:null,retryLane:536870912,hydrationErrors:null},S=gA(18,null,null,0),S.stateNode=Q,S.return=s,s.child=S,We=s,fe=null,S=!0):S=!1}S||fr(s)}if(Q=s.memoizedState,Q!==null&&(Q=Q.dehydrated,Q!==null))return hg(Q)?s.lanes=32:s.lanes=536870912,null;Mn(s)}return Q=f.children,f=f.fallback,d?(bi(),d=s.mode,Q=du({mode:"hidden",children:Q},d),f=or(f,d,l,null),Q.return=s,f.return=s,Q.sibling=f,s.child=Q,d=s.child,d.memoizedState=Ed(l),d.childLanes=Sd(r,w,l),s.memoizedState=Fd,f):(wi(s),Hd(s,Q))}if(S=r.memoizedState,S!==null&&(Q=S.dehydrated,Q!==null)){if(p)s.flags&256?(wi(s),s.flags&=-257,s=Od(r,s,l)):s.memoizedState!==null?(bi(),s.child=r.child,s.flags|=128,s=null):(bi(),d=f.fallback,Q=s.mode,f=du({mode:"visible",children:f.children},Q),d=or(d,Q,l,null),d.flags|=2,f.return=s,d.return=s,f.sibling=d,s.child=f,Cs(s,r.child,null,l),f=s.child,f.memoizedState=Ed(l),f.childLanes=Sd(r,w,l),s.memoizedState=Fd,s=d);else if(wi(s),hg(Q)){if(w=Q.nextSibling&&Q.nextSibling.dataset,w)var K=w.dgst;w=K,f=Error(n(419)),f.stack="",f.digest=w,Oa({value:f,source:null,stack:null}),s=Od(r,s,l)}else if(Ee||Ta(r,s,l,!1),w=(l&r.childLanes)!==0,Ee||w){if(w=Yt,w!==null&&(f=l&-l,f=(f&42)!==0?1:dh(f),f=(f&(w.suspendedLanes|l))!==0?0:f,f!==0&&f!==S.retryLane))throw S.retryLane=f,fs(r,f),wA(w,r,f),Fv;Q.data==="$?"||Wd(),s=Od(r,s,l)}else Q.data==="$?"?(s.flags|=192,s.child=r.child,s=null):(r=S.treeContext,fe=ZA(Q.nextSibling),We=s,Lt=!0,cr=null,on=!1,r!==null&&(TA[DA++]=Sn,TA[DA++]=Hn,TA[DA++]=lr,Sn=r.id,Hn=r.overflow,lr=s),s=Hd(s,f.children),s.flags|=4096);return s}return d?(bi(),d=f.fallback,Q=s.mode,S=r.child,K=S.sibling,f=En(S,{mode:"hidden",children:f.children}),f.subtreeFlags=S.subtreeFlags&65011712,K!==null?d=En(K,d):(d=or(d,Q,l,null),d.flags|=2),d.return=s,f.return=s,f.sibling=d,s.child=f,f=d,d=s.child,Q=r.child.memoizedState,Q===null?Q=Ed(l):(S=Q.cachePool,S!==null?(K=_e._currentValue,S=S.parent!==K?{parent:K,pool:K}:S):S=C0(),Q={baseLanes:Q.baseLanes|l,cachePool:S}),d.memoizedState=Q,d.childLanes=Sd(r,w,l),s.memoizedState=Fd,f):(wi(s),l=r.child,r=l.sibling,l=En(l,{mode:"visible",children:f.children}),l.return=s,l.sibling=null,r!==null&&(w=s.deletions,w===null?(s.deletions=[r],s.flags|=16):w.push(r)),s.child=l,s.memoizedState=null,l)}function Hd(r,s){return s=du({mode:"visible",children:s},r.mode),s.return=r,r.child=s}function du(r,s){return r=gA(22,r,null,s),r.lanes=0,r.stateNode={_visibility:1,_pendingMarkers:null,_retryCache:null,_transitions:null},r}function Od(r,s,l){return Cs(s,r.child,null,l),r=Hd(s,s.pendingProps.children),r.flags|=2,s.memoizedState=null,r}function Rv(r,s,l){r.lanes|=s;var f=r.alternate;f!==null&&(f.lanes|=s),Wh(r.return,s,l)}function Td(r,s,l,f,d){var p=r.memoizedState;p===null?r.memoizedState={isBackwards:s,rendering:null,renderingStartTime:0,last:f,tail:l,tailMode:d}:(p.isBackwards=s,p.rendering=null,p.renderingStartTime=0,p.last=f,p.tail=l,p.tailMode=d)}function Nv(r,s,l){var f=s.pendingProps,d=f.revealOrder,p=f.tail;if(Me(r,s,f.children,l),f=Qe.current,(f&2)!==0)f=f&1|2,s.flags|=128;else{if(r!==null&&(r.flags&128)!==0)t:for(r=s.child;r!==null;){if(r.tag===13)r.memoizedState!==null&&Rv(r,l,s);else if(r.tag===19)Rv(r,l,s);else if(r.child!==null){r.child.return=r,r=r.child;continue}if(r===s)break t;for(;r.sibling===null;){if(r.return===null||r.return===s)break t;r=r.return}r.sibling.return=r.return,r=r.sibling}f&=1}switch(q(Qe,f),d){case"forwards":for(l=s.child,d=null;l!==null;)r=l.alternate,r!==null&&uu(r)===null&&(d=l),l=l.sibling;l=d,l===null?(d=s.child,s.child=null):(d=l.sibling,l.sibling=null),Td(s,!1,d,l,p);break;case"backwards":for(l=null,d=s.child,s.child=null;d!==null;){if(r=d.alternate,r!==null&&uu(r)===null){s.child=d;break}r=d.sibling,d.sibling=l,l=d,d=r}Td(s,!0,l,null,p);break;case"together":Td(s,!1,null,null,void 0);break;default:s.memoizedState=null}return s.child}function Ln(r,s,l){if(r!==null&&(s.dependencies=r.dependencies),xi|=s.lanes,(l&s.childLanes)===0)if(r!==null){if(Ta(r,s,l,!1),(l&s.childLanes)===0)return null}else return null;if(r!==null&&s.child!==r.child)throw Error(n(153));if(s.child!==null){for(r=s.child,l=En(r,r.pendingProps),s.child=l,l.return=s;r.sibling!==null;)r=r.sibling,l=l.sibling=En(r,r.pendingProps),l.return=s;l.sibling=null}return s.child}function Dd(r,s){return(r.lanes&s)!==0?!0:(r=r.dependencies,!!(r!==null&&Zl(r)))}function lS(r,s,l){switch(s.tag){case 3:qt(s,s.stateNode.containerInfo),gi(s,_e,r.memoizedState.cache),Ha();break;case 27:case 5:da(s);break;case 4:qt(s,s.stateNode.containerInfo);break;case 10:gi(s,s.type,s.memoizedProps.value);break;case 13:var f=s.memoizedState;if(f!==null)return f.dehydrated!==null?(wi(s),s.flags|=128,null):(l&s.child.childLanes)!==0?Iv(r,s,l):(wi(s),r=Ln(r,s,l),r!==null?r.sibling:null);wi(s);break;case 19:var d=(r.flags&128)!==0;if(f=(l&s.childLanes)!==0,f||(Ta(r,s,l,!1),f=(l&s.childLanes)!==0),d){if(f)return Nv(r,s,l);s.flags|=128}if(d=s.memoizedState,d!==null&&(d.rendering=null,d.tail=null,d.lastEffect=null),q(Qe,Qe.current),f)break;return null;case 22:case 23:return s.lanes=0,Ov(r,s,l);case 24:gi(s,_e,r.memoizedState.cache)}return Ln(r,s,l)}function Kv(r,s,l){if(r!==null)if(r.memoizedProps!==s.pendingProps)Ee=!0;else{if(!Dd(r,l)&&(s.flags&128)===0)return Ee=!1,lS(r,s,l);Ee=(r.flags&131072)!==0}else Ee=!1,Lt&&(s.flags&1048576)!==0&&p0(s,Xl,s.index);switch(s.lanes=0,s.tag){case 16:t:{r=s.pendingProps;var f=s.elementType,d=f._init;if(f=d(f._payload),s.type=f,typeof f=="function")Vh(f)?(r=Br(f,r),s.tag=1,s=Mv(null,s,f,r,l)):(s.tag=0,s=Ud(null,s,f,r,l));else{if(f!=null){if(d=f.$$typeof,d===O){s.tag=11,s=Ev(null,s,f,r,l);break t}else if(d===k){s.tag=14,s=Sv(null,s,f,r,l);break t}}throw s=bt(f)||f,Error(n(306,s,""))}}return s;case 0:return Ud(r,s,s.type,s.pendingProps,l);case 1:return f=s.type,d=Br(f,s.pendingProps),Mv(r,s,f,d,l);case 3:t:{if(qt(s,s.stateNode.containerInfo),r===null)throw Error(n(387));f=s.pendingProps;var p=s.memoizedState;d=p.element,nd(r,s),Ka(s,f,null,l);var w=s.memoizedState;if(f=w.cache,gi(s,_e,f),f!==p.cache&&$h(s,[_e],l,!0),Na(),f=w.element,p.isDehydrated)if(p={element:f,isDehydrated:!1,cache:w.cache},s.updateQueue.baseState=p,s.memoizedState=p,s.flags&256){s=Lv(r,s,f,l);break t}else if(f!==d){d=HA(Error(n(424)),s),Oa(d),s=Lv(r,s,f,l);break t}else{switch(r=s.stateNode.containerInfo,r.nodeType){case 9:r=r.body;break;default:r=r.nodeName==="HTML"?r.ownerDocument.body:r}for(fe=ZA(r.firstChild),We=s,Lt=!0,cr=null,on=!0,l=mv(s,null,f,l),s.child=l;l;)l.flags=l.flags&-3|4096,l=l.sibling}else{if(Ha(),f===d){s=Ln(r,s,l);break t}Me(r,s,f,l)}s=s.child}return s;case 26:return hu(r,s),r===null?(l=Pw(s.type,null,s.pendingProps,null))?s.memoizedState=l:Lt||(l=s.type,r=s.pendingProps,f=Fu(st.current).createElement(l),f[ke]=s,f[tA]=r,Ie(f,l,r),Fe(f),s.stateNode=f):s.memoizedState=Pw(s.type,r.memoizedProps,s.pendingProps,r.memoizedState),null;case 27:return da(s),r===null&&Lt&&(f=s.stateNode=kw(s.type,s.pendingProps,st.current),We=s,on=!0,d=fe,Si(s.type)?(dg=d,fe=ZA(f.firstChild)):fe=d),Me(r,s,s.pendingProps.children,l),hu(r,s),r===null&&(s.flags|=4194304),s.child;case 5:return r===null&&Lt&&((d=f=fe)&&(f=IS(f,s.type,s.pendingProps,on),f!==null?(s.stateNode=f,We=s,fe=ZA(f.firstChild),on=!1,d=!0):d=!1),d||fr(s)),da(s),d=s.type,p=s.pendingProps,w=r!==null?r.memoizedProps:null,f=p.children,ug(d,p)?f=null:w!==null&&ug(d,w)&&(s.flags|=32),s.memoizedState!==null&&(d=ld(r,s,eS,null,null,l),oo._currentValue=d),hu(r,s),Me(r,s,f,l),s.child;case 6:return r===null&&Lt&&((r=l=fe)&&(l=RS(l,s.pendingProps,on),l!==null?(s.stateNode=l,We=s,fe=null,r=!0):r=!1),r||fr(s)),null;case 13:return Iv(r,s,l);case 4:return qt(s,s.stateNode.containerInfo),f=s.pendingProps,r===null?s.child=Cs(s,null,f,l):Me(r,s,f,l),s.child;case 11:return Ev(r,s,s.type,s.pendingProps,l);case 7:return Me(r,s,s.pendingProps,l),s.child;case 8:return Me(r,s,s.pendingProps.children,l),s.child;case 12:return Me(r,s,s.pendingProps.children,l),s.child;case 10:return f=s.pendingProps,gi(s,s.type,f.value),Me(r,s,f.children,l),s.child;case 9:return d=s.type._context,f=s.pendingProps.children,dr(s),d=ze(d),f=f(d),s.flags|=1,Me(r,s,f,l),s.child;case 14:return Sv(r,s,s.type,s.pendingProps,l);case 15:return Hv(r,s,s.type,s.pendingProps,l);case 19:return Nv(r,s,l);case 31:return f=s.pendingProps,l=s.mode,f={mode:f.mode,children:f.children},r===null?(l=du(f,l),l.ref=s.ref,s.child=l,l.return=s,s=l):(l=En(r.child,f),l.ref=s.ref,s.child=l,l.return=s,s=l),s;case 22:return Ov(r,s,l);case 24:return dr(s),f=ze(_e),r===null?(d=td(),d===null&&(d=Yt,p=Jh(),d.pooledCache=p,p.refCount++,p!==null&&(d.pooledCacheLanes|=l),d=p),s.memoizedState={parent:f,cache:d},Ad(s),gi(s,_e,d)):((r.lanes&l)!==0&&(nd(r,s),Ka(s,null,null,l),Na()),d=r.memoizedState,p=s.memoizedState,d.parent!==f?(d={parent:f,cache:f},s.memoizedState=d,s.lanes===0&&(s.memoizedState=s.updateQueue.baseState=d),gi(s,_e,f)):(f=p.cache,gi(s,_e,f),f!==d.cache&&$h(s,[_e],l,!0))),Me(r,s,s.pendingProps.children,l),s.child;case 29:throw s.pendingProps}throw Error(n(156,s.tag))}function In(r){r.flags|=4}function kv(r,s){if(s.type!=="stylesheet"||(s.state.loading&4)!==0)r.flags&=-16777217;else if(r.flags|=16777216,!Yw(s)){if(s=MA.current,s!==null&&((Ht&4194048)===Ht?ln!==null:(Ht&62914560)!==Ht&&(Ht&536870912)===0||s!==ln))throw Ia=ed,_0;r.flags|=8192}}function gu(r,s){s!==null&&(r.flags|=4),r.flags&16384&&(s=r.tag!==22?vm():536870912,r.lanes|=s,Us|=s)}function Xa(r,s){if(!Lt)switch(r.tailMode){case"hidden":s=r.tail;for(var l=null;s!==null;)s.alternate!==null&&(l=s),s=s.sibling;l===null?r.tail=null:l.sibling=null;break;case"collapsed":l=r.tail;for(var f=null;l!==null;)l.alternate!==null&&(f=l),l=l.sibling;f===null?s||r.tail===null?r.tail=null:r.tail.sibling=null:f.sibling=null}}function le(r){var s=r.alternate!==null&&r.alternate.child===r.child,l=0,f=0;if(s)for(var d=r.child;d!==null;)l|=d.lanes|d.childLanes,f|=d.subtreeFlags&65011712,f|=d.flags&65011712,d.return=r,d=d.sibling;else for(d=r.child;d!==null;)l|=d.lanes|d.childLanes,f|=d.subtreeFlags,f|=d.flags,d.return=r,d=d.sibling;return r.subtreeFlags|=f,r.childLanes=l,s}function uS(r,s,l){var f=s.pendingProps;switch(Xh(s),s.tag){case 31:case 16:case 15:case 0:case 11:case 7:case 8:case 12:case 9:case 14:return le(s),null;case 1:return le(s),null;case 3:return l=s.stateNode,f=null,r!==null&&(f=r.memoizedState.cache),s.memoizedState.cache!==f&&(s.flags|=2048),Tn(_e),FA(),l.pendingContext&&(l.context=l.pendingContext,l.pendingContext=null),(r===null||r.child===null)&&(Sa(s)?In(s):r===null||r.memoizedState.isDehydrated&&(s.flags&256)===0||(s.flags|=1024,v0())),le(s),null;case 26:return l=s.memoizedState,r===null?(In(s),l!==null?(le(s),kv(s,l)):(le(s),s.flags&=-16777217)):l?l!==r.memoizedState?(In(s),le(s),kv(s,l)):(le(s),s.flags&=-16777217):(r.memoizedProps!==f&&In(s),le(s),s.flags&=-16777217),null;case 27:$r(s),l=st.current;var d=s.type;if(r!==null&&s.stateNode!=null)r.memoizedProps!==f&&In(s);else{if(!f){if(s.stateNode===null)throw Error(n(166));return le(s),null}r=Y.current,Sa(s)?B0(s):(r=kw(d,f,l),s.stateNode=r,In(s))}return le(s),null;case 5:if($r(s),l=s.type,r!==null&&s.stateNode!=null)r.memoizedProps!==f&&In(s);else{if(!f){if(s.stateNode===null)throw Error(n(166));return le(s),null}if(r=Y.current,Sa(s))B0(s);else{switch(d=Fu(st.current),r){case 1:r=d.createElementNS("http://www.w3.org/2000/svg",l);break;case 2:r=d.createElementNS("http://www.w3.org/1998/Math/MathML",l);break;default:switch(l){case"svg":r=d.createElementNS("http://www.w3.org/2000/svg",l);break;case"math":r=d.createElementNS("http://www.w3.org/1998/Math/MathML",l);break;case"script":r=d.createElement("div"),r.innerHTML=" + diff --git a/vite-app/src/components/Dashboard.tsx b/vite-app/src/components/Dashboard.tsx index 7bedf67d..ace28316 100644 --- a/vite-app/src/components/Dashboard.tsx +++ b/vite-app/src/components/Dashboard.tsx @@ -87,17 +87,6 @@ const Dashboard = observer(({ onRefresh }: DashboardProps) => { return (
- {/* Summary */} -
-

- Dataset Summary -

-
- Total Rows:{" "} - {state.totalCount} -
-
- {/* Content Area */} {state.isLoading ? ( From 37dc41738d97b3c711423c67589d34f17b63fbf2 Mon Sep 17 00:00:00 2001 From: "Yufei (Benny) Chen" <1585539+benjibc@users.noreply.github.com> Date: Sun, 17 Aug 2025 17:02:46 -0700 Subject: [PATCH 25/26] chore: squash branch changes on top of main; adopt Ruff+Pyright; update CI; exclude vite dist; restore deleted files from main (bigquery adapter + vite src/readme) (#74) --- .flake8 | 3 - .github/workflows/ci.yml | 11 +- .pre-commit-config.yaml | 40 +- LICENSE | 2 +- development/normalize_sandbox_fusion.py | 19 +- .../notes/pytest_integration_proposal.md | 6 +- development/utils/subprocess_manager.py | 2 +- eval_protocol/adapters/CONTRIBUTING.md | 82 +- eval_protocol/adapters/huggingface.py | 2 +- eval_protocol/agent/orchestrator.py | 12 +- .../agent/resources/bfcl_sim_api_resource.py | 3 +- eval_protocol/agent/task_manager.py | 4 +- eval_protocol/benchmarks/__init__.py | 2 - eval_protocol/benchmarks/registry.py | 5 +- eval_protocol/benchmarks/run.py | 10 +- eval_protocol/benchmarks/suites/__init__.py | 2 - eval_protocol/benchmarks/suites/aime25.py | 2 +- eval_protocol/cli_commands/agent_eval_cmd.py | 2 +- eval_protocol/cli_commands/deploy.py | 11 +- eval_protocol/cli_commands/deploy_mcp.py | 11 +- eval_protocol/cli_commands/logs.py | 2 +- eval_protocol/cli_commands/preview.py | 6 +- eval_protocol/config.py | 6 +- eval_protocol/dataset_logger/__init__.py | 1 - eval_protocol/evaluation.py | 18 +- eval_protocol/execution/pipeline.py | 8 +- eval_protocol/gcp_tools.py | 6 +- eval_protocol/generation/clients.py | 4 +- eval_protocol/generic_server.py | 2 +- eval_protocol/integrations/trl.py | 2 +- eval_protocol/mcp/execution/base_policy.py | 3 +- eval_protocol/mcp/execution/manager.py | 3 +- eval_protocol/mcp/execution/policy.py | 2 +- eval_protocol/mcp/mcpgym.py | 3 +- eval_protocol/mcp/session/manager.py | 2 +- eval_protocol/mcp/simulation_server.py | 2 +- .../mcp_agent/intermediary_server.py | 3 +- .../orchestration/local_docker_client.py | 3 +- eval_protocol/mcp_agent/session.py | 6 +- eval_protocol/platform_api.py | 2 +- .../pytest/default_dataset_adapter.py | 2 +- .../default_mcp_gym_rollout_processor.py | 6 +- eval_protocol/pytest/evaluation_test.py | 18 +- eval_protocol/pytest/utils.py | 10 +- eval_protocol/resources.py | 4 +- eval_protocol/rewards/accuracy_length.py | 2 +- eval_protocol/rewards/apps_coding_reward.py | 2 +- eval_protocol/rewards/apps_execution_utils.py | 2 +- eval_protocol/rewards/apps_testing_util.py | 2 +- eval_protocol/rewards/bfcl_reward.py | 2 +- eval_protocol/rewards/code_execution.py | 1 - eval_protocol/rewards/cpp_code.py | 2 +- eval_protocol/rewards/deepcoder_reward.py | 4 +- eval_protocol/rewards/language_consistency.py | 2 +- eval_protocol/rewards/tag_count.py | 4 +- eval_protocol/stats/__init__.py | 2 - eval_protocol/stats/confidence_intervals.py | 2 - eval_protocol/utils/logs_server.py | 1 - examples/adapters/README.md | 36 - .../adapters/gsm8k_replacement_example.py | 118 +- examples/adapters/huggingface_example.py | 334 +-- examples/adapters/langfuse_example.py | 122 +- examples/aime2025_chat_completion/README.md | 3 - examples/aime2025_chat_completion/__init__.py | 3 - examples/aime2025_chat_completion/main.py | 6 +- examples/blackjack_mcp/blackjack_adapter.py | 6 +- examples/blackjack_mcp/blackjack_mcp.py | 2 +- .../tests/test_record_and_replay_e2e.py | 21 +- .../cliff_walking_adapter.py | 2 +- .../tests/test_cliff_walking_e2e.py | 15 +- .../frozen_lake_mcp/frozen_lake_adapter.py | 4 +- examples/frozen_lake_mcp/rollout_example.py | 8 +- examples/frozen_lake_mcp/test_seed_logging.py | 2 +- .../frozen_lake_mcp/test_termination_fix.py | 2 +- .../frozen_lake_mcp/test_validation_logic.py | 2 +- .../tests/test_frozen_lake_e2e.py | 15 +- .../generate_sample_images.py | 4 +- .../simple_trajectory_test.py | 10 +- .../test_lunar_lander_conda.py | 8 +- .../tests/test_lunar_lander_e2e.py | 19 +- .../mcp_agent_filesystem_rl/test_example.py | 7 +- examples/rollout_control_plane_demo.py | 9 +- .../airline_environment.py | 1 + .../mock_environment/mock_environment.py | 1 + .../retail_environment/retail_environment.py | 1 + examples/tau2_mcp/tests/test_tau2_e2e.py | 43 +- .../local_testing/test_north_star.py | 4 +- .../mcp_server/simulation_server.py | 5 +- .../trl_integration/working_grpo_example.py | 1 - local_evals/model_comparison_eval.ipynb | 2080 ++++++++--------- mypy.ini | 37 - pyproject.toml | 83 +- scripts/create_sample_gsm8k_jsonl.py | 2 +- tests/cli_commands/test_deploy_cmd.py | 7 +- tests/cli_commands/test_preview_cmd.py | 1 - tests/conftest.py | 1 + .../test_eval_protocol_simple.py | 3 +- .../test_minimal_structure.py | 2 +- tests/execution/test_pipeline.py | 2 +- .../orchestration/test_local_docker_client.py | 3 +- .../mcp_agent/test_rl_filesystem_scenario.py | 2 +- tests/pytest/data/basic_coding_dataset.jsonl | 2 +- tests/pytest/data/lunar_lander_dataset.jsonl | 2 +- .../helper/word_count_to_evaluation_row.py | 5 +- tests/pytest/test_livesvgbench.py | 34 +- tests/pytest/test_svgbench.py | 8 +- tests/test_adapters_e2e.py | 4 +- tests/test_agent_resources.py | 12 +- tests/test_batch_evaluation.py | 16 +- tests/test_cli_agent.py | 6 +- tests/test_cli_args.py | 1 - tests/test_code_execution.py | 8 +- tests/test_data_driven_task_manager.py | 5 - tests/test_deepeval_integration.py | 24 +- tests/test_deploy_integration.py | 1 - tests/test_e2b_integration.py | 2 +- tests/test_e2b_js_integration.py | 2 +- tests/test_eval_protocol_import.py | 66 +- tests/test_examples_end_to_end.py | 1 - tests/test_function_calling.py | 12 +- tests/test_gcp_tools.py | 2 - tests/test_generic_server.py | 3 +- tests/test_math.py | 7 +- tests/test_models_rl.py | 5 +- tests/test_packaging.py | 1 - tests/test_parallel_rollouts.py | 4 +- tests/test_platform_api.py | 1 - tests/test_readiness.py | 2 - tests/test_retry_mechanism.py | 22 +- tests/test_reward_protocol_import.py | 66 +- tests/test_rl_processing.py | 5 +- .../test_rollout_control_plane_integration.py | 10 +- tests/test_typed_interface_rl.py | 1 - tests/test_url_handling.py | 1 + uv.lock | 405 +--- vendor/tau2/__init__.py | 1 - vendor/tau2/agent/README.md | 2 +- vendor/tau2/agent/base.py | 4 +- vendor/tau2/agent/llm_agent.py | 18 +- vendor/tau2/cli.py | 4 +- .../user_simulator/simulation_guidelines.md | 4 +- .../simulation_guidelines_tools.md | 4 +- vendor/tau2/data_model/__init__.py | 1 - vendor/tau2/data_model/message.py | 58 +- vendor/tau2/data_model/simulation.py | 58 +- vendor/tau2/data_model/tasks.py | 95 +- vendor/tau2/domains/airline/data_model.py | 88 +- vendor/tau2/domains/airline/tools.py | 79 +- vendor/tau2/domains/mock/data_model.py | 8 +- vendor/tau2/domains/mock/environment.py | 4 +- vendor/tau2/domains/mock/tools.py | 4 +- vendor/tau2/domains/retail/data_model.py | 96 +- vendor/tau2/domains/retail/tools.py | 33 +- vendor/tau2/domains/telecom/data_model.py | 104 +- vendor/tau2/domains/telecom/environment.py | 12 +- vendor/tau2/domains/telecom/tasks/const.py | 2 +- .../domains/telecom/tasks/create_tasks.py | 6 +- .../tau2/domains/telecom/tasks/mms_issues.py | 26 +- .../telecom/tasks/mobile_data_issues.py | 8 +- vendor/tau2/domains/telecom/tasks/utils.py | 4 +- vendor/tau2/domains/telecom/tools.py | 47 +- .../tau2/domains/telecom/user_data_model.py | 74 +- vendor/tau2/domains/telecom/user_tools.py | 102 +- vendor/tau2/environment/environment.py | 6 +- vendor/tau2/environment/server.py | 18 +- vendor/tau2/environment/tool.py | 12 +- vendor/tau2/environment/toolkit.py | 16 +- .../tau2/environment/utils/interface_agent.py | 16 +- vendor/tau2/evaluator/__init__.py | 1 - vendor/tau2/evaluator/evaluator.py | 4 +- vendor/tau2/evaluator/evaluator_action.py | 5 +- .../tau2/evaluator/evaluator_communicate.py | 8 +- vendor/tau2/evaluator/evaluator_env.py | 24 +- vendor/tau2/metrics/agent_metrics.py | 8 +- vendor/tau2/metrics/break_down_metrics.py | 26 +- .../tau2/orchestrator/environment_manager.py | 20 +- vendor/tau2/orchestrator/orchestrator.py | 108 +- vendor/tau2/orchestrator/utils.py | 4 +- vendor/tau2/registry.py | 50 +- vendor/tau2/run.py | 56 +- vendor/tau2/scripts/show_domain_doc.py | 4 +- vendor/tau2/scripts/start_servers.py | 8 +- vendor/tau2/scripts/view_simulations.py | 70 +- vendor/tau2/user/base.py | 12 +- vendor/tau2/user/user_simulator.py | 6 +- vendor/tau2/utils/display.py | 58 +- vendor/tau2/utils/llm_utils.py | 4 +- vendor/tau2/utils/pydantic_utils.py | 4 +- vendor/tau2/utils/utils.py | 10 +- versioneer.py | 4 +- 190 files changed, 2263 insertions(+), 3411 deletions(-) delete mode 100644 .flake8 delete mode 100644 mypy.ini diff --git a/.flake8 b/.flake8 deleted file mode 100644 index 06945f46..00000000 --- a/.flake8 +++ /dev/null @@ -1,3 +0,0 @@ -[flake8] -max-line-length = 119 -ignore = E203, W503 diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 08aaf406..8c1b0691 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -42,11 +42,14 @@ jobs: - name: Install tau2 for testing run: uv pip install git+https://github.com/sierra-research/tau2-bench.git@main - - name: Lint with flake8 - run: uv run flake8 eval_protocol tests examples scripts --count --exit-zero --max-complexity=10 --max-line-length=88 --statistics + - name: Ruff format (check) + run: uv run ruff format --check . - - name: Type check with mypy - run: uv run mypy eval_protocol + - name: Ruff lint + run: uv run ruff check . + + - name: Type check with pyright + run: uv run pyright test-core: name: Core Tests (Python ${{ matrix.python-version }}) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 43c0f8c1..441a2de7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,43 +1,29 @@ # See https://pre-commit.com for more information # See https://pre-commit.com/hooks.html for more hooks +exclude: | + (^vite-app/|\.snap$) repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v5.0.0 + rev: v6.0.0 hooks: - id: trailing-whitespace + exclude: "(^vite-app/|\\.snap$)" - id: end-of-file-fixer + exclude: "(^vite-app/|\\.snap$)" - id: check-yaml - id: check-added-large-files - id: check-merge-conflict - id: check-toml - id: detect-private-key -- repo: https://github.com/psf/black - rev: 25.1.0 +- repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.12.8 hooks: - - id: black - args: [--line-length=119] + - id: ruff-format + - id: ruff + args: ["--fix"] -- repo: https://github.com/pycqa/isort - rev: 6.0.1 +- repo: https://github.com/RobertCraigie/pyright-python + rev: v1.1.403 hooks: - - id: isort - name: isort (python) - args: ["--profile", "black", "--line-length", "119", "--filter-files"] - -- repo: https://github.com/pycqa/flake8 - rev: 7.3.0 - hooks: - - id: flake8 - args: [--max-line-length=119, --max-complexity=100, "--ignore=E402,F401,F541,W503,E203,F811,E226,F841,E704,E713,E712,E231,E731,E501"] - # additional_dependencies: [flake8-docstrings, flake8-import-order] # Optional: add flake8 plugins - -- repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.17.0 - hooks: - - id: mypy - args: [--ignore-missing-imports, --install-types, --non-interactive] - additional_dependencies: - - types-requests - - types-setuptools - # Add other types-* packages your project uses + - id: pyright diff --git a/LICENSE b/LICENSE index e926381a..4bff8e12 100644 --- a/LICENSE +++ b/LICENSE @@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. \ No newline at end of file +SOFTWARE. diff --git a/development/normalize_sandbox_fusion.py b/development/normalize_sandbox_fusion.py index fa191283..988f740e 100644 --- a/development/normalize_sandbox_fusion.py +++ b/development/normalize_sandbox_fusion.py @@ -56,7 +56,7 @@ try: repobench_p_tokenizer = AutoTokenizer.from_pretrained("gpt2") except OSError: - print("Warning: Could not load gpt2 tokenizer for Repobench-P. " "Falling back to basic split for token counting.") + print("Warning: Could not load gpt2 tokenizer for Repobench-P. Falling back to basic split for token counting.") repobench_p_tokenizer = None @@ -108,8 +108,7 @@ def format_aider_prompt(problem_json: dict) -> str: """Format the prompt for Aider benchmark style problems.""" question = problem_json.get("content", "") return ( - f"{question}\n\nPlease generate the code in the following format:\n" - "```python\n# Your code response here\n```" + f"{question}\n\nPlease generate the code in the following format:\n```python\n# Your code response here\n```" ) @@ -327,7 +326,7 @@ def normalize_problem_to_openai_format( try: labels = json.loads(labels_data) except json.JSONDecodeError: - print(f"Warning: Skipping ID {problem_id_str} in {filename} " "- malformed JSON in labels.") + print(f"Warning: Skipping ID {problem_id_str} in {filename} - malformed JSON in labels.") return None elif isinstance(labels_data, dict): labels = labels_data @@ -426,10 +425,10 @@ def normalize_problem_to_openai_format( ) return None if not final_user_content.strip() or not final_assistant_content.strip(): - print(f"Warning: Skipping ID {problem_id_str} in {filename} - " "empty processed content.") + print(f"Warning: Skipping ID {problem_id_str} in {filename} - empty processed content.") return None if final_assistant_content.strip() == "import sys; sys.exit(0)": - print(f"Warning: Skipping ID {problem_id_str} in {filename} - " "placeholder solution.") + print(f"Warning: Skipping ID {problem_id_str} in {filename} - placeholder solution.") return None return { @@ -439,7 +438,7 @@ def normalize_problem_to_openai_format( ] } except Exception as e: - print(f"Warning: Skipping ID {problem_id_str} in {filename} - " f"error ({type(e).__name__}: {e}).") + print(f"Warning: Skipping ID {problem_id_str} in {filename} - error ({type(e).__name__}: {e}).") import traceback traceback.print_exc() @@ -474,7 +473,7 @@ def main(): file_error_count += 1 continue - print(f"Processing file {filename_idx + 1}/{len(ALL_SOURCE_JSONL_FILES)}: " f"{filename}...") + print(f"Processing file {filename_idx + 1}/{len(ALL_SOURCE_JSONL_FILES)}: {filename}...") lines_in_file = 0 processed_in_file = 0 skipped_in_file = 0 @@ -488,7 +487,7 @@ def main(): try: problem_data = json.loads(stripped_line) except json.JSONDecodeError: - print(f"Warning: Malformed JSON on line {line_number} " f"in {filepath}. Skipping line.") + print(f"Warning: Malformed JSON on line {line_number} in {filepath}. Skipping line.") skipped_in_file += 1 continue @@ -507,7 +506,7 @@ def main(): processed_count += processed_in_file skipped_count += skipped_in_file except Exception as e: - print(f"Error processing file {filepath}: {type(e).__name__}: {e}. " "Skipping rest of file.") + print(f"Error processing file {filepath}: {type(e).__name__}: {e}. Skipping rest of file.") import traceback traceback.print_exc() diff --git a/development/notes/pytest_integration_proposal.md b/development/notes/pytest_integration_proposal.md index c9496587..784cc215 100644 --- a/development/notes/pytest_integration_proposal.md +++ b/development/notes/pytest_integration_proposal.md @@ -115,7 +115,7 @@ def tau2_rollout_processor(row: EvaluationRow, model: str, input_params: Dict, * # from the dataset and provide a simulated tool response. # 4. Call the model again with the tool response. # 5. Construct a final EvaluationRow with the full transcript. - + # The logic is encapsulated here, away from the test definition. processed_row = ep.default_rollout_processor(row, model, input_params)[0] # Simplified for example return [processed_row] @@ -186,11 +186,11 @@ def best_of_n_processor(row: EvaluationRow, model: str, input_params: Dict, **kw # Then, apply a reward function to score each candidate. scored_rows = ep.evaluate(candidate_rows, score_politeness) - + # Finally, select the best row. # This logic could be encapsulated in a helper, e.g., ep.select_best(). best_row = select_best_by_group(scored_rows, score_key='politeness') - + return [best_row] @evaluation_test( diff --git a/development/utils/subprocess_manager.py b/development/utils/subprocess_manager.py index 5af6c768..1d568c87 100644 --- a/development/utils/subprocess_manager.py +++ b/development/utils/subprocess_manager.py @@ -139,7 +139,7 @@ def start_ngrok_and_get_url( # Or by setting NGROK_AUTHTOKEN environment variable. # Forcing it via command line is also an option but less common for persistent setup. print( - f"Note: Ngrok authtoken should be pre-configured by the user (e.g., 'ngrok config add-authtoken ') or via NGROK_AUTHTOKEN env var." + "Note: Ngrok authtoken should be pre-configured by the user (e.g., 'ngrok config add-authtoken ') or via NGROK_AUTHTOKEN env var." ) # Example if passing via env for the subprocess: # ngrok_env = os.environ.copy() diff --git a/eval_protocol/adapters/CONTRIBUTING.md b/eval_protocol/adapters/CONTRIBUTING.md index 18f31378..e47e06e9 100644 --- a/eval_protocol/adapters/CONTRIBUTING.md +++ b/eval_protocol/adapters/CONTRIBUTING.md @@ -37,36 +37,36 @@ except ImportError: class YourCustomAdapter: """Adapter for integrating with Your Custom Data Source. - + This adapter loads data from Your Custom Data Source and converts it to EvaluationRow format for use in evaluation pipelines. - + Examples: Basic usage: >>> adapter = YourCustomAdapter(api_key="your_key") >>> rows = list(adapter.get_evaluation_rows(limit=10)) """ - + def __init__(self, **config): """Initialize the adapter with configuration.""" if not DEPENDENCY_AVAILABLE: raise ImportError("your_external_library not installed") - + # Initialize your client/connection here self.client = your_external_library.Client(**config) - + def get_evaluation_rows(self, **kwargs) -> Iterator[EvaluationRow]: """Main method to fetch and convert data to EvaluationRow format. - + Args: **kwargs: Adapter-specific parameters - + Yields: EvaluationRow: Converted evaluation rows """ # Implement your data fetching logic raw_data = self.client.fetch_data(**kwargs) - + for item in raw_data: try: eval_row = self._convert_to_evaluation_row(item) @@ -75,51 +75,51 @@ class YourCustomAdapter: except Exception as e: logger.warning(f"Failed to convert item: {e}") continue - + def _convert_to_evaluation_row(self, raw_item: Any) -> Optional[EvaluationRow]: """Convert a raw data item to EvaluationRow format. - + Args: raw_item: Raw data item from your source - + Returns: EvaluationRow or None if conversion fails """ # Extract messages from your data format messages = self._extract_messages(raw_item) - + # Extract metadata input_metadata = self._create_input_metadata(raw_item) - + # Extract ground truth if available ground_truth = self._extract_ground_truth(raw_item) - + # Extract tools if available (for tool calling scenarios) tools = self._extract_tools(raw_item) - + return EvaluationRow( messages=messages, tools=tools, input_metadata=input_metadata, ground_truth=ground_truth, ) - + def _extract_messages(self, raw_item: Any) -> List[Message]: """Extract conversation messages from raw data.""" # Implement message extraction logic # Convert your data format to List[Message] pass - + def _create_input_metadata(self, raw_item: Any) -> InputMetadata: """Create InputMetadata from raw data.""" # Implement metadata extraction pass - + def _extract_ground_truth(self, raw_item: Any) -> Optional[str]: """Extract ground truth if available.""" # Implement ground truth extraction pass - + def _extract_tools(self, raw_item: Any) -> Optional[List[Dict[str, Any]]]: """Extract tool definitions if available.""" # Implement tool extraction for tool calling scenarios @@ -149,7 +149,7 @@ message = Message( content="I'll help you with that calculation.", tool_calls=[{ "id": "call_123", - "type": "function", + "type": "function", "function": { "name": "calculate", "arguments": '{"x": 5, "y": 3}' @@ -185,7 +185,7 @@ input_metadata = InputMetadata( }, session_data={ "user_id": "user123", - "session_id": "session456", + "session_id": "session456", "timestamp": "2024-01-01T00:00:00Z", } ) @@ -259,7 +259,7 @@ def get_evaluation_rows(self, **kwargs) -> Iterator[EvaluationRow]: except Exception as e: logger.error(f"Failed to fetch data: {e}") return - + for item in data: try: row = self._convert_to_evaluation_row(item) @@ -298,36 +298,36 @@ from eval_protocol.models import EvaluationRow class TestYourCustomAdapter: """Test suite for YourCustomAdapter.""" - + def test_initialization(self): """Test adapter initialization.""" adapter = YourCustomAdapter(api_key="test_key") assert adapter.client is not None - + def test_get_evaluation_rows(self): """Test conversion to EvaluationRow format.""" adapter = YourCustomAdapter(api_key="test_key") - + # Mock the external API response with patch.object(adapter.client, 'fetch_data') as mock_fetch: mock_fetch.return_value = [ # Mock data in your format {"id": "1", "question": "Test?", "answer": "Yes"} ] - + rows = list(adapter.get_evaluation_rows(limit=1)) - + assert len(rows) == 1 assert isinstance(rows[0], EvaluationRow) assert len(rows[0].messages) > 0 - + def test_error_handling(self): """Test error handling.""" adapter = YourCustomAdapter(api_key="test_key") - + with patch.object(adapter.client, 'fetch_data') as mock_fetch: mock_fetch.side_effect = Exception("API Error") - + rows = list(adapter.get_evaluation_rows()) assert len(rows) == 0 # Should handle error gracefully ``` @@ -341,18 +341,18 @@ For simple chat data: ```python def _extract_messages(self, conversation: Dict) -> List[Message]: messages = [] - + # Add system prompt if available if conversation.get('system_prompt'): messages.append(Message(role="system", content=conversation['system_prompt'])) - + # Add conversation turns for turn in conversation['turns']: messages.append(Message( role=turn['role'], content=turn['content'] )) - + return messages ``` @@ -363,27 +363,27 @@ For tool calling scenarios: ```python def _extract_messages(self, trace: Dict) -> List[Message]: messages = [] - + for step in trace['steps']: if step['type'] == 'user_message': messages.append(Message(role="user", content=step['content'])) - + elif step['type'] == 'assistant_message': message = Message(role="assistant", content=step.get('content')) - + # Add tool calls if present if step.get('tool_calls'): message.tool_calls = step['tool_calls'] - + messages.append(message) - + elif step['type'] == 'tool_response': messages.append(Message( role="tool", content=step['content'], tool_call_id=step['tool_call_id'] )) - + return messages ``` @@ -515,10 +515,10 @@ Here are some potential adapters that would be valuable: - **OpenAI Evals**: Load data from OpenAI's evals repository - **LLM Evaluation Datasets**: MMLU, HellaSwag, etc. -- **Chat Platforms**: Discord, Slack conversation exports +- **Chat Platforms**: Discord, Slack conversation exports - **Monitoring Tools**: Other observability platforms - **Custom APIs**: Company-specific data sources - **File Formats**: Parquet, Excel, database exports - **Research Datasets**: Academic benchmarks and competitions -We welcome contributions for any of these or other creative integrations! \ No newline at end of file +We welcome contributions for any of these or other creative integrations! diff --git a/eval_protocol/adapters/huggingface.py b/eval_protocol/adapters/huggingface.py index 2825dafa..7f8b6902 100644 --- a/eval_protocol/adapters/huggingface.py +++ b/eval_protocol/adapters/huggingface.py @@ -413,7 +413,7 @@ def create_math_adapter( HuggingFaceAdapter configured for MATH dataset """ default_system_prompt = ( - "You are an expert mathematician. Solve this advanced math problem " "step by step, showing detailed work." + "You are an expert mathematician. Solve this advanced math problem step by step, showing detailed work." ) system_content = system_prompt or default_system_prompt diff --git a/eval_protocol/agent/orchestrator.py b/eval_protocol/agent/orchestrator.py index 2f737e2c..61be1091 100644 --- a/eval_protocol/agent/orchestrator.py +++ b/eval_protocol/agent/orchestrator.py @@ -416,9 +416,9 @@ async def execute_task_poc(self, sample_data: Optional[Dict[str, Any]] = None) - episode_resource: Optional[ForkableResource] = None evaluation_result: Optional[Dict[str, Any]] = None - all_user_turns_successful_function_calls: List[List[Dict[str, Any]]] = ( - [] - ) # Track successful calls for reward fn, list of lists (per user turn) + all_user_turns_successful_function_calls: List[ + List[Dict[str, Any]] + ] = [] # Track successful calls for reward fn, list of lists (per user turn) conversation_messages: List[Dict[str, Any]] = [] # Use dicts for API compatibility # --- Agent Model Setup --- @@ -845,16 +845,16 @@ async def execute_task_poc(self, sample_data: Optional[Dict[str, Any]] = None) - eval_args["ground_truth"] = ground_truth_for_reward # Call the reward function - self.logger.info(f"=== CALLING REWARD FUNCTION DEBUG ===") + self.logger.info("=== CALLING REWARD FUNCTION DEBUG ===") self.logger.info(f"Reward function type: {type(self.reward_function)}") self.logger.info(f"Eval args keys: {list(eval_args.keys())}") self.logger.info(f"Task achieved: {eval_args.get('task_achieved', 'NOT_SET')}") self.logger.info(f"Messages count: {len(eval_args.get('messages', []))}") evaluation_result = self.reward_function(**eval_args) - self.logger.info(f"=== REWARD FUNCTION RESULT ===") + self.logger.info("=== REWARD FUNCTION RESULT ===") self.logger.info(f"Reward function result: {evaluation_result}") self.logger.info(f"Result type: {type(evaluation_result)}") - self.logger.info(f"=== END REWARD FUNCTION DEBUG ===") + self.logger.info("=== END REWARD FUNCTION DEBUG ===") # Return both the evaluation result and the inputs for trajectory capture return { diff --git a/eval_protocol/agent/resources/bfcl_sim_api_resource.py b/eval_protocol/agent/resources/bfcl_sim_api_resource.py index 638c915e..8bee80a7 100644 --- a/eval_protocol/agent/resources/bfcl_sim_api_resource.py +++ b/eval_protocol/agent/resources/bfcl_sim_api_resource.py @@ -4,8 +4,7 @@ from pathlib import Path # Import BFCL File and Directory for isinstance checks from local implementation -from .bfcl_envs.gorilla_file_system import Directory as BFCLDirectory -from .bfcl_envs.gorilla_file_system import File as BFCLFile +from .bfcl_envs.gorilla_file_system import Directory as BFCLDirectory, File as BFCLFile BFCL_TYPES_AVAILABLE = True import gc diff --git a/eval_protocol/agent/task_manager.py b/eval_protocol/agent/task_manager.py index 7e6ee6e3..5cff2f9d 100644 --- a/eval_protocol/agent/task_manager.py +++ b/eval_protocol/agent/task_manager.py @@ -918,7 +918,7 @@ def _save_detailed_results( output_path = Path(output_file) try: - self.logger.info(f"=== TRAJECTORY SAVE DEBUG START ===") + self.logger.info("=== TRAJECTORY SAVE DEBUG START ===") self.logger.info(f"Saving trajectory data to: {output_path}") self.logger.info(f"Chosen directory: {chosen_dir}") self.logger.info(f"Individual results count: {len(aggregated_result.get('individual_results', []))}") @@ -992,7 +992,7 @@ def _save_detailed_results( self.logger.info(f"Successfully saved trajectory data to: {output_path}") self.logger.info(f"Trajectory file size: {output_path.stat().st_size} bytes") - self.logger.info(f"=== TRAJECTORY SAVE DEBUG END ===") + self.logger.info("=== TRAJECTORY SAVE DEBUG END ===") return str(output_path) except Exception as e: diff --git a/eval_protocol/benchmarks/__init__.py b/eval_protocol/benchmarks/__init__.py index 18a872c7..e248fe9b 100644 --- a/eval_protocol/benchmarks/__init__.py +++ b/eval_protocol/benchmarks/__init__.py @@ -5,5 +5,3 @@ "get_benchmark_runner", "list_benchmarks", ] - - diff --git a/eval_protocol/benchmarks/registry.py b/eval_protocol/benchmarks/registry.py index 31840fd1..ce3c698e 100644 --- a/eval_protocol/benchmarks/registry.py +++ b/eval_protocol/benchmarks/registry.py @@ -199,6 +199,7 @@ def _composite_runner( _get_benchmark_runner = get_benchmark_runner import pathlib as _pathlib import time as _time + _json = json child_summaries: List[Dict[str, Any]] = [] @@ -302,9 +303,7 @@ def _composite_runner( f"EP Summary | suite={name} model={model} agg={combined['agg_score']:.3f} rows={total_rows}" ) else: - print( - f"EP Summary | suite={name} model={model} agg=None rows={total_rows}" - ) + print(f"EP Summary | suite={name} model={model} agg=None rows={total_rows}") except Exception: pass diff --git a/eval_protocol/benchmarks/run.py b/eval_protocol/benchmarks/run.py index 9195666f..a5afe900 100644 --- a/eval_protocol/benchmarks/run.py +++ b/eval_protocol/benchmarks/run.py @@ -14,10 +14,10 @@ from __future__ import annotations import argparse +import pkgutil +from importlib import import_module from typing import Any -from importlib import import_module -import pkgutil import eval_protocol.benchmarks.suites as suites_pkg from eval_protocol.benchmarks.registry import get_benchmark_runner, list_benchmarks @@ -49,7 +49,9 @@ def main() -> int: args = _parse_args() # Auto-import all suite modules so their @export_benchmark decorators register # Import all suite modules so their @export_benchmark decorators register - import sys, traceback + import sys + import traceback + for modinfo in pkgutil.iter_modules(suites_pkg.__path__): mod_name = f"{suites_pkg.__name__}.{modinfo.name}" try: @@ -96,5 +98,3 @@ def main() -> int: if __name__ == "__main__": raise SystemExit(main()) - - diff --git a/eval_protocol/benchmarks/suites/__init__.py b/eval_protocol/benchmarks/suites/__init__.py index 04746ad6..d0effd69 100644 --- a/eval_protocol/benchmarks/suites/__init__.py +++ b/eval_protocol/benchmarks/suites/__init__.py @@ -1,3 +1 @@ # Suite modules are auto-imported by eval_protocol.benchmarks.run to register benchmarks. - - diff --git a/eval_protocol/benchmarks/suites/aime25.py b/eval_protocol/benchmarks/suites/aime25.py index 92d7bedc..755795df 100644 --- a/eval_protocol/benchmarks/suites/aime25.py +++ b/eval_protocol/benchmarks/suites/aime25.py @@ -8,7 +8,7 @@ from eval_protocol.pytest.evaluation_test import evaluation_test SYSTEM_PROMPT = ( - "You are a helpful math assistant. Please reason step by step, and put your " "final answer within \\boxed{...}." + "You are a helpful math assistant. Please reason step by step, and put your final answer within \\boxed{...}." ) diff --git a/eval_protocol/cli_commands/agent_eval_cmd.py b/eval_protocol/cli_commands/agent_eval_cmd.py index f49ab0ea..08767d50 100644 --- a/eval_protocol/cli_commands/agent_eval_cmd.py +++ b/eval_protocol/cli_commands/agent_eval_cmd.py @@ -115,7 +115,7 @@ async def main_flow(): logger.info(f" - Standard deviation: {result.get('std_dev', 0.0):.4f}") logger.info(f" - Score range: {result['min_score']:.4f} - {result['max_score']:.4f}") if "aggregated_metrics" in result: - logger.info(f" - Aggregated metrics:") + logger.info(" - Aggregated metrics:") for metric_name, metric_data in result["aggregated_metrics"].items(): logger.info( f" * {metric_name}: avg={metric_data['avg_score']:.4f}, range={metric_data['min_score']:.4f}-{metric_data['max_score']:.4f}" diff --git a/eval_protocol/cli_commands/deploy.py b/eval_protocol/cli_commands/deploy.py index 7536969a..7250e291 100644 --- a/eval_protocol/cli_commands/deploy.py +++ b/eval_protocol/cli_commands/deploy.py @@ -16,8 +16,8 @@ # TODO: Consider moving subprocess_manager functions to a more central location if used by core CLI try: - from development.utils.subprocess_manager import start_ngrok_and_get_url # Added ngrok function from development.utils.subprocess_manager import ( + start_ngrok_and_get_url, # Added ngrok function start_process, start_serveo_and_get_url, stop_process, @@ -59,9 +59,12 @@ def start_ngrok_and_get_url(local_port, log_path): from eval_protocol.auth import get_fireworks_account_id -from eval_protocol.config import GCPCloudRunConfig, RewardKitConfig -from eval_protocol.config import _config_file_path as global_loaded_config_path -from eval_protocol.config import get_config +from eval_protocol.config import ( + GCPCloudRunConfig, + RewardKitConfig, + _config_file_path as global_loaded_config_path, + get_config, +) from eval_protocol.evaluation import create_evaluation from eval_protocol.gcp_tools import ( build_and_push_docker_image, diff --git a/eval_protocol/cli_commands/deploy_mcp.py b/eval_protocol/cli_commands/deploy_mcp.py index f71796d6..34cb6a6f 100644 --- a/eval_protocol/cli_commands/deploy_mcp.py +++ b/eval_protocol/cli_commands/deploy_mcp.py @@ -9,9 +9,12 @@ from pathlib import Path from typing import Dict, Optional -from eval_protocol.config import GCPCloudRunConfig, RewardKitConfig -from eval_protocol.config import _config_file_path as global_loaded_config_path -from eval_protocol.config import get_config +from eval_protocol.config import ( + GCPCloudRunConfig, + RewardKitConfig, + _config_file_path as global_loaded_config_path, + get_config, +) from eval_protocol.gcp_tools import ( build_and_push_docker_image, deploy_to_cloud_run, @@ -235,7 +238,7 @@ def _deploy_mcp_to_gcp_cloud_run(args, current_config, gcp_config_from_yaml): print("Failed to deploy to Cloud Run or retrieve service URL. Aborting.") return None - print(f"🚀 Successfully deployed MCP server to Cloud Run!") + print("🚀 Successfully deployed MCP server to Cloud Run!") print(f"📍 Service URL: {cloud_run_service_url}") print(f"🔗 MCP Connection URL: {cloud_run_service_url}") print(f"📋 Service Name: {args.id}") diff --git a/eval_protocol/cli_commands/logs.py b/eval_protocol/cli_commands/logs.py index 07c211be..92b1be58 100644 --- a/eval_protocol/cli_commands/logs.py +++ b/eval_protocol/cli_commands/logs.py @@ -12,7 +12,7 @@ def logs_command(args): """Serve logs with file watching and real-time updates""" port = args.port - print(f"🚀 Starting Eval Protocol Logs Server") + print("🚀 Starting Eval Protocol Logs Server") print(f"🌐 URL: http://localhost:{port}") print(f"🔌 WebSocket: ws://localhost:{port}/ws") print(f"👀 Watching paths: {['current directory']}") diff --git a/eval_protocol/cli_commands/preview.py b/eval_protocol/cli_commands/preview.py index 0df8e95a..ef438496 100644 --- a/eval_protocol/cli_commands/preview.py +++ b/eval_protocol/cli_commands/preview.py @@ -110,7 +110,7 @@ def preview_command(args): processed_messages.append(msg_item) else: print( - f"Warning: Sample {i+1} has unexpected message item type: {type(msg_item)}. Skipping this message item." + f"Warning: Sample {i + 1} has unexpected message item type: {type(msg_item)}. Skipping this message item." ) try: @@ -120,13 +120,13 @@ def preview_command(args): kwargs=sample_kwargs, ) except Exception as e: # Pydantic validation for EvaluationRequest - print(f"\n--- Sample {i+1} ---") + print(f"\n--- Sample {i + 1} ---") print(f" Error creating request payload for sample: {e}") print(f" Sample data: {sample_data}") print("--- End Sample ---") continue # Skip to next sample - print(f"\n--- Sample {i+1} ---") + print(f"\n--- Sample {i + 1} ---") try: response = requests.post( diff --git a/eval_protocol/config.py b/eval_protocol/config.py index a1fee7d6..7d75ab01 100644 --- a/eval_protocol/config.py +++ b/eval_protocol/config.py @@ -31,9 +31,9 @@ class RewardKitConfig(BaseModel): default_deployment_target: Optional[Literal["gcp-cloud-run", "aws-lambda", "fireworks", "local"]] = "fireworks" gcp_cloud_run: Optional[GCPCloudRunConfig] = GCPCloudRunConfig() aws_lambda: Optional[AWSLambdaConfig] = AWSLambdaConfig() - evaluator_endpoint_keys: Optional[Dict[str, str]] = ( - {} # Stores generated API keys for self-hosted evaluator endpoints - ) + evaluator_endpoint_keys: Optional[ + Dict[str, str] + ] = {} # Stores generated API keys for self-hosted evaluator endpoints # --- Global variable to hold the loaded configuration --- diff --git a/eval_protocol/dataset_logger/__init__.py b/eval_protocol/dataset_logger/__init__.py index 4d04ce7d..c087b6cd 100644 --- a/eval_protocol/dataset_logger/__init__.py +++ b/eval_protocol/dataset_logger/__init__.py @@ -22,7 +22,6 @@ def read(self, rollout_id=None): # Lazy property that creates the logger only when accessed class _LazyLogger(DatasetLogger): - def log(self, row): return _get_default_logger().log(row) diff --git a/eval_protocol/evaluation.py b/eval_protocol/evaluation.py index ee71a37e..fe58bb8a 100644 --- a/eval_protocol/evaluation.py +++ b/eval_protocol/evaluation.py @@ -309,7 +309,7 @@ def load_multi_metrics_folder(self, folder_path): files = self._load_python_files_from_folder(folder_path) self.code_files = files - logger.info(f"Loaded {len(files)} Python files from {folder_path} " f"for multi-metrics evaluation") + logger.info(f"Loaded {len(files)} Python files from {folder_path} for multi-metrics evaluation") return files def load_samples_from_jsonl(self, sample_file, max_samples=5): @@ -327,7 +327,7 @@ def load_samples_from_jsonl(self, sample_file, max_samples=5): sample = json.loads(line) samples.append(sample) except json.JSONDecodeError: - logger.warning(f"Invalid JSON on line {i+1}, skipping") + logger.warning(f"Invalid JSON on line {i + 1}, skipping") logger.info(f"Loaded {len(samples)} samples from {sample_file}") return samples @@ -444,7 +444,9 @@ def _get_combined_requirements(self) -> str: # Fallback for multi_metrics if requirements were loaded differently (hypothetical) # This attribute doesn't exist yet, placeholder for future enhancement if needed. if self._loaded_multi_metric_requirements_str: # type: ignore - requirements_list = [r.strip() for r in self._loaded_multi_metric_requirements_str.splitlines() if r.strip()] # type: ignore + requirements_list = [ + r.strip() for r in self._loaded_multi_metric_requirements_str.splitlines() if r.strip() + ] # type: ignore for req_item in requirements_list: all_requirements_set.add(req_item) @@ -458,7 +460,7 @@ def _simulated_preview(self, samples): for i, sample in enumerate(samples): try: if "messages" not in sample: - raise ValueError(f"Sample {i+1} is missing 'messages' field") + raise ValueError(f"Sample {i + 1} is missing 'messages' field") _ = sample.get("messages", []) _ = sample.get("ground_truth", []) _ = sample.get("tools", []) @@ -486,7 +488,7 @@ def _simulated_preview(self, samples): per_metric_evals=per_metric_evals, ) except Exception as e: - logger.error(f"Error processing sample {i+1}: {str(e)}") + logger.error(f"Error processing sample {i + 1}: {str(e)}") preview_result.add_result( sample_index=i, success=False, @@ -873,7 +875,7 @@ def preview_folder_evaluation( # This function might become redundant or need t if has_main_py and not multi_metrics: py_files = list(Path(evaluator_folder).glob("*.py")) if len(py_files) > 1: - logger.info(f"Auto-detecting multi-metrics mode based on folder structure for preview_folder_evaluation") + logger.info("Auto-detecting multi-metrics mode based on folder structure for preview_folder_evaluation") detected_multi_metrics = True # Call the unified preview_evaluation @@ -947,7 +949,7 @@ def create_evaluation( ) elif ts_mode_config: # ts_mode_config already handled in Evaluator.__init__ for self.code_files - logger.info(f"Configuring evaluator with direct Python code snippet (ts_mode).") + logger.info("Configuring evaluator with direct Python code snippet (ts_mode).") elif multi_metrics: # Folder-based multi_metrics if not folder: raise ValueError("`folder` must be specified for folder-based multi_metrics mode.") @@ -1008,7 +1010,7 @@ def deploy_folder_evaluation( # This function might become redundant or need to if has_main_py and not multi_metrics: # If user says not multi_metrics, but main.py is at root py_files = list(Path(evaluator_folder_abs).glob("*.py")) if len(py_files) > 1: # Heuristic: if multiple .py files at root with main.py, likely multi-metric - logger.info(f"Auto-detecting multi-metrics mode for deploy_folder_evaluation.") + logger.info("Auto-detecting multi-metrics mode for deploy_folder_evaluation.") detected_multi_metrics = True if detected_multi_metrics: diff --git a/eval_protocol/execution/pipeline.py b/eval_protocol/execution/pipeline.py index 7e7f7c29..e644ba32 100644 --- a/eval_protocol/execution/pipeline.py +++ b/eval_protocol/execution/pipeline.py @@ -78,7 +78,7 @@ def __init__(self, pipeline_cfg: DictConfig): if self.cfg.get("agent") and self.cfg.agent.get("type") == "mcp_agent": if not self.cfg.agent.get("intermediary_server_url"): raise ValueError("agent.intermediary_server_url must be configured for mcp_agent type.") - logger.info(f"Pipeline configured for mcp_agent. IntermediaryMCPClient will be initialized in run().") + logger.info("Pipeline configured for mcp_agent. IntermediaryMCPClient will be initialized in run().") async def _discover_tools_for_sample(self, sample_id: str, mcp_backend_ref: str) -> List[Dict[str, Any]]: """Discover available tools from MCP backend for a sample.""" @@ -348,7 +348,7 @@ async def _execute_mcp_agent_rollout( ) except Exception as e_tool_exec: logger.error( - f"Sample {sample_id}, Turn {turn_num+1}: Error executing/parsing tool '{tool_name}': {e_tool_exec}", + f"Sample {sample_id}, Turn {turn_num + 1}: Error executing/parsing tool '{tool_name}': {e_tool_exec}", exc_info=True, ) error_payload = {"error": str(e_tool_exec)} @@ -381,11 +381,11 @@ async def _execute_mcp_agent_rollout( assistant_msg_for_history["content"] = final_llm_text_response current_messages_for_rollout.append(assistant_msg_for_history) final_assistant_output_for_log = final_llm_text_response - logger.info(f"Sample {sample_id}, Turn {turn_num+1}: LLM responded with text. Ending rollout.") + logger.info(f"Sample {sample_id}, Turn {turn_num + 1}: LLM responded with text. Ending rollout.") break else: logger.warning( - f"Sample {sample_id}, Turn {turn_num+1}: LLM provided no content or tool calls. Ending rollout." + f"Sample {sample_id}, Turn {turn_num + 1}: LLM provided no content or tool calls. Ending rollout." ) final_llm_text_response = "LLM provided no actionable response in this turn." assistant_msg_for_history["content"] = final_llm_text_response diff --git a/eval_protocol/gcp_tools.py b/eval_protocol/gcp_tools.py index b19089ca..b6131157 100644 --- a/eval_protocol/gcp_tools.py +++ b/eval_protocol/gcp_tools.py @@ -453,7 +453,7 @@ def ensure_gcp_secret( dry_run=True, ) - print(f"\n2. Simulating deploy to Cloud Run (dry_run=True)") + print("\n2. Simulating deploy to Cloud Run (dry_run=True)") deploy_to_cloud_run( service_name="my-reward-service", image_name_tag=ar_img_name, # Use AR image name @@ -465,7 +465,7 @@ def ensure_gcp_secret( dry_run=True, ) - print(f"\n3. Simulating ensure_artifact_registry_repo_exists (dry_run=True)") + print("\n3. Simulating ensure_artifact_registry_repo_exists (dry_run=True)") ensure_artifact_registry_repo_exists( project_id="my-test-project", region="us-central1", @@ -473,7 +473,7 @@ def ensure_gcp_secret( dry_run=True, ) - print(f"\n4. Simulating ensure_gcp_secret (dry_run=True)") + print("\n4. Simulating ensure_gcp_secret (dry_run=True)") ensure_gcp_secret( project_id="my-test-project", secret_id="my-test-api-key-secret", diff --git a/eval_protocol/generation/clients.py b/eval_protocol/generation/clients.py index 45be6ab0..873f587e 100644 --- a/eval_protocol/generation/clients.py +++ b/eval_protocol/generation/clients.py @@ -226,7 +226,7 @@ async def generate( # ... (rest of the error handling as before) ... elif response.status == 429: # Rate limit retry_after = int(response.headers.get("Retry-After", "5")) - logger.warning(f"Rate limited. Retrying after {retry_after}s (attempt {attempt+1}).") + logger.warning(f"Rate limited. Retrying after {retry_after}s (attempt {attempt + 1}).") await asyncio.sleep(retry_after) elif response.status in [401, 403]: # Auth errors error_text = await response.text() @@ -234,7 +234,7 @@ async def generate( return GenerationResult() # Empty result on auth error elif response.status >= 500: # Server errors logger.warning( - f"Fireworks API Server Error ({response.status}). Retrying (attempt {attempt+1})." + f"Fireworks API Server Error ({response.status}). Retrying (attempt {attempt + 1})." ) await asyncio.sleep(2**attempt) else: # Other client errors diff --git a/eval_protocol/generic_server.py b/eval_protocol/generic_server.py index 7efd444b..04e111e2 100644 --- a/eval_protocol/generic_server.py +++ b/eval_protocol/generic_server.py @@ -153,7 +153,7 @@ def load_reward_function(import_string: str): try: load_reward_function(args.import_string) except Exception: - print(f"Failed to load reward function. Exiting.") + print("Failed to load reward function. Exiting.") exit(1) if not _LOADED_REWARD_FUNCTION: diff --git a/eval_protocol/integrations/trl.py b/eval_protocol/integrations/trl.py index 62f89a34..9873c593 100644 --- a/eval_protocol/integrations/trl.py +++ b/eval_protocol/integrations/trl.py @@ -180,7 +180,7 @@ def trl_reward_pipeline( if scores: logger.debug( f"Batch rewards calculated by TRL adapter. Count: {len(scores)}, " - f"Min: {min(scores)}, Max: {max(scores)}, Avg: {sum(scores)/len(scores):.2f}" + f"Min: {min(scores)}, Max: {max(scores)}, Avg: {sum(scores) / len(scores):.2f}" ) return scores diff --git a/eval_protocol/mcp/execution/base_policy.py b/eval_protocol/mcp/execution/base_policy.py index 819b33dd..bdced48a 100644 --- a/eval_protocol/mcp/execution/base_policy.py +++ b/eval_protocol/mcp/execution/base_policy.py @@ -109,7 +109,6 @@ def add_tool_response( # Add control plane metadata if provided if reward != 0.0 or terminated or info: - tool_message["metadata"] = { "reward": reward, "terminated": terminated, @@ -182,7 +181,7 @@ async def _generate_live_tool_calls( # This is crucial for proper tool call ID management in add_tool_response assistant_message_for_history = { "role": "assistant", - "content": response["choices"][0]["message"].get("content", ""), + "content": response["choices"][0]["message"]["content"], } usage_stats = CompletionUsage( prompt_tokens=response["usage"]["prompt_tokens"], diff --git a/eval_protocol/mcp/execution/manager.py b/eval_protocol/mcp/execution/manager.py index b0359d79..1a36afef 100644 --- a/eval_protocol/mcp/execution/manager.py +++ b/eval_protocol/mcp/execution/manager.py @@ -80,7 +80,7 @@ def execute_rollouts( elif playback_mode: logger.info(f"🎬 Playback mode: Using recorded data from {playback_file}") else: - logger.info(f"🚀 Live mode: No recording/playback") + logger.info("🚀 Live mode: No recording/playback") # Initialize OpenAI format logging for terminated trajectories only openai_logger = None @@ -285,7 +285,6 @@ async def _execute_rollout( # Execute each tool call sequentially for tool_call in tool_calls: - # Execute tool call for this environment observation, reward, env_end, info = await envs.step(rollout_idx, tool_call) diff --git a/eval_protocol/mcp/execution/policy.py b/eval_protocol/mcp/execution/policy.py index f529a21d..c7f284f7 100644 --- a/eval_protocol/mcp/execution/policy.py +++ b/eval_protocol/mcp/execution/policy.py @@ -72,7 +72,7 @@ def __init__( self._setup_litellm_caching(use_caching, cache_type, redis_url) logger.info(f"✅ Initialized LiteLLM policy: {self.model_id}") else: - logger.info(f"🎬 Playback mode: Skipping LiteLLM initialization for performance") + logger.info("🎬 Playback mode: Skipping LiteLLM initialization for performance") def _setup_litellm_caching( self, use_caching: bool, cache_type: Literal["memory", "redis", "dual", "s3", "disk"], redis_url: Optional[str] diff --git a/eval_protocol/mcp/mcpgym.py b/eval_protocol/mcp/mcpgym.py index cf942a0f..fb8d8caa 100644 --- a/eval_protocol/mcp/mcpgym.py +++ b/eval_protocol/mcp/mcpgym.py @@ -141,7 +141,7 @@ def _get_session_id(self, ctx: Context) -> str: Creates stable session IDs based on client info (seed + config + client details) for consistent session management across reconnections. """ - print(f"🔍 _get_session_id: Starting session ID extraction") + print("🔍 _get_session_id: Starting session ID extraction") print(f"🔍 _get_session_id: ctx type: {type(ctx)}") print(f"🔍 _get_session_id: hasattr(ctx, 'session'): {hasattr(ctx, 'session')}") @@ -208,7 +208,6 @@ def _get_or_create_session(self, ctx: Context) -> Dict[str, Any]: return self.sessions[session_id] def _register_session_reset_endpoint(self): - @self.mcp.custom_route("/control/reset_session", methods=["POST"]) async def reset_session_endpoint(request: Request) -> JSONResponse: session_id = request.headers.get("mcp-session-id") diff --git a/eval_protocol/mcp/session/manager.py b/eval_protocol/mcp/session/manager.py index 71c23af0..a7ae679a 100644 --- a/eval_protocol/mcp/session/manager.py +++ b/eval_protocol/mcp/session/manager.py @@ -226,4 +226,4 @@ async def close(self): print(f"🧹 Closing {self.n} MCP sessions...") tasks = [self.connection_manager.close_session(session) for session in self.sessions] await asyncio.gather(*tasks) - print(f"✅ All MCP sessions closed.") + print("✅ All MCP sessions closed.") diff --git a/eval_protocol/mcp/simulation_server.py b/eval_protocol/mcp/simulation_server.py index f18bb0cf..7dfc11eb 100644 --- a/eval_protocol/mcp/simulation_server.py +++ b/eval_protocol/mcp/simulation_server.py @@ -398,7 +398,7 @@ def run(self, port: int = 8000, host: str = "127.0.0.1", **kwargs): host: Host to bind to **kwargs: Additional arguments for uvicorn """ - print(f"📡 Starting simulation server with StreamableHTTPSessionManager") + print("📡 Starting simulation server with StreamableHTTPSessionManager") print(f"🎮 Domain tools: {list(self._domain_tools.keys())}") print(f"📦 Domain resources: {list(self._domain_resources.keys())}") if self.production_server_app: diff --git a/eval_protocol/mcp_agent/intermediary_server.py b/eval_protocol/mcp_agent/intermediary_server.py index 7aa3f399..368f0232 100644 --- a/eval_protocol/mcp_agent/intermediary_server.py +++ b/eval_protocol/mcp_agent/intermediary_server.py @@ -23,8 +23,7 @@ logger = logging.getLogger(__name__) # logger.setLevel(logging.DEBUG) # Removed: Let level be set by main config -from mcp.server.fastmcp.server import Context as FastMCPContext -from mcp.server.fastmcp.server import FastMCP +from mcp.server.fastmcp.server import Context as FastMCPContext, FastMCP # RequestContext is not directly used by handlers anymore, mcp_ctx is. diff --git a/eval_protocol/mcp_agent/orchestration/local_docker_client.py b/eval_protocol/mcp_agent/orchestration/local_docker_client.py index 56a7c6ba..58c098dd 100644 --- a/eval_protocol/mcp_agent/orchestration/local_docker_client.py +++ b/eval_protocol/mcp_agent/orchestration/local_docker_client.py @@ -198,7 +198,7 @@ async def _perform_startup_check(self, url: str, check: Dict[str, Any]) -> bool: res.raise_for_status() return True except Exception as e: - logger.warning(f"Startup check fail {attempt+1}/5: {e}") + logger.warning(f"Startup check fail {attempt + 1}/5: {e}") if attempt < 4: await asyncio.sleep(2) return False @@ -233,7 +233,6 @@ async def provision_instances( and (template_details or backend_config.template_data_path_host) and backend_config.container_template_data_path ): - host_path_for_commit = template_details or backend_config.template_data_path_host if not host_path_for_commit or not backend_config.container_template_data_path: raise ValueError( diff --git a/eval_protocol/mcp_agent/session.py b/eval_protocol/mcp_agent/session.py index d126dc50..a4e91550 100644 --- a/eval_protocol/mcp_agent/session.py +++ b/eval_protocol/mcp_agent/session.py @@ -12,8 +12,10 @@ # Option 1: Try mcp.server.transport # from mcp.server.transport import ReadStream, WriteStream # Option 2: If not found, use typing.Any as a fallback for type hints -from typing import Any as ReadStream # Fallback if specific types are not found -from typing import Any as WriteStream +from typing import ( + Any as ReadStream, # Fallback if specific types are not found + Any as WriteStream, +) from mcp.server.session import ServerSession # Correct base class diff --git a/eval_protocol/platform_api.py b/eval_protocol/platform_api.py index efea26a7..c5c4d62e 100644 --- a/eval_protocol/platform_api.py +++ b/eval_protocol/platform_api.py @@ -290,7 +290,7 @@ def delete_fireworks_secret( test_api_key = get_fireworks_api_key() # Not passed directly, functions will resolve test_api_base = get_fireworks_api_base() - logger.info(f"Attempting to use the following configuration for testing Fireworks secrets API:") + logger.info("Attempting to use the following configuration for testing Fireworks secrets API:") logger.info(f" Resolved FIREWORKS_ACCOUNT_ID: {test_account_id}") logger.info(f" Resolved FIREWORKS_API_BASE: {test_api_base}") logger.info( diff --git a/eval_protocol/pytest/default_dataset_adapter.py b/eval_protocol/pytest/default_dataset_adapter.py index 87377cff..7c4a7d73 100644 --- a/eval_protocol/pytest/default_dataset_adapter.py +++ b/eval_protocol/pytest/default_dataset_adapter.py @@ -7,4 +7,4 @@ def default_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]: """ Default dataset adapter that simply returns the rows as is. """ - return [EvaluationRow(**row) for row in rows] \ No newline at end of file + return [EvaluationRow(**row) for row in rows] diff --git a/eval_protocol/pytest/default_mcp_gym_rollout_processor.py b/eval_protocol/pytest/default_mcp_gym_rollout_processor.py index b7376e9c..e9bbc1e4 100644 --- a/eval_protocol/pytest/default_mcp_gym_rollout_processor.py +++ b/eval_protocol/pytest/default_mcp_gym_rollout_processor.py @@ -86,12 +86,12 @@ def start(self) -> None: try: with open(self._log_file_path, "r") as f: log_content = f.read() - print(f"❌ Server failed to start!") + print("❌ Server failed to start!") print(f"📋 Server log ({self._log_file_path}):") print("=" * 50) print(log_content) print("=" * 50) - raise RuntimeError(f"Server failed to start or become ready. Check log above for details.") + raise RuntimeError("Server failed to start or become ready. Check log above for details.") except Exception as e: stdout, stderr = self.process.communicate() raise RuntimeError(f"Server failed to start or become ready. stderr: {stderr}, log error: {e}") @@ -108,7 +108,7 @@ def _wait_for_server_ready(self, timeout: int = 15) -> bool: while time.time() - start_time < timeout: # Check if process is still running if self.process.poll() is not None: - print(f"Server process exited early") + print("Server process exited early") return False try: diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index 38f66d54..c5717113 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -590,13 +590,13 @@ async def _execute_with_semaphore(row): # Check threshold after logging if threshold is not None and not passed: - assert ( - agg_score >= threshold.success - ), f"Aggregated score {agg_score:.3f} below threshold {threshold.success}" + assert agg_score >= threshold.success, ( + f"Aggregated score {agg_score:.3f} below threshold {threshold.success}" + ) if threshold.standard_deviation is not None: - assert ( - score_std <= threshold.standard_deviation - ), f"Standard deviation {score_std:.3f} above threshold {threshold.standard_deviation}" + assert score_std <= threshold.standard_deviation, ( + f"Standard deviation {score_std:.3f} above threshold {threshold.standard_deviation}" + ) except AssertionError: _log_eval_error("finished", data if "data" in locals() else None, passed=False) @@ -944,9 +944,9 @@ def run_evaluation_test_direct( pass if passed_threshold is not None and not passed: - assert ( - agg_score >= passed_threshold.success - ), f"Aggregated score {agg_score:.3f} below threshold {passed_threshold}" + assert agg_score >= passed_threshold.success, ( + f"Aggregated score {agg_score:.3f} below threshold {passed_threshold}" + ) return {"summary": summary_obj, "results": all_results} except Exception: diff --git a/eval_protocol/pytest/utils.py b/eval_protocol/pytest/utils.py index 186f7c7c..617b9e85 100644 --- a/eval_protocol/pytest/utils.py +++ b/eval_protocol/pytest/utils.py @@ -216,7 +216,9 @@ def generate_parameter_combinations( else: messages = [None] # type: ignore - kwargs: List[Optional[EvaluationInputParam]] = evaluation_test_kwargs if evaluation_test_kwargs is not None else [None] # type: ignore + kwargs: List[Optional[EvaluationInputParam]] = ( + evaluation_test_kwargs if evaluation_test_kwargs is not None else [None] + ) # type: ignore # Generate all combinations for ds in datasets: @@ -255,9 +257,9 @@ async def retry_handler(failed_row: EvaluationRow): current_attempts = retry_counts.get(rollout_id, 0) if current_attempts >= max_retry: - assert ( - failed_row.rollout_status and failed_row.rollout_status.status == "error" - ), f"Rollout {failed_row.execution_metadata.rollout_id} did not fail with error status" + assert failed_row.rollout_status and failed_row.rollout_status.status == "error", ( + f"Rollout {failed_row.execution_metadata.rollout_id} did not fail with error status" + ) failed_permanently.append(failed_row) await queue.put(failed_row) # put failed row on queue return diff --git a/eval_protocol/resources.py b/eval_protocol/resources.py index 3ca63aaf..d34c1a05 100644 --- a/eval_protocol/resources.py +++ b/eval_protocol/resources.py @@ -57,7 +57,7 @@ def setup(self) -> None: return try: - logger.debug(f"Setting up LLM deployment for model: " f"{self.llm_instance.model}") + logger.debug(f"Setting up LLM deployment for model: {self.llm_instance.model}") # For on-demand deployments, call apply() if hasattr(self.llm_instance, "deployment_type") and self.llm_instance.deployment_type == "on-demand": @@ -68,7 +68,7 @@ def setup(self) -> None: self._client = self.llm_instance self._is_setup = True - logger.info(f"LLM resource setup completed for model: " f"{self.llm_instance.model}") + logger.info(f"LLM resource setup completed for model: {self.llm_instance.model}") except Exception as e: logger.error(f"Failed to setup LLM resource: {e}") diff --git a/eval_protocol/rewards/accuracy_length.py b/eval_protocol/rewards/accuracy_length.py index 310353f5..b8e64eb2 100644 --- a/eval_protocol/rewards/accuracy_length.py +++ b/eval_protocol/rewards/accuracy_length.py @@ -132,7 +132,7 @@ def cosine_scaled_accuracy_length_reward( # Prepare detailed reason reward_type = "reward" if accuracy_success else "penalty" length_reason = ( - f"Length-based {reward_type}: {token_count}/{max_length} tokens, " f"cosine factor: {cosine_factor:.2f}" + f"Length-based {reward_type}: {token_count}/{max_length} tokens, cosine factor: {cosine_factor:.2f}" ) combined_reason = ( diff --git a/eval_protocol/rewards/apps_coding_reward.py b/eval_protocol/rewards/apps_coding_reward.py index 3089f56e..6cbd63b9 100644 --- a/eval_protocol/rewards/apps_coding_reward.py +++ b/eval_protocol/rewards/apps_coding_reward.py @@ -244,7 +244,7 @@ def evaluate_apps_solution(messages: List[Message], ground_truth: Optional[str], # as our system prompt now asks for a main() that handles IO. # The generated code itself should be a runnable script. del in_outs_for_check["fn_name"] - logger.info(f"Removed 'fn_name' from in_outs for check_correctness to use standard_input path.") + logger.info("Removed 'fn_name' from in_outs for check_correctness to use standard_input path.") final_code_to_execute = code_solution # The model's full response (after extraction) diff --git a/eval_protocol/rewards/apps_execution_utils.py b/eval_protocol/rewards/apps_execution_utils.py index c40baffe..74e3fbfa 100644 --- a/eval_protocol/rewards/apps_execution_utils.py +++ b/eval_protocol/rewards/apps_execution_utils.py @@ -47,7 +47,7 @@ def _temp_run( # Temporarily disable stdout/stderr redirection to see debug prints from run_test # sys.stdout = open(os.devnull, "w") # sys.stderr = open(os.devnull, "w") - print(f"[_temp_run] Executing run_test for sample. Debug prints from run_test should be visible.") + print("[_temp_run] Executing run_test for sample. Debug prints from run_test should be visible.") try: res, metadata = run_test(in_outs=sample, test=generation, debug=debug, timeout=timeout) diff --git a/eval_protocol/rewards/apps_testing_util.py b/eval_protocol/rewards/apps_testing_util.py index 0c349797..84f13f8a 100644 --- a/eval_protocol/rewards/apps_testing_util.py +++ b/eval_protocol/rewards/apps_testing_util.py @@ -109,7 +109,7 @@ def _load_module_from_string(module_name, code_string): try: exec(code_string, module.__dict__) # sys.modules[module_name] = module # Optional: if other parts of the code expect it in sys.modules - except Exception as e: + except Exception: raise return module diff --git a/eval_protocol/rewards/bfcl_reward.py b/eval_protocol/rewards/bfcl_reward.py index 44053ad8..ccf1cda4 100644 --- a/eval_protocol/rewards/bfcl_reward.py +++ b/eval_protocol/rewards/bfcl_reward.py @@ -286,7 +286,7 @@ def bfcl_reward( final_score = 0.0 reason = "State or function calls did not perfectly match ground truth." if state_match_score < 0.5: - reason += f" State match failed." + reason += " State match failed." if state_diffs: reason += f" Differences: {json.dumps(state_diffs)}" if func_match_score < 0.5: # Check against 0.5 as perfect score for this component diff --git a/eval_protocol/rewards/code_execution.py b/eval_protocol/rewards/code_execution.py index 6bfbe8d5..52db7db2 100644 --- a/eval_protocol/rewards/code_execution.py +++ b/eval_protocol/rewards/code_execution.py @@ -276,7 +276,6 @@ def _execute_code_in_process(execute_func: Callable, args: Tuple, timeout: int = Returns: Dictionary with execution results """ - import multiprocessing manager = multiprocessing.Manager() result_dict = manager.dict() diff --git a/eval_protocol/rewards/cpp_code.py b/eval_protocol/rewards/cpp_code.py index 9511f0e9..cb324273 100644 --- a/eval_protocol/rewards/cpp_code.py +++ b/eval_protocol/rewards/cpp_code.py @@ -484,7 +484,7 @@ async def run_cpp_test_cases( for i, test_case in enumerate(test_cases): test_input = test_case.get("input", "") expected_output = test_case.get("expected_output", "") - test_name = test_case.get("name", f"Test {i+1}") + test_name = test_case.get("name", f"Test {i + 1}") execution_result = await execute_cpp_code( code=code, diff --git a/eval_protocol/rewards/deepcoder_reward.py b/eval_protocol/rewards/deepcoder_reward.py index d0674dd7..ebdc44bb 100644 --- a/eval_protocol/rewards/deepcoder_reward.py +++ b/eval_protocol/rewards/deepcoder_reward.py @@ -9,9 +9,9 @@ from ..models import EvaluateResult, Message, MetricResult from ..reward_function import reward_function -from .code_execution import _HAS_E2B # Import _HAS_E2B to check E2B availability -from .code_execution import _run_test_cases # Import the main test case runner from .code_execution import ( + _HAS_E2B, # Import _HAS_E2B to check E2B availability + _run_test_cases, # Import the main test case runner compare_outputs, execute_code_with_e2b, execute_javascript_code, diff --git a/eval_protocol/rewards/language_consistency.py b/eval_protocol/rewards/language_consistency.py index 174f137e..bfdd2052 100644 --- a/eval_protocol/rewards/language_consistency.py +++ b/eval_protocol/rewards/language_consistency.py @@ -646,7 +646,7 @@ def language_consistency_reward( if total_counted == 0: return EvaluateResult( score=0.0, - reason=f"No language markers found in model response to evaluate.", + reason="No language markers found in model response to evaluate.", metrics={ "language_consistency": MetricResult( score=0.0, diff --git a/eval_protocol/rewards/tag_count.py b/eval_protocol/rewards/tag_count.py index 1d0e04e7..83acef6f 100644 --- a/eval_protocol/rewards/tag_count.py +++ b/eval_protocol/rewards/tag_count.py @@ -126,9 +126,9 @@ def _get_tag_reason(tag: str, opening_count: int, closing_count: int, require_ba return f"Found {opening_count} balanced '{tag}' tag(s)" else: if require_balanced: - return f"Unbalanced tags: {opening_count} opening vs " f"{closing_count} closing '{tag}' tags" + return f"Unbalanced tags: {opening_count} opening vs {closing_count} closing '{tag}' tags" else: - return f"Found '{tag}' tags (unbalanced: {opening_count} opening, " f"{closing_count} closing)" + return f"Found '{tag}' tags (unbalanced: {opening_count} opening, {closing_count} closing)" def _get_overall_reason( diff --git a/eval_protocol/stats/__init__.py b/eval_protocol/stats/__init__.py index c327d2ed..6cf5a888 100644 --- a/eval_protocol/stats/__init__.py +++ b/eval_protocol/stats/__init__.py @@ -1,5 +1,3 @@ """Statistical utilities for evaluation reporting (confidence intervals, etc.).""" from .confidence_intervals import compute_fixed_set_mu_ci # re-export - - diff --git a/eval_protocol/stats/confidence_intervals.py b/eval_protocol/stats/confidence_intervals.py index bf78934c..70a4dd2d 100644 --- a/eval_protocol/stats/confidence_intervals.py +++ b/eval_protocol/stats/confidence_intervals.py @@ -112,5 +112,3 @@ def compute_fixed_set_mu_ci( ci_high = min(1.0, mu_hat + margin) return float(mu_hat), float(ci_low), float(ci_high) - - diff --git a/eval_protocol/utils/logs_server.py b/eval_protocol/utils/logs_server.py index 46630cdf..e5e6e4a3 100644 --- a/eval_protocol/utils/logs_server.py +++ b/eval_protocol/utils/logs_server.py @@ -195,7 +195,6 @@ def _should_update_status(self, row: "EvaluationRow") -> bool: """Check if a row's status should be updated to 'stopped'.""" # Check if the row has running status and a PID if row.eval_metadata and row.eval_metadata.status == "running" and row.pid is not None: - # Check if the process is still running try: process = psutil.Process(row.pid) diff --git a/examples/adapters/README.md b/examples/adapters/README.md index 591bbbb2..4b8501ef 100644 --- a/examples/adapters/README.md +++ b/examples/adapters/README.md @@ -43,34 +43,6 @@ Loads datasets from HuggingFace Hub and converts them to EvaluationRow format. pip install 'eval-protocol[huggingface]' ``` -### 3. BigQuery Adapter (`bigquery_example.py`) - -Queries data from Google BigQuery tables and converts them to EvaluationRow format. - -**Features:** -- Execute custom SQL queries against BigQuery datasets -- Support for parameterized queries and batch processing -- Built-in convenience adapters for conversation and Q&A data -- Rich metadata preservation including query information -- Integration with Google Cloud authentication -- Schema introspection and dataset exploration - -**Prerequisites:** -```bash -pip install 'eval-protocol[bigquery]' -``` - -**Environment Variables:** -```bash -export GOOGLE_CLOUD_PROJECT="your-project-id" -export GOOGLE_APPLICATION_CREDENTIALS="/path/to/service-account.json" # optional -``` - -**Alternative Authentication:** -```bash -gcloud auth application-default login -``` - ## Running the Examples ### Basic Usage @@ -82,9 +54,6 @@ python examples/adapters/langfuse_example.py # Run HuggingFace example python examples/adapters/huggingface_example.py -# Run BigQuery example -python examples/adapters/bigquery_example.py - # Run GSM8K replacement example python examples/adapters/gsm8k_replacement_example.py ``` @@ -97,11 +66,6 @@ export LANGFUSE_PUBLIC_KEY="pk_..." export LANGFUSE_SECRET_KEY="sk_..." python examples/adapters/langfuse_example.py -# Set up Google Cloud credentials for BigQuery -export GOOGLE_CLOUD_PROJECT="your-project-id" -export GOOGLE_APPLICATION_CREDENTIALS="/path/to/service-account.json" # optional -python examples/adapters/bigquery_example.py - # HuggingFace works without credentials for public datasets python examples/adapters/huggingface_example.py ``` diff --git a/examples/adapters/gsm8k_replacement_example.py b/examples/adapters/gsm8k_replacement_example.py index a86de261..3c18775e 100644 --- a/examples/adapters/gsm8k_replacement_example.py +++ b/examples/adapters/gsm8k_replacement_example.py @@ -1,8 +1,8 @@ """ GSM8K Replacement Example -This example shows how to replace the static GSM8K JSONL file -(development/gsm8k_sample.jsonl) with the dynamic HuggingFace adapter +This example shows how to replace the static GSM8K JSONL file +(development/gsm8k_sample.jsonl) with the dynamic HuggingFace adapter to get fresh data from the GSM8K dataset. """ @@ -18,17 +18,17 @@ def load_original_gsm8k_sample() -> List[dict]: """Load the original GSM8K sample file for comparison.""" sample_file = Path("development/gsm8k_sample.jsonl") - + if not sample_file.exists(): print(f"⚠️ Original sample file not found: {sample_file}") return [] - + data = [] - with open(sample_file, 'r') as f: + with open(sample_file, "r") as f: for line in f: if line.strip(): data.append(json.loads(line)) - + return data @@ -36,52 +36,52 @@ def demonstrate_old_vs_new_approach(): """Compare the old static file approach with the new adapter approach.""" print("📊 Comparing Old vs New Approach") print("=" * 50) - + # OLD APPROACH: Static JSONL file print("🗂️ OLD APPROACH: Static JSONL File") print("-" * 35) - + original_data = load_original_gsm8k_sample() print(f"Loaded {len(original_data)} items from static file") - + if original_data: sample = original_data[0] print(f"Sample item fields: {list(sample.keys())}") print(f"Sample question: {sample.get('user_query', '')[:100]}...") print(f"Sample ground truth: {sample.get('ground_truth_for_eval', '')[:100]}...") - - print("\n" + "="*50 + "\n") - + + print("\n" + "=" * 50 + "\n") + # NEW APPROACH: HuggingFace Adapter print("🤗 NEW APPROACH: HuggingFace Adapter") print("-" * 38) - + try: # Create adapter adapter = create_gsm8k_adapter( system_prompt="You are a helpful assistant that solves math problems step by step." ) - + print("✅ GSM8K adapter created successfully") - + # Get the same number of items as the original file num_items = len(original_data) if original_data else 6 rows = list(adapter.get_evaluation_rows(limit=num_items)) - + print(f"Retrieved {len(rows)} evaluation rows from HuggingFace") - + if rows: sample_row = rows[0] - print(f"Sample EvaluationRow fields: messages, tools, input_metadata, ground_truth") - + print("Sample EvaluationRow fields: messages, tools, input_metadata, ground_truth") + # Show the question from messages user_msg = next((msg for msg in sample_row.messages if msg.role == "user"), None) if user_msg: print(f"Sample question: {user_msg.content[:100]}...") - + if sample_row.ground_truth: print(f"Sample ground truth: {sample_row.ground_truth[:100]}...") - + except ImportError as e: print(f"❌ Error: {e}") print("Install HuggingFace dependencies: pip install 'eval-protocol[huggingface]'") @@ -89,9 +89,9 @@ def demonstrate_old_vs_new_approach(): except Exception as e: print(f"❌ Error with adapter: {e}") return - - print("\n" + "="*50 + "\n") - + + print("\n" + "=" * 50 + "\n") + # COMPARISON print("🔍 Key Differences") print("-" * 20) @@ -101,7 +101,7 @@ def demonstrate_old_vs_new_approach(): print(" ❌ Manual data preparation required") print(" ❌ Limited to pre-selected subset") print(" ❌ Requires manual format conversion") - + print("\nNEW APPROACH:") print(" ✅ Access to full GSM8K dataset (8,792 test problems)") print(" ✅ Automatic format conversion to EvaluationRow") @@ -115,10 +115,11 @@ def show_migration_example(): """Show how to migrate existing code from JSONL to adapter.""" print("\n🔄 Code Migration Example") print("=" * 30) - + print("OLD CODE:") print("-" * 10) - print(""" + print( + """ # Old way with static JSONL file input_dataset = ["development/gsm8k_sample.jsonl"] @@ -134,11 +135,13 @@ def show_migration_example(): ] ground_truth = item["ground_truth_for_eval"] # ... more manual processing -""") - +""" + ) + print("\nNEW CODE:") print("-" * 10) - print(""" + print( + """ # New way with HuggingFace adapter from eval_protocol.adapters.huggingface import create_gsm8k_adapter @@ -149,7 +152,7 @@ def show_migration_example(): # Get evaluation rows (already in correct format) evaluation_rows = list(adapter.get_evaluation_rows( - split="test", # or "train" + split="test", # or "train" limit=100, # Can get much more data than static file model_name="gpt-4", temperature=0.0, @@ -175,8 +178,9 @@ def custom_gsm8k_transform(row): config_name="main", transform_fn=custom_gsm8k_transform ) -""") - +""" + ) + print("\n✅ Benefits of Migration:") print(" - More data available (6 → 8,792 problems)") print(" - Automatic format handling") @@ -189,30 +193,30 @@ def practical_migration_demo(): """Show a practical example of using the adapter in evaluation.""" print("\n🧪 Practical Evaluation Example") print("=" * 35) - + try: # Create adapter adapter = create_gsm8k_adapter() - + # Get a few problems for evaluation print("Loading GSM8K problems...") rows = list(adapter.get_evaluation_rows(limit=3)) print(f"✅ Loaded {len(rows)} problems from GSM8K test set") - + # Simulate evaluation workflow for i, row in enumerate(rows): - print(f"\n📝 Problem {i+1}:") - + print(f"\n📝 Problem {i + 1}:") + # Show the problem user_msg = next((msg for msg in row.messages if msg.role == "user"), None) if user_msg: print(f" Question: {user_msg.content[:150]}...") - + # In a real scenario, you'd generate a response with your LLM # For this demo, we'll add a dummy response dummy_response = "Let me solve this step by step. After working through the math, the answer is 42." row.messages.append(Message(role="assistant", content=dummy_response)) - + # Evaluate with math reward function if row.ground_truth: try: @@ -222,7 +226,7 @@ def practical_migration_demo(): ) print(f" 📊 Math evaluation score: {result.score:.2f}") print(f" 💭 Evaluation reason: {result.reason[:100]}...") - + # Show metadata if row.input_metadata: print(f" 🏷️ Row ID: {row.input_metadata.row_id}") @@ -230,12 +234,12 @@ def practical_migration_demo(): dataset_info = row.input_metadata.dataset_info print(f" 📚 Dataset: {dataset_info.get('dataset_name', 'N/A')}") print(f" 📍 Row index: {dataset_info.get('row_index', 'N/A')}") - + except Exception as e: print(f" ❌ Evaluation error: {e}") - + print(f"\n✅ Successfully processed {len(rows)} problems using the new adapter approach!") - + except Exception as e: print(f"❌ Error in practical demo: {e}") @@ -244,9 +248,9 @@ def performance_comparison(): """Compare performance characteristics of both approaches.""" print("\n⚡ Performance Considerations") print("=" * 35) - + import time - + # Time the old approach (if file exists) original_data = load_original_gsm8k_sample() if original_data: @@ -259,7 +263,7 @@ def performance_comparison(): print("📁 Static file not available for timing") old_time = 0 processed_old = 0 - + # Time the new approach try: start_time = time.time() @@ -267,9 +271,9 @@ def performance_comparison(): rows = list(adapter.get_evaluation_rows(split="test", limit=max(6, processed_old))) new_time = time.time() - start_time processed_new = len(rows) - + print(f"🤗 HuggingFace adapter: {processed_new} items in {new_time:.4f}s") - + if old_time > 0: if new_time > old_time: factor = new_time / old_time @@ -277,11 +281,11 @@ def performance_comparison(): else: factor = old_time / new_time print(f" 📊 Adapter is {factor:.1f}x faster!") - - print(f"\n💡 Trade-offs:") + + print("\n💡 Trade-offs:") print(f" Static file: Fast ({old_time:.4f}s) but limited data ({processed_old} items)") print(f" Adapter: Slower ({new_time:.4f}s) but access to full dataset ({processed_new}+ items)") - + except Exception as e: print(f"❌ Error timing adapter: {e}") @@ -293,16 +297,16 @@ def main(): print("This example shows how to replace the static GSM8K JSONL file") print("with the dynamic HuggingFace adapter for better data access.") print() - + # Run all demonstrations demonstrate_old_vs_new_approach() show_migration_example() practical_migration_demo() performance_comparison() - - print("\n" + "="*50) + + print("\n" + "=" * 50) print("🎯 MIGRATION SUMMARY") - print("="*50) + print("=" * 50) print("1. ✅ Replace static JSONL with HuggingFace adapter") print("2. ✅ Get access to full GSM8K dataset (8,792 test problems)") print("3. ✅ Automatic conversion to EvaluationRow format") @@ -318,4 +322,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/examples/adapters/huggingface_example.py b/examples/adapters/huggingface_example.py index 2d79eae3..c4f1d907 100644 --- a/examples/adapters/huggingface_example.py +++ b/examples/adapters/huggingface_example.py @@ -9,10 +9,10 @@ from typing import List from eval_protocol.adapters.huggingface import ( - create_huggingface_adapter, + HuggingFaceAdapter, create_gsm8k_adapter, + create_huggingface_adapter, create_math_adapter, - HuggingFaceAdapter, ) from eval_protocol.models import EvaluationRow @@ -21,44 +21,47 @@ def gsm8k_example(): """Example using the GSM8K dataset.""" print("📚 Example 1: GSM8K Dataset") print("-" * 30) - + try: # Create GSM8K adapter using the convenience method adapter = create_gsm8k_adapter( - split="test", - system_prompt="You are a helpful assistant that solves math problems step by step." + split="test", system_prompt="You are a helpful assistant that solves math problems step by step." ) - + print("✅ GSM8K adapter created successfully") print(f"📊 Dataset info: {adapter.get_dataset_info()}") - + # Get a few evaluation rows - rows = list(adapter.get_evaluation_rows( - limit=3, - model_name="gpt-4", - temperature=0.0, - )) - + rows = list( + adapter.get_evaluation_rows( + limit=3, + model_name="gpt-4", + temperature=0.0, + ) + ) + print(f"\nRetrieved {len(rows)} evaluation rows from GSM8K test set:") - + for i, row in enumerate(rows): - print(f"\n Row {i+1}:") + print(f"\n Row {i + 1}:") print(f" - ID: {row.input_metadata.row_id if row.input_metadata else 'N/A'}") print(f" - Messages: {len(row.messages)}") - + # Show the math problem user_message = next((msg for msg in row.messages if msg.role == "user"), None) if user_message: - problem = user_message.content[:200] + "..." if len(user_message.content) > 200 else user_message.content + problem = ( + user_message.content[:200] + "..." if len(user_message.content) > 200 else user_message.content + ) print(f" - Problem: {problem}") - + # Show ground truth answer if row.ground_truth: answer_preview = row.ground_truth[:100] + "..." if len(row.ground_truth) > 100 else row.ground_truth print(f" - Ground truth: {answer_preview}") - + print() - + except ImportError as e: print(f"❌ Error: {e}") print("Install HuggingFace dependencies: pip install 'eval-protocol[huggingface]'") @@ -70,42 +73,44 @@ def math_dataset_example(): """Example using the MATH competition dataset.""" print("🧮 Example 2: MATH Competition Dataset") print("-" * 40) - + try: # Create MATH dataset adapter - adapter = create_math_adapter( - system_prompt="You are an expert mathematician. Solve this step by step." - ) - + adapter = create_math_adapter(system_prompt="You are an expert mathematician. Solve this step by step.") + print("✅ MATH dataset adapter created successfully") print(f"📊 Dataset info: {adapter.get_dataset_info()}") - + # Get a few examples - rows = list(adapter.get_evaluation_rows( - limit=2, - model_name="gpt-4", - temperature=0.1, - )) - + rows = list( + adapter.get_evaluation_rows( + limit=2, + model_name="gpt-4", + temperature=0.1, + ) + ) + print(f"\nRetrieved {len(rows)} evaluation rows from MATH test set:") - + for i, row in enumerate(rows): - print(f"\n Row {i+1}:") - + print(f"\n Row {i + 1}:") + # Show the problem user_message = next((msg for msg in row.messages if msg.role == "user"), None) if user_message: - problem = user_message.content[:150] + "..." if len(user_message.content) > 150 else user_message.content + problem = ( + user_message.content[:150] + "..." if len(user_message.content) > 150 else user_message.content + ) print(f" - Problem: {problem}") - + # Show metadata if row.input_metadata and row.input_metadata.dataset_info: dataset_info = row.input_metadata.dataset_info - if 'original_type' in dataset_info: + if "original_type" in dataset_info: print(f" - Problem type: {dataset_info['original_type']}") - if 'original_level' in dataset_info: + if "original_level" in dataset_info: print(f" - Level: {dataset_info['original_level']}") - + except Exception as e: print(f"❌ Error with MATH dataset: {e}") @@ -114,66 +119,70 @@ def custom_dataset_example(): """Example using a custom dataset with transformation function.""" print("🔧 Example 3: Custom Dataset with Transform Function") print("-" * 55) - + try: # Define transformation function for SQuAD dataset def squad_transform(row): """Transform SQuAD row to evaluation format.""" - context = row['context'] - question = row['question'] - answers = row['answers'] - + context = row["context"] + question = row["question"] + answers = row["answers"] + # Get first answer text - answer_text = answers['text'][0] if answers['text'] else "No answer provided" - + answer_text = answers["text"][0] if answers["text"] else "No answer provided" + return { - 'messages': [ - {'role': 'system', 'content': 'Answer the question based on the given context.'}, - {'role': 'user', 'content': f"Context: {context}\\n\\nQuestion: {question}"}, + "messages": [ + {"role": "system", "content": "Answer the question based on the given context."}, + {"role": "user", "content": f"Context: {context}\\n\\nQuestion: {question}"}, ], - 'ground_truth': answer_text, - 'metadata': { - 'dataset': 'squad', - 'context_length': len(context), - 'question_length': len(question), - 'num_possible_answers': len(answers['text']), - } + "ground_truth": answer_text, + "metadata": { + "dataset": "squad", + "context_length": len(context), + "question_length": len(question), + "num_possible_answers": len(answers["text"]), + }, } - + # Create adapter with transformation function adapter = create_huggingface_adapter( dataset_id="squad", transform_fn=squad_transform, ) - + print("✅ Custom dataset adapter created successfully") - + # Get dataset info info = adapter.get_dataset_info() print(f"📊 Dataset info: {info}") - + # Get a few examples - rows = list(adapter.get_evaluation_rows( - split="validation", # SQuAD has train/validation splits - limit=2, - model_name="gpt-3.5-turbo", - )) - + rows = list( + adapter.get_evaluation_rows( + split="validation", # SQuAD has train/validation splits + limit=2, + model_name="gpt-3.5-turbo", + ) + ) + print(f"\nRetrieved {len(rows)} evaluation rows:") - + for i, row in enumerate(rows): - print(f"\n Row {i+1}:") + print(f"\n Row {i + 1}:") print(f" - Messages: {len(row.messages)}") - + # Show question user_message = next((msg for msg in row.messages if msg.role == "user"), None) if user_message: - question = user_message.content[:100] + "..." if len(user_message.content) > 100 else user_message.content + question = ( + user_message.content[:100] + "..." if len(user_message.content) > 100 else user_message.content + ) print(f" - Question: {question}") - + # SQuAD answers are complex, so just show if we have ground truth print(f" - Has ground truth: {'Yes' if row.ground_truth else 'No'}") - + except Exception as e: print(f"❌ Error with custom dataset: {e}") @@ -182,93 +191,85 @@ def local_file_example(): """Example loading a local dataset file.""" print("📁 Example 4: Local Dataset File") print("-" * 35) - + # Create a sample JSONL file for demonstration sample_file = "/tmp/sample_qa.jsonl" sample_data = [ - { - "id": "q1", - "question": "What is the capital of France?", - "answer": "Paris", - "category": "geography" - }, - { - "id": "q2", - "question": "What is 2 + 2?", - "answer": "4", - "category": "math" - }, + {"id": "q1", "question": "What is the capital of France?", "answer": "Paris", "category": "geography"}, + {"id": "q2", "question": "What is 2 + 2?", "answer": "4", "category": "math"}, { "id": "q3", "question": "Who wrote Romeo and Juliet?", "answer": "William Shakespeare", - "category": "literature" - } + "category": "literature", + }, ] - + try: import json - + # Write sample data - with open(sample_file, 'w') as f: + with open(sample_file, "w") as f: for item in sample_data: - f.write(json.dumps(item) + '\n') - + f.write(json.dumps(item) + "\n") + print(f"📝 Created sample file: {sample_file}") - + # Define transformation function for local data def local_qa_transform(row): - """Transform local Q&A data to evaluation format.""" + """Transform local Q&A data to evaluation format.""" return { - 'messages': [ - {'role': 'system', 'content': 'You are a knowledgeable assistant.'}, - {'role': 'user', 'content': row['question']}, + "messages": [ + {"role": "system", "content": "You are a knowledgeable assistant."}, + {"role": "user", "content": row["question"]}, ], - 'ground_truth': row['answer'], - 'metadata': { - 'id': row.get('id'), - 'category': row.get('category'), - 'dataset': 'local_qa_sample', - } + "ground_truth": row["answer"], + "metadata": { + "id": row.get("id"), + "category": row.get("category"), + "dataset": "local_qa_sample", + }, } - + # Load with adapter adapter = HuggingFaceAdapter.from_local( path=sample_file, transform_fn=local_qa_transform, ) - + print("✅ Local file adapter created successfully") - + # Get all rows - rows = list(adapter.get_evaluation_rows( - model_name="gpt-3.5-turbo", - temperature=0.0, - )) - + rows = list( + adapter.get_evaluation_rows( + model_name="gpt-3.5-turbo", + temperature=0.0, + ) + ) + print(f"\nLoaded {len(rows)} rows from local file:") - + for i, row in enumerate(rows): - print(f"\n Row {i+1}:") - + print(f"\n Row {i + 1}:") + # Show question and answer user_msg = next((msg for msg in row.messages if msg.role == "user"), None) if user_msg: print(f" - Question: {user_msg.content}") - + if row.ground_truth: print(f" - Answer: {row.ground_truth}") - + # Show original metadata if row.input_metadata and row.input_metadata.dataset_info: - original_data = {k: v for k, v in row.input_metadata.dataset_info.items() if k.startswith('original_')} + original_data = {k: v for k, v in row.input_metadata.dataset_info.items() if k.startswith("original_")} if original_data: print(f" - Original data: {original_data}") - + # Clean up os.remove(sample_file) - print(f"\n🧹 Cleaned up sample file") - + print("\n🧹 Cleaned up sample file") + except Exception as e: print(f"❌ Error with local file: {e}") @@ -277,35 +278,34 @@ def evaluation_integration_example(): """Show how to integrate with evaluation functions.""" print("\n🧪 Example 5: Integration with Evaluation") print("-" * 45) - + try: # Import evaluation functions - from eval_protocol.rewards.math import math_reward from eval_protocol.rewards.accuracy import accuracy_reward - + from eval_protocol.rewards.math import math_reward + # Create GSM8K adapter adapter = create_gsm8k_adapter(split="test") - + # Get a few rows for evaluation rows = list(adapter.get_evaluation_rows(limit=2)) - + print(f"Running evaluation on {len(rows)} GSM8K problems:") - + for i, row in enumerate(rows): - print(f"\n Problem {i+1}:") - + print(f"\n Problem {i + 1}:") + # Show the problem user_msg = next((msg for msg in row.messages if msg.role == "user"), None) if user_msg: print(f" Question: {user_msg.content[:100]}...") - + # For this example, we'll simulate an assistant response # In practice, this would come from your LLM - row.messages.append({ - "role": "assistant", - "content": "Let me solve this step by step... The answer is 42." - }) - + row.messages.append( + {"role": "assistant", "content": "Let me solve this step by step... The answer is 42."} + ) + # Evaluate with math reward if row.ground_truth: try: @@ -315,17 +315,17 @@ def evaluation_integration_example(): ) print(f" Math score: {math_result.score:.2f}") print(f" Reason: {math_result.reason[:100]}...") - + # Also try accuracy reward acc_result = accuracy_reward( messages=row.messages, ground_truth=row.ground_truth, ) print(f" Accuracy score: {acc_result.score:.2f}") - + except Exception as e: print(f" ❌ Evaluation error: {e}") - + except ImportError: print("Evaluation functions not available") except Exception as e: @@ -336,24 +336,26 @@ def batch_processing_example(): """Show how to process datasets in batches.""" print("\n📦 Example 6: Batch Processing") print("-" * 35) - + try: adapter = create_gsm8k_adapter(split="test") - + batch_size = 5 total_processed = 0 - + print(f"Processing GSM8K test set in batches of {batch_size}:") - + # Process in batches for batch_start in range(0, 20, batch_size): # Process first 20 items - batch_rows = list(adapter.get_evaluation_rows( - limit=batch_size, - offset=batch_start, - )) - - print(f" Batch {batch_start//batch_size + 1}: {len(batch_rows)} rows") - + batch_rows = list( + adapter.get_evaluation_rows( + limit=batch_size, + offset=batch_start, + ) + ) + + print(f" Batch {batch_start // batch_size + 1}: {len(batch_rows)} rows") + # Process each row in the batch for row in batch_rows: # Here you would typically: @@ -361,9 +363,9 @@ def batch_processing_example(): # 2. Evaluate the response # 3. Store results total_processed += 1 - + print(f"✅ Processed {total_processed} rows total") - + except Exception as e: print(f"❌ Error in batch processing: {e}") @@ -372,40 +374,40 @@ def main(): """Run all examples.""" print("🤗 HuggingFace Dataset Adapter Examples") print("=" * 50) - + # Run examples gsm8k_example() - print("\n" + "="*50 + "\n") - + print("\n" + "=" * 50 + "\n") + math_dataset_example() - print("\n" + "="*50 + "\n") - + print("\n" + "=" * 50 + "\n") + custom_dataset_example() - print("\n" + "="*50 + "\n") - + print("\n" + "=" * 50 + "\n") + local_file_example() - print("\n" + "="*50 + "\n") - + print("\n" + "=" * 50 + "\n") + evaluation_integration_example() - print("\n" + "="*50 + "\n") - + print("\n" + "=" * 50 + "\n") + batch_processing_example() if __name__ == "__main__": try: main() - + print("\n✅ All examples completed!") print("\nNext steps:") print("1. Choose the dataset that fits your needs") - print("2. Customize the system prompts for your use case") + print("2. Customize the system prompts for your use case") print("3. Integrate with your evaluation pipeline") print("4. Scale up to process full datasets") print("5. Use the EvaluationRow data for training or evaluation") - + except ImportError as e: print(f"❌ Missing dependencies: {e}") print("Install with: pip install 'eval-protocol[huggingface]'") except Exception as e: - print(f"❌ Error running examples: {e}") \ No newline at end of file + print(f"❌ Error running examples: {e}") diff --git a/examples/adapters/langfuse_example.py b/examples/adapters/langfuse_example.py index 78937c80..39fe31f4 100644 --- a/examples/adapters/langfuse_example.py +++ b/examples/adapters/langfuse_example.py @@ -15,16 +15,16 @@ def main(): """Example usage of the Langfuse adapter.""" - + # Configuration - you can set these as environment variables public_key = os.getenv("LANGFUSE_PUBLIC_KEY", "your_public_key_here") - secret_key = os.getenv("LANGFUSE_SECRET_KEY", "your_secret_key_here") + secret_key = os.getenv("LANGFUSE_SECRET_KEY", "your_secret_key_here") host = os.getenv("LANGFUSE_HOST", "https://langfuse-web-prod-zfdbl7ykrq-uc.a.run.app") project_id = os.getenv("LANGFUSE_PROJECT_ID", "cmdj5yxhk0006s6022cyi0prv") - + print(f"Connecting to Langfuse at: {host}") print(f"Project ID: {project_id}\n") - + # Create the adapter try: adapter = create_langfuse_adapter( @@ -41,91 +41,99 @@ def main(): except Exception as e: print(f"❌ Failed to create adapter: {e}") return - + # Example 1: Get recent evaluation rows print("\n📊 Example 1: Get recent evaluation rows") try: - rows = list(adapter.get_evaluation_rows( - limit=5, - from_timestamp=datetime.now() - timedelta(days=7), - include_tool_calls=True, - )) - + rows = list( + adapter.get_evaluation_rows( + limit=5, + from_timestamp=datetime.now() - timedelta(days=7), + include_tool_calls=True, + ) + ) + print(f"Retrieved {len(rows)} evaluation rows") for i, row in enumerate(rows): - print(f" Row {i+1}:") + print(f" Row {i + 1}:") print(f" - ID: {row.input_metadata.row_id if row.input_metadata else 'N/A'}") print(f" - Messages: {len(row.messages)}") print(f" - Has tools: {'Yes' if row.tools else 'No'}") print(f" - Ground truth: {'Yes' if row.ground_truth else 'No'}") - + # Show first message content (truncated) if row.messages: content = row.messages[0].content or "" preview = content[:100] + "..." if len(content) > 100 else content print(f" - First message: {preview}") print() - + except Exception as e: print(f"❌ Error retrieving rows: {e}") - + # Example 2: Filter by specific criteria print("\n🔍 Example 2: Filter by specific criteria") try: - rows = list(adapter.get_evaluation_rows( - limit=3, - tags=["production"], # Filter by tags if available - include_tool_calls=True, - )) - + rows = list( + adapter.get_evaluation_rows( + limit=3, + tags=["production"], # Filter by tags if available + include_tool_calls=True, + ) + ) + print(f"Retrieved {len(rows)} rows with 'production' tag") - + except Exception as e: print(f"❌ Error with filtered query: {e}") - + # Example 3: Get specific traces by ID print("\n🎯 Example 3: Get specific traces by ID") try: # Replace with actual trace IDs from your Langfuse deployment trace_ids = ["trace_id_1", "trace_id_2"] # These would be real IDs - - rows = list(adapter.get_evaluation_rows_by_ids( - trace_ids=trace_ids, - include_tool_calls=True, - )) - + + rows = list( + adapter.get_evaluation_rows_by_ids( + trace_ids=trace_ids, + include_tool_calls=True, + ) + ) + print(f"Retrieved {len(rows)} rows by specific IDs") - + except Exception as e: print(f"❌ Error retrieving specific traces: {e}") - + # Example 4: Extract different types of conversations print("\n💬 Example 4: Analyze conversation types") try: rows = list(adapter.get_evaluation_rows(limit=10, include_tool_calls=True)) - + chat_only = [] tool_calling = [] - + for row in rows: - if row.tools and any(msg.tool_calls for msg in row.messages if hasattr(msg, 'tool_calls') and msg.tool_calls): + if row.tools and any( + msg.tool_calls for msg in row.messages if hasattr(msg, "tool_calls") and msg.tool_calls + ): tool_calling.append(row) else: chat_only.append(row) - + print(f"Chat-only conversations: {len(chat_only)}") print(f"Tool calling conversations: {len(tool_calling)}") - + # Show example of tool calling conversation if tool_calling: row = tool_calling[0] - print(f"\n🔧 Example tool calling conversation:") + print("\n🔧 Example tool calling conversation:") for i, msg in enumerate(row.messages): - print(f" {i+1}. {msg.role}: {msg.content[:50] if msg.content else '[No content]'}...") - if hasattr(msg, 'tool_calls') and msg.tool_calls: + print(f" {i + 1}. {msg.role}: {msg.content[:50] if msg.content else '[No content]'}...") + if hasattr(msg, "tool_calls") and msg.tool_calls: for tool_call in msg.tool_calls: print(f" 🛠 Tool call: {tool_call}") - + except Exception as e: print(f"❌ Error analyzing conversation types: {e}") @@ -133,11 +141,11 @@ def main(): def demonstrate_evaluation_integration(): """Show how to use Langfuse data with evaluation functions.""" print("\n🧪 Integration with Evaluation Functions") - + # This would typically be in a separate evaluation script try: from eval_protocol.rewards.math import math_reward - + # Create adapter (reuse configuration from main example) adapter = create_langfuse_adapter( public_key=os.getenv("LANGFUSE_PUBLIC_KEY", "your_public_key_here"), @@ -145,13 +153,13 @@ def demonstrate_evaluation_integration(): host=os.getenv("LANGFUSE_HOST", "https://langfuse-web-prod-zfdbl7ykrq-uc.a.run.app"), project_id=os.getenv("LANGFUSE_PROJECT_ID", "cmdj5yxhk0006s6022cyi0prv"), ) - + # Get data and evaluate rows = list(adapter.get_evaluation_rows(limit=3)) - + for i, row in enumerate(rows): - print(f"\nEvaluating row {i+1}:") - + print(f"\nEvaluating row {i + 1}:") + # Only evaluate if we have ground truth if row.ground_truth: try: @@ -164,8 +172,8 @@ def demonstrate_evaluation_integration(): except Exception as e: print(f" ❌ Evaluation failed: {e}") else: - print(f" ⚠️ No ground truth available for evaluation") - + print(" ⚠️ No ground truth available for evaluation") + except ImportError: print("Math reward function not available") except Exception as e: @@ -175,25 +183,27 @@ def demonstrate_evaluation_integration(): if __name__ == "__main__": print("🚀 Langfuse Adapter Example") print("=" * 50) - + # Check if credentials are set - if not all([ - os.getenv("LANGFUSE_PUBLIC_KEY"), - os.getenv("LANGFUSE_SECRET_KEY"), - ]): + if not all( + [ + os.getenv("LANGFUSE_PUBLIC_KEY"), + os.getenv("LANGFUSE_SECRET_KEY"), + ] + ): print("⚠️ To run this example with real data, set environment variables:") print(" export LANGFUSE_PUBLIC_KEY='your_public_key'") print(" export LANGFUSE_SECRET_KEY='your_secret_key'") print(" export LANGFUSE_HOST='your_langfuse_host' # optional") print(" export LANGFUSE_PROJECT_ID='your_project_id' # optional") print() - + main() demonstrate_evaluation_integration() - + print("\n✅ Example completed!") print("\nNext steps:") print("1. Set up your Langfuse credentials") print("2. Modify the filters and parameters to match your data") print("3. Integrate with your evaluation pipeline") - print("4. Use the converted EvaluationRow data for training or evaluation") \ No newline at end of file + print("4. Use the converted EvaluationRow data for training or evaluation") diff --git a/examples/aime2025_chat_completion/README.md b/examples/aime2025_chat_completion/README.md index dbe79527..69a6ad6f 100644 --- a/examples/aime2025_chat_completion/README.md +++ b/examples/aime2025_chat_completion/README.md @@ -19,6 +19,3 @@ Environment variables expected: - `FIREWORKS_API_KEY` To scale up, adjust parameters in the decorator (e.g., `threshold_of_success`, `max_dataset_rows`). - - - diff --git a/examples/aime2025_chat_completion/__init__.py b/examples/aime2025_chat_completion/__init__.py index 8bcaacfb..470d6936 100644 --- a/examples/aime2025_chat_completion/__init__.py +++ b/examples/aime2025_chat_completion/__init__.py @@ -1,4 +1 @@ __all__ = ["main"] - - - diff --git a/examples/aime2025_chat_completion/main.py b/examples/aime2025_chat_completion/main.py index 92c6dd83..b6d12976 100644 --- a/examples/aime2025_chat_completion/main.py +++ b/examples/aime2025_chat_completion/main.py @@ -64,9 +64,7 @@ def evaluate( score=0.0, reason="No messages provided", is_score_valid=False, - metrics={ - "parse_status": MetricResult(score=0.0, is_score_valid=False, reason="empty messages") - }, + metrics={"parse_status": MetricResult(score=0.0, is_score_valid=False, reason="empty messages")}, ) last_msg = messages[-1] @@ -106,5 +104,3 @@ def evaluate( is_score_valid=is_valid, metrics=metrics, ) - - diff --git a/examples/blackjack_mcp/blackjack_adapter.py b/examples/blackjack_mcp/blackjack_adapter.py index 48c0b6b3..f88bc2cb 100644 --- a/examples/blackjack_mcp/blackjack_adapter.py +++ b/examples/blackjack_mcp/blackjack_adapter.py @@ -32,7 +32,7 @@ def create_environment(self, config: Optional[Dict[str, Any]] = None) -> Blackja natural = config.get("natural") if natural is None: natural = False - print(f"🔍 BlackjackAdapter.create_environment: natural is not set in the config, use False by default") + print("🔍 BlackjackAdapter.create_environment: natural is not set in the config, use False by default") if isinstance(natural, str): natural = natural.lower() == "true" print(f"🔍 BlackjackAdapter.create_environment: natural is a string, convert to boolean: {natural}") @@ -42,7 +42,7 @@ def create_environment(self, config: Optional[Dict[str, Any]] = None) -> Blackja sab = config.get("sab", False) if sab is None: sab = False - print(f"🔍 BlackjackAdapter.create_environment: sab is not set in the config, use False by default") + print("🔍 BlackjackAdapter.create_environment: sab is not set in the config, use False by default") if isinstance(sab, str): sab = sab.lower() == "true" print(f"🔍 BlackjackAdapter.create_environment: sab is a string, convert to boolean: {sab}") @@ -50,7 +50,7 @@ def create_environment(self, config: Optional[Dict[str, Any]] = None) -> Blackja sab = bool(sab) env = BlackjackEnv(render_mode="ansi", natural=natural, sab=sab) - print(f"🔍 BlackjackAdapter.create_environment: Created BlackjackEnv") + print("🔍 BlackjackAdapter.create_environment: Created BlackjackEnv") return env def create_environment_with_seed( diff --git a/examples/blackjack_mcp/blackjack_mcp.py b/examples/blackjack_mcp/blackjack_mcp.py index f8b0a877..0f40f6b1 100644 --- a/examples/blackjack_mcp/blackjack_mcp.py +++ b/examples/blackjack_mcp/blackjack_mcp.py @@ -71,7 +71,7 @@ def blackjack_move(action: str, ctx: Context) -> Dict[str, Any]: # Validate action if not action or not isinstance(action, str): raise ValueError( - f"Invalid action parameter: '{action}'. " f"Must be a non-empty string. Valid actions: STICK, HIT" + f"Invalid action parameter: '{action}'. Must be a non-empty string. Valid actions: STICK, HIT" ) action = action.strip().upper() diff --git a/examples/blackjack_mcp/tests/test_record_and_replay_e2e.py b/examples/blackjack_mcp/tests/test_record_and_replay_e2e.py index 69552c53..8a724565 100644 --- a/examples/blackjack_mcp/tests/test_record_and_replay_e2e.py +++ b/examples/blackjack_mcp/tests/test_record_and_replay_e2e.py @@ -562,7 +562,6 @@ async def test_multi_environment_sessions(multi_env_dataset, multi_env_recording # Start server for this test server = _create_test_server(9600) try: - # Set up recording os.environ["EP_PLAYBACK_FILE"] = multi_env_recording_file @@ -661,7 +660,7 @@ async def _validate_recording_integrity(recording_file: str, dataset: List[Dict] print("\n🏁 Validating trajectory termination...") _validate_trajectory_termination(env_recordings, dataset) - print(f"✅ Recording integrity validation completed") + print("✅ Recording integrity validation completed") def _validate_no_repeated_initial_states(env_recordings: Dict, dataset: List[Dict]): @@ -746,9 +745,9 @@ def _validate_state_progression(env_recordings: Dict): try: response_data = json.loads(response) game_states.append(response_data) - print(f" Step {i+1}: Game state {response_data}") + print(f" Step {i + 1}: Game state {response_data}") except json.JSONDecodeError: - pytest.fail(f"❌ Invalid JSON in tool response {i+1} for env {env_idx}: {response}") + pytest.fail(f"❌ Invalid JSON in tool response {i + 1} for env {env_idx}: {response}") # Check that player_sum changes when HIT action is taken for i in range(len(game_states) - 1): @@ -763,21 +762,21 @@ def _validate_state_progression(env_recordings: Dict): if current_player_sum == next_player_sum: pytest.fail( f"❌ STATE PROGRESSION BUG DETECTED in Env {env_idx}: " - f"After HIT action at step {i+1}, player_sum remained {current_player_sum}. " + f"After HIT action at step {i + 1}, player_sum remained {current_player_sum}. " f"When hitting, player should draw a card and player_sum should change. " f"Current state: {current_state}, Next state: {next_state}" ) else: print( - f" ✅ Step {i+1}: HIT action changed player_sum from {current_player_sum} to {next_player_sum}" + f" ✅ Step {i + 1}: HIT action changed player_sum from {current_player_sum} to {next_player_sum}" ) elif current_action == "STAND": # STAND action should not change player_sum (dealer's turn) print( - f" ℹ️ Step {i+1}: STAND action - player_sum transition from {current_player_sum} to {next_player_sum}" + f" ℹ️ Step {i + 1}: STAND action - player_sum transition from {current_player_sum} to {next_player_sum}" ) else: - print(f" ⚠️ Step {i+1}: Unknown action '{current_action}' - skipping validation") + print(f" ⚠️ Step {i + 1}: Unknown action '{current_action}' - skipping validation") print(f" ✅ Env {env_idx}: State progression validation completed successfully") @@ -833,7 +832,7 @@ def _validate_control_plane_sync(env_recordings: Dict, dataset: List[Dict]): elif terminated_steps == 0: print(f" ⚠️ Warning: No terminated=True found in {total_steps} steps (may be expected for short runs)") else: - print(f" ✅ Found some termination signals - control plane appears to be working") + print(" ✅ Found some termination signals - control plane appears to be working") def _validate_no_tool_calls_after_termination(env_recordings: Dict, dataset: List[Dict]): @@ -934,7 +933,7 @@ def _validate_trajectory_termination(env_recordings: Dict, dataset: List[Dict]): f"Expected: Substantial trajectories should end with terminated=True." ) elif last_terminated: - print(f" ✅ Trajectory properly terminated") + print(" ✅ Trajectory properly terminated") else: print(f" ℹ️ Short trajectory ({total_steps} steps) - termination not required") @@ -1019,7 +1018,6 @@ async def test_fireworks_multi_environment_sessions(multi_env_dataset, fireworks # Start server for this test server = _create_test_server(9700) try: - # Set up recording os.environ["EP_PLAYBACK_FILE"] = fireworks_multi_env_recording_file @@ -1144,7 +1142,6 @@ async def test_control_plane_state_querying(multi_env_dataset): # Start server for this test server = _create_test_server(9700) try: - # Create policy with shorter sequence for testing policy = create_blackjack_static_policy(action_sequence=["HIT", "STAND"]) diff --git a/examples/cliff_walking_mcp/cliff_walking_adapter.py b/examples/cliff_walking_mcp/cliff_walking_adapter.py index e8f79fb2..0445d35e 100644 --- a/examples/cliff_walking_mcp/cliff_walking_adapter.py +++ b/examples/cliff_walking_mcp/cliff_walking_adapter.py @@ -29,7 +29,7 @@ def create_environment(self, config: Optional[Dict[str, Any]] = None) -> CliffWa """ print(f"🔍 CliffWalkingAdapter.create_environment: config: {config}") env = CliffWalkingEnv(render_mode="ansi", is_slippery=False) - print(f"🔍 CliffWalkingAdapter.create_environment: Created CliffWalkingEnv") + print("🔍 CliffWalkingAdapter.create_environment: Created CliffWalkingEnv") return env def create_environment_with_seed( diff --git a/examples/cliff_walking_mcp/tests/test_cliff_walking_e2e.py b/examples/cliff_walking_mcp/tests/test_cliff_walking_e2e.py index fc327f62..9d5c6e23 100644 --- a/examples/cliff_walking_mcp/tests/test_cliff_walking_e2e.py +++ b/examples/cliff_walking_mcp/tests/test_cliff_walking_e2e.py @@ -565,7 +565,6 @@ async def test_multi_environment_sessions(multi_env_dataset, multi_env_recording # Start server for this test server = _create_test_server(9600) try: - # Set up recording os.environ["EP_PLAYBACK_FILE"] = multi_env_recording_file @@ -680,7 +679,7 @@ async def _validate_recording_integrity(recording_file: str, dataset: List[Dict] print("\n🏁 Validating trajectory termination...") _validate_trajectory_termination(env_recordings, dataset) - print(f"✅ Recording integrity validation completed") + print("✅ Recording integrity validation completed") def _validate_multi_seed_environments(env_recordings: Dict, dataset: List[Dict]): @@ -760,9 +759,9 @@ def _validate_state_progression(env_recordings: Dict, dataset: List[Dict]): position = response_data.get("position") if position is not None: positions.append(position) - print(f" Step {i+1}: Position {position}") + print(f" Step {i + 1}: Position {position}") except json.JSONDecodeError: - pytest.fail(f"❌ Invalid JSON in tool response {i+1} for env {env_idx}: {response}") + pytest.fail(f"❌ Invalid JSON in tool response {i + 1} for env {env_idx}: {response}") if len(positions) < 2: print(f" Env {env_idx}: Only {len(positions)} valid positions, skipping progression check") @@ -801,7 +800,7 @@ def _validate_state_progression(env_recordings: Dict, dataset: List[Dict]): f"Full position sequence: {positions}" ) else: - print(f" ✅ Valid state progression - all position changes follow Cliff Walking rules") + print(" ✅ Valid state progression - all position changes follow Cliff Walking rules") def _validate_control_plane_sync(env_recordings: Dict, dataset: List[Dict]): @@ -855,7 +854,7 @@ def _validate_control_plane_sync(env_recordings: Dict, dataset: List[Dict]): elif terminated_steps == 0: print(f" ⚠️ Warning: No terminated=True found in {total_steps} steps (may be expected for short runs)") else: - print(f" ✅ Found some termination signals - control plane appears to be working") + print(" ✅ Found some termination signals - control plane appears to be working") def _validate_no_tool_calls_after_termination(env_recordings: Dict, dataset: List[Dict]): @@ -960,7 +959,7 @@ def _validate_trajectory_termination(env_recordings: Dict, dataset: List[Dict]): f"Expected: Substantial trajectories should end with terminated=True." ) elif last_terminated: - print(f" ✅ Trajectory properly terminated") + print(" ✅ Trajectory properly terminated") else: print(f" ℹ️ Short trajectory ({total_steps} steps) - termination not required") @@ -1045,7 +1044,6 @@ async def test_fireworks_multi_environment_sessions(multi_env_dataset, fireworks # Start server for this test server = _create_test_server(9700) try: - # Set up recording os.environ["EP_PLAYBACK_FILE"] = fireworks_multi_env_recording_file @@ -1173,7 +1171,6 @@ async def test_control_plane_state_querying(multi_env_dataset): # Start server for this test server = _create_test_server(9700) try: - # Create policy with shorter sequence for testing policy = create_cliff_walking_static_policy(action_sequence=["UP", "UP"]) diff --git a/examples/frozen_lake_mcp/frozen_lake_adapter.py b/examples/frozen_lake_mcp/frozen_lake_adapter.py index 4181e64b..f8f197df 100644 --- a/examples/frozen_lake_mcp/frozen_lake_adapter.py +++ b/examples/frozen_lake_mcp/frozen_lake_adapter.py @@ -46,12 +46,12 @@ def create_environment(self, config: Optional[Dict[str, Any]] = None) -> FrozenL desc = generate_random_map(size=grid_size, p=0.8, seed=seed) print(f"🔍 FrozenLakeAdapter.create_environment: Generated map desc: {desc}") else: - print(f"🔍 FrozenLakeAdapter.create_environment: Generating map without seed") + print("🔍 FrozenLakeAdapter.create_environment: Generating map without seed") desc = generate_random_map(size=grid_size, p=0.8) print(f"🔍 FrozenLakeAdapter.create_environment: Generated map desc: {desc}") env = FrozenLakeEnv(desc=desc, is_slippery=False, render_mode="ansi") - print(f"🔍 FrozenLakeAdapter.create_environment: Created FrozenLakeEnv") + print("🔍 FrozenLakeAdapter.create_environment: Created FrozenLakeEnv") return env def create_environment_with_seed( diff --git a/examples/frozen_lake_mcp/rollout_example.py b/examples/frozen_lake_mcp/rollout_example.py index 4f969f0c..31a471a9 100644 --- a/examples/frozen_lake_mcp/rollout_example.py +++ b/examples/frozen_lake_mcp/rollout_example.py @@ -90,7 +90,7 @@ async def rollout(self, envs: List[McpGym], policy: SimplePolicy, steps: int = 2 evaluation_rows = [] for i, env in enumerate(envs): - self.logger.info(f"Running rollout {i+1}/{len(envs)}") + self.logger.info(f"Running rollout {i + 1}/{len(envs)}") trajectory = { "environment": env.__class__.__name__, @@ -136,7 +136,7 @@ async def rollout(self, envs: List[McpGym], policy: SimplePolicy, steps: int = 2 break evaluation_rows.append(trajectory) - self.logger.info(f"Rollout {i+1} completed: {trajectory['total_reward']} total reward") + self.logger.info(f"Rollout {i + 1} completed: {trajectory['total_reward']} total reward") return evaluation_rows @@ -147,7 +147,7 @@ def print_trajectory_summary(self, evaluation_rows: List[Dict[str, Any]]): print("=" * 60) for i, traj in enumerate(evaluation_rows): - print(f"\nEvaluation {i+1}:") + print(f"\nEvaluation {i + 1}:") print(f" Environment: {traj['environment']}") print(f" Seed: {traj['seed']}") print(f" Steps: {len(traj['steps'])}") @@ -169,7 +169,7 @@ def print_trajectory_summary(self, evaluation_rows: List[Dict[str, Any]]): else 0 ) - print(f"\nOverall Statistics:") + print("\nOverall Statistics:") print(f" Total Environments: {len(evaluation_rows)}") print(f" Average Reward: {avg_reward:.2f}") print(f" Success Rate: {success_rate:.2%}") diff --git a/examples/frozen_lake_mcp/test_seed_logging.py b/examples/frozen_lake_mcp/test_seed_logging.py index edb1b272..d4dcd203 100644 --- a/examples/frozen_lake_mcp/test_seed_logging.py +++ b/examples/frozen_lake_mcp/test_seed_logging.py @@ -36,7 +36,7 @@ async def test_seed_logging(): # Reset environments to trigger session creation print("🔄 Resetting environments...") observations, tool_schemas, system_prompts = await envs.reset() - print(f"✅ Reset complete") + print("✅ Reset complete") print(f"📊 Observations: {observations}") print(f"🛠️ Tool schemas: {len(tool_schemas[0])} tools available") diff --git a/examples/frozen_lake_mcp/test_termination_fix.py b/examples/frozen_lake_mcp/test_termination_fix.py index cdebcd59..10823826 100755 --- a/examples/frozen_lake_mcp/test_termination_fix.py +++ b/examples/frozen_lake_mcp/test_termination_fix.py @@ -50,7 +50,7 @@ async def test_control_plane_separation(): successful_path = ["DOWN", "RIGHT", "RIGHT", "RIGHT", "DOWN", "DOWN"] for i, action in enumerate(successful_path): - print(f"\n--- Step {i+1}: {action} ---") + print(f"\n--- Step {i + 1}: {action} ---") # Execute tool call (data plane) tool_result = await session.call_tool("lake_move", {"action": action}) diff --git a/examples/frozen_lake_mcp/test_validation_logic.py b/examples/frozen_lake_mcp/test_validation_logic.py index 69477db6..8a476929 100644 --- a/examples/frozen_lake_mcp/test_validation_logic.py +++ b/examples/frozen_lake_mcp/test_validation_logic.py @@ -95,7 +95,7 @@ def test_validation_with_existing_data(): print("\n✅ All validations passed - no bugs detected") return True else: - print(f"\n❌ Validation caught bugs (as expected):") + print("\n❌ Validation caught bugs (as expected):") print(f" - Repeated states bug: {'No' if repeated_states_ok else 'Yes'}") print(f" - Control plane sync bug: {'No' if control_plane_ok else 'Yes'}") print(f" - Trajectory termination bug: {'No' if trajectory_termination_ok else 'Yes'}") diff --git a/examples/frozen_lake_mcp/tests/test_frozen_lake_e2e.py b/examples/frozen_lake_mcp/tests/test_frozen_lake_e2e.py index e2c4c78e..bea2ad08 100644 --- a/examples/frozen_lake_mcp/tests/test_frozen_lake_e2e.py +++ b/examples/frozen_lake_mcp/tests/test_frozen_lake_e2e.py @@ -585,7 +585,6 @@ async def test_multi_environment_sessions(multi_env_dataset, multi_env_recording # Start server for this test server = _create_test_server(9600) try: - # Set up recording os.environ["EP_PLAYBACK_FILE"] = multi_env_recording_file @@ -737,9 +736,9 @@ async def _validate_recording_integrity(recording_file: str, dataset: List[Dict] response_data = json.loads(response) position = response_data.get("position") positions.append(position) - print(f" Step {i+1}: Position {position}") + print(f" Step {i + 1}: Position {position}") except json.JSONDecodeError: - pytest.fail(f"❌ Invalid JSON in tool response {i+1} for env {env_idx}: {response}") + pytest.fail(f"❌ Invalid JSON in tool response {i + 1} for env {env_idx}: {response}") # TODO: come back to fix this later. # if len(positions) >= 2: @@ -774,7 +773,7 @@ async def _validate_recording_integrity(recording_file: str, dataset: List[Dict] print("\n🏁 Validating trajectory termination...") _validate_trajectory_termination(env_recordings, dataset) - print(f"✅ Recording integrity validation completed") + print("✅ Recording integrity validation completed") def _validate_no_repeated_states(env_recordings: Dict, dataset: List[Dict]): @@ -841,7 +840,7 @@ def _validate_no_repeated_states(env_recordings: Dict, dataset: List[Dict]): print( f"⚠️ WARNING: Env {env_idx}: Position {longest_sequence[0]} repeated {longest_sequence[1]} times starting from step {longest_sequence[2]}." ) - print(f" This might indicate session state or control plane termination issues.") + print(" This might indicate session state or control plane termination issues.") print(f" All positions: {[pos for _, pos in positions]}") # For FireworksPolicy, log but don't fail the test as LLM behavior can vary # pytest.fail( @@ -907,7 +906,7 @@ def _validate_control_plane_sync(env_recordings: Dict, dataset: List[Dict]): elif terminated_steps == 0: print(f" ⚠️ Warning: No terminated=True found in {total_steps} steps (may be expected for short runs)") else: - print(f" ✅ Found some termination signals - control plane appears to be working") + print(" ✅ Found some termination signals - control plane appears to be working") def _validate_no_tool_calls_after_termination(env_recordings: Dict, dataset: List[Dict]): @@ -1012,7 +1011,7 @@ def _validate_trajectory_termination(env_recordings: Dict, dataset: List[Dict]): f"Expected: Substantial trajectories should end with terminated=True." ) elif last_terminated: - print(f" ✅ Trajectory properly terminated") + print(" ✅ Trajectory properly terminated") else: print(f" ℹ️ Short trajectory ({total_steps} steps) - termination not required") @@ -1098,7 +1097,6 @@ async def test_fireworks_multi_environment_sessions(multi_env_dataset, fireworks # Start server for this test server = _create_test_server(9700) try: - # Set up recording os.environ["EP_PLAYBACK_FILE"] = fireworks_multi_env_recording_file @@ -1227,7 +1225,6 @@ async def test_control_plane_state_querying(multi_env_dataset): # Start server for this test server = _create_test_server(9700) try: - # Create policy with shorter sequence for testing policy = create_frozen_lake_static_policy(action_sequence=["RIGHT", "DOWN"]) diff --git a/examples/lunar_lander_mcp/generate_sample_images.py b/examples/lunar_lander_mcp/generate_sample_images.py index 8057bb46..75f22387 100644 --- a/examples/lunar_lander_mcp/generate_sample_images.py +++ b/examples/lunar_lander_mcp/generate_sample_images.py @@ -121,7 +121,7 @@ def generate_sample_trajectory(): print(f"\n📁 Trajectory saved to {output_dir}") print(f" 📊 {len(trajectory_data)} steps recorded") print(f" 🖼️ {len(list(output_dir.glob('*.png')))} images saved") - print(f" 📋 Summary: trajectory_summary.json") + print(" 📋 Summary: trajectory_summary.json") env.close() return output_dir @@ -153,7 +153,7 @@ def save_frame(frame_data: str, output_path: Path, step: int, action: str): if __name__ == "__main__": try: output_dir = generate_sample_trajectory() - print(f"\n✅ Sample trajectory generated successfully!") + print("\n✅ Sample trajectory generated successfully!") print(f"📁 View images in: {output_dir.absolute()}") except Exception as e: diff --git a/examples/lunar_lander_mcp/simple_trajectory_test.py b/examples/lunar_lander_mcp/simple_trajectory_test.py index 18c42dad..802b87fe 100644 --- a/examples/lunar_lander_mcp/simple_trajectory_test.py +++ b/examples/lunar_lander_mcp/simple_trajectory_test.py @@ -90,7 +90,7 @@ async def test_lunar_lander_direct(): actions = ["NOTHING", "FIRE_MAIN", "FIRE_LEFT", "FIRE_RIGHT", "NOTHING"] for i, action in enumerate(actions): - print(f"🎮 Step {i+1}: {action}") + print(f"🎮 Step {i + 1}: {action}") # Call lander_action tool result = await session.call_tool("lander_action", {"action": action}) @@ -113,7 +113,7 @@ async def test_lunar_lander_direct(): "status": response_data.get("status", "Unknown"), } - with open(output_dir / f"step_{i+1:03d}_summary.json", "w") as f: + with open(output_dir / f"step_{i + 1:03d}_summary.json", "w") as f: json.dump(step_summary, f, indent=2) # Save rendered frame if available @@ -123,14 +123,14 @@ async def test_lunar_lander_direct(): image_data = frame_data.split(",")[1] image_bytes = base64.b64decode(image_data) - frame_path = output_dir / f"step_{i+1:03d}_{action.lower()}.png" + frame_path = output_dir / f"step_{i + 1:03d}_{action.lower()}.png" with open(frame_path, "wb") as f: f.write(image_bytes) print(f" 💾 Saved frame: {frame_path}") else: - print(f" ⚠️ No rendered frame in response") + print(" ⚠️ No rendered frame in response") else: - print(f" ⚠️ No rendered_frame field in response") + print(" ⚠️ No rendered_frame field in response") except json.JSONDecodeError as e: print(f" ❌ Could not parse response as JSON: {e}") diff --git a/examples/lunar_lander_mcp/test_lunar_lander_conda.py b/examples/lunar_lander_mcp/test_lunar_lander_conda.py index 98d3c491..89f48597 100644 --- a/examples/lunar_lander_mcp/test_lunar_lander_conda.py +++ b/examples/lunar_lander_mcp/test_lunar_lander_conda.py @@ -225,14 +225,14 @@ async def __call__(self, tool_schemas, observations, system_prompts, user_prompt f" Step {step_idx}: control_plane_step is not a dict, type: {type(control_plane_step)}" ) else: - print(f" 🔍 No control plane messages found") + print(" 🔍 No control plane messages found") print(f" ✅ Episode {i} validation passed") print(f"📁 All evaluation data saved to {output_dir}") - print(f" - Episode summaries: episode_*_summary.json") - print(f" - Control plane debug data: episode_*_first_control_plane_debug.json") - print(f" - Rendered frames: episode_*_step_*.png (if available)") + print(" - Episode summaries: episode_*_summary.json") + print(" - Control plane debug data: episode_*_first_control_plane_debug.json") + print(" - Rendered frames: episode_*_step_*.png (if available)") print("🎉 All tests passed! Conda isolation working correctly.") return True diff --git a/examples/lunar_lander_mcp/tests/test_lunar_lander_e2e.py b/examples/lunar_lander_mcp/tests/test_lunar_lander_e2e.py index 723b68bb..3bda23aa 100644 --- a/examples/lunar_lander_mcp/tests/test_lunar_lander_e2e.py +++ b/examples/lunar_lander_mcp/tests/test_lunar_lander_e2e.py @@ -618,7 +618,6 @@ async def test_multi_environment_sessions(multi_env_dataset, multi_env_recording # Start server for this test server = _create_test_server(9600) try: - # Set up recording os.environ["EP_PLAYBACK_FILE"] = multi_env_recording_file @@ -790,9 +789,9 @@ async def _validate_recording_integrity(recording_file: str, dataset: List[Dict] "reward": response_data.get("reward", 0.0), } states.append(state_info) - print(f" Step {i+1}: {state_info}") + print(f" Step {i + 1}: {state_info}") except (json.JSONDecodeError, TypeError) as e: - pytest.fail(f"❌ Invalid JSON in tool response {i+1} for env {env_idx}: {response}. Error: {e}") + pytest.fail(f"❌ Invalid JSON in tool response {i + 1} for env {env_idx}: {response}. Error: {e}") # For lunar lander, we expect state to change between steps if len(states) >= 2: @@ -819,7 +818,7 @@ async def _validate_recording_integrity(recording_file: str, dataset: List[Dict] print("\n🏁 Validating trajectory termination...") _validate_trajectory_termination(env_recordings, dataset) - print(f"✅ Recording integrity validation completed") + print("✅ Recording integrity validation completed") def _validate_no_repeated_states(env_recordings: Dict, dataset: List[Dict]): @@ -899,7 +898,7 @@ def _validate_no_repeated_states(env_recordings: Dict, dataset: List[Dict]): print( f"⚠️ WARNING: Env {env_idx}: Position {longest_sequence[0]} repeated {longest_sequence[1]} times starting from step {longest_sequence[2]}." ) - print(f" This might indicate session state or control plane termination issues.") + print(" This might indicate session state or control plane termination issues.") print(f" All positions: {[pos for _, pos in positions]}") else: print(f" ✅ Env {env_idx}: No repeated states detected - good state progression!") @@ -952,9 +951,9 @@ def _validate_control_plane_sync(env_recordings: Dict, dataset: List[Dict]): # f"Expected: At least some episodes should terminate when lander crashes or lands successfully." # ) if terminated_steps == 0: - print(f" ⚠️ Warning: No terminated=True found in (may be expected for short runs)") + print(" ⚠️ Warning: No terminated=True found in (may be expected for short runs)") else: - print(f" ✅ Found some termination signals - control plane appears to be working") + print(" ✅ Found some termination signals - control plane appears to be working") def _validate_no_tool_calls_after_termination(env_recordings: Dict, dataset: List[Dict]): @@ -1042,11 +1041,11 @@ def _validate_trajectory_termination(env_recordings: Dict, dataset: List[Dict]): if total_steps >= 8 and not last_terminated: print(f" ⚠️ Env {env_idx}: Trajectory has {total_steps} steps but final metadata shows terminated=False.") print( - f" This might indicate: 1) Episode still in progress, 2) Control plane sync issues, or 3) Lander hasn't crashed/landed yet" + " This might indicate: 1) Episode still in progress, 2) Control plane sync issues, or 3) Lander hasn't crashed/landed yet" ) print(f" Last metadata: {last_tool_metadata}") elif last_terminated: - print(f" ✅ Trajectory properly terminated") + print(" ✅ Trajectory properly terminated") else: print(f" ℹ️ Short trajectory ({total_steps} steps) - termination not required") @@ -1103,7 +1102,6 @@ async def test_fireworks_multi_environment_sessions(multi_env_dataset, fireworks # Start server for this test server = _create_test_server(9700) try: - # Set up recording os.environ["EP_PLAYBACK_FILE"] = fireworks_multi_env_recording_file @@ -1223,7 +1221,6 @@ async def test_control_plane_state_querying(multi_env_dataset): # Start server for this test server = _create_test_server(9700) try: - # Create policy with shorter sequence for testing policy = create_lunar_lander_static_policy(action_sequence=["FIRE_MAIN", "FIRE_LEFT"]) diff --git a/examples/mcp_agent_filesystem_rl/test_example.py b/examples/mcp_agent_filesystem_rl/test_example.py index ef7c5f73..ff848fc0 100644 --- a/examples/mcp_agent_filesystem_rl/test_example.py +++ b/examples/mcp_agent_filesystem_rl/test_example.py @@ -51,7 +51,6 @@ def test_reward_function_import(): print("Testing reward function import...") # Import EvaluateResult specifically within this function's scope - from eval_protocol.models import EvaluateResult from examples.mcp_agent_filesystem_rl import main as filesystem_rl_main assert hasattr(filesystem_rl_main, "evaluate"), "Reward function 'evaluate' not found in main.py" @@ -106,9 +105,9 @@ def test_reward_function_import(): assert isinstance(result_success, EvaluateResult), "evaluate function did not return an EvaluateResult" # Based on main.py logic, a perfect move should result in score 1.0 - assert ( - result_success.score == 1.0 - ), f"Expected score 1.0 for mock success, got {result_success.score}. Reason: {result_success.reason}" + assert result_success.score == 1.0, ( + f"Expected score 1.0 for mock success, got {result_success.score}. Reason: {result_success.reason}" + ) assert result_success.is_score_valid print("✓ Reward function 'evaluate' import and basic validation works") diff --git a/examples/rollout_control_plane_demo.py b/examples/rollout_control_plane_demo.py index 9c327e5e..9f6d2795 100644 --- a/examples/rollout_control_plane_demo.py +++ b/examples/rollout_control_plane_demo.py @@ -98,7 +98,6 @@ async def demonstrate_control_plane_rollout(): patch.object(GeneralMCPVectorEnv, "step") as mock_step, patch.object(GeneralMCPVectorEnv, "close") as mock_close, ): - # Setup mock vector environment mock_env = GeneralMCPVectorEnv(sessions, dataset_rows) mock_env.sessions = sessions @@ -273,14 +272,14 @@ def mock_step_side_effect(tool_calls): # Analyze the trajectory trajectory = trajectories[0] - print(f"Basic Trajectory Info:") + print("Basic Trajectory Info:") print(f" • Total Steps: {trajectory.steps}") print(f" • Total Reward: {trajectory.total_reward}") print(f" • Terminated: {trajectory.terminated}") print(f" • Duration: {trajectory.duration:.3f}s") print() - print(f"Data Plane Analysis (Observations):") + print("Data Plane Analysis (Observations):") print(f" • Observation Count: {len(trajectory.observations)}") for i, obs in enumerate(trajectory.observations): if i == 0: @@ -289,14 +288,14 @@ def mock_step_side_effect(tool_calls): print(f" Step {i}: {obs}") print() - print(f"Control Plane Analysis (Rewards/Termination):") + print("Control Plane Analysis (Rewards/Termination):") print(f" • Reward Count: {len(trajectory.rewards)}") print(f" • Rewards: {trajectory.rewards}") print(f" • Actions: {trajectory.actions}") print() # Validate control plane separation - print(f"Control Plane Separation Validation:") + print("Control Plane Separation Validation:") # Check data plane contains no rewards data_plane_clean = True diff --git a/examples/tau2_mcp/airplane_environment/airline_environment.py b/examples/tau2_mcp/airplane_environment/airline_environment.py index 0c1e2d14..f7c7a920 100644 --- a/examples/tau2_mcp/airplane_environment/airline_environment.py +++ b/examples/tau2_mcp/airplane_environment/airline_environment.py @@ -5,6 +5,7 @@ This module implements an AirlineEnvironment that integrates the τ²-Bench simulation pattern (Agent/User/Environment communication) with the MCP-Gym framework. """ + import json import logging import os diff --git a/examples/tau2_mcp/mock_environment/mock_environment.py b/examples/tau2_mcp/mock_environment/mock_environment.py index 85ad84f5..fc255f5f 100644 --- a/examples/tau2_mcp/mock_environment/mock_environment.py +++ b/examples/tau2_mcp/mock_environment/mock_environment.py @@ -5,6 +5,7 @@ This module implements a MockEnvironment that integrates the τ²-Bench simulation pattern (Agent/User/Environment communication) with the MCP-Gym framework. """ + import json import logging import os diff --git a/examples/tau2_mcp/retail_environment/retail_environment.py b/examples/tau2_mcp/retail_environment/retail_environment.py index 425ef785..d2163c0c 100644 --- a/examples/tau2_mcp/retail_environment/retail_environment.py +++ b/examples/tau2_mcp/retail_environment/retail_environment.py @@ -5,6 +5,7 @@ This module implements a RetailEnvironment that integrates the τ²-Bench simulation pattern (Agent/User/Environment communication) with the MCP-Gym framework. """ + import json import logging import os diff --git a/examples/tau2_mcp/tests/test_tau2_e2e.py b/examples/tau2_mcp/tests/test_tau2_e2e.py index cb71fab7..ec7c3944 100644 --- a/examples/tau2_mcp/tests/test_tau2_e2e.py +++ b/examples/tau2_mcp/tests/test_tau2_e2e.py @@ -31,7 +31,6 @@ warnings.filterwarnings("ignore", category=DeprecationWarning, message=".*class-based.*config.*") # Set environment variable to suppress pydantic warnings at runtime -import os os.environ["PYTHONWARNINGS"] = "ignore::UserWarning:pydantic,ignore::DeprecationWarning:pydantic" @@ -140,12 +139,12 @@ def start(self) -> None: try: with open(self._log_file_path, "r") as f: log_content = f.read() - print(f"❌ Server failed to start!") + print("❌ Server failed to start!") print(f"📋 Server log ({self._log_file_path}):") print("=" * 50) print(log_content) print("=" * 50) - raise RuntimeError(f"Server failed to start. Check log above for details.") + raise RuntimeError("Server failed to start. Check log above for details.") except Exception as e: stdout, stderr = self.process.communicate() raise RuntimeError(f"Server failed to start. stderr: {stderr}, log error: {e}") @@ -438,9 +437,9 @@ async def _validate_recording_integrity(recording_file: str, dataset: List[Dict] } states.append(state_info) - print(f" Step {i+1}: {state_info}") + print(f" Step {i + 1}: {state_info}") except (json.JSONDecodeError, TypeError) as e: - pytest.fail(f"❌ Invalid JSON in tool response {i+1} for env {env_idx}: {response}. Error: {e}") + pytest.fail(f"❌ Invalid JSON in tool response {i + 1} for env {env_idx}: {response}. Error: {e}") # For airline, we expect state to remain consistent between steps (same reservation details) if len(states) >= 2: @@ -467,7 +466,7 @@ async def _validate_recording_integrity(recording_file: str, dataset: List[Dict] print("\n🏁 Validating trajectory termination...") _validate_trajectory_termination(env_recordings, dataset) - print(f"✅ Recording integrity validation completed") + print("✅ Recording integrity validation completed") def _validate_no_repeated_states(env_recordings: Dict, dataset: List[Dict]): @@ -554,7 +553,7 @@ def _validate_no_repeated_states(env_recordings: Dict, dataset: List[Dict]): print( f"⚠️ WARNING: Env {env_idx}: State {longest_sequence[0]} repeated {longest_sequence[1]} times starting from step {longest_sequence[2]}." ) - print(f" This might indicate session state or control plane termination issues.") + print(" This might indicate session state or control plane termination issues.") print(f" All states: {[state for _, state in reservation_states]}") else: print(f" ✅ Env {env_idx}: No repeated states detected - good state progression!") @@ -598,9 +597,9 @@ def _validate_control_plane_sync(env_recordings: Dict, dataset: List[Dict]): print(f" ℹ️ {missing_envs} environments not recorded (likely terminated immediately)") if terminated_steps == 0: - print(f" ⚠️ Warning: No terminated=True found in metadata (may be expected for short runs)") + print(" ⚠️ Warning: No terminated=True found in metadata (may be expected for short runs)") else: - print(f" ✅ Found some termination signals - control plane appears to be working") + print(" ✅ Found some termination signals - control plane appears to be working") def _validate_no_tool_calls_after_termination(env_recordings: Dict, dataset: List[Dict]): @@ -688,11 +687,11 @@ def _validate_trajectory_termination(env_recordings: Dict, dataset: List[Dict]): if total_steps >= 8 and not last_terminated: print(f" ⚠️ Env {env_idx}: Trajectory has {total_steps} steps but final metadata shows terminated=False.") print( - f" This might indicate: 1) Conversation still in progress, 2) Control plane sync issues, or 3) User still interacting" + " This might indicate: 1) Conversation still in progress, 2) Control plane sync issues, or 3) User still interacting" ) print(f" Last metadata: {last_tool_metadata}") elif last_terminated: - print(f" ✅ Trajectory properly terminated") + print(" ✅ Trajectory properly terminated") else: print(f" ℹ️ Short trajectory ({total_steps} steps) - termination not required") @@ -914,7 +913,6 @@ async def test_fireworks_multi_airline_environment_sessions( # Start server for this test server = _create_test_server(9700) try: - # Set up recording os.environ["EP_PLAYBACK_FILE"] = fireworks_multi_env_airline_recording_file @@ -942,9 +940,9 @@ async def test_fireworks_multi_airline_environment_sessions( duration = time.time() - start_time # Validate results - assert len(evaluation_rows) == len( - multi_env_airline_dataset - ), "Should have evaluation row for each environment" + assert len(evaluation_rows) == len(multi_env_airline_dataset), ( + "Should have evaluation row for each environment" + ) assert all(eval_row.get_steps() > 0 for eval_row in evaluation_rows), "All evaluation rows should have steps" print( @@ -1057,7 +1055,6 @@ async def test_entire_airline_dataset(multi_env_airline_full_dataset, fireworks_ # Start server for this test server = _create_test_server(9700) try: - # Set up recording os.environ["EP_PLAYBACK_FILE"] = fireworks_multi_env_airline_recording_file @@ -1090,9 +1087,9 @@ async def test_entire_airline_dataset(multi_env_airline_full_dataset, fireworks_ duration = time.time() - start_time # Validate results - assert len(evaluation_rows) == len( - multi_env_airline_full_dataset - ), "Should have evaluation row for each environment" + assert len(evaluation_rows) == len(multi_env_airline_full_dataset), ( + "Should have evaluation row for each environment" + ) assert all(eval_row.get_steps() > 0 for eval_row in evaluation_rows), "All evaluation rows should have steps" print( @@ -1211,7 +1208,7 @@ async def test_entire_airline_dataset(multi_env_airline_full_dataset, fireworks_ all_results.append(result) # Summary Statistics - print(f"\n📈 Summary Statistics:") + print("\n📈 Summary Statistics:") avg_score = sum(r["score"] for r in all_results) / len(all_results) if all_results else 0 total_cost = sum(r["cost_info"]["total_cost"] for r in all_results) @@ -1219,7 +1216,7 @@ async def test_entire_airline_dataset(multi_env_airline_full_dataset, fireworks_ f" {policy.model_id}: {avg_score:.2%} success rate ({sum(r['score'] for r in all_results)}/{len(all_results)}) - Cost: ${total_cost:.2f}" ) print(f"\n💰 Total evaluation cost: ${total_cost:.2f}") - print(f"📊 Cost calculation uses actual API usage data.") + print("📊 Cost calculation uses actual API usage data.") def save_results_jsonl( evaluation_records: List[Dict], output_file: str = "evaluation_outputs/all_evaluations.jsonl" @@ -1307,7 +1304,7 @@ def save_evaluation_files(evaluation_records: List[Dict], output_dir: str = "eva print(f"\n📁 Saved evaluation files to: {output_path}") print(f" - {len(evaluation_records)} individual evaluation files") - print(f" - 1 evaluation summary file") + print(" - 1 evaluation summary file") return output_path @@ -1382,7 +1379,7 @@ def save_trajectory_files(trajectory_records: List[Dict], output_dir: str = "tra print(f"\n📁 Saved trajectory files to: {output_path}") print(f" - {len(trajectory_records)} individual trajectory files") - print(f" - 1 trajectory summary file") + print(" - 1 trajectory summary file") return output_path diff --git a/examples/taxi_mcp_complete/local_testing/test_north_star.py b/examples/taxi_mcp_complete/local_testing/test_north_star.py index 3721b4e7..e0a3c75d 100644 --- a/examples/taxi_mcp_complete/local_testing/test_north_star.py +++ b/examples/taxi_mcp_complete/local_testing/test_north_star.py @@ -73,8 +73,8 @@ async def test_north_star_interface(): if recording_mode: print(f"📝 Recorded to: {playback_file}") - print(f"💬 OpenAI format: clean_openai_format.jsonl") - print(f"🔄 Run again to test playback mode!") + print("💬 OpenAI format: clean_openai_format.jsonl") + print("🔄 Run again to test playback mode!") else: # Assume ~90s for recording time for speedup calculation (taxi is more complex) estimated_recording_time = 90.0 diff --git a/examples/taxi_mcp_complete/mcp_server/simulation_server.py b/examples/taxi_mcp_complete/mcp_server/simulation_server.py index 45c11aec..22c767d3 100644 --- a/examples/taxi_mcp_complete/mcp_server/simulation_server.py +++ b/examples/taxi_mcp_complete/mcp_server/simulation_server.py @@ -40,8 +40,7 @@ def taxi_move(self, action: str, *, ctx, session_state) -> Dict[str, Any]: # Validate action if not action or not isinstance(action, str): raise ValueError( - f"Invalid action parameter: '{action}'. " - f"Must be a non-empty string. Valid actions: {self.ACTION_NAMES}" + f"Invalid action parameter: '{action}'. Must be a non-empty string. Valid actions: {self.ACTION_NAMES}" ) action = action.strip().upper() @@ -133,7 +132,7 @@ def main(): args = parser.parse_args() - print(f"🚀 Starting Taxi Simulation Server") + print("🚀 Starting Taxi Simulation Server") print(f"🌐 Host: {args.host}") print(f"🌐 Port: {args.port}") print("🎯 Framework: Unified SimulationServerBase") diff --git a/examples/trl_integration/working_grpo_example.py b/examples/trl_integration/working_grpo_example.py index 387040cd..7b5412df 100644 --- a/examples/trl_integration/working_grpo_example.py +++ b/examples/trl_integration/working_grpo_example.py @@ -372,7 +372,6 @@ def make_conversation(example): def combine_rewards( reward_adapter_configs: List[Dict[str, Any]], # Each dict: {'adapter': callable, 'weight': float} ) -> Callable[[List[Any], List[str]], List[float]]: # Corrected return type hint - total_weight = sum(c["weight"] for c in reward_adapter_configs) if abs(total_weight - 1.0) > 1e-6: logger.warning(f"Sum of weights is {total_weight}, normalizing to 1.0.") diff --git a/local_evals/model_comparison_eval.ipynb b/local_evals/model_comparison_eval.ipynb index e36dbe6c..231e4d95 100644 --- a/local_evals/model_comparison_eval.ipynb +++ b/local_evals/model_comparison_eval.ipynb @@ -1,1075 +1,1065 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "vscode": { - "languageId": "raw" - } - }, - "source": [ - "# Model Comparison Eval Harness: Tau2-Bench Airline\n", - "\n", - "This notebook compares different models on airline customer service scenarios using tau2-bench natural language evaluation.\n", - "\n", - "**Models being compared:**\n", - "- Claude 4 Opus (AnthropicPolicy)\n", - "- GPT 4.1 (OpenAIPolicy)\n", - "- Kimi K2 (FireworksPolicy)\n", - "\n", - "**Evaluation Framework:** tau2-bench with natural language assertions\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Install required packages\n", - "!pip install eval-protocol anthropic fireworks-ai tau2-bench pytest-asyncio\n", - "!pip install firectl # For sharing results\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "✅ All imports successful!\n" - ] - }, - { - "data": { - "text/plain": [ - "3" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import asyncio\n", - "import json\n", - "import os\n", - "import time\n", - "from datetime import datetime\n", - "from pathlib import Path\n", - "from typing import Dict, List, Any, Tuple\n", - "import pandas as pd\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "import logging\n", - "from litellm import cost_per_token\n", - "from loguru import logger\n", - "\n", - "# Import eval protocol and tau2-bench\n", - "import eval_protocol as rk\n", - "from eval_protocol import reward_function, EvaluateResult\n", - "from eval_protocol.models import LLMUsageStats\n", - "\n", - "from examples.tau2_mcp.tests.test_tau2_e2e import MCPServerManager\n", - "\n", - "from vendor.tau2.evaluator.evaluator_nl_assertions import NLAssertionsEvaluator\n", - "from vendor.tau2.data_model.message import (\n", - " SystemMessage,\n", - " AssistantMessage,\n", - " UserMessage,\n", - " ToolMessage,\n", - ")\n", - "\n", - "print(\"✅ All imports successful!\")\n", - "\n", - "logging.basicConfig(level=logging.WARNING, force=True)\n", - "\n", - "logger.remove() # Remove default handler\n", - "logger.add(lambda _: None, level=\"ERROR\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "vscode": { - "languageId": "raw" - } - }, - "source": [ - "## 1. Set Up Evaluation Benchmark\n", - "\n", - "First, let's load the evaluation dataset we want to benchmark our models on." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "✅ Loaded airline dataset with 50 scenarios\n" - ] - } - ], - "source": [ - "with open(\"datasets/airline.json\", \"r\") as f:\n", - " tau2_eval_dataset = json.load(f)\n", - " # TODO: something here is broken\n", - "\n", - "print(f\"✅ Loaded airline dataset with {len(tau2_eval_dataset)} scenarios\")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "vscode": { - "languageId": "raw" - } - }, - "source": [ - "## 2. Evaluation Function: Tau2-Bench\n", - "\n", - "Now, let's implement the actual evaluation function (also called a reward function), based on Tau2-Bench. If you haven't heard of Tau2-Bench, it's a customer support benchmark from Sierra AI. Check out more information here: https://github.com/sierra-research/tau2-bench" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "@reward_function\n", - "async def airline_eval(messages: List[Any], nl_assertions: List[str] = None, **kwargs) -> EvaluateResult:\n", - " \"\"\"\n", - " Evaluate airline conversation using tau2-bench NLAssertionsEvaluator.\n", - "\n", - " Args:\n", - " messages: Conversation between agent and customer\n", - " nl_assertions: List of natural language assertions to evaluate\n", - " **kwargs: Additional parameters\n", - "\n", - " Returns:\n", - " EvaluateResult with binary pass/fail and detailed assertion breakdown\n", - " \"\"\"\n", - " # Default assertions if none provided\n", - " if nl_assertions is None:\n", - " nl_assertions = [\"The agent handled the customer request appropriately according to airline policy\"]\n", - "\n", - " # Convert Message objects directly to tau2-bench message objects\n", - " trajectory_objects = []\n", - " for msg in messages:\n", - " role = msg.role\n", - " content = msg.content\n", - "\n", - " if role == \"system\":\n", - " trajectory_objects.append(SystemMessage(role=role, content=content))\n", - " elif role == \"assistant\":\n", - " trajectory_objects.append(AssistantMessage(role=role, content=content))\n", - " elif role == \"user\":\n", - " trajectory_objects.append(UserMessage(role=role, content=content))\n", - " elif role == \"tool\":\n", - " tool_id = msg.tool_call_id\n", - " trajectory_objects.append(ToolMessage(id=tool_id, role=role, content=content))\n", - "\n", - " # Run the synchronous tau2-bench evaluation in a thread pool to avoid blocking\n", - " loop = asyncio.get_event_loop()\n", - " nl_assertions_checks = await loop.run_in_executor(\n", - " None, \n", - " NLAssertionsEvaluator.evaluate_nl_assertions,\n", - " trajectory_objects, \n", - " nl_assertions\n", - " )\n", - "\n", - " all_expectations_met = all(result.met for result in nl_assertions_checks)\n", - " reward = 1.0 if all_expectations_met else 0.0\n", - "\n", - " # Build reason string\n", - " if all_expectations_met:\n", - " reason = f\"All {len(nl_assertions)} natural language assertions passed\"\n", - " else:\n", - " failed_assertions = [nl_assertions[i] for i, result in enumerate(nl_assertions_checks) if not result.met]\n", - " reason = f\"Failed assertions: {failed_assertions}\"\n", - "\n", - " return EvaluateResult(\n", - " score=reward,\n", - " reason=reason,\n", - " metrics={},\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "vscode": { - "languageId": "raw" - } - }, - "source": [ - "## 3. Set Up Model Policies\n", - "\n", - "Configure the three models we want to compare: Claude 4 Opus, GPT-4.1, and Kimi K2.\n" - ] - }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "vscode": { + "languageId": "raw" + } + }, + "source": [ + "# Model Comparison Eval Harness: Tau2-Bench Airline\n", + "\n", + "This notebook compares different models on airline customer service scenarios using tau2-bench natural language evaluation.\n", + "\n", + "**Models being compared:**\n", + "- Claude 4 Opus (AnthropicPolicy)\n", + "- GPT 4.1 (OpenAIPolicy)\n", + "- Kimi K2 (FireworksPolicy)\n", + "\n", + "**Evaluation Framework:** tau2-bench with natural language assertions\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Install required packages\n", + "!pip install eval-protocol anthropic fireworks-ai tau2-bench pytest-asyncio\n", + "!pip install firectl # For sharing results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "✅ All required API keys are set\n" - ] - } - ], - "source": [ - "# Check for required API keys (set these as environment variables)\n", - "# Example: export ANTHROPIC_API_KEY=your-key-here\n", - "\n", - "required_keys = [\"ANTHROPIC_API_KEY\", \"OPENAI_API_KEY\", \"FIREWORKS_API_KEY\"]\n", - "missing_keys = [key for key in required_keys if not os.getenv(key)]\n", - "\n", - "if missing_keys:\n", - " print(f\"⚠️ Missing API keys: {missing_keys}\")\n", - " print(\"Please set these environment variables:\")\n", - " for key in missing_keys:\n", - " print(f\" export {key}='your-key-here'\")\n", - "else:\n", - " print(\"✅ All required API keys are set\")\n" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ All imports successful!\n" + ] }, { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "✅ Model policies created:\n", - " - Claude 4 Sonnet (Anthropic)\n", - " - Kimi K2 (Fireworks)\n" - ] - } - ], - "source": [ - "# Create model policies\n", - "openai_policy = rk.OpenAIPolicy(\n", - " model_id=\"gpt-4.1\",\n", - " temperature=0.1,\n", - " max_tokens=4096,\n", - ")\n", - "\n", - "anthropic_policy = rk.AnthropicPolicy(\n", - " model_id=\"claude-sonnet-4-20250514\",\n", - " temperature=0.1,\n", - " max_tokens=4096,\n", - ")\n", - "\n", - "kimi_policy = rk.FireworksPolicy(\n", - " model_id=\"accounts/fireworks/models/kimi-k2-instruct\",\n", - " temperature=0.1,\n", - " max_tokens=4096,\n", - ")\n", - "\n", - "models_to_test = {\n", - " # \"gpt-4.1\": {\n", - " # \"policy\": openai_policy,\n", - " # \"name\": \"GPT-4.1\",\n", - " # \"provider\": \"OpenAI\"\n", - " # },\n", - " \"claude-sonnet-4\": {\n", - " \"policy\": anthropic_policy,\n", - " \"name\": \"Claude 4 Sonnet\",\n", - " \"provider\": \"Anthropic\"\n", - " },\n", - " \"kimi-k2\": {\n", - " \"policy\": kimi_policy,\n", - " \"name\": \"Kimi K2\", \n", - " \"provider\": \"Fireworks\"\n", - " }\n", - "}\n", - "\n", - "print(\"✅ Model policies created:\")\n", - "for model_id, model_info in models_to_test.items():\n", - " print(f\" - {model_info['name']} ({model_info['provider']})\")\n" + "data": { + "text/plain": [ + "3" ] - }, + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import asyncio\n", + "import json\n", + "import logging\n", + "import os\n", + "import time\n", + "from datetime import datetime\n", + "from pathlib import Path\n", + "from typing import Any, Dict, List, Tuple\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "import seaborn as sns\n", + "from litellm import cost_per_token\n", + "from loguru import logger\n", + "\n", + "# Import eval protocol and tau2-bench\n", + "import eval_protocol as rk\n", + "from eval_protocol import EvaluateResult, reward_function\n", + "from eval_protocol.models import LLMUsageStats\n", + "from examples.tau2_mcp.tests.test_tau2_e2e import MCPServerManager\n", + "from vendor.tau2.data_model.message import (\n", + " AssistantMessage,\n", + " SystemMessage,\n", + " ToolMessage,\n", + " UserMessage,\n", + ")\n", + "from vendor.tau2.evaluator.evaluator_nl_assertions import NLAssertionsEvaluator\n", + "\n", + "print(\"✅ All imports successful!\")\n", + "\n", + "logging.basicConfig(level=logging.WARNING, force=True)\n", + "\n", + "logger.remove() # Remove default handler\n", + "logger.add(lambda _: None, level=\"ERROR\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "vscode": { + "languageId": "raw" + } + }, + "source": [ + "## 1. Set Up Evaluation Benchmark\n", + "\n", + "First, let's load the evaluation dataset we want to benchmark our models on." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "vscode": { - "languageId": "plaintext" - } - }, - "source": [ - "## 4. Run Evaluations\n", - "\n", - "Now we'll run the airline evaluation on both models and compare their performance.\n", - "\n", - "First, let's set up some code to manager our MCP server. We will run this server later on for our MCP tools to make calls to." - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Loaded airline dataset with 50 scenarios\n" + ] + } + ], + "source": [ + "with open(\"datasets/airline.json\", \"r\") as f:\n", + " tau2_eval_dataset = json.load(f)\n", + " # TODO: something here is broken\n", + "\n", + "print(f\"✅ Loaded airline dataset with {len(tau2_eval_dataset)} scenarios\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "vscode": { + "languageId": "raw" + } + }, + "source": [ + "## 2. Evaluation Function: Tau2-Bench\n", + "\n", + "Now, let's implement the actual evaluation function (also called a reward function), based on Tau2-Bench. If you haven't heard of Tau2-Bench, it's a customer support benchmark from Sierra AI. Check out more information here: https://github.com/sierra-research/tau2-bench" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "@reward_function\n", + "async def airline_eval(messages: List[Any], nl_assertions: List[str] = None, **kwargs) -> EvaluateResult:\n", + " \"\"\"\n", + " Evaluate airline conversation using tau2-bench NLAssertionsEvaluator.\n", + "\n", + " Args:\n", + " messages: Conversation between agent and customer\n", + " nl_assertions: List of natural language assertions to evaluate\n", + " **kwargs: Additional parameters\n", + "\n", + " Returns:\n", + " EvaluateResult with binary pass/fail and detailed assertion breakdown\n", + " \"\"\"\n", + " # Default assertions if none provided\n", + " if nl_assertions is None:\n", + " nl_assertions = [\"The agent handled the customer request appropriately according to airline policy\"]\n", + "\n", + " # Convert Message objects directly to tau2-bench message objects\n", + " trajectory_objects = []\n", + " for msg in messages:\n", + " role = msg.role\n", + " content = msg.content\n", + "\n", + " if role == \"system\":\n", + " trajectory_objects.append(SystemMessage(role=role, content=content))\n", + " elif role == \"assistant\":\n", + " trajectory_objects.append(AssistantMessage(role=role, content=content))\n", + " elif role == \"user\":\n", + " trajectory_objects.append(UserMessage(role=role, content=content))\n", + " elif role == \"tool\":\n", + " tool_id = msg.tool_call_id\n", + " trajectory_objects.append(ToolMessage(id=tool_id, role=role, content=content))\n", + "\n", + " # Run the synchronous tau2-bench evaluation in a thread pool to avoid blocking\n", + " loop = asyncio.get_event_loop()\n", + " nl_assertions_checks = await loop.run_in_executor(\n", + " None, NLAssertionsEvaluator.evaluate_nl_assertions, trajectory_objects, nl_assertions\n", + " )\n", + "\n", + " all_expectations_met = all(result.met for result in nl_assertions_checks)\n", + " reward = 1.0 if all_expectations_met else 0.0\n", + "\n", + " # Build reason string\n", + " if all_expectations_met:\n", + " reason = f\"All {len(nl_assertions)} natural language assertions passed\"\n", + " else:\n", + " failed_assertions = [nl_assertions[i] for i, result in enumerate(nl_assertions_checks) if not result.met]\n", + " reason = f\"Failed assertions: {failed_assertions}\"\n", + "\n", + " return EvaluateResult(\n", + " score=reward,\n", + " reason=reason,\n", + " metrics={},\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "vscode": { + "languageId": "raw" + } + }, + "source": [ + "## 3. Set Up Model Policies\n", + "\n", + "Configure the three models we want to compare: Claude 4 Opus, GPT-4.1, and Kimi K2.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Before we get into the main logic, we'd like to track quality and cost across the different models, so this is a bit of setup for tracking cost. For Kimi K2, we're using the official pricing from Firework's website, since litellm doesn't contain it." - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ All required API keys are set\n" + ] + } + ], + "source": [ + "# Check for required API keys (set these as environment variables)\n", + "# Example: export ANTHROPIC_API_KEY=your-key-here\n", + "\n", + "required_keys = [\"ANTHROPIC_API_KEY\", \"OPENAI_API_KEY\", \"FIREWORKS_API_KEY\"]\n", + "missing_keys = [key for key in required_keys if not os.getenv(key)]\n", + "\n", + "if missing_keys:\n", + " print(f\"⚠️ Missing API keys: {missing_keys}\")\n", + " print(\"Please set these environment variables:\")\n", + " for key in missing_keys:\n", + " print(f\" export {key}='your-key-here'\")\n", + "else:\n", + " print(\"✅ All required API keys are set\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "MANUAL_PRICING = {\n", - " \"accounts/fireworks/models/kimi-k2-instruct\": {\n", - " \"input_cost_per_1m\": 0.60, # Estimated based on Fireworks pricing\n", - " \"output_cost_per_1m\": 2.50, # Estimated - Fireworks often uses same price for input/output\n", - " }\n", - "}\n", - "\n", - "def calculate_evaluation_cost(model_id: str, llm_usage_summary: LLMUsageStats) -> Dict[str, Any]:\n", - " input_tokens = llm_usage_summary.prompt_tokens or 0\n", - " output_tokens = llm_usage_summary.completion_tokens or 0\n", - " total_tokens = llm_usage_summary.total_tokens or (input_tokens + output_tokens)\n", - " \n", - " if model_id in MANUAL_PRICING:\n", - " pricing = MANUAL_PRICING[model_id]\n", - " \n", - " input_cost = input_tokens * pricing[\"input_cost_per_1m\"] / 1000000\n", - " output_cost = output_tokens * pricing[\"output_cost_per_1m\"] / 1000000\n", - " total_cost = input_cost + output_cost\n", - " \n", - " cost_source = \"manual_pricing\"\n", - "\n", - " else:\n", - " input_cost, output_cost = cost_per_token(\n", - " model=model_id,\n", - " prompt_tokens=input_tokens,\n", - " completion_tokens=output_tokens\n", - " )\n", - " total_cost = input_cost + output_cost\n", - " \n", - " cost_source = \"litellm\"\n", - " \n", - " return {\n", - " \"total_cost\": total_cost,\n", - " \"input_cost\": input_cost,\n", - " \"output_cost\": output_cost,\n", - " \"total_tokens\": total_tokens,\n", - " \"input_tokens\": input_tokens,\n", - " \"output_tokens\": output_tokens,\n", - " \"model_id\": model_id,\n", - " \"cost_source\": cost_source,\n", - " }" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Model policies created:\n", + " - Claude 4 Sonnet (Anthropic)\n", + " - Kimi K2 (Fireworks)\n" + ] + } + ], + "source": [ + "# Create model policies\n", + "openai_policy = rk.OpenAIPolicy(\n", + " model_id=\"gpt-4.1\",\n", + " temperature=0.1,\n", + " max_tokens=4096,\n", + ")\n", + "\n", + "anthropic_policy = rk.AnthropicPolicy(\n", + " model_id=\"claude-sonnet-4-20250514\",\n", + " temperature=0.1,\n", + " max_tokens=4096,\n", + ")\n", + "\n", + "kimi_policy = rk.FireworksPolicy(\n", + " model_id=\"accounts/fireworks/models/kimi-k2-instruct\",\n", + " temperature=0.1,\n", + " max_tokens=4096,\n", + ")\n", + "\n", + "models_to_test = {\n", + " # \"gpt-4.1\": {\n", + " # \"policy\": openai_policy,\n", + " # \"name\": \"GPT-4.1\",\n", + " # \"provider\": \"OpenAI\"\n", + " # },\n", + " \"claude-sonnet-4\": {\"policy\": anthropic_policy, \"name\": \"Claude 4 Sonnet\", \"provider\": \"Anthropic\"},\n", + " \"kimi-k2\": {\"policy\": kimi_policy, \"name\": \"Kimi K2\", \"provider\": \"Fireworks\"},\n", + "}\n", + "\n", + "print(\"✅ Model policies created:\")\n", + "for model_id, model_info in models_to_test.items():\n", + " print(f\" - {model_info['name']} ({model_info['provider']})\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "source": [ + "## 4. Run Evaluations\n", + "\n", + "Now we'll run the airline evaluation on both models and compare their performance.\n", + "\n", + "First, let's set up some code to manager our MCP server. We will run this server later on for our MCP tools to make calls to." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Before we get into the main logic, we'd like to track quality and cost across the different models, so this is a bit of setup for tracking cost. For Kimi K2, we're using the official pricing from Firework's website, since litellm doesn't contain it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "MANUAL_PRICING = {\n", + " \"accounts/fireworks/models/kimi-k2-instruct\": {\n", + " \"input_cost_per_1m\": 0.60, # Estimated based on Fireworks pricing\n", + " \"output_cost_per_1m\": 2.50, # Estimated - Fireworks often uses same price for input/output\n", + " }\n", + "}\n", + "\n", + "\n", + "def calculate_evaluation_cost(model_id: str, llm_usage_summary: LLMUsageStats) -> Dict[str, Any]:\n", + " input_tokens = llm_usage_summary.prompt_tokens or 0\n", + " output_tokens = llm_usage_summary.completion_tokens or 0\n", + " total_tokens = llm_usage_summary.total_tokens or (input_tokens + output_tokens)\n", + "\n", + " if model_id in MANUAL_PRICING:\n", + " pricing = MANUAL_PRICING[model_id]\n", + "\n", + " input_cost = input_tokens * pricing[\"input_cost_per_1m\"] / 1000000\n", + " output_cost = output_tokens * pricing[\"output_cost_per_1m\"] / 1000000\n", + " total_cost = input_cost + output_cost\n", + "\n", + " cost_source = \"manual_pricing\"\n", + "\n", + " else:\n", + " input_cost, output_cost = cost_per_token(\n", + " model=model_id, prompt_tokens=input_tokens, completion_tokens=output_tokens\n", + " )\n", + " total_cost = input_cost + output_cost\n", + "\n", + " cost_source = \"litellm\"\n", + "\n", + " return {\n", + " \"total_cost\": total_cost,\n", + " \"input_cost\": input_cost,\n", + " \"output_cost\": output_cost,\n", + " \"total_tokens\": total_tokens,\n", + " \"input_tokens\": input_tokens,\n", + " \"output_tokens\": output_tokens,\n", + " \"model_id\": model_id,\n", + " \"cost_source\": cost_source,\n", + " }" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Below is our core logic for running the Tau2-bench eval for a single model. We use the eval protocol framework to do rk.make() and rk.rollout(), " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "async def run_model_evaluation(model_id: str, model_info: Dict, dataset: List[Dict]) -> Tuple[List[Dict], List[Dict]]:\n", + " \"\"\"\n", + " Run evaluation for a single model on the airline dataset.\n", + "\n", + " Returns:\n", + " Tuple of (evaluation_results, evaluation_records)\n", + " \"\"\"\n", + " print(f\"\\n🧪 Starting evaluation for {model_info['name']}...\")\n", + "\n", + " # Use context manager for automatic cleanup even on exceptions\n", + " with MCPServerManager(\"../examples/tau2_mcp/server.py\", port=8000, domain=\"airline\") as server:\n", + " policy = model_info[\"policy\"]\n", + "\n", + " envs = await rk.make(\n", + " \"http://localhost:8000/mcp/\",\n", + " dataset=dataset,\n", + " model_id=policy.model_id,\n", + " )\n", + "\n", + " print(f\"📊 Created {len(envs.sessions)} environment sessions\")\n", + "\n", + " start_time = time.time()\n", + " evaluation_rows = await rk.rollout(envs, policy=policy, steps=30, max_concurrent_rollouts=8)\n", + " duration = time.time() - start_time\n", + "\n", + " print(f\"✅ Completed {len(evaluation_rows)} evaluation rows in {duration:.2f}s\")\n", + "\n", + " # Create a helper function to process each evaluation row\n", + " async def process_evaluation_row(i: int, eval_row, dataset_item):\n", + " nl_assertions = dataset_item[\"assertions\"]\n", + "\n", + " # Run tau2-bench evaluation (now async and parallelizable!)\n", + " eval_result = await airline_eval(eval_row.messages, nl_assertions)\n", + "\n", + " # Calculate cost using existing LLMUsageStats and LiteLLM/manual pricing\n", + " llm_usage = eval_row.llm_usage_summary\n", + " print(f\" 📊 LLM Usage for {dataset_item['id']}: {llm_usage}\") # Debug: show actual usage\n", + " cost_info = calculate_evaluation_cost(policy.model_id, llm_usage)\n", + "\n", + " num_assertions = len(nl_assertions)\n", + "\n", + " # Create evaluation result\n", + " result = {\n", + " \"scenario_id\": dataset_item[\"id\"],\n", + " \"model_id\": policy.model_id,\n", + " \"score\": eval_result.score,\n", + " \"num_assertions\": num_assertions,\n", + " \"cost_info\": cost_info, # Include cost information in results\n", + " }\n", + "\n", + " # Create comprehensive evaluation record\n", + " evaluation_record = {\n", + " \"model_id\": policy.model_id,\n", + " \"scenario_id\": dataset_item[\"id\"],\n", + " \"conversation_history\": eval_row.messages,\n", + " \"evaluation\": {\n", + " \"score\": eval_result.score,\n", + " \"num_assertions\": num_assertions,\n", + " \"reason\": eval_result.reason,\n", + " \"assertions\": [\n", + " {\n", + " \"assertion\": assertion,\n", + " \"passed\": eval_result.score > 0, # All pass or all fail for this simple implementation\n", + " }\n", + " for assertion in nl_assertions\n", + " ],\n", + " },\n", + " \"cost_info\": cost_info, # Add cost information to evaluation record\n", + " \"timestamp\": datetime.now().isoformat(),\n", + " }\n", + "\n", + " print(f\" 📋 {result['scenario_id']}: {result['score']:.1f}, total {result['num_assertions']} assertions)\")\n", + " return result, evaluation_record\n", + "\n", + " # Process all evaluation rows in parallel using asyncio.gather\n", + " print(f\"🚀 Processing {len(evaluation_rows)} evaluation row evaluations in parallel...\")\n", + " eval_start_time = time.time()\n", + "\n", + " tasks = [process_evaluation_row(i, eval_row, dataset[i]) for i, eval_row in enumerate(evaluation_rows)]\n", + "\n", + " # Run all evaluations concurrently\n", + " results_and_records = await asyncio.gather(*tasks)\n", + "\n", + " eval_duration = time.time() - eval_start_time\n", + " print(f\"✅ Completed parallel evaluations in {eval_duration:.2f}s\")\n", + "\n", + " # Separate results and evaluation records\n", + " results = []\n", + " evaluation_records = []\n", + " for result, evaluation_record in results_and_records:\n", + " results.append(result)\n", + " evaluation_records.append(evaluation_record)\n", + "\n", + " await envs.close()\n", + " # Server cleanup happens automatically via context manager\n", + "\n", + " return results, evaluation_records" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Below is our core logic for running the Tau2-bench eval for a single model. We use the eval protocol framework to do rk.make() and rk.rollout(), " - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "🧪 Starting evaluation for Claude 4 Sonnet...\n", + "✅ Server started successfully on port 8000\n", + "📊 Created 50 environment sessions\n" + ] }, { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "async def run_model_evaluation(model_id: str, model_info: Dict, dataset: List[Dict]) -> Tuple[List[Dict], List[Dict]]:\n", - " \"\"\"\n", - " Run evaluation for a single model on the airline dataset.\n", - " \n", - " Returns:\n", - " Tuple of (evaluation_results, evaluation_records)\n", - " \"\"\"\n", - " print(f\"\\n🧪 Starting evaluation for {model_info['name']}...\")\n", - "\n", - " # Use context manager for automatic cleanup even on exceptions\n", - " with MCPServerManager(\"../examples/tau2_mcp/server.py\", port=8000, domain=\"airline\") as server:\n", - " policy = model_info[\"policy\"]\n", - " \n", - " envs = await rk.make(\n", - " \"http://localhost:8000/mcp/\",\n", - " dataset=dataset, \n", - " model_id=policy.model_id,\n", - " )\n", - " \n", - " print(f\"📊 Created {len(envs.sessions)} environment sessions\")\n", - " \n", - " start_time = time.time()\n", - " evaluation_rows = await rk.rollout(envs, policy=policy, steps=30, max_concurrent_rollouts=8)\n", - " duration = time.time() - start_time\n", - " \n", - " print(f\"✅ Completed {len(evaluation_rows)} evaluation rows in {duration:.2f}s\")\n", - " \n", - " # Create a helper function to process each evaluation row\n", - " async def process_evaluation_row(i: int, eval_row, dataset_item):\n", - " nl_assertions = dataset_item[\"assertions\"]\n", - " \n", - " # Run tau2-bench evaluation (now async and parallelizable!)\n", - " eval_result = await airline_eval(eval_row.messages, nl_assertions)\n", - " \n", - " # Calculate cost using existing LLMUsageStats and LiteLLM/manual pricing\n", - " llm_usage = eval_row.llm_usage_summary\n", - " print(f\" 📊 LLM Usage for {dataset_item['id']}: {llm_usage}\") # Debug: show actual usage\n", - " cost_info = calculate_evaluation_cost(policy.model_id, llm_usage)\n", - "\n", - " num_assertions = len(nl_assertions)\n", - "\n", - " # Create evaluation result\n", - " result = {\n", - " \"scenario_id\": dataset_item[\"id\"],\n", - " \"model_id\": policy.model_id,\n", - " \"score\": eval_result.score,\n", - " \"num_assertions\": num_assertions,\n", - " \"cost_info\": cost_info, # Include cost information in results\n", - " }\n", - " \n", - " # Create comprehensive evaluation record\n", - " evaluation_record = {\n", - " \"model_id\": policy.model_id,\n", - " \"scenario_id\": dataset_item[\"id\"],\n", - " \"conversation_history\": eval_row.messages,\n", - " \"evaluation\": {\n", - " \"score\": eval_result.score,\n", - " \"num_assertions\": num_assertions,\n", - " \"reason\": eval_result.reason,\n", - " \"assertions\": [\n", - " {\n", - " \"assertion\": assertion,\n", - " \"passed\": eval_result.score > 0 # All pass or all fail for this simple implementation\n", - " }\n", - " for assertion in nl_assertions\n", - " ]\n", - " },\n", - " \"cost_info\": cost_info, # Add cost information to evaluation record\n", - " \"timestamp\": datetime.now().isoformat(),\n", - " }\n", - " \n", - " print(f\" 📋 {result['scenario_id']}: {result['score']:.1f}, total {result['num_assertions']} assertions)\")\n", - " return result, evaluation_record\n", - " \n", - " # Process all evaluation rows in parallel using asyncio.gather\n", - " print(f\"🚀 Processing {len(evaluation_rows)} evaluation row evaluations in parallel...\")\n", - " eval_start_time = time.time()\n", - " \n", - " tasks = [\n", - " process_evaluation_row(i, eval_row, dataset[i]) \n", - " for i, eval_row in enumerate(evaluation_rows)\n", - " ]\n", - " \n", - " # Run all evaluations concurrently\n", - " results_and_records = await asyncio.gather(*tasks)\n", - " \n", - " eval_duration = time.time() - eval_start_time\n", - " print(f\"✅ Completed parallel evaluations in {eval_duration:.2f}s\")\n", - " \n", - " # Separate results and evaluation records\n", - " results = []\n", - " evaluation_records = []\n", - " for result, evaluation_record in results_and_records:\n", - " results.append(result)\n", - " evaluation_records.append(evaluation_record)\n", - " \n", - " await envs.close()\n", - " # Server cleanup happens automatically via context manager\n", - " \n", - " return results, evaluation_records" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:eval_protocol.mcp.client.connection:Session c581b1937dfd10fa4e177cc027a41035: Invalid JSON from update_reservation_flights: Error executing tool update_reservation_flights: Flight HAT030 not available on date 2024-05-13. Error: Expecting value: line 1 column 1 (char 0)\n", + "WARNING:eval_protocol.mcp.client.connection:Session 84de98d36b6446b307f60fe1e534a067: Invalid JSON from update_reservation_baggages: Error executing tool update_reservation_baggages: Gift card balance is not enough. Error: Expecting value: line 1 column 1 (char 0)\n", + "WARNING:eval_protocol.mcp.client.connection:Session 84de98d36b6446b307f60fe1e534a067: Invalid JSON from update_reservation_baggages: Error executing tool update_reservation_baggages: Gift card balance is not enough. Error: Expecting value: line 1 column 1 (char 0)\n", + "WARNING:eval_protocol.mcp.client.connection:Session aa8e35d6b8cfee9df34e24b405b60f94: Invalid JSON from update_reservation_flights: Error executing tool update_reservation_flights: Gift card balance is not enough. Error: Expecting value: line 1 column 1 (char 0)\n", + "WARNING:eval_protocol.mcp.client.connection:Session 84de98d36b6446b307f60fe1e534a067: Invalid JSON from update_reservation_baggages: Error executing tool update_reservation_baggages: Gift card balance is not enough. Error: Expecting value: line 1 column 1 (char 0)\n", + "WARNING:eval_protocol.mcp.client.connection:Session 41dba4c12d152158564c1e49f986c220: Invalid JSON from update_reservation_flights: Error executing tool update_reservation_flights: Certificate cannot be used to update reservation. Error: Expecting value: line 1 column 1 (char 0)\n", + "WARNING:eval_protocol.mcp.client.connection:Session 105e5b441bcc0be055a231d0189ee750: Invalid JSON from book_reservation: Error executing tool book_reservation: Payment amount does not add up, total price is 290, but paid 304. Error: Expecting value: line 1 column 1 (char 0)\n" + ] }, { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "🧪 Starting evaluation for Claude 4 Sonnet...\n", - "✅ Server started successfully on port 8000\n", - "📊 Created 50 environment sessions\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:eval_protocol.mcp.client.connection:Session c581b1937dfd10fa4e177cc027a41035: Invalid JSON from update_reservation_flights: Error executing tool update_reservation_flights: Flight HAT030 not available on date 2024-05-13. Error: Expecting value: line 1 column 1 (char 0)\n", - "WARNING:eval_protocol.mcp.client.connection:Session 84de98d36b6446b307f60fe1e534a067: Invalid JSON from update_reservation_baggages: Error executing tool update_reservation_baggages: Gift card balance is not enough. Error: Expecting value: line 1 column 1 (char 0)\n", - "WARNING:eval_protocol.mcp.client.connection:Session 84de98d36b6446b307f60fe1e534a067: Invalid JSON from update_reservation_baggages: Error executing tool update_reservation_baggages: Gift card balance is not enough. Error: Expecting value: line 1 column 1 (char 0)\n", - "WARNING:eval_protocol.mcp.client.connection:Session aa8e35d6b8cfee9df34e24b405b60f94: Invalid JSON from update_reservation_flights: Error executing tool update_reservation_flights: Gift card balance is not enough. Error: Expecting value: line 1 column 1 (char 0)\n", - "WARNING:eval_protocol.mcp.client.connection:Session 84de98d36b6446b307f60fe1e534a067: Invalid JSON from update_reservation_baggages: Error executing tool update_reservation_baggages: Gift card balance is not enough. Error: Expecting value: line 1 column 1 (char 0)\n", - "WARNING:eval_protocol.mcp.client.connection:Session 41dba4c12d152158564c1e49f986c220: Invalid JSON from update_reservation_flights: Error executing tool update_reservation_flights: Certificate cannot be used to update reservation. Error: Expecting value: line 1 column 1 (char 0)\n", - "WARNING:eval_protocol.mcp.client.connection:Session 105e5b441bcc0be055a231d0189ee750: Invalid JSON from book_reservation: Error executing tool book_reservation: Payment amount does not add up, total price is 290, but paid 304. Error: Expecting value: line 1 column 1 (char 0)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "🧹 Closing 50 MCP sessions...\n", - "✅ All MCP sessions closed.\n", - "✅ Completed 50 trajectories in 438.92s\n", - "🚀 Processing 50 trajectory evaluations in parallel...\n", - " 📊 LLM Usage for airline_task_6: {'prompt_tokens': 11809, 'completion_tokens': 439, 'total_tokens': 12248}\n", - " 📋 airline_task_6: 1.0, total 1 assertions)\n", - " 📊 LLM Usage for airline_task_1: {'prompt_tokens': 48521, 'completion_tokens': 465, 'total_tokens': 48986}\n", - " 📋 airline_task_1: 0.0, total 1 assertions)\n", - " 📊 LLM Usage for airline_task_0: {'prompt_tokens': 18067, 'completion_tokens': 255, 'total_tokens': 18322}\n", - " 📋 airline_task_0: 0.0, total 1 assertions)\n", - " 📊 LLM Usage for airline_task_10: {'prompt_tokens': 70113, 'completion_tokens': 1132, 'total_tokens': 71245}\n", - " 📋 airline_task_10: 1.0, total 1 assertions)\n", - " 📊 LLM Usage for airline_task_13: {'prompt_tokens': 73350, 'completion_tokens': 1136, 'total_tokens': 74486}\n", - " 📋 airline_task_13: 0.0, total 1 assertions)\n", - " 📊 LLM Usage for airline_task_5: {'prompt_tokens': 31643, 'completion_tokens': 416, 'total_tokens': 32059}\n", - " 📋 airline_task_5: 0.0, total 2 assertions)\n", - " 📊 LLM Usage for airline_task_3: {'prompt_tokens': 18289, 'completion_tokens': 281, 'total_tokens': 18570}\n", - " 📋 airline_task_3: 1.0, total 2 assertions)\n", - " 📊 LLM Usage for airline_task_4: {'prompt_tokens': 47856, 'completion_tokens': 838, 'total_tokens': 48694}\n", - " 📋 airline_task_4: 1.0, total 2 assertions)\n", - " 📊 LLM Usage for airline_task_12: {'prompt_tokens': 32974, 'completion_tokens': 545, 'total_tokens': 33519}\n", - " 📋 airline_task_12: 1.0, total 2 assertions)\n", - " 📊 LLM Usage for airline_task_19: {'prompt_tokens': 31917, 'completion_tokens': 452, 'total_tokens': 32369}\n", - " 📋 airline_task_19: 1.0, total 1 assertions)\n", - " 📊 LLM Usage for airline_task_16: {'prompt_tokens': 34448, 'completion_tokens': 748, 'total_tokens': 35196}\n", - " 📋 airline_task_16: 0.0, total 2 assertions)\n", - " 📊 LLM Usage for airline_task_9: {'prompt_tokens': 48912, 'completion_tokens': 825, 'total_tokens': 49737}\n", - " 📋 airline_task_9: 0.0, total 4 assertions)\n", - " 📊 LLM Usage for airline_task_15: {'prompt_tokens': 56489, 'completion_tokens': 949, 'total_tokens': 57438}\n", - " 📋 airline_task_15: 0.0, total 2 assertions)\n", - " 📊 LLM Usage for airline_task_11: {'prompt_tokens': 32715, 'completion_tokens': 395, 'total_tokens': 33110}\n", - " 📋 airline_task_11: 0.0, total 3 assertions)\n", - " 📊 LLM Usage for airline_task_2: {'prompt_tokens': 64625, 'completion_tokens': 925, 'total_tokens': 65550}\n", - " 📋 airline_task_2: 1.0, total 4 assertions)\n", - " 📊 LLM Usage for airline_task_14: {'prompt_tokens': 105792, 'completion_tokens': 1297, 'total_tokens': 107089}\n", - " 📋 airline_task_14: 0.0, total 5 assertions)\n", - " 📊 LLM Usage for airline_task_7: {'prompt_tokens': 49692, 'completion_tokens': 518, 'total_tokens': 50210}\n", - " 📋 airline_task_7: 0.0, total 4 assertions)\n", - " 📊 LLM Usage for airline_task_26: {'prompt_tokens': 18258, 'completion_tokens': 498, 'total_tokens': 18756}\n", - " 📋 airline_task_26: 1.0, total 1 assertions)\n", - " 📊 LLM Usage for airline_task_8: {'prompt_tokens': 56129, 'completion_tokens': 939, 'total_tokens': 57068}\n", - " 📋 airline_task_8: 1.0, total 4 assertions)\n", - " 📊 LLM Usage for airline_task_17: {'prompt_tokens': 57622, 'completion_tokens': 710, 'total_tokens': 58332}\n", - " 📋 airline_task_17: 0.0, total 3 assertions)\n", - " 📊 LLM Usage for airline_task_28: {'prompt_tokens': 18305, 'completion_tokens': 519, 'total_tokens': 18824}\n", - " 📋 airline_task_28: 1.0, total 2 assertions)\n", - " 📊 LLM Usage for airline_task_22: {'prompt_tokens': 102233, 'completion_tokens': 1630, 'total_tokens': 103863}\n", - " 📋 airline_task_22: 0.0, total 3 assertions)\n", - " 📊 LLM Usage for airline_task_20: {'prompt_tokens': 75987, 'completion_tokens': 1169, 'total_tokens': 77156}\n", - " 📋 airline_task_20: 0.0, total 2 assertions)\n", - " 📊 LLM Usage for airline_task_31: {'prompt_tokens': 17973, 'completion_tokens': 508, 'total_tokens': 18481}\n", - " 📋 airline_task_31: 1.0, total 1 assertions)\n", - " 📊 LLM Usage for airline_task_18: {'prompt_tokens': 290410, 'completion_tokens': 3217, 'total_tokens': 293627}\n", - " 📋 airline_task_18: 0.0, total 6 assertions)\n", - " 📊 LLM Usage for airline_task_27: {'prompt_tokens': 91697, 'completion_tokens': 900, 'total_tokens': 92597}\n", - " 📋 airline_task_27: 1.0, total 3 assertions)\n", - " 📊 LLM Usage for airline_task_29: {'prompt_tokens': 45098, 'completion_tokens': 921, 'total_tokens': 46019}\n", - " 📋 airline_task_29: 1.0, total 2 assertions)\n", - " 📊 LLM Usage for airline_task_30: {'prompt_tokens': 34284, 'completion_tokens': 851, 'total_tokens': 35135}\n", - " 📋 airline_task_30: 1.0, total 2 assertions)\n", - " 📊 LLM Usage for airline_task_25: {'prompt_tokens': 34098, 'completion_tokens': 791, 'total_tokens': 34889}\n", - " 📋 airline_task_25: 1.0, total 2 assertions)\n", - " 📊 LLM Usage for airline_task_32: {'prompt_tokens': 25880, 'completion_tokens': 505, 'total_tokens': 26385}\n", - " 📋 airline_task_32: 1.0, total 2 assertions)\n", - " 📊 LLM Usage for airline_task_21: {'prompt_tokens': 91852, 'completion_tokens': 1140, 'total_tokens': 92992}\n", - " 📋 airline_task_21: 0.0, total 3 assertions)\n", - " 📊 LLM Usage for airline_task_34: {'prompt_tokens': 33951, 'completion_tokens': 1036, 'total_tokens': 34987}\n", - " 📋 airline_task_34: 1.0, total 1 assertions)\n", - " 📊 LLM Usage for airline_task_24: {'prompt_tokens': 64616, 'completion_tokens': 1547, 'total_tokens': 66163}\n", - " 📋 airline_task_24: 0.0, total 3 assertions)\n", - " 📊 LLM Usage for airline_task_36: {'prompt_tokens': 27295, 'completion_tokens': 347, 'total_tokens': 27642}\n", - " 📋 airline_task_36: 1.0, total 1 assertions)\n", - " 📊 LLM Usage for airline_task_41: {'prompt_tokens': 66964, 'completion_tokens': 577, 'total_tokens': 67541}\n", - " 📋 airline_task_41: 0.0, total 2 assertions)\n", - " 📊 LLM Usage for airline_task_42: {'prompt_tokens': 86379, 'completion_tokens': 951, 'total_tokens': 87330}\n", - " 📋 airline_task_42: 1.0, total 2 assertions)\n", - " 📊 LLM Usage for airline_task_40: {'prompt_tokens': 11763, 'completion_tokens': 229, 'total_tokens': 11992}\n", - " 📋 airline_task_40: 1.0, total 1 assertions)\n", - " 📊 LLM Usage for airline_task_33: {'prompt_tokens': 43097, 'completion_tokens': 814, 'total_tokens': 43911}\n", - " 📋 airline_task_33: 1.0, total 3 assertions)\n", - " 📊 LLM Usage for airline_task_45: {'prompt_tokens': 5584, 'completion_tokens': 62, 'total_tokens': 5646}\n", - " 📋 airline_task_45: 1.0, total 2 assertions)\n", - " 📊 LLM Usage for airline_task_46: {'prompt_tokens': 12057, 'completion_tokens': 322, 'total_tokens': 12379}\n", - " 📋 airline_task_46: 1.0, total 1 assertions)\n", - " 📊 LLM Usage for airline_task_23: {'prompt_tokens': 134261, 'completion_tokens': 2398, 'total_tokens': 136659}\n", - " 📋 airline_task_23: 1.0, total 8 assertions)\n", - " 📊 LLM Usage for airline_task_47: {'prompt_tokens': 17987, 'completion_tokens': 221, 'total_tokens': 18208}\n", - " 📋 airline_task_47: 0.0, total 1 assertions)\n", - " 📊 LLM Usage for airline_task_38: {'prompt_tokens': 40105, 'completion_tokens': 852, 'total_tokens': 40957}\n", - " 📋 airline_task_38: 0.0, total 4 assertions)\n", - " 📊 LLM Usage for airline_task_35: {'prompt_tokens': 56123, 'completion_tokens': 1211, 'total_tokens': 57334}\n", - " 📋 airline_task_35: 0.0, total 3 assertions)\n", - " 📊 LLM Usage for airline_task_49: {'prompt_tokens': 11756, 'completion_tokens': 332, 'total_tokens': 12088}\n", - " 📋 airline_task_49: 1.0, total 1 assertions)\n", - " 📊 LLM Usage for airline_task_43: {'prompt_tokens': 56223, 'completion_tokens': 554, 'total_tokens': 56777}\n", - " 📋 airline_task_43: 0.0, total 2 assertions)\n", - " 📊 LLM Usage for airline_task_48: {'prompt_tokens': 18950, 'completion_tokens': 592, 'total_tokens': 19542}\n", - " 📋 airline_task_48: 1.0, total 1 assertions)\n", - " 📊 LLM Usage for airline_task_37: {'prompt_tokens': 50314, 'completion_tokens': 870, 'total_tokens': 51184}\n", - " 📋 airline_task_37: 0.0, total 3 assertions)\n", - " 📊 LLM Usage for airline_task_39: {'prompt_tokens': 100087, 'completion_tokens': 1072, 'total_tokens': 101159}\n", - " 📋 airline_task_39: 1.0, total 4 assertions)\n", - " 📊 LLM Usage for airline_task_44: {'prompt_tokens': 146859, 'completion_tokens': 1910, 'total_tokens': 148769}\n", - " 📋 airline_task_44: 0.0, total 5 assertions)\n", - "✅ Completed parallel evaluations in 16.19s\n", - "🧹 Closing 50 MCP sessions...\n", - "✅ All MCP sessions closed.\n", - "🛑 Stopping server on port 8000...\n", - "🧹 Cleaned up log file: /Users/derekxu/Documents/code/python-sdk/local_evals/server_output_airline_8000.log\n", - "\n", - "🧪 Starting evaluation for Kimi K2...\n", - "✅ Server started successfully on port 8000\n", - "📊 Created 50 environment sessions\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:eval_protocol.mcp.client.connection:Session 77f6847cd7f4eaa908955b68fc08b75e: Invalid JSON from get_reservation_details: Error executing tool get_reservation_details: Reservation L7X4P9 not found. Error: Expecting value: line 1 column 1 (char 0)\n", - "WARNING:eval_protocol.mcp.client.connection:Control plane status endpoint timed out after 3.0s\n", - "WARNING:eval_protocol.mcp.client.connection:Session 0de216038acb0986989909c3b22b5373: Invalid JSON from get_reservation_details: Error executing tool get_reservation_details: Reservation 45698213 not found. Error: Expecting value: line 1 column 1 (char 0)\n", - "WARNING:eval_protocol.mcp.client.connection:Session 0de216038acb0986989909c3b22b5373: Invalid JSON from get_reservation_details: Error executing tool get_reservation_details: Reservation Q7ZB34 not found. Error: Expecting value: line 1 column 1 (char 0)\n", - "WARNING:eval_protocol.mcp.client.connection:Control plane reward endpoint timed out after 3.0s\n", - "WARNING:eval_protocol.mcp.client.connection:Session 20dd3f68f9165c4cc4bd81aec770c9d4: Invalid JSON from update_reservation_flights: Error executing tool update_reservation_flights: Payment method not found. Error: Expecting value: line 1 column 1 (char 0)\n", - "WARNING:eval_protocol.mcp.client.connection:Control plane status endpoint timed out after 3.0s\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "🧹 Closing 50 MCP sessions...\n", - "✅ All MCP sessions closed.\n", - "✅ Completed 50 trajectories in 373.16s\n", - "🚀 Processing 50 trajectory evaluations in parallel...\n", - " 📊 LLM Usage for airline_task_0: {'prompt_tokens': 10394, 'completion_tokens': 348, 'total_tokens': 10742}\n", - " 📋 airline_task_0: 1.0, total 1 assertions)\n", - " 📊 LLM Usage for airline_task_1: {'prompt_tokens': 42103, 'completion_tokens': 192, 'total_tokens': 42295}\n", - " 📋 airline_task_1: 0.0, total 1 assertions)\n", - " 📊 LLM Usage for airline_task_6: {'prompt_tokens': 4932, 'completion_tokens': 38, 'total_tokens': 4970}\n", - " 📋 airline_task_6: 1.0, total 1 assertions)\n", - " 📊 LLM Usage for airline_task_13: {'prompt_tokens': 38788, 'completion_tokens': 663, 'total_tokens': 39451}\n", - " 📋 airline_task_13: 0.0, total 1 assertions)\n", - " 📊 LLM Usage for airline_task_10: {'prompt_tokens': 43693, 'completion_tokens': 366, 'total_tokens': 44059}\n", - " 📋 airline_task_10: 1.0, total 1 assertions)\n", - " 📊 LLM Usage for airline_task_5: {'prompt_tokens': 33828, 'completion_tokens': 479, 'total_tokens': 34307}\n", - " 📋 airline_task_5: 0.0, total 2 assertions)\n", - " 📊 LLM Usage for airline_task_3: {'prompt_tokens': 15963, 'completion_tokens': 230, 'total_tokens': 16193}\n", - " 📋 airline_task_3: 1.0, total 2 assertions)\n", - " 📊 LLM Usage for airline_task_12: {'prompt_tokens': 37553, 'completion_tokens': 577, 'total_tokens': 38130}\n", - " 📋 airline_task_12: 0.0, total 2 assertions)\n", - " 📊 LLM Usage for airline_task_4: {'prompt_tokens': 55540, 'completion_tokens': 610, 'total_tokens': 56150}\n", - " 📋 airline_task_4: 0.0, total 2 assertions)\n", - " 📊 LLM Usage for airline_task_19: {'prompt_tokens': 16020, 'completion_tokens': 118, 'total_tokens': 16138}\n", - " 📋 airline_task_19: 1.0, total 1 assertions)\n", - " 📊 LLM Usage for airline_task_2: {'prompt_tokens': 37009, 'completion_tokens': 243, 'total_tokens': 37252}\n", - " 📋 airline_task_2: 1.0, total 4 assertions)\n", - " 📊 LLM Usage for airline_task_11: {'prompt_tokens': 4766, 'completion_tokens': 146, 'total_tokens': 4912}\n", - " 📋 airline_task_11: 0.0, total 3 assertions)\n", - " 📊 LLM Usage for airline_task_15: {'prompt_tokens': 57885, 'completion_tokens': 533, 'total_tokens': 58418}\n", - " 📋 airline_task_15: 0.0, total 2 assertions)\n", - " 📊 LLM Usage for airline_task_16: {'prompt_tokens': 30439, 'completion_tokens': 429, 'total_tokens': 30868}\n", - " 📋 airline_task_16: 0.0, total 2 assertions)\n", - " 📊 LLM Usage for airline_task_9: {'prompt_tokens': 35496, 'completion_tokens': 376, 'total_tokens': 35872}\n", - " 📋 airline_task_9: 1.0, total 4 assertions)\n", - " 📊 LLM Usage for airline_task_17: {'prompt_tokens': 49670, 'completion_tokens': 610, 'total_tokens': 50280}\n", - " 📋 airline_task_17: 0.0, total 3 assertions)\n", - " 📊 LLM Usage for airline_task_8: {'prompt_tokens': 49402, 'completion_tokens': 510, 'total_tokens': 49912}\n", - " 📋 airline_task_8: 0.0, total 4 assertions)\n", - " 📊 LLM Usage for airline_task_7: {'prompt_tokens': 51445, 'completion_tokens': 275, 'total_tokens': 51720}\n", - " 📋 airline_task_7: 0.0, total 4 assertions)\n", - " 📊 LLM Usage for airline_task_26: {'prompt_tokens': 15790, 'completion_tokens': 426, 'total_tokens': 16216}\n", - " 📋 airline_task_26: 1.0, total 1 assertions)\n", - " 📊 LLM Usage for airline_task_14: {'prompt_tokens': 59997, 'completion_tokens': 1031, 'total_tokens': 61028}\n", - " 📋 airline_task_14: 0.0, total 5 assertions)\n", - " 📊 LLM Usage for airline_task_21: {'prompt_tokens': 96182, 'completion_tokens': 603, 'total_tokens': 96785}\n", - " 📋 airline_task_21: 0.0, total 3 assertions)\n", - " 📊 LLM Usage for airline_task_22: {'prompt_tokens': 28531, 'completion_tokens': 143, 'total_tokens': 28674}\n", - " 📋 airline_task_22: 0.0, total 3 assertions)\n", - " 📊 LLM Usage for airline_task_25: {'prompt_tokens': 10238, 'completion_tokens': 138, 'total_tokens': 10376}\n", - " 📋 airline_task_25: 0.0, total 2 assertions)\n", - " 📊 LLM Usage for airline_task_18: {'prompt_tokens': 85908, 'completion_tokens': 689, 'total_tokens': 86597}\n", - " 📋 airline_task_18: 1.0, total 6 assertions)\n", - " 📊 LLM Usage for airline_task_20: {'prompt_tokens': 42172, 'completion_tokens': 548, 'total_tokens': 42720}\n", - " 📋 airline_task_20: 0.0, total 2 assertions)\n", - " 📊 LLM Usage for airline_task_27: {'prompt_tokens': 80408, 'completion_tokens': 488, 'total_tokens': 80896}\n", - " 📋 airline_task_27: 1.0, total 3 assertions)\n", - " 📊 LLM Usage for airline_task_30: {'prompt_tokens': 30393, 'completion_tokens': 396, 'total_tokens': 30789}\n", - " 📋 airline_task_30: 1.0, total 2 assertions)\n", - " 📊 LLM Usage for airline_task_24: {'prompt_tokens': 63523, 'completion_tokens': 731, 'total_tokens': 64254}\n", - " 📋 airline_task_24: 0.0, total 3 assertions)\n", - " 📊 LLM Usage for airline_task_34: {'prompt_tokens': 23049, 'completion_tokens': 485, 'total_tokens': 23534}\n", - " 📋 airline_task_34: 1.0, total 1 assertions)\n", - " 📊 LLM Usage for airline_task_28: {'prompt_tokens': 10066, 'completion_tokens': 360, 'total_tokens': 10426}\n", - " 📋 airline_task_28: 1.0, total 2 assertions)\n", - " 📊 LLM Usage for airline_task_31: {'prompt_tokens': 16464, 'completion_tokens': 226, 'total_tokens': 16690}\n", - " 📋 airline_task_31: 1.0, total 1 assertions)\n", - " 📊 LLM Usage for airline_task_36: {'prompt_tokens': 10772, 'completion_tokens': 202, 'total_tokens': 10974}\n", - " 📋 airline_task_36: 1.0, total 1 assertions)\n", - " 📊 LLM Usage for airline_task_40: {'prompt_tokens': 10333, 'completion_tokens': 114, 'total_tokens': 10447}\n", - " 📋 airline_task_40: 1.0, total 1 assertions)\n", - " 📊 LLM Usage for airline_task_33: {'prompt_tokens': 30350, 'completion_tokens': 393, 'total_tokens': 30743}\n", - " 📋 airline_task_33: 1.0, total 3 assertions)\n", - " 📊 LLM Usage for airline_task_29: {'prompt_tokens': 38672, 'completion_tokens': 501, 'total_tokens': 39173}\n", - " 📋 airline_task_29: 0.0, total 2 assertions)\n", - " 📊 LLM Usage for airline_task_41: {'prompt_tokens': 49179, 'completion_tokens': 227, 'total_tokens': 49406}\n", - " 📋 airline_task_41: 1.0, total 2 assertions)\n", - " 📊 LLM Usage for airline_task_32: {'prompt_tokens': 22825, 'completion_tokens': 297, 'total_tokens': 23122}\n", - " 📋 airline_task_32: 0.0, total 2 assertions)\n", - " 📊 LLM Usage for airline_task_42: {'prompt_tokens': 84720, 'completion_tokens': 491, 'total_tokens': 85211}\n", - " 📋 airline_task_42: 1.0, total 2 assertions)\n", - " 📊 LLM Usage for airline_task_37: {'prompt_tokens': 61432, 'completion_tokens': 572, 'total_tokens': 62004}\n", - " 📋 airline_task_37: 0.0, total 3 assertions)\n", - " 📊 LLM Usage for airline_task_46: {'prompt_tokens': 5083, 'completion_tokens': 83, 'total_tokens': 5166}\n", - " 📋 airline_task_46: 1.0, total 1 assertions)\n", - " 📊 LLM Usage for airline_task_47: {'prompt_tokens': 10303, 'completion_tokens': 74, 'total_tokens': 10377}\n", - " 📋 airline_task_47: 0.0, total 1 assertions)\n", - " 📊 LLM Usage for airline_task_45: {'prompt_tokens': 10672, 'completion_tokens': 107, 'total_tokens': 10779}\n", - " 📋 airline_task_45: 0.0, total 2 assertions)\n", - " 📊 LLM Usage for airline_task_38: {'prompt_tokens': 35391, 'completion_tokens': 495, 'total_tokens': 35886}\n", - " 📋 airline_task_38: 0.0, total 4 assertions)\n", - " 📊 LLM Usage for airline_task_39: {'prompt_tokens': 77165, 'completion_tokens': 331, 'total_tokens': 77496}\n", - " 📋 airline_task_39: 1.0, total 4 assertions)\n", - " 📊 LLM Usage for airline_task_48: {'prompt_tokens': 10259, 'completion_tokens': 330, 'total_tokens': 10589}\n", - " 📋 airline_task_48: 1.0, total 1 assertions)\n", - " 📊 LLM Usage for airline_task_49: {'prompt_tokens': 10091, 'completion_tokens': 257, 'total_tokens': 10348}\n", - " 📋 airline_task_49: 1.0, total 1 assertions)\n", - " 📊 LLM Usage for airline_task_35: {'prompt_tokens': 32788, 'completion_tokens': 406, 'total_tokens': 33194}\n", - " 📋 airline_task_35: 0.0, total 3 assertions)\n", - " 📊 LLM Usage for airline_task_43: {'prompt_tokens': 43329, 'completion_tokens': 275, 'total_tokens': 43604}\n", - " 📋 airline_task_43: 0.0, total 2 assertions)\n", - " 📊 LLM Usage for airline_task_23: {'prompt_tokens': 50198, 'completion_tokens': 921, 'total_tokens': 51119}\n", - " 📋 airline_task_23: 0.0, total 8 assertions)\n", - " 📊 LLM Usage for airline_task_44: {'prompt_tokens': 41578, 'completion_tokens': 345, 'total_tokens': 41923}\n", - " 📋 airline_task_44: 0.0, total 5 assertions)\n", - "✅ Completed parallel evaluations in 17.52s\n", - "🧹 Closing 50 MCP sessions...\n", - "✅ All MCP sessions closed.\n", - "🛑 Stopping server on port 8000...\n", - "🧹 Cleaned up log file: /Users/derekxu/Documents/code/python-sdk/local_evals/server_output_airline_8000.log\n", - "\n", - "✅ Completed evaluations for 2 models\n", - "📊 Total results: 100\n", - "📊 Total trajectories: 100\n" - ] - } - ], - "source": [ - "all_results = []\n", - "all_evaluation_records = []\n", - "\n", - "for model_id, model_info in models_to_test.items():\n", - " model_results, evaluation_records = await run_model_evaluation(model_id, model_info, tau2_eval_dataset)\n", - " all_results.extend(model_results)\n", - " all_evaluation_records.extend(evaluation_records)\n", - "\n", - "print(f\"\\n✅ Completed evaluations for {len(models_to_test)} models\")\n", - "print(f\"📊 Total results: {len(all_results)}\")\n", - "print(f\"📊 Total evaluation records: {len(all_evaluation_records)}\")" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "🧹 Closing 50 MCP sessions...\n", + "✅ All MCP sessions closed.\n", + "✅ Completed 50 trajectories in 438.92s\n", + "🚀 Processing 50 trajectory evaluations in parallel...\n", + " 📊 LLM Usage for airline_task_6: {'prompt_tokens': 11809, 'completion_tokens': 439, 'total_tokens': 12248}\n", + " 📋 airline_task_6: 1.0, total 1 assertions)\n", + " 📊 LLM Usage for airline_task_1: {'prompt_tokens': 48521, 'completion_tokens': 465, 'total_tokens': 48986}\n", + " 📋 airline_task_1: 0.0, total 1 assertions)\n", + " 📊 LLM Usage for airline_task_0: {'prompt_tokens': 18067, 'completion_tokens': 255, 'total_tokens': 18322}\n", + " 📋 airline_task_0: 0.0, total 1 assertions)\n", + " 📊 LLM Usage for airline_task_10: {'prompt_tokens': 70113, 'completion_tokens': 1132, 'total_tokens': 71245}\n", + " 📋 airline_task_10: 1.0, total 1 assertions)\n", + " 📊 LLM Usage for airline_task_13: {'prompt_tokens': 73350, 'completion_tokens': 1136, 'total_tokens': 74486}\n", + " 📋 airline_task_13: 0.0, total 1 assertions)\n", + " 📊 LLM Usage for airline_task_5: {'prompt_tokens': 31643, 'completion_tokens': 416, 'total_tokens': 32059}\n", + " 📋 airline_task_5: 0.0, total 2 assertions)\n", + " 📊 LLM Usage for airline_task_3: {'prompt_tokens': 18289, 'completion_tokens': 281, 'total_tokens': 18570}\n", + " 📋 airline_task_3: 1.0, total 2 assertions)\n", + " 📊 LLM Usage for airline_task_4: {'prompt_tokens': 47856, 'completion_tokens': 838, 'total_tokens': 48694}\n", + " 📋 airline_task_4: 1.0, total 2 assertions)\n", + " 📊 LLM Usage for airline_task_12: {'prompt_tokens': 32974, 'completion_tokens': 545, 'total_tokens': 33519}\n", + " 📋 airline_task_12: 1.0, total 2 assertions)\n", + " 📊 LLM Usage for airline_task_19: {'prompt_tokens': 31917, 'completion_tokens': 452, 'total_tokens': 32369}\n", + " 📋 airline_task_19: 1.0, total 1 assertions)\n", + " 📊 LLM Usage for airline_task_16: {'prompt_tokens': 34448, 'completion_tokens': 748, 'total_tokens': 35196}\n", + " 📋 airline_task_16: 0.0, total 2 assertions)\n", + " 📊 LLM Usage for airline_task_9: {'prompt_tokens': 48912, 'completion_tokens': 825, 'total_tokens': 49737}\n", + " 📋 airline_task_9: 0.0, total 4 assertions)\n", + " 📊 LLM Usage for airline_task_15: {'prompt_tokens': 56489, 'completion_tokens': 949, 'total_tokens': 57438}\n", + " 📋 airline_task_15: 0.0, total 2 assertions)\n", + " 📊 LLM Usage for airline_task_11: {'prompt_tokens': 32715, 'completion_tokens': 395, 'total_tokens': 33110}\n", + " 📋 airline_task_11: 0.0, total 3 assertions)\n", + " 📊 LLM Usage for airline_task_2: {'prompt_tokens': 64625, 'completion_tokens': 925, 'total_tokens': 65550}\n", + " 📋 airline_task_2: 1.0, total 4 assertions)\n", + " 📊 LLM Usage for airline_task_14: {'prompt_tokens': 105792, 'completion_tokens': 1297, 'total_tokens': 107089}\n", + " 📋 airline_task_14: 0.0, total 5 assertions)\n", + " 📊 LLM Usage for airline_task_7: {'prompt_tokens': 49692, 'completion_tokens': 518, 'total_tokens': 50210}\n", + " 📋 airline_task_7: 0.0, total 4 assertions)\n", + " 📊 LLM Usage for airline_task_26: {'prompt_tokens': 18258, 'completion_tokens': 498, 'total_tokens': 18756}\n", + " 📋 airline_task_26: 1.0, total 1 assertions)\n", + " 📊 LLM Usage for airline_task_8: {'prompt_tokens': 56129, 'completion_tokens': 939, 'total_tokens': 57068}\n", + " 📋 airline_task_8: 1.0, total 4 assertions)\n", + " 📊 LLM Usage for airline_task_17: {'prompt_tokens': 57622, 'completion_tokens': 710, 'total_tokens': 58332}\n", + " 📋 airline_task_17: 0.0, total 3 assertions)\n", + " 📊 LLM Usage for airline_task_28: {'prompt_tokens': 18305, 'completion_tokens': 519, 'total_tokens': 18824}\n", + " 📋 airline_task_28: 1.0, total 2 assertions)\n", + " 📊 LLM Usage for airline_task_22: {'prompt_tokens': 102233, 'completion_tokens': 1630, 'total_tokens': 103863}\n", + " 📋 airline_task_22: 0.0, total 3 assertions)\n", + " 📊 LLM Usage for airline_task_20: {'prompt_tokens': 75987, 'completion_tokens': 1169, 'total_tokens': 77156}\n", + " 📋 airline_task_20: 0.0, total 2 assertions)\n", + " 📊 LLM Usage for airline_task_31: {'prompt_tokens': 17973, 'completion_tokens': 508, 'total_tokens': 18481}\n", + " 📋 airline_task_31: 1.0, total 1 assertions)\n", + " 📊 LLM Usage for airline_task_18: {'prompt_tokens': 290410, 'completion_tokens': 3217, 'total_tokens': 293627}\n", + " 📋 airline_task_18: 0.0, total 6 assertions)\n", + " 📊 LLM Usage for airline_task_27: {'prompt_tokens': 91697, 'completion_tokens': 900, 'total_tokens': 92597}\n", + " 📋 airline_task_27: 1.0, total 3 assertions)\n", + " 📊 LLM Usage for airline_task_29: {'prompt_tokens': 45098, 'completion_tokens': 921, 'total_tokens': 46019}\n", + " 📋 airline_task_29: 1.0, total 2 assertions)\n", + " 📊 LLM Usage for airline_task_30: {'prompt_tokens': 34284, 'completion_tokens': 851, 'total_tokens': 35135}\n", + " 📋 airline_task_30: 1.0, total 2 assertions)\n", + " 📊 LLM Usage for airline_task_25: {'prompt_tokens': 34098, 'completion_tokens': 791, 'total_tokens': 34889}\n", + " 📋 airline_task_25: 1.0, total 2 assertions)\n", + " 📊 LLM Usage for airline_task_32: {'prompt_tokens': 25880, 'completion_tokens': 505, 'total_tokens': 26385}\n", + " 📋 airline_task_32: 1.0, total 2 assertions)\n", + " 📊 LLM Usage for airline_task_21: {'prompt_tokens': 91852, 'completion_tokens': 1140, 'total_tokens': 92992}\n", + " 📋 airline_task_21: 0.0, total 3 assertions)\n", + " 📊 LLM Usage for airline_task_34: {'prompt_tokens': 33951, 'completion_tokens': 1036, 'total_tokens': 34987}\n", + " 📋 airline_task_34: 1.0, total 1 assertions)\n", + " 📊 LLM Usage for airline_task_24: {'prompt_tokens': 64616, 'completion_tokens': 1547, 'total_tokens': 66163}\n", + " 📋 airline_task_24: 0.0, total 3 assertions)\n", + " 📊 LLM Usage for airline_task_36: {'prompt_tokens': 27295, 'completion_tokens': 347, 'total_tokens': 27642}\n", + " 📋 airline_task_36: 1.0, total 1 assertions)\n", + " 📊 LLM Usage for airline_task_41: {'prompt_tokens': 66964, 'completion_tokens': 577, 'total_tokens': 67541}\n", + " 📋 airline_task_41: 0.0, total 2 assertions)\n", + " 📊 LLM Usage for airline_task_42: {'prompt_tokens': 86379, 'completion_tokens': 951, 'total_tokens': 87330}\n", + " 📋 airline_task_42: 1.0, total 2 assertions)\n", + " 📊 LLM Usage for airline_task_40: {'prompt_tokens': 11763, 'completion_tokens': 229, 'total_tokens': 11992}\n", + " 📋 airline_task_40: 1.0, total 1 assertions)\n", + " 📊 LLM Usage for airline_task_33: {'prompt_tokens': 43097, 'completion_tokens': 814, 'total_tokens': 43911}\n", + " 📋 airline_task_33: 1.0, total 3 assertions)\n", + " 📊 LLM Usage for airline_task_45: {'prompt_tokens': 5584, 'completion_tokens': 62, 'total_tokens': 5646}\n", + " 📋 airline_task_45: 1.0, total 2 assertions)\n", + " 📊 LLM Usage for airline_task_46: {'prompt_tokens': 12057, 'completion_tokens': 322, 'total_tokens': 12379}\n", + " 📋 airline_task_46: 1.0, total 1 assertions)\n", + " 📊 LLM Usage for airline_task_23: {'prompt_tokens': 134261, 'completion_tokens': 2398, 'total_tokens': 136659}\n", + " 📋 airline_task_23: 1.0, total 8 assertions)\n", + " 📊 LLM Usage for airline_task_47: {'prompt_tokens': 17987, 'completion_tokens': 221, 'total_tokens': 18208}\n", + " 📋 airline_task_47: 0.0, total 1 assertions)\n", + " 📊 LLM Usage for airline_task_38: {'prompt_tokens': 40105, 'completion_tokens': 852, 'total_tokens': 40957}\n", + " 📋 airline_task_38: 0.0, total 4 assertions)\n", + " 📊 LLM Usage for airline_task_35: {'prompt_tokens': 56123, 'completion_tokens': 1211, 'total_tokens': 57334}\n", + " 📋 airline_task_35: 0.0, total 3 assertions)\n", + " 📊 LLM Usage for airline_task_49: {'prompt_tokens': 11756, 'completion_tokens': 332, 'total_tokens': 12088}\n", + " 📋 airline_task_49: 1.0, total 1 assertions)\n", + " 📊 LLM Usage for airline_task_43: {'prompt_tokens': 56223, 'completion_tokens': 554, 'total_tokens': 56777}\n", + " 📋 airline_task_43: 0.0, total 2 assertions)\n", + " 📊 LLM Usage for airline_task_48: {'prompt_tokens': 18950, 'completion_tokens': 592, 'total_tokens': 19542}\n", + " 📋 airline_task_48: 1.0, total 1 assertions)\n", + " 📊 LLM Usage for airline_task_37: {'prompt_tokens': 50314, 'completion_tokens': 870, 'total_tokens': 51184}\n", + " 📋 airline_task_37: 0.0, total 3 assertions)\n", + " 📊 LLM Usage for airline_task_39: {'prompt_tokens': 100087, 'completion_tokens': 1072, 'total_tokens': 101159}\n", + " 📋 airline_task_39: 1.0, total 4 assertions)\n", + " 📊 LLM Usage for airline_task_44: {'prompt_tokens': 146859, 'completion_tokens': 1910, 'total_tokens': 148769}\n", + " 📋 airline_task_44: 0.0, total 5 assertions)\n", + "✅ Completed parallel evaluations in 16.19s\n", + "🧹 Closing 50 MCP sessions...\n", + "✅ All MCP sessions closed.\n", + "🛑 Stopping server on port 8000...\n", + "🧹 Cleaned up log file: /Users/derekxu/Documents/code/python-sdk/local_evals/server_output_airline_8000.log\n", + "\n", + "🧪 Starting evaluation for Kimi K2...\n", + "✅ Server started successfully on port 8000\n", + "📊 Created 50 environment sessions\n" + ] }, { - "cell_type": "markdown", - "metadata": { - "vscode": { - "languageId": "raw" - } - }, - "source": [ - "## 5. Analyze Results\n", - "\n", - "Let's analyze and visualize the comparison between Claude 4 Opus, GPT-4.1, and Kimi K2.\n" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:eval_protocol.mcp.client.connection:Session 77f6847cd7f4eaa908955b68fc08b75e: Invalid JSON from get_reservation_details: Error executing tool get_reservation_details: Reservation L7X4P9 not found. Error: Expecting value: line 1 column 1 (char 0)\n", + "WARNING:eval_protocol.mcp.client.connection:Control plane status endpoint timed out after 3.0s\n", + "WARNING:eval_protocol.mcp.client.connection:Session 0de216038acb0986989909c3b22b5373: Invalid JSON from get_reservation_details: Error executing tool get_reservation_details: Reservation 45698213 not found. Error: Expecting value: line 1 column 1 (char 0)\n", + "WARNING:eval_protocol.mcp.client.connection:Session 0de216038acb0986989909c3b22b5373: Invalid JSON from get_reservation_details: Error executing tool get_reservation_details: Reservation Q7ZB34 not found. Error: Expecting value: line 1 column 1 (char 0)\n", + "WARNING:eval_protocol.mcp.client.connection:Control plane reward endpoint timed out after 3.0s\n", + "WARNING:eval_protocol.mcp.client.connection:Session 20dd3f68f9165c4cc4bd81aec770c9d4: Invalid JSON from update_reservation_flights: Error executing tool update_reservation_flights: Payment method not found. Error: Expecting value: line 1 column 1 (char 0)\n", + "WARNING:eval_protocol.mcp.client.connection:Control plane status endpoint timed out after 3.0s\n" + ] }, { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "📈 Summary Statistics:\n", - " Claude 4 Sonnet: 54.00% success rate (27.0/50) - Cost: $8.79 (via litellm)\n", - " Kimi K2: 46.00% success rate (23.0/50) - Cost: $1.14 (via manual_pricing)\n", - "\n", - "💰 Total evaluation cost: $9.93\n", - "📊 Cost calculation uses actual API usage data from LLMUsageStats\n" - ] - } - ], - "source": [ - "model_id_to_config = {}\n", - "for config_key, model_info in models_to_test.items():\n", - " actual_model_id = model_info[\"policy\"].model_id\n", - " model_id_to_config[actual_model_id] = model_info\n", - "\n", - "print(f\"\\n📈 Summary Statistics:\")\n", - "total_cost = 0.0\n", - "for actual_model_id, model_info in model_id_to_config.items():\n", - " model_results_subset = [r for r in all_results if r['model_id'] == actual_model_id]\n", - " avg_score = sum(r['score'] for r in model_results_subset) / len(model_results_subset) if model_results_subset else 0\n", - " \n", - " # Calculate total cost for this model\n", - " model_total_cost = sum(r['cost_info']['total_cost'] for r in model_results_subset if 'cost_info' in r)\n", - " total_cost += model_total_cost\n", - " \n", - " # Show cost source info\n", - " cost_sources = [r['cost_info'].get('cost_source', 'unknown') for r in model_results_subset if 'cost_info' in r]\n", - " cost_source_summary = f\" (via {cost_sources[0]})\" if cost_sources else \"\"\n", - " \n", - " print(f\" {model_info['name']}: {avg_score:.2%} success rate ({sum(r['score'] for r in model_results_subset)}/{len(model_results_subset)}) - Cost: ${model_total_cost:.2f}{cost_source_summary}\")\n", - "\n", - "print(f\"\\n💰 Total evaluation cost: ${total_cost:.2f}\")\n", - "print(f\"📊 Cost calculation uses actual API usage data from LLMUsageStats\")" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "🧹 Closing 50 MCP sessions...\n", + "✅ All MCP sessions closed.\n", + "✅ Completed 50 trajectories in 373.16s\n", + "🚀 Processing 50 trajectory evaluations in parallel...\n", + " 📊 LLM Usage for airline_task_0: {'prompt_tokens': 10394, 'completion_tokens': 348, 'total_tokens': 10742}\n", + " 📋 airline_task_0: 1.0, total 1 assertions)\n", + " 📊 LLM Usage for airline_task_1: {'prompt_tokens': 42103, 'completion_tokens': 192, 'total_tokens': 42295}\n", + " 📋 airline_task_1: 0.0, total 1 assertions)\n", + " 📊 LLM Usage for airline_task_6: {'prompt_tokens': 4932, 'completion_tokens': 38, 'total_tokens': 4970}\n", + " 📋 airline_task_6: 1.0, total 1 assertions)\n", + " 📊 LLM Usage for airline_task_13: {'prompt_tokens': 38788, 'completion_tokens': 663, 'total_tokens': 39451}\n", + " 📋 airline_task_13: 0.0, total 1 assertions)\n", + " 📊 LLM Usage for airline_task_10: {'prompt_tokens': 43693, 'completion_tokens': 366, 'total_tokens': 44059}\n", + " 📋 airline_task_10: 1.0, total 1 assertions)\n", + " 📊 LLM Usage for airline_task_5: {'prompt_tokens': 33828, 'completion_tokens': 479, 'total_tokens': 34307}\n", + " 📋 airline_task_5: 0.0, total 2 assertions)\n", + " 📊 LLM Usage for airline_task_3: {'prompt_tokens': 15963, 'completion_tokens': 230, 'total_tokens': 16193}\n", + " 📋 airline_task_3: 1.0, total 2 assertions)\n", + " 📊 LLM Usage for airline_task_12: {'prompt_tokens': 37553, 'completion_tokens': 577, 'total_tokens': 38130}\n", + " 📋 airline_task_12: 0.0, total 2 assertions)\n", + " 📊 LLM Usage for airline_task_4: {'prompt_tokens': 55540, 'completion_tokens': 610, 'total_tokens': 56150}\n", + " 📋 airline_task_4: 0.0, total 2 assertions)\n", + " 📊 LLM Usage for airline_task_19: {'prompt_tokens': 16020, 'completion_tokens': 118, 'total_tokens': 16138}\n", + " 📋 airline_task_19: 1.0, total 1 assertions)\n", + " 📊 LLM Usage for airline_task_2: {'prompt_tokens': 37009, 'completion_tokens': 243, 'total_tokens': 37252}\n", + " 📋 airline_task_2: 1.0, total 4 assertions)\n", + " 📊 LLM Usage for airline_task_11: {'prompt_tokens': 4766, 'completion_tokens': 146, 'total_tokens': 4912}\n", + " 📋 airline_task_11: 0.0, total 3 assertions)\n", + " 📊 LLM Usage for airline_task_15: {'prompt_tokens': 57885, 'completion_tokens': 533, 'total_tokens': 58418}\n", + " 📋 airline_task_15: 0.0, total 2 assertions)\n", + " 📊 LLM Usage for airline_task_16: {'prompt_tokens': 30439, 'completion_tokens': 429, 'total_tokens': 30868}\n", + " 📋 airline_task_16: 0.0, total 2 assertions)\n", + " 📊 LLM Usage for airline_task_9: {'prompt_tokens': 35496, 'completion_tokens': 376, 'total_tokens': 35872}\n", + " 📋 airline_task_9: 1.0, total 4 assertions)\n", + " 📊 LLM Usage for airline_task_17: {'prompt_tokens': 49670, 'completion_tokens': 610, 'total_tokens': 50280}\n", + " 📋 airline_task_17: 0.0, total 3 assertions)\n", + " 📊 LLM Usage for airline_task_8: {'prompt_tokens': 49402, 'completion_tokens': 510, 'total_tokens': 49912}\n", + " 📋 airline_task_8: 0.0, total 4 assertions)\n", + " 📊 LLM Usage for airline_task_7: {'prompt_tokens': 51445, 'completion_tokens': 275, 'total_tokens': 51720}\n", + " 📋 airline_task_7: 0.0, total 4 assertions)\n", + " 📊 LLM Usage for airline_task_26: {'prompt_tokens': 15790, 'completion_tokens': 426, 'total_tokens': 16216}\n", + " 📋 airline_task_26: 1.0, total 1 assertions)\n", + " 📊 LLM Usage for airline_task_14: {'prompt_tokens': 59997, 'completion_tokens': 1031, 'total_tokens': 61028}\n", + " 📋 airline_task_14: 0.0, total 5 assertions)\n", + " 📊 LLM Usage for airline_task_21: {'prompt_tokens': 96182, 'completion_tokens': 603, 'total_tokens': 96785}\n", + " 📋 airline_task_21: 0.0, total 3 assertions)\n", + " 📊 LLM Usage for airline_task_22: {'prompt_tokens': 28531, 'completion_tokens': 143, 'total_tokens': 28674}\n", + " 📋 airline_task_22: 0.0, total 3 assertions)\n", + " 📊 LLM Usage for airline_task_25: {'prompt_tokens': 10238, 'completion_tokens': 138, 'total_tokens': 10376}\n", + " 📋 airline_task_25: 0.0, total 2 assertions)\n", + " 📊 LLM Usage for airline_task_18: {'prompt_tokens': 85908, 'completion_tokens': 689, 'total_tokens': 86597}\n", + " 📋 airline_task_18: 1.0, total 6 assertions)\n", + " 📊 LLM Usage for airline_task_20: {'prompt_tokens': 42172, 'completion_tokens': 548, 'total_tokens': 42720}\n", + " 📋 airline_task_20: 0.0, total 2 assertions)\n", + " 📊 LLM Usage for airline_task_27: {'prompt_tokens': 80408, 'completion_tokens': 488, 'total_tokens': 80896}\n", + " 📋 airline_task_27: 1.0, total 3 assertions)\n", + " 📊 LLM Usage for airline_task_30: {'prompt_tokens': 30393, 'completion_tokens': 396, 'total_tokens': 30789}\n", + " 📋 airline_task_30: 1.0, total 2 assertions)\n", + " 📊 LLM Usage for airline_task_24: {'prompt_tokens': 63523, 'completion_tokens': 731, 'total_tokens': 64254}\n", + " 📋 airline_task_24: 0.0, total 3 assertions)\n", + " 📊 LLM Usage for airline_task_34: {'prompt_tokens': 23049, 'completion_tokens': 485, 'total_tokens': 23534}\n", + " 📋 airline_task_34: 1.0, total 1 assertions)\n", + " 📊 LLM Usage for airline_task_28: {'prompt_tokens': 10066, 'completion_tokens': 360, 'total_tokens': 10426}\n", + " 📋 airline_task_28: 1.0, total 2 assertions)\n", + " 📊 LLM Usage for airline_task_31: {'prompt_tokens': 16464, 'completion_tokens': 226, 'total_tokens': 16690}\n", + " 📋 airline_task_31: 1.0, total 1 assertions)\n", + " 📊 LLM Usage for airline_task_36: {'prompt_tokens': 10772, 'completion_tokens': 202, 'total_tokens': 10974}\n", + " 📋 airline_task_36: 1.0, total 1 assertions)\n", + " 📊 LLM Usage for airline_task_40: {'prompt_tokens': 10333, 'completion_tokens': 114, 'total_tokens': 10447}\n", + " 📋 airline_task_40: 1.0, total 1 assertions)\n", + " 📊 LLM Usage for airline_task_33: {'prompt_tokens': 30350, 'completion_tokens': 393, 'total_tokens': 30743}\n", + " 📋 airline_task_33: 1.0, total 3 assertions)\n", + " 📊 LLM Usage for airline_task_29: {'prompt_tokens': 38672, 'completion_tokens': 501, 'total_tokens': 39173}\n", + " 📋 airline_task_29: 0.0, total 2 assertions)\n", + " 📊 LLM Usage for airline_task_41: {'prompt_tokens': 49179, 'completion_tokens': 227, 'total_tokens': 49406}\n", + " 📋 airline_task_41: 1.0, total 2 assertions)\n", + " 📊 LLM Usage for airline_task_32: {'prompt_tokens': 22825, 'completion_tokens': 297, 'total_tokens': 23122}\n", + " 📋 airline_task_32: 0.0, total 2 assertions)\n", + " 📊 LLM Usage for airline_task_42: {'prompt_tokens': 84720, 'completion_tokens': 491, 'total_tokens': 85211}\n", + " 📋 airline_task_42: 1.0, total 2 assertions)\n", + " 📊 LLM Usage for airline_task_37: {'prompt_tokens': 61432, 'completion_tokens': 572, 'total_tokens': 62004}\n", + " 📋 airline_task_37: 0.0, total 3 assertions)\n", + " 📊 LLM Usage for airline_task_46: {'prompt_tokens': 5083, 'completion_tokens': 83, 'total_tokens': 5166}\n", + " 📋 airline_task_46: 1.0, total 1 assertions)\n", + " 📊 LLM Usage for airline_task_47: {'prompt_tokens': 10303, 'completion_tokens': 74, 'total_tokens': 10377}\n", + " 📋 airline_task_47: 0.0, total 1 assertions)\n", + " 📊 LLM Usage for airline_task_45: {'prompt_tokens': 10672, 'completion_tokens': 107, 'total_tokens': 10779}\n", + " 📋 airline_task_45: 0.0, total 2 assertions)\n", + " 📊 LLM Usage for airline_task_38: {'prompt_tokens': 35391, 'completion_tokens': 495, 'total_tokens': 35886}\n", + " 📋 airline_task_38: 0.0, total 4 assertions)\n", + " 📊 LLM Usage for airline_task_39: {'prompt_tokens': 77165, 'completion_tokens': 331, 'total_tokens': 77496}\n", + " 📋 airline_task_39: 1.0, total 4 assertions)\n", + " 📊 LLM Usage for airline_task_48: {'prompt_tokens': 10259, 'completion_tokens': 330, 'total_tokens': 10589}\n", + " 📋 airline_task_48: 1.0, total 1 assertions)\n", + " 📊 LLM Usage for airline_task_49: {'prompt_tokens': 10091, 'completion_tokens': 257, 'total_tokens': 10348}\n", + " 📋 airline_task_49: 1.0, total 1 assertions)\n", + " 📊 LLM Usage for airline_task_35: {'prompt_tokens': 32788, 'completion_tokens': 406, 'total_tokens': 33194}\n", + " 📋 airline_task_35: 0.0, total 3 assertions)\n", + " 📊 LLM Usage for airline_task_43: {'prompt_tokens': 43329, 'completion_tokens': 275, 'total_tokens': 43604}\n", + " 📋 airline_task_43: 0.0, total 2 assertions)\n", + " 📊 LLM Usage for airline_task_23: {'prompt_tokens': 50198, 'completion_tokens': 921, 'total_tokens': 51119}\n", + " 📋 airline_task_23: 0.0, total 8 assertions)\n", + " 📊 LLM Usage for airline_task_44: {'prompt_tokens': 41578, 'completion_tokens': 345, 'total_tokens': 41923}\n", + " 📋 airline_task_44: 0.0, total 5 assertions)\n", + "✅ Completed parallel evaluations in 17.52s\n", + "🧹 Closing 50 MCP sessions...\n", + "✅ All MCP sessions closed.\n", + "🛑 Stopping server on port 8000...\n", + "🧹 Cleaned up log file: /Users/derekxu/Documents/code/python-sdk/local_evals/server_output_airline_8000.log\n", + "\n", + "✅ Completed evaluations for 2 models\n", + "📊 Total results: 100\n", + "📊 Total trajectories: 100\n" + ] + } + ], + "source": [ + "all_results = []\n", + "all_evaluation_records = []\n", + "\n", + "for model_id, model_info in models_to_test.items():\n", + " model_results, evaluation_records = await run_model_evaluation(model_id, model_info, tau2_eval_dataset)\n", + " all_results.extend(model_results)\n", + " all_evaluation_records.extend(evaluation_records)\n", + "\n", + "print(f\"\\n✅ Completed evaluations for {len(models_to_test)} models\")\n", + "print(f\"📊 Total results: {len(all_results)}\")\n", + "print(f\"📊 Total evaluation records: {len(all_evaluation_records)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "vscode": { + "languageId": "raw" + } + }, + "source": [ + "## 5. Analyze Results\n", + "\n", + "Let's analyze and visualize the comparison between Claude 4 Opus, GPT-4.1, and Kimi K2.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "📄 Saved JSONL file: trajectory_outputs/all_trajectories.jsonl\n" - ] - }, - { - "data": { - "text/plain": [ - "PosixPath('trajectory_outputs/all_trajectories.jsonl')" - ] - }, - "execution_count": 35, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "def save_results_jsonl(evaluation_records: List[Dict], output_file: str = \"evaluation_outputs/all_evaluations.jsonl\"):\n", - " \"\"\"Save all evaluation records in JSONL format (one JSON object per line).\"\"\"\n", - " output_path = Path(output_file)\n", - " output_path.parent.mkdir(exist_ok=True)\n", - " \n", - " with open(output_path, 'w') as f:\n", - " for record in evaluation_records:\n", - " json.dump(record, f, default=str)\n", - " f.write('\\n')\n", - " \n", - " print(f\"📄 Saved JSONL file: {output_path}\")\n", - " return output_path\n", - "\n", - "save_results_jsonl(all_evaluation_records)" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "📈 Summary Statistics:\n", + " Claude 4 Sonnet: 54.00% success rate (27.0/50) - Cost: $8.79 (via litellm)\n", + " Kimi K2: 46.00% success rate (23.0/50) - Cost: $1.14 (via manual_pricing)\n", + "\n", + "💰 Total evaluation cost: $9.93\n", + "📊 Cost calculation uses actual API usage data from LLMUsageStats\n" + ] + } + ], + "source": [ + "model_id_to_config = {}\n", + "for config_key, model_info in models_to_test.items():\n", + " actual_model_id = model_info[\"policy\"].model_id\n", + " model_id_to_config[actual_model_id] = model_info\n", + "\n", + "print(\"\\n📈 Summary Statistics:\")\n", + "total_cost = 0.0\n", + "for actual_model_id, model_info in model_id_to_config.items():\n", + " model_results_subset = [r for r in all_results if r[\"model_id\"] == actual_model_id]\n", + " avg_score = (\n", + " sum(r[\"score\"] for r in model_results_subset) / len(model_results_subset) if model_results_subset else 0\n", + " )\n", + "\n", + " # Calculate total cost for this model\n", + " model_total_cost = sum(r[\"cost_info\"][\"total_cost\"] for r in model_results_subset if \"cost_info\" in r)\n", + " total_cost += model_total_cost\n", + "\n", + " # Show cost source info\n", + " cost_sources = [r[\"cost_info\"].get(\"cost_source\", \"unknown\") for r in model_results_subset if \"cost_info\" in r]\n", + " cost_source_summary = f\" (via {cost_sources[0]})\" if cost_sources else \"\"\n", + "\n", + " print(\n", + " f\" {model_info['name']}: {avg_score:.2%} success rate ({sum(r['score'] for r in model_results_subset)}/{len(model_results_subset)}) - Cost: ${model_total_cost:.2f}{cost_source_summary}\"\n", + " )\n", + "\n", + "print(f\"\\n💰 Total evaluation cost: ${total_cost:.2f}\")\n", + "print(\"📊 Cost calculation uses actual API usage data from LLMUsageStats\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "📁 Saved trajectory files to: trajectory_outputs\n", - " - 100 individual trajectory files\n", - " - 1 evaluation summary file\n" - ] - }, - { - "data": { - "text/plain": [ - "PosixPath('trajectory_outputs')" - ] - }, - "execution_count": 36, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "def save_evaluation_files(evaluation_records: List[Dict], output_dir: str = \"evaluation_outputs\"):\n", - " \"\"\"Save evaluation records to individual files and create summary.\"\"\"\n", - " output_path = Path(output_dir)\n", - " output_path.mkdir(exist_ok=True)\n", - " \n", - " # Save individual evaluation files\n", - " for record in evaluation_records:\n", - " # Sanitize model_id for filename (replace slashes with underscores)\n", - " safe_model_id = record['model_id'].replace('/', '_').replace('\\\\', '_')\n", - " filename = f\"{safe_model_id}_{record['scenario_id']}_evaluation.json\"\n", - " filepath = output_path / filename\n", - " \n", - " with open(filepath, 'w') as f:\n", - " json.dump(record, f, indent=2, default=str)\n", - " \n", - " # Create summary file\n", - " summary = {\n", - " \"evaluation_summary\": {\n", - " \"total_evaluations\": len(evaluation_records),\n", - " \"models_evaluated\": list(set(r['model_id'] for r in evaluation_records)),\n", - " \"scenarios_evaluated\": list(set(r['scenario_id'] for r in evaluation_records)),\n", - " \"timestamp\": datetime.now().isoformat(),\n", - " },\n", - " \"model_performance\": {},\n", - " \"scenario_difficulty\": {}\n", - " }\n", - " \n", - " # Calculate model performance\n", - " for model_id in summary[\"evaluation_summary\"][\"models_evaluated\"]:\n", - " model_records = [r for r in evaluation_records if r['model_id'] == model_id]\n", - " total_score = sum(r['evaluation']['score'] for r in model_records)\n", - " avg_score = total_score / len(model_records) if model_records else 0\n", - " \n", - " # Calculate cost metrics\n", - " total_cost = sum(r.get('cost_info', {}).get('total_cost', 0) for r in model_records)\n", - " total_tokens = sum(r.get('cost_info', {}).get('total_tokens', 0) for r in model_records)\n", - " avg_cost_per_scenario = total_cost / len(model_records) if model_records else 0\n", - " \n", - " summary[\"model_performance\"][model_id] = {\n", - " \"total_scenarios\": len(model_records),\n", - " \"total_score\": total_score,\n", - " \"average_score\": avg_score,\n", - " \"pass_rate\": avg_score, # Since scores are 0 or 1\n", - " \"total_cost\": total_cost,\n", - " \"average_cost_per_scenario\": avg_cost_per_scenario,\n", - " \"total_tokens\": total_tokens,\n", - " \"cost_per_success\": total_cost / total_score if total_score > 0 else 0\n", - " }\n", - " \n", - " # Calculate scenario difficulty\n", - " for scenario_id in summary[\"evaluation_summary\"][\"scenarios_evaluated\"]:\n", - " scenario_records = [r for r in evaluation_records if r['scenario_id'] == scenario_id]\n", - " total_score = sum(r['evaluation']['score'] for r in scenario_records)\n", - " avg_score = total_score / len(scenario_records) if scenario_records else 0\n", - " \n", - " summary[\"scenario_difficulty\"][scenario_id] = {\n", - " \"models_tested\": len(scenario_records),\n", - " \"total_score\": total_score,\n", - " \"average_score\": avg_score,\n", - " \"difficulty\": \"easy\" if avg_score > 0.8 else \"medium\" if avg_score > 0.5 else \"hard\"\n", - " }\n", - " \n", - " # Save summary\n", - " summary_path = output_path / \"evaluation_summary.json\"\n", - " with open(summary_path, 'w') as f:\n", - " json.dump(summary, f, indent=2, default=str)\n", - " \n", - " print(f\"\\n📁 Saved evaluation files to: {output_path}\")\n", - " print(f\" - {len(evaluation_records)} individual evaluation files\")\n", - " print(f\" - 1 evaluation summary file\")\n", - " \n", - " return output_path\n", - "\n", - "save_evaluation_files(all_evaluation_records)" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "📄 Saved JSONL file: trajectory_outputs/all_trajectories.jsonl\n" + ] }, { - "cell_type": "markdown", - "metadata": { - "vscode": { - "languageId": "plaintext" - } - }, - "source": [ - "## 7. Share Results with Firectl\n", - "\n", - "Finally, let's create a dataset with our evaluation results to share using `firectl create dataset`.\n" + "data": { + "text/plain": [ + "PosixPath('trajectory_outputs/all_trajectories.jsonl')" ] - }, + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def save_results_jsonl(evaluation_records: List[Dict], output_file: str = \"evaluation_outputs/all_evaluations.jsonl\"):\n", + " \"\"\"Save all evaluation records in JSONL format (one JSON object per line).\"\"\"\n", + " output_path = Path(output_file)\n", + " output_path.parent.mkdir(exist_ok=True)\n", + "\n", + " with open(output_path, \"w\") as f:\n", + " for record in evaluation_records:\n", + " json.dump(record, f, default=str)\n", + " f.write(\"\\n\")\n", + "\n", + " print(f\"📄 Saved JSONL file: {output_path}\")\n", + " return output_path\n", + "\n", + "\n", + "save_results_jsonl(all_evaluation_records)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# TODO" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "📁 Saved trajectory files to: trajectory_outputs\n", + " - 100 individual trajectory files\n", + " - 1 evaluation summary file\n" + ] }, { - "cell_type": "markdown", - "metadata": { - "vscode": { - "languageId": "plaintext" - } - }, - "source": [ - "## Summary\n", - "\n", - "This notebook provides a complete eval harness for comparing models using tau2-bench airline evaluation with proper dataset structure:\n", - "\n", - "1. **Dataset Structure**: Following tau2-bench pattern with separate JSON datasets and markdown system prompts\n", - "2. **Models**: Configured Claude 4 Sonnet (AnthropicPolicy) and Kimi K2 (FireworksPolicy)\n", - "3. **Evaluation**: Used tau2-bench NLAssertionsEvaluator for objective scoring with EvaluationRow format\n", - "4. **Analysis**: Compared performance across multiple dimensions\n", - "5. **Sharing**: Prepared results for sharing via `firectl create dataset`\n", - "\n", - "### Key Features:\n", - "- **Clean Dataset Structure**: Separate JSON data and markdown prompts like the tau2 examples\n", - "- **Natural Language Evaluation**: Uses human-readable assertions instead of code-based metrics\n", - "- **Multi-Model Comparison**: Easy to add more models for comparison\n", - "- **Comprehensive Analysis**: Performance, accuracy, and efficiency metrics with cost tracking\n", - "- **EvaluationRow Support**: Updated to work with the new EvaluationRow format from eval_protocol\n", - "- **Reproducible**: Results can be shared and reproduced via firectl\n", - "\n", - "### Next Steps:\n", - "1. Set your API keys as environment variables:\n", - " ```bash\n", - " export ANTHROPIC_API_KEY=\"your-anthropic-key-here\"\n", - " export OPENAI_API_KEY=\"your-openai-key-here\"\n", - " export FIREWORKS_API_KEY=\"your-fireworks-key-here\"\n", - " ```\n", - "2. Start the tau2 MCP server: `cd examples/tau2_mcp && python server.py --port 8000`\n", - "3. Run the evaluation cells\n", - "4. Share results with the community using the provided firectl command\n", - "\n", - "### Expected Results:\n", - "Based on the tau2-bench framework, we expect different models to show varying performance on natural language assertion evaluation, demonstrating their ability to adhere to airline policy compliance and customer service protocols.\n", - "\n", - "This structure uses the updated EvaluationRow format and provides comprehensive cost analysis across different model providers." + "data": { + "text/plain": [ + "PosixPath('trajectory_outputs')" ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.11" + ], + "source": [ + "def save_evaluation_files(evaluation_records: List[Dict], output_dir: str = \"evaluation_outputs\"):\n", + " \"\"\"Save evaluation records to individual files and create summary.\"\"\"\n", + " output_path = Path(output_dir)\n", + " output_path.mkdir(exist_ok=True)\n", + "\n", + " # Save individual evaluation files\n", + " for record in evaluation_records:\n", + " # Sanitize model_id for filename (replace slashes with underscores)\n", + " safe_model_id = record[\"model_id\"].replace(\"/\", \"_\").replace(\"\\\\\", \"_\")\n", + " filename = f\"{safe_model_id}_{record['scenario_id']}_evaluation.json\"\n", + " filepath = output_path / filename\n", + "\n", + " with open(filepath, \"w\") as f:\n", + " json.dump(record, f, indent=2, default=str)\n", + "\n", + " # Create summary file\n", + " summary = {\n", + " \"evaluation_summary\": {\n", + " \"total_evaluations\": len(evaluation_records),\n", + " \"models_evaluated\": list(set(r[\"model_id\"] for r in evaluation_records)),\n", + " \"scenarios_evaluated\": list(set(r[\"scenario_id\"] for r in evaluation_records)),\n", + " \"timestamp\": datetime.now().isoformat(),\n", + " },\n", + " \"model_performance\": {},\n", + " \"scenario_difficulty\": {},\n", + " }\n", + "\n", + " # Calculate model performance\n", + " for model_id in summary[\"evaluation_summary\"][\"models_evaluated\"]:\n", + " model_records = [r for r in evaluation_records if r[\"model_id\"] == model_id]\n", + " total_score = sum(r[\"evaluation\"][\"score\"] for r in model_records)\n", + " avg_score = total_score / len(model_records) if model_records else 0\n", + "\n", + " # Calculate cost metrics\n", + " total_cost = sum(r.get(\"cost_info\", {}).get(\"total_cost\", 0) for r in model_records)\n", + " total_tokens = sum(r.get(\"cost_info\", {}).get(\"total_tokens\", 0) for r in model_records)\n", + " avg_cost_per_scenario = total_cost / len(model_records) if model_records else 0\n", + "\n", + " summary[\"model_performance\"][model_id] = {\n", + " \"total_scenarios\": len(model_records),\n", + " \"total_score\": total_score,\n", + " \"average_score\": avg_score,\n", + " \"pass_rate\": avg_score, # Since scores are 0 or 1\n", + " \"total_cost\": total_cost,\n", + " \"average_cost_per_scenario\": avg_cost_per_scenario,\n", + " \"total_tokens\": total_tokens,\n", + " \"cost_per_success\": total_cost / total_score if total_score > 0 else 0,\n", + " }\n", + "\n", + " # Calculate scenario difficulty\n", + " for scenario_id in summary[\"evaluation_summary\"][\"scenarios_evaluated\"]:\n", + " scenario_records = [r for r in evaluation_records if r[\"scenario_id\"] == scenario_id]\n", + " total_score = sum(r[\"evaluation\"][\"score\"] for r in scenario_records)\n", + " avg_score = total_score / len(scenario_records) if scenario_records else 0\n", + "\n", + " summary[\"scenario_difficulty\"][scenario_id] = {\n", + " \"models_tested\": len(scenario_records),\n", + " \"total_score\": total_score,\n", + " \"average_score\": avg_score,\n", + " \"difficulty\": \"easy\" if avg_score > 0.8 else \"medium\" if avg_score > 0.5 else \"hard\",\n", + " }\n", + "\n", + " # Save summary\n", + " summary_path = output_path / \"evaluation_summary.json\"\n", + " with open(summary_path, \"w\") as f:\n", + " json.dump(summary, f, indent=2, default=str)\n", + "\n", + " print(f\"\\n📁 Saved evaluation files to: {output_path}\")\n", + " print(f\" - {len(evaluation_records)} individual evaluation files\")\n", + " print(\" - 1 evaluation summary file\")\n", + "\n", + " return output_path\n", + "\n", + "\n", + "save_evaluation_files(all_evaluation_records)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "source": [ + "## 7. Share Results with Firectl\n", + "\n", + "Finally, let's create a dataset with our evaluation results to share using `firectl create dataset`.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "vscode": { + "languageId": "plaintext" } + }, + "source": [ + "## Summary\n", + "\n", + "This notebook provides a complete eval harness for comparing models using tau2-bench airline evaluation with proper dataset structure:\n", + "\n", + "1. **Dataset Structure**: Following tau2-bench pattern with separate JSON datasets and markdown system prompts\n", + "2. **Models**: Configured Claude 4 Sonnet (AnthropicPolicy) and Kimi K2 (FireworksPolicy)\n", + "3. **Evaluation**: Used tau2-bench NLAssertionsEvaluator for objective scoring with EvaluationRow format\n", + "4. **Analysis**: Compared performance across multiple dimensions\n", + "5. **Sharing**: Prepared results for sharing via `firectl create dataset`\n", + "\n", + "### Key Features:\n", + "- **Clean Dataset Structure**: Separate JSON data and markdown prompts like the tau2 examples\n", + "- **Natural Language Evaluation**: Uses human-readable assertions instead of code-based metrics\n", + "- **Multi-Model Comparison**: Easy to add more models for comparison\n", + "- **Comprehensive Analysis**: Performance, accuracy, and efficiency metrics with cost tracking\n", + "- **EvaluationRow Support**: Updated to work with the new EvaluationRow format from eval_protocol\n", + "- **Reproducible**: Results can be shared and reproduced via firectl\n", + "\n", + "### Next Steps:\n", + "1. Set your API keys as environment variables:\n", + " ```bash\n", + " export ANTHROPIC_API_KEY=\"your-anthropic-key-here\"\n", + " export OPENAI_API_KEY=\"your-openai-key-here\"\n", + " export FIREWORKS_API_KEY=\"your-fireworks-key-here\"\n", + " ```\n", + "2. Start the tau2 MCP server: `cd examples/tau2_mcp && python server.py --port 8000`\n", + "3. Run the evaluation cells\n", + "4. Share results with the community using the provided firectl command\n", + "\n", + "### Expected Results:\n", + "Based on the tau2-bench framework, we expect different models to show varying performance on natural language assertion evaluation, demonstrating their ability to adhere to airline policy compliance and customer service protocols.\n", + "\n", + "This structure uses the updated EvaluationRow format and provides comprehensive cost analysis across different model providers." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" }, - "nbformat": 4, - "nbformat_minor": 2 + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.11" + } + }, + "nbformat": 4, + "nbformat_minor": 2 } diff --git a/mypy.ini b/mypy.ini deleted file mode 100644 index 182ca82b..00000000 --- a/mypy.ini +++ /dev/null @@ -1,37 +0,0 @@ -[mypy] -# Global options -python_version = 3.10 -follow_imports = skip -explicit_package_bases = True -warn_return_any = False -warn_unused_configs = True -disallow_untyped_defs = False -disallow_incomplete_defs = False -check_untyped_defs = True -disallow_untyped_decorators = False -no_implicit_optional = True -strict_optional = True -ignore_missing_imports = True -disable_error_code = import-not-found, truthy-function, no-redef, assignment, union-attr, attr-defined, arg-type, method-assign, misc, return-value, var-annotated, operator, call-arg, index - -[mypy.plugins.pydantic.*] -follow_imports = skip - -# Specific package options -[mypy.eval_protocol.*] -# Be less strict when type checking Eval Protocol code -disallow_untyped_defs = False -disallow_incomplete_defs = False - -# Third-party packages -[mypy.numpy.*] -ignore_missing_imports = True - -[mypy.requests.*] -ignore_missing_imports = True - -[mypy.pytest.*] -ignore_missing_imports = True - -[mypy.docker.*] -ignore_missing_imports = True diff --git a/pyproject.toml b/pyproject.toml index 73105fd5..30380dd4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,11 +62,8 @@ dev = [ "pytest-asyncio", "pytest-httpserver", "werkzeug>=2.0.0", - "black>=21.5b2", - "isort>=5.0.0", - "mypy>=0.812", - "flake8>=3.9.2", - "autopep8>=1.5.0", + "ruff>=0.5.0", + "pyright>=1.1.365", "transformers>=4.0.0", "types-setuptools", "types-requests", @@ -109,18 +106,10 @@ huggingface = [ "datasets>=2.0.0", "transformers>=4.0.0", ] -bigquery = [ - "google-cloud-bigquery>=3.0.0", - "google-auth>=2.0.0", - "google-auth-oauthlib>=1.0.0", -] adapters = [ "langfuse>=2.0.0", "datasets>=2.0.0", "transformers>=4.0.0", - "google-cloud-bigquery>=3.0.0", - "google-auth>=2.0.0", - "google-auth-oauthlib>=1.0.0", ] svgbench = [ "selenium>=4.0.0", @@ -157,12 +146,7 @@ versionfile_build = "eval_protocol/_version.py" tag_prefix = "v" parentdir_prefix = "eval-protocol-" -[tool.black] -line-length = 119 - -[tool.isort] -profile = "black" -line_length = 119 +### Black and isort removed; Ruff provides formatting and imports [tool.uv.sources] tau2 = { git = "https://github.com/sierra-research/tau2-bench.git" } @@ -174,3 +158,64 @@ dev = [ "haikus==0.3.8", "pytest>=8.4.1", ] + +[tool.ruff] +line-length = 119 +target-version = "py310" +exclude = ["vite-app", "vendor", "local_evals"] + +[tool.ruff.lint] +# Relax: only enforce core errors (E/F); drop W/I for now +select = ["E", "F"] +# Preserve current flake8 ignore behavior to minimize churn +ignore = ["E203", "E402", "E501", "F401"] + +# Suppress noisy rules in tests/examples/vendor where readability patterns differ +[tool.ruff.lint.per-file-ignores] +"tests/**/*.py" = ["F841", "E712", "E731", "F821"] +"examples/**/*.py" = ["F841", "E712", "E731", "F821"] +"eval_protocol/pytest/**/*.py" = ["F841", "E712", "E731", "F811"] +"vendor/**" = ["F841", "E712", "E731", "F811"] +"development/**/*.py" = ["F841", "E712", "E731"] +"eval_protocol/**/*.py" = ["F841", "F811", "E731", "E721"] + +[tool.ruff.lint.isort] +known-first-party = ["eval_protocol"] +combine-as-imports = true + +[tool.pyright] +typeCheckingMode = "basic" +pythonVersion = "3.10" +reportMissingImports = "none" +reportMissingTypeStubs = "none" +reportMissingModuleSource = "none" +include = ["eval_protocol", "examples", "tests"] +exclude = ["vite-app", "vendor"] +# Ignore diagnostics for vendored generator code +ignore = ["versioneer.py"] +# Relax noisy diagnostics commonly triggered in tests and dynamic libs +reportAttributeAccessIssue = "none" +reportCallIssue = "none" +reportUnknownMemberType = "none" +reportUnknownVariableType = "none" +reportPossiblyUnboundVariable = "none" +# Additional suppressions per request +reportOptionalMemberAccess = "none" +reportIndexIssue = "none" +reportReturnType = "none" +reportOptionalCall = "none" +reportGeneralTypeIssues = "none" +reportOperatorIssue = "none" +reportOptionalSubscript = "none" +reportUnsupportedDunderAll = "none" +reportOptionalContextManager = "none" +reportInvalidTypeForm = "none" +reportRedeclaration = "none" +reportUndefinedVariable = "none" +reportPrivateImportUsage = "none" +reportOptionalIterable = "none" +# Make incompatibilities and argument types warnings instead of errors for now +# and suppress warnings output entirely +reportIncompatibleVariableOverride = "none" +reportArgumentType = "none" +reportAssignmentType = "none" diff --git a/scripts/create_sample_gsm8k_jsonl.py b/scripts/create_sample_gsm8k_jsonl.py index 0bc78e46..8e561cb2 100644 --- a/scripts/create_sample_gsm8k_jsonl.py +++ b/scripts/create_sample_gsm8k_jsonl.py @@ -54,7 +54,7 @@ def create_sample_jsonl(): if samples_written > 0: print(f"Successfully wrote {samples_written} samples to {output_filepath}") else: - print(f"No samples were written. Check dataset loading and content.") + print("No samples were written. Check dataset loading and content.") if __name__ == "__main__": diff --git a/tests/cli_commands/test_deploy_cmd.py b/tests/cli_commands/test_deploy_cmd.py index 65e6471c..fbd38ae8 100644 --- a/tests/cli_commands/test_deploy_cmd.py +++ b/tests/cli_commands/test_deploy_cmd.py @@ -51,7 +51,6 @@ def mock_gcp_tools(): patch("eval_protocol.cli_commands.deploy.deploy_to_cloud_run") as mock_deploy_run, patch("eval_protocol.cli_commands.deploy.ensure_gcp_secret") as mock_ensure_gcp_secret, ): - mock_ensure_repo.return_value = True mock_gen_dockerfile.return_value = "DOCKERFILE CONTENT" mock_build_push.return_value = True @@ -67,7 +66,6 @@ def mock_gcp_tools(): class TestDeployCommandRemoteUrl: - @patch("eval_protocol.cli_commands.deploy.create_evaluation") def test_deploy_remote_url_success(self, mock_create_evaluation_call, mock_check_environment, capsys): """Test successful registration of a remote URL via create_evaluation.""" @@ -167,11 +165,10 @@ def test_deploy_remote_url_unexpected_error(self, mock_create_eval, mock_check_e captured = capsys.readouterr() # Updated error message to match common registration block - assert f"An unexpected error occurred during Fireworks AI registration: Something broke" in captured.out + assert "An unexpected error occurred during Fireworks AI registration: Something broke" in captured.out class TestDeployCommandLocalMode: # This class tests the "fireworks" target (packaging metrics) - @patch("eval_protocol.cli_commands.deploy.create_evaluation") def test_deploy_local_mode_success( # Renaming to reflect it tests "fireworks" target self, mock_create_eval, mock_check_environment, capsys @@ -276,7 +273,7 @@ def test_deploy_gcp_mode_success( captured = capsys.readouterr() # Check initial message from helper assert f"Starting GCP Cloud Run deployment for evaluator '{args.id}'..." in captured.out - assert f"Successfully built and pushed Docker image" in captured.out + assert "Successfully built and pushed Docker image" in captured.out assert ( f"Successfully deployed to Cloud Run. Service URL: {mock_gcp_tools['deploy_run'].return_value}" in captured.out diff --git a/tests/cli_commands/test_preview_cmd.py b/tests/cli_commands/test_preview_cmd.py index ec4ffa9e..20cf0416 100644 --- a/tests/cli_commands/test_preview_cmd.py +++ b/tests/cli_commands/test_preview_cmd.py @@ -53,7 +53,6 @@ def create_temp_jsonl(tmp_path: Path, samples_data: list) -> str: class TestPreviewCommandRemoteUrl: - @patch("requests.post") def test_preview_remote_url_success_with_file(self, mock_post, mock_check_environment, tmp_path, capsys): mock_response = MagicMock() diff --git a/tests/conftest.py b/tests/conftest.py index 6a3526a7..9c93cbf8 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,5 +1,6 @@ import sys from pathlib import Path + import pytest # Add the project root to the Python path diff --git a/tests/eval_protocol_tests/test_eval_protocol_simple.py b/tests/eval_protocol_tests/test_eval_protocol_simple.py index a8e127a6..4068cd3c 100644 --- a/tests/eval_protocol_tests/test_eval_protocol_simple.py +++ b/tests/eval_protocol_tests/test_eval_protocol_simple.py @@ -24,7 +24,6 @@ def test_basic_imports(): print(" ✓ eval_protocol imported successfully") print(" Importing eval_protocol...") - import eval_protocol print(" ✓ eval_protocol imported successfully") @@ -208,7 +207,7 @@ def main(): print(f"Test {test.__name__} crashed: {e}") failed += 1 - print(f"\n=== Results ===") + print("\n=== Results ===") print(f"Passed: {passed}") print(f"Failed: {failed}") print(f"Total: {passed + failed}") diff --git a/tests/eval_protocol_tests/test_minimal_structure.py b/tests/eval_protocol_tests/test_minimal_structure.py index aff78bff..c083e28f 100644 --- a/tests/eval_protocol_tests/test_minimal_structure.py +++ b/tests/eval_protocol_tests/test_minimal_structure.py @@ -267,7 +267,7 @@ def main(): print(f"Test {test.__name__} crashed: {e}") failed += 1 - print(f"\n=== Results ===") + print("\n=== Results ===") print(f"Passed: {passed}") print(f"Failed: {failed}") print(f"Total: {passed + failed}") diff --git a/tests/execution/test_pipeline.py b/tests/execution/test_pipeline.py index 9dba6b57..96ba8ef0 100644 --- a/tests/execution/test_pipeline.py +++ b/tests/execution/test_pipeline.py @@ -7,9 +7,9 @@ from eval_protocol.execution.pipeline import EvaluationPipeline from eval_protocol.generation.cache import ResponseCache -from eval_protocol.generation.clients import GenerationResult # Import GenerationResult from eval_protocol.generation.clients import ( # For type hinting and mocking FireworksModelClient, + GenerationResult, # Import GenerationResult ) from eval_protocol.models import EvaluateResult, Message, MetricResult diff --git a/tests/mcp_agent/orchestration/test_local_docker_client.py b/tests/mcp_agent/orchestration/test_local_docker_client.py index 1ecd3be7..13edefba 100644 --- a/tests/mcp_agent/orchestration/test_local_docker_client.py +++ b/tests/mcp_agent/orchestration/test_local_docker_client.py @@ -160,7 +160,6 @@ async def test_provision_deprovision_http_instance( "eval_protocol.mcp_agent.orchestration.local_docker_client.streamablehttp_client", new_callable=MagicMock, ) as mock_streamablehttp_client_func: - # Configure the mock Async Context Manager (ACM) that mock_streamablehttp_client_func will return mock_acm_instance = AsyncMock() # This object needs __aenter__ and __aexit__ @@ -327,7 +326,7 @@ async def test_provision_deprovision_stdio_instance( assert len(read_content_list) == 1 assert read_content_list[0].get("type") == "text" assert read_content_list[0].get("text") == test_file_content - logger.info(f"Successfully called 'read_file' via stdio, content matches template.") + logger.info("Successfully called 'read_file' via stdio, content matches template.") finally: if provisioned_instances: diff --git a/tests/mcp_agent/test_rl_filesystem_scenario.py b/tests/mcp_agent/test_rl_filesystem_scenario.py index cb9f70bb..4a394793 100644 --- a/tests/mcp_agent/test_rl_filesystem_scenario.py +++ b/tests/mcp_agent/test_rl_filesystem_scenario.py @@ -127,7 +127,7 @@ async def main(): if not fs_instance_id: raise ValueError(f"Instance ID not found for filesystem instance #{i}") - logger.info(f"\n--- Testing Filesystem Instance #{i+1} (ID: {fs_instance_id}) ---") + logger.info(f"\n--- Testing Filesystem Instance #{i + 1} (ID: {fs_instance_id}) ---") # --- Verify Initial State --- logger.info(f"[{fs_instance_id}] Verifying initial state...") diff --git a/tests/pytest/data/basic_coding_dataset.jsonl b/tests/pytest/data/basic_coding_dataset.jsonl index 27573c1b..fc25abcd 100644 --- a/tests/pytest/data/basic_coding_dataset.jsonl +++ b/tests/pytest/data/basic_coding_dataset.jsonl @@ -7,4 +7,4 @@ {"prompt": "Write a Python function `multiply_by_two` that takes an integer and returns the integer multiplied by 2.", "input": "10", "expected_output": "20"} {"prompt": "Write a Python function `get_length` that takes a list and returns its length.", "input": "[1, 2, 3]", "expected_output": "3"} {"prompt": "Write a Python function `get_length` that takes a list and returns its length.", "input": "[]", "expected_output": "0"} -{"prompt": "Write a Python function `get_length` that takes a list and returns its length.", "input": "['a', 'b', 'c', 'd']", "expected_output": "4"} \ No newline at end of file +{"prompt": "Write a Python function `get_length` that takes a list and returns its length.", "input": "['a', 'b', 'c', 'd']", "expected_output": "4"} diff --git a/tests/pytest/data/lunar_lander_dataset.jsonl b/tests/pytest/data/lunar_lander_dataset.jsonl index af396fc1..a3de90c6 100644 --- a/tests/pytest/data/lunar_lander_dataset.jsonl +++ b/tests/pytest/data/lunar_lander_dataset.jsonl @@ -1,3 +1,3 @@ {"id": "multi_env_test_001", "system_prompt": "You are controlling a lunar lander spacecraft. Use the lander_action tool with actions: NOTHING, FIRE_LEFT, FIRE_MAIN, FIRE_RIGHT. Your goal is to land safely on the moon between the two flags without crashing.", "user_prompt_template": "Current state: {observation}. First, describe what is in the image attached and analyze the current state. You MUST explain your reasoning in picking the next best action (NOTHING, FIRE_LEFT, FIRE_MAIN, FIRE_RIGHT) and call lander_action tool with it to land the spacecraft.", "environment_context": {"game": "LunarLander", "continuous": false, "gravity": -10.0, "enable_wind": false, "seed": 42}} {"id": "multi_env_test_002", "system_prompt": "You are controlling a lunar lander spacecraft. Use the lander_action tool with actions: NOTHING, FIRE_LEFT, FIRE_MAIN, FIRE_RIGHT. Your goal is to land safely on the moon between the two flags without crashing.", "user_prompt_template": "Current state: {observation}. First, describe what is in the image attached and analyze the current state. You MUST explain your reasoning in picking the next best action (NOTHING, FIRE_LEFT, FIRE_MAIN, FIRE_RIGHT) and call lander_action tool with it to land the spacecraft.", "environment_context": {"game": "LunarLander", "continuous": false, "gravity": -8.0, "enable_wind": false, "seed": 123}} -{"id": "multi_env_test_003", "system_prompt": "You are controlling a lunar lander spacecraft. Use the lander_action tool with actions: NOTHING, FIRE_LEFT, FIRE_MAIN, FIRE_RIGHT. Your goal is to land safely on the moon between the two flags without crashing.", "user_prompt_template": "Current state: {observation}. First, describe what is in the image attached and analyze the current state. You MUST explain your reasoning in picking the next best action (NOTHING, FIRE_LEFT, FIRE_MAIN, FIRE_RIGHT) and call lander_action tool with it to land the spacecraft.", "environment_context": {"game": "LunarLander", "continuous": false, "gravity": -12.0, "enable_wind": false, "seed": 456}} \ No newline at end of file +{"id": "multi_env_test_003", "system_prompt": "You are controlling a lunar lander spacecraft. Use the lander_action tool with actions: NOTHING, FIRE_LEFT, FIRE_MAIN, FIRE_RIGHT. Your goal is to land safely on the moon between the two flags without crashing.", "user_prompt_template": "Current state: {observation}. First, describe what is in the image attached and analyze the current state. You MUST explain your reasoning in picking the next best action (NOTHING, FIRE_LEFT, FIRE_MAIN, FIRE_RIGHT) and call lander_action tool with it to land the spacecraft.", "environment_context": {"game": "LunarLander", "continuous": false, "gravity": -12.0, "enable_wind": false, "seed": 456}} diff --git a/tests/pytest/helper/word_count_to_evaluation_row.py b/tests/pytest/helper/word_count_to_evaluation_row.py index f0517dd0..dbb05cc4 100644 --- a/tests/pytest/helper/word_count_to_evaluation_row.py +++ b/tests/pytest/helper/word_count_to_evaluation_row.py @@ -7,8 +7,7 @@ def word_count_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationR """Convert gsm8k dataset format to EvaluationRow for word_count evaluation.""" return [ EvaluationRow( - messages=[Message(role="user", content=row["user_query"])], - ground_truth=row["ground_truth_for_eval"] + messages=[Message(role="user", content=row["user_query"])], ground_truth=row["ground_truth_for_eval"] ) for row in data - ] \ No newline at end of file + ] diff --git a/tests/pytest/test_livesvgbench.py b/tests/pytest/test_livesvgbench.py index f105f8e0..44a8c8b8 100644 --- a/tests/pytest/test_livesvgbench.py +++ b/tests/pytest/test_livesvgbench.py @@ -47,10 +47,10 @@ def svgbench_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow for i, row in enumerate(data): # Format requirements as numbered list - requirements = "\n".join([f"{i+1}. {req}" for i, req in enumerate(row["requirements"])]) + requirements = "\n".join([f"{i + 1}. {req}" for i, req in enumerate(row["requirements"])]) # Create the generation prompt following SVGBench format - prompt = f"""{row['prompt']} Wrap the SVG code in an SVG code block following the example below. + prompt = f"""{row["prompt"]} Wrap the SVG code in an SVG code block following the example below. Example: ```svg @@ -166,7 +166,7 @@ def render_svg_to_png(svg_code: str, output_path: str) -> bool: chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") chrome_options.add_argument("--disable-gpu") - chrome_options.add_argument(f"--window-size={width+40},{height+40}") + chrome_options.add_argument(f"--window-size={width + 40},{height + 40}") # Create temporary HTML file with tempfile.NamedTemporaryFile(mode="w", suffix=".html", delete=False) as f: @@ -209,7 +209,7 @@ def evaluate_with_llm_judge(image_path: str, requirements: List[str]) -> Dict[st Dictionary with evaluation results """ # Format requirements for evaluation (exactly as in original) - requirements_text = "\n".join([f"{i+1}. {req}" for i, req in enumerate(requirements)]) + requirements_text = "\n".join([f"{i + 1}. {req}" for i, req in enumerate(requirements)]) # Create evaluation prompt with JSON response format evaluate_prompt = f"""Examine the generated image. How many of the following {len(requirements)} requirements were fulfilled? @@ -343,7 +343,7 @@ def evaluate_with_human_preference_rubrics( For example, colored circles arranged in Google colors should score very low for intent matching and recognizability. Original Requirements (for context): -{chr(10).join([f"{i+1}. {req}" for i, req in enumerate(requirements)])} +{chr(10).join([f"{i + 1}. {req}" for i, req in enumerate(requirements)])} Respond with JSON in this exact format: {{ @@ -493,27 +493,27 @@ def test_svg_combined_evaluation(row: EvaluationRow) -> EvaluationRow: === REQUIREMENTS EVALUATION (Listwise - Row-Specific) === Score: {requirements_score:.3f} -{requirements_result.get('reasoning', 'No reasoning provided')} +{requirements_result.get("reasoning", "No reasoning provided")} === HUMAN PREFERENCE EVALUATION (Pointwise - Universal Rubrics) === Score: {human_pref_score:.3f} -🎯 Intent Matching: {human_pref_result.get('intent_matching_score', 0.0):.2f}/1.0 -{human_pref_result.get('intent_reasoning', 'No reasoning provided')} +🎯 Intent Matching: {human_pref_result.get("intent_matching_score", 0.0):.2f}/1.0 +{human_pref_result.get("intent_reasoning", "No reasoning provided")} -👁️ Content Recognizability: {human_pref_result.get('content_recognizability_score', 0.0):.2f}/1.0 -{human_pref_result.get('content_reasoning', 'No reasoning provided')} +👁️ Content Recognizability: {human_pref_result.get("content_recognizability_score", 0.0):.2f}/1.0 +{human_pref_result.get("content_reasoning", "No reasoning provided")} -📐 Spatial Design Quality: {human_pref_result.get('spatial_design_score', 0.0):.2f}/1.0 -{human_pref_result.get('spatial_reasoning', 'No reasoning provided')} +📐 Spatial Design Quality: {human_pref_result.get("spatial_design_score", 0.0):.2f}/1.0 +{human_pref_result.get("spatial_reasoning", "No reasoning provided")} -👤 User Experience: {human_pref_result.get('user_experience_score', 0.0):.2f}/1.0 -{human_pref_result.get('ux_reasoning', 'No reasoning provided')} +👤 User Experience: {human_pref_result.get("user_experience_score", 0.0):.2f}/1.0 +{human_pref_result.get("ux_reasoning", "No reasoning provided")} -🎨 Visual Coherence: {human_pref_result.get('visual_coherence_score', 0.0):.2f}/1.0 -{human_pref_result.get('coherence_reasoning', 'No reasoning provided')} +🎨 Visual Coherence: {human_pref_result.get("visual_coherence_score", 0.0):.2f}/1.0 +{human_pref_result.get("coherence_reasoning", "No reasoning provided")} -{human_pref_result.get('overall_reasoning', 'No overall reasoning provided')} +{human_pref_result.get("overall_reasoning", "No overall reasoning provided")} === FINAL COMBINED SCORE === Requirements: {requirements_score:.3f} × 30% = {requirements_score * 0.3:.3f} diff --git a/tests/pytest/test_svgbench.py b/tests/pytest/test_svgbench.py index 7ff08642..90d2f8f0 100644 --- a/tests/pytest/test_svgbench.py +++ b/tests/pytest/test_svgbench.py @@ -46,10 +46,10 @@ def svgbench_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow for i, row in enumerate(data): # Format requirements as numbered list - requirements = "\n".join([f"{i+1}. {req}" for i, req in enumerate(row["requirements"])]) + requirements = "\n".join([f"{i + 1}. {req}" for i, req in enumerate(row["requirements"])]) # Create the generation prompt following SVGBench format - prompt = f"""{row['prompt']} Wrap the SVG code in an SVG code block following the example below. + prompt = f"""{row["prompt"]} Wrap the SVG code in an SVG code block following the example below. Example: ```svg @@ -165,7 +165,7 @@ def render_svg_to_png(svg_code: str, output_path: str) -> bool: chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") chrome_options.add_argument("--disable-gpu") - chrome_options.add_argument(f"--window-size={width+40},{height+40}") + chrome_options.add_argument(f"--window-size={width + 40},{height + 40}") # Create temporary HTML file with tempfile.NamedTemporaryFile(mode="w", suffix=".html", delete=False) as f: @@ -208,7 +208,7 @@ def evaluate_with_llm_judge(image_path: str, requirements: List[str]) -> Dict[st Dictionary with evaluation results """ # Format requirements for evaluation (exactly as in original) - requirements_text = "\n".join([f"{i+1}. {req}" for i, req in enumerate(requirements)]) + requirements_text = "\n".join([f"{i + 1}. {req}" for i, req in enumerate(requirements)]) # Create evaluation prompt with JSON response format evaluate_prompt = f"""Examine the generated image. How many of the following {len(requirements)} requirements were fulfilled? diff --git a/tests/test_adapters_e2e.py b/tests/test_adapters_e2e.py index 72449e8b..2c105315 100644 --- a/tests/test_adapters_e2e.py +++ b/tests/test_adapters_e2e.py @@ -554,7 +554,9 @@ def google_books_transform(row: Dict[str, Any]) -> Dict[str, Any]: frequency_desc = ( "high frequency" if term_frequency > 1000 - else "moderate frequency" if term_frequency > 100 else "low frequency" + else "moderate frequency" + if term_frequency > 100 + else "low frequency" ) document_desc = ( f"appears in {document_frequency} documents" if document_frequency > 0 else "rare occurrence" diff --git a/tests/test_agent_resources.py b/tests/test_agent_resources.py index 303f9c6b..56c23601 100644 --- a/tests/test_agent_resources.py +++ b/tests/test_agent_resources.py @@ -374,16 +374,16 @@ async def test_checkpoint_and_restore_docker(self, docker_resource: DockerResour await docker_resource.setup(config) create_file_command = "sh -c \"echo 'initial_data' > /data.txt\"" create_file_result = await docker_resource.step("exec_command", {"command": create_file_command}) - assert ( - create_file_result["exit_code"] == 0 - ), f"Failed to create /data.txt with '{create_file_command}': {create_file_result['output']}" + assert create_file_result["exit_code"] == 0, ( + f"Failed to create /data.txt with '{create_file_command}': {create_file_result['output']}" + ) # Optionally, verify file content immediately after creation in the source container verify_result = await docker_resource.step("exec_command", {"command": "cat /data.txt"}) assert verify_result["exit_code"] == 0, f"Failed to cat /data.txt after creation: {verify_result['output']}" - assert ( - "initial_data" in verify_result["output"] - ), f"/data.txt content mismatch after creation: {verify_result['output']}" + assert "initial_data" in verify_result["output"], ( + f"/data.txt content mismatch after creation: {verify_result['output']}" + ) checkpoint_info = await docker_resource.checkpoint() checkpoint_image_id = checkpoint_info["image_id"] diff --git a/tests/test_batch_evaluation.py b/tests/test_batch_evaluation.py index 9308f77f..772b8290 100644 --- a/tests/test_batch_evaluation.py +++ b/tests/test_batch_evaluation.py @@ -399,7 +399,6 @@ def smart_move_generator(**kwargs): patch.object(task_manager, "_start_resource_server", return_value=12345), patch.object(task_manager, "_wait_for_server_health", return_value=True), ): - # Execute the task with batch evaluation results = await task_manager.execute_tasks( task_ids=[task_id], @@ -413,9 +412,9 @@ def smart_move_generator(**kwargs): result = results[task_id] # Should not be an error result - assert not ( - isinstance(result, dict) and "error" in result - ), f"Task failed: {result.get('error', 'Unknown error')}" + assert not (isinstance(result, dict) and "error" in result), ( + f"Task failed: {result.get('error', 'Unknown error')}" + ) # Should be aggregated results assert isinstance(result, dict) @@ -563,7 +562,6 @@ async def test_batch_evaluation_task_manager_openai( patch.object(task_manager, "_start_resource_server", return_value=12346), patch.object(task_manager, "_wait_for_server_health", return_value=True), ): - # Execute the task with batch evaluation results = await task_manager.execute_tasks( task_ids=[task_id], @@ -577,9 +575,9 @@ async def test_batch_evaluation_task_manager_openai( result = results[task_id] # Should not be an error result - assert not ( - isinstance(result, dict) and "error" in result - ), f"Task failed: {result.get('error', 'Unknown error')}" + assert not (isinstance(result, dict) and "error" in result), ( + f"Task failed: {result.get('error', 'Unknown error')}" + ) # Should be aggregated results assert isinstance(result, dict) @@ -964,7 +962,6 @@ def smart_move_generator(**kwargs): patch.object(task_manager, "_start_resource_server", return_value=12347), patch.object(task_manager, "_wait_for_server_health", return_value=True), ): - # Execute with parallel enabled results = await task_manager.execute_tasks( task_ids=[task_id], @@ -1108,7 +1105,6 @@ def smart_move_generator(**kwargs): patch.object(task_manager, "_start_resource_server", return_value=12348), patch.object(task_manager, "_wait_for_server_health", return_value=True), ): - # Execute task results = await task_manager.execute_tasks(task_ids=[task_id], num_rollouts_override=2) diff --git a/tests/test_cli_agent.py b/tests/test_cli_agent.py index cc50376f..00763c3c 100644 --- a/tests/test_cli_agent.py +++ b/tests/test_cli_agent.py @@ -41,7 +41,7 @@ class TestAgentEvalCommand: def test_agent_eval_success_yaml(self, MockPath, MockTaskManager, caplog): # Configure caplog to capture logs from the agent_eval logger caplog.set_level(logging.INFO, logger="agent_eval") - + # Setup Path mock mock_path_instance = Mock() MockPath.return_value = mock_path_instance @@ -207,9 +207,7 @@ def test_agent_eval_orchestrator_execution_fails(self, MockPath, MockTaskManager mock_task_manager.register_task.return_value = "task1" # Make execute_tasks raise an exception - mock_task_manager.execute_tasks = AsyncMock( - side_effect=RuntimeError("Execution failed") - ) # type: ignore[attr-defined] + mock_task_manager.execute_tasks = AsyncMock(side_effect=RuntimeError("Execution failed")) # type: ignore[attr-defined] mock_task_manager.cleanup = AsyncMock() args = argparse.Namespace(task_def="dummy_task.yaml") diff --git a/tests/test_cli_args.py b/tests/test_cli_args.py index 2ecdbadb..21817879 100644 --- a/tests/test_cli_args.py +++ b/tests/test_cli_args.py @@ -7,7 +7,6 @@ class TestCliArgParsing: - # --- Tests for 'preview' command --- def test_preview_with_remote_url_and_samples(self): args_list = [ diff --git a/tests/test_code_execution.py b/tests/test_code_execution.py index 2290087d..714813b1 100644 --- a/tests/test_code_execution.py +++ b/tests/test_code_execution.py @@ -7,7 +7,6 @@ import pytest from eval_protocol.models import EvaluateResult, Message # Added for new tests -from eval_protocol.rewards.code_execution import fractional_code_reward # Added for new tests from eval_protocol.rewards.code_execution import ( _HAS_E2B, compare_outputs, @@ -16,6 +15,7 @@ execute_javascript_code, execute_python_code, extract_code_blocks, + fractional_code_reward, # Added for new tests local_code_execution_reward, string_similarity, ) @@ -464,9 +464,9 @@ def test_python_function_arg_parsing(self, test_input_str, expected_args_list, e and len(actual_test_run_details_list) > 0 ): actual_output_str = actual_test_run_details_list[0].get("actual_output") - assert actual_output_str == repr( - expected_return_val - ), f"Actual output '{actual_output_str}' did not match expected '{repr(expected_return_val)}' for input '{test_input_str}'" + assert actual_output_str == repr(expected_return_val), ( + f"Actual output '{actual_output_str}' did not match expected '{repr(expected_return_val)}' for input '{test_input_str}'" + ) except json.JSONDecodeError: # Catch specifically json.JSONDecodeError # Accessing reason from MetricResult object print( diff --git a/tests/test_data_driven_task_manager.py b/tests/test_data_driven_task_manager.py index f77051bc..a0697351 100644 --- a/tests/test_data_driven_task_manager.py +++ b/tests/test_data_driven_task_manager.py @@ -168,7 +168,6 @@ async def test_execute_data_driven_rollouts_basic(self): patch.object(self.task_manager, "_stop_resource_server"), patch("eval_protocol.agent.task_manager.Orchestrator") as mock_orchestrator_class, ): - # Set up mock orchestrator mock_orchestrator = AsyncMock() mock_orchestrator.setup_base_resource = AsyncMock() @@ -206,7 +205,6 @@ async def test_execute_multiple_rollouts_per_sample(self): patch.object(self.task_manager, "_stop_resource_server"), patch("eval_protocol.agent.task_manager.Orchestrator") as mock_orchestrator_class, ): - # Set up mock orchestrator to return different scores for each rollout mock_orchestrator = AsyncMock() mock_orchestrator.setup_base_resource = AsyncMock() @@ -250,7 +248,6 @@ async def test_execute_data_driven_with_failures(self): patch.object(self.task_manager, "_stop_resource_server"), patch("eval_protocol.agent.task_manager.Orchestrator") as mock_orchestrator_class, ): - # Set up mock orchestrator with one success and one failure mock_orchestrator = AsyncMock() mock_orchestrator.setup_base_resource = AsyncMock() @@ -307,7 +304,6 @@ async def mock_execute(*args, **kwargs): patch.object(self.task_manager, "_stop_resource_server"), patch("eval_protocol.agent.task_manager.Orchestrator") as mock_orchestrator_class, ): - mock_orchestrator = AsyncMock() mock_orchestrator.setup_base_resource = AsyncMock() mock_orchestrator.execute_task_poc = AsyncMock(side_effect=mock_execute) @@ -355,7 +351,6 @@ async def test_execute_tasks_data_driven_vs_traditional(self): patch.object(self.task_manager, "_execute_data_driven_rollouts") as mock_data_driven, patch.object(self.task_manager, "_execute_batch_rollouts") as mock_traditional, ): - mock_data_driven.return_value = [{"score": 1.0}] mock_traditional.return_value = [{"score": 0.5}] diff --git a/tests/test_deepeval_integration.py b/tests/test_deepeval_integration.py index 7115e740..16a0d86c 100644 --- a/tests/test_deepeval_integration.py +++ b/tests/test_deepeval_integration.py @@ -108,7 +108,11 @@ def is_successful(self) -> bool: class DummyGEval(BaseMetric): # type: ignore - evaluation_params = [LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT] if DEEPEVAL_AVAILABLE and hasattr(LLMTestCaseParams, "INPUT") else [] # type: ignore + evaluation_params = ( + [LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT] + if DEEPEVAL_AVAILABLE and hasattr(LLMTestCaseParams, "INPUT") + else [] + ) # type: ignore def __init__(self, threshold: float = 0.0) -> None: self.threshold = threshold @@ -199,7 +203,9 @@ def test_fireworks_geval_integration_with_mock(self) -> None: if parsed_fireworks_model_name not in valid_gpt_models: valid_gpt_models.append(parsed_fireworks_model_name) # type: ignore - actual_fireworks_model_for_geval = GPTModel(model=fireworks_model_name_for_api, _openai_api_key=fireworks_api_key) # type: ignore + actual_fireworks_model_for_geval = GPTModel( + model=fireworks_model_name_for_api, _openai_api_key=fireworks_api_key + ) # type: ignore actual_fireworks_model_for_geval.model_name = fireworks_model_name_for_api # type: ignore if fireworks_model_name_for_api not in model_pricing: @@ -340,7 +346,13 @@ async def mock_chat_completions_create(*args, messages: List[dict], model: str, "openai.resources.chat.completions.AsyncCompletions.create", new=mock_chat_completions_create, ): - geval_metric = GEval(name="Fireworks GEval Mocked", criteria="Evaluate the helpfulness and relevance of the actual output based on the input.", evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT], model=actual_fireworks_model_for_geval, strict_mode=False) # type: ignore + geval_metric = GEval( + name="Fireworks GEval Mocked", + criteria="Evaluate the helpfulness and relevance of the actual output based on the input.", + evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT], + model=actual_fireworks_model_for_geval, + strict_mode=False, + ) # type: ignore wrapped_metric = adapt_metric(geval_metric) messages_data = [ {"role": "user", "content": "What is the capital of France?"}, @@ -353,7 +365,11 @@ async def mock_chat_completions_create(*args, messages: List[dict], model: str, self.assertIsNotNone(result.score, "GEval score should not be None") self.assertEqual(result.score, 1.0, f"GEval score {result.score} was not 1.0 with mock.") expected_metric_key = f"{geval_metric.name} ({geval_metric.__class__.__name__})" # type: ignore - self.assertIn(expected_metric_key, result.metrics, f"Constructed metric key '{expected_metric_key}' not found. Keys: {list(result.metrics.keys())}") # type: ignore + self.assertIn( + expected_metric_key, + result.metrics, + f"Constructed metric key '{expected_metric_key}' not found. Keys: {list(result.metrics.keys())}", + ) # type: ignore self.assertIsNotNone(result.metrics[expected_metric_key].reason) # type: ignore diff --git a/tests/test_deploy_integration.py b/tests/test_deploy_integration.py index 8a8b6d26..6b7db1b0 100644 --- a/tests/test_deploy_integration.py +++ b/tests/test_deploy_integration.py @@ -174,7 +174,6 @@ def test_deploy_gcp_with_inline_requirements( return_value={"name": evaluator_id}, ) as mock_create_eval, ): - # Configure mock_get_config to return a basic config mock_config_instance = RewardKitConfig( gcp_cloud_run=GCPCloudRunConfig( diff --git a/tests/test_e2b_integration.py b/tests/test_e2b_integration.py index eafaeff8..eba8b585 100755 --- a/tests/test_e2b_integration.py +++ b/tests/test_e2b_integration.py @@ -62,7 +62,7 @@ def add(a, b): pytest.skip(f"Skipping due to E2B connection issue: {error_msg}") # Also check for error in main result reason if result.reason and "Invalid API key" in result.reason: - pytest.skip(f"Skipping due to invalid E2B API key") + pytest.skip("Skipping due to invalid E2B API key") # Otherwise, it should be a successful result assert result.score == 1.0 diff --git a/tests/test_e2b_js_integration.py b/tests/test_e2b_js_integration.py index 566c9005..5c266645 100755 --- a/tests/test_e2b_js_integration.py +++ b/tests/test_e2b_js_integration.py @@ -68,7 +68,7 @@ def test_e2b_javascript_integration(): pytest.skip(f"Skipping due to E2B connection issue: {error_msg}") # Also check for error in main result reason if result.reason and "Invalid API key" in result.reason: - pytest.skip(f"Skipping due to invalid E2B API key") + pytest.skip("Skipping due to invalid E2B API key") # Otherwise, it should be a successful result assert result.score == 1.0 diff --git a/tests/test_eval_protocol_import.py b/tests/test_eval_protocol_import.py index c0e17a99..c16b3927 100644 --- a/tests/test_eval_protocol_import.py +++ b/tests/test_eval_protocol_import.py @@ -36,14 +36,16 @@ def test_all_exports_consistency(self): def test_core_classes_available(self): """Test that core classes are available through both imports.""" - from eval_protocol import EvaluateResult - from eval_protocol import EvaluateResult as RPEvaluateResult - from eval_protocol import Message - from eval_protocol import Message as RPMessage - from eval_protocol import MetricResult - from eval_protocol import MetricResult as RPMetricResult - from eval_protocol import RewardFunction - from eval_protocol import RewardFunction as RPRewardFunction + from eval_protocol import ( + EvaluateResult, + EvaluateResult as RPEvaluateResult, + Message, + Message as RPMessage, + MetricResult, + MetricResult as RPMetricResult, + RewardFunction, + RewardFunction as RPRewardFunction, + ) # Classes should be the same assert RewardFunction is RPRewardFunction @@ -53,16 +55,18 @@ def test_core_classes_available(self): def test_functions_available(self): """Test that core functions are available through both imports.""" - from eval_protocol import load_jsonl - from eval_protocol import load_jsonl as rp_load_jsonl - from eval_protocol import make - from eval_protocol import make as rp_make - from eval_protocol import reward_function - from eval_protocol import reward_function as rp_reward_function - from eval_protocol import rollout - from eval_protocol import rollout as rp_rollout - from eval_protocol import test_mcp - from eval_protocol import test_mcp as rp_test_mcp + from eval_protocol import ( + load_jsonl, + load_jsonl as rp_load_jsonl, + make, + make as rp_make, + reward_function, + reward_function as rp_reward_function, + rollout, + rollout as rp_rollout, + test_mcp, + test_mcp as rp_test_mcp, + ) # Functions should be the same assert reward_function is rp_reward_function @@ -110,9 +114,11 @@ def test_star_import_works(self): def test_reward_function_decorator_works(self): """Test that the @reward_function decorator works through both imports.""" - from eval_protocol import EvaluateResult - from eval_protocol import reward_function as rk_reward_function - from eval_protocol import reward_function as rp_reward_function + from eval_protocol import ( + EvaluateResult, + reward_function as rk_reward_function, + reward_function as rp_reward_function, + ) # Create a simple reward function using eval_protocol @rk_reward_function @@ -147,8 +153,7 @@ def test_reward_rp(response: str, **kwargs) -> EvaluateResult: def test_message_class_works(self): """Test that Message class works through both imports.""" - from eval_protocol import Message as RKMessage - from eval_protocol import Message as RPMessage + from eval_protocol import Message as RKMessage, Message as RPMessage # They should be the same class assert RKMessage is RPMessage @@ -196,29 +201,28 @@ def test_deep_import_consistency(self): """Test that deep imports work consistently.""" try: # Test importing from submodules - from eval_protocol.models import Message as RKMessage - from eval_protocol.models import Message as RPMessage + from eval_protocol.models import Message as RKMessage, Message as RPMessage # Should be the same class assert RKMessage is RPMessage except ImportError: # If submodule imports don't work, that's expected in some install scenarios # Just verify the star import works - from eval_protocol import Message as RKMessage - from eval_protocol import Message as RPMessage + from eval_protocol import Message as RKMessage, Message as RPMessage assert RKMessage is RPMessage try: # Test another submodule - use a function that actually exists - from eval_protocol.auth import get_fireworks_account_id - from eval_protocol.auth import get_fireworks_account_id as rp_get_fireworks_account_id + from eval_protocol.auth import ( + get_fireworks_account_id, + get_fireworks_account_id as rp_get_fireworks_account_id, + ) assert get_fireworks_account_id is rp_get_fireworks_account_id except ImportError: # If submodule imports don't work, verify through star import - from eval_protocol import auth as rk_auth - from eval_protocol import auth as rp_auth + from eval_protocol import auth as rk_auth, auth as rp_auth assert rk_auth is rp_auth diff --git a/tests/test_examples_end_to_end.py b/tests/test_examples_end_to_end.py index d4edac17..fdb35496 100644 --- a/tests/test_examples_end_to_end.py +++ b/tests/test_examples_end_to_end.py @@ -47,7 +47,6 @@ def mock_requests(): patch("requests.get") as mock_get, patch("requests.delete") as mock_delete, ): - # Configure mock_post for different use cases def post_side_effect(*args, **kwargs): url = args[0] diff --git a/tests/test_function_calling.py b/tests/test_function_calling.py index 2c4d8cb3..343a755a 100644 --- a/tests/test_function_calling.py +++ b/tests/test_function_calling.py @@ -4,8 +4,10 @@ import pytest -from eval_protocol.models import EvaluateResult # Changed -from eval_protocol.models import Message # Added import +from eval_protocol.models import ( + EvaluateResult, # Changed + Message, # Added import +) from eval_protocol.rewards.function_calling import ( calculate_jaccard_similarity, composite_function_call_reward, @@ -93,16 +95,14 @@ def test_wrong_function_name(self): assert result.metrics["function_name_match"].score == 0.0 assert ( result.metrics["function_name_match"].reason is not None - and "Function name does not match" - in result.metrics["function_name_match"].reason # type: ignore[operator] + and "Function name does not match" in result.metrics["function_name_match"].reason # type: ignore[operator] ) # Dictionary access assert result["score"] < 1.0 assert result["metrics"]["function_name_match"]["score"] == 0.0 assert ( result["metrics"]["function_name_match"]["reason"] is not None - and "Function name does not match" - in result["metrics"]["function_name_match"]["reason"] # type: ignore[operator] + and "Function name does not match" in result["metrics"]["function_name_match"]["reason"] # type: ignore[operator] ) def test_missing_required_argument(self): diff --git a/tests/test_gcp_tools.py b/tests/test_gcp_tools.py index 9b8804fa..d1f490eb 100644 --- a/tests/test_gcp_tools.py +++ b/tests/test_gcp_tools.py @@ -91,7 +91,6 @@ def test_run_gcloud_command_success_with_stderr(self, mock_subprocess_run): ) # os.path.exists mock removed as not directly used by SUT for this path @patch("builtins.open", new_callable=mock_open) def test_build_and_push_docker_image_success(self, mock_open_file, mock_os_remove, mock_run_gcloud, MockGCPPath): - mock_path_instance = MockGCPPath.return_value mock_dockerfile_path_obj = MagicMock(spec=Path) @@ -276,7 +275,6 @@ def test_build_and_push_docker_image_success_dockerfile_vanishes( @patch("eval_protocol.gcp_tools._run_gcloud_command") def test_deploy_to_cloud_run_success(self, mock_run_gcloud): - # Mock for deploy command mock_run_gcloud.side_effect = [ (True, "Deploy success", ""), # For initial deploy diff --git a/tests/test_generic_server.py b/tests/test_generic_server.py index ee21906a..94beb0aa 100644 --- a/tests/test_generic_server.py +++ b/tests/test_generic_server.py @@ -102,8 +102,7 @@ def test_load_failure_resets_globals(self): # --- Tests for FastAPI app endpoints --- from fastapi.testclient import TestClient -from eval_protocol.generic_server import EvaluationRequest -from eval_protocol.generic_server import app as generic_fastapi_app +from eval_protocol.generic_server import EvaluationRequest, app as generic_fastapi_app from eval_protocol.models import EvaluateResult, Message, MetricResult diff --git a/tests/test_math.py b/tests/test_math.py index 8b53a4c4..5aad9135 100644 --- a/tests/test_math.py +++ b/tests/test_math.py @@ -8,8 +8,11 @@ import pytest # Removed: from eval_protocol.rewards.advanced_math import advanced_math_reward -from eval_protocol.models import Message # Added Message import -from eval_protocol.models import EvaluateResult, MetricResult +from eval_protocol.models import ( + EvaluateResult, + Message, # Added Message import + MetricResult, +) from eval_protocol.rewards.math import compare_numbers, extract_numbers, math_reward diff --git a/tests/test_models_rl.py b/tests/test_models_rl.py index ea753f77..f1eed5d4 100644 --- a/tests/test_models_rl.py +++ b/tests/test_models_rl.py @@ -6,16 +6,13 @@ from eval_protocol.agent.models import StepData # Assuming these are the correct import paths based on our plan -from eval_protocol.models import EvaluateResult -from eval_protocol.models import Message as RewardKitMessage -from eval_protocol.models import StepOutput +from eval_protocol.models import EvaluateResult, Message as RewardKitMessage, StepOutput # Minimal Message for StepData if direct import from eval_protocol.models is problematic in tests # For now, assume RewardKitMessage from eval_protocol.models works. class TestRLDataStructures: - def test_step_output_creation_valid(self): """Test valid creation of StepOutput.""" so = StepOutput(step_index=0, base_reward=0.5, reason="Good step", metrics={"accuracy": 0.9}) diff --git a/tests/test_packaging.py b/tests/test_packaging.py index a019edcd..ee44eb7c 100644 --- a/tests/test_packaging.py +++ b/tests/test_packaging.py @@ -19,7 +19,6 @@ class TestPackaging(unittest.TestCase): - @classmethod def setUpClass(cls): # Create a unique dummy reward function file for testing to avoid race conditions diff --git a/tests/test_parallel_rollouts.py b/tests/test_parallel_rollouts.py index ef5c83a6..8da83d27 100644 --- a/tests/test_parallel_rollouts.py +++ b/tests/test_parallel_rollouts.py @@ -105,7 +105,7 @@ async def _test_seed_handling_and_type_compatibility_impl(): stdout, stderr = server_process.communicate() # This is a CI environment issue, not a code issue - run a simplified test instead - print(f"⚠️ Server startup failed in CI environment, running simplified test...") + print("⚠️ Server startup failed in CI environment, running simplified test...") print(f"Server stdout: {stdout.decode()[:200]}") print(f"Server stderr: {stderr.decode()[:200]}") @@ -376,4 +376,4 @@ async def test_mcp_resource_type_compatibility(): print(f" - Seed 42 map: {map1}") print(f" - Seed 123 map: {map2}") print(f" - Seed 999 map: {map3}") - print(f" - JSON serialization: ✅") + print(" - JSON serialization: ✅") diff --git a/tests/test_platform_api.py b/tests/test_platform_api.py index 84f1dc0d..13451779 100644 --- a/tests/test_platform_api.py +++ b/tests/test_platform_api.py @@ -15,7 +15,6 @@ class TestPlatformAPI(unittest.TestCase): - def setUp(self): # Patch auth functions for isolation self.mock_api_key = "test_api_key" diff --git a/tests/test_readiness.py b/tests/test_readiness.py index 5fef142a..8712aeb0 100644 --- a/tests/test_readiness.py +++ b/tests/test_readiness.py @@ -43,7 +43,6 @@ def mock_requests_post(): # --- End-to-End Script Tests for Math Example --- class TestMathExampleEndToEndScripts: - BASE_MATH_EXAMPLE_PATH = os.path.join(os.path.dirname(__file__), "../examples/math_example") def run_script( @@ -201,7 +200,6 @@ def run_script( # --- End-to-End Script Tests for Math Example (OpenR1) --- class TestMathExampleOpenR1EndToEndScripts: - BASE_MATH_EXAMPLE_OPENR1_PATH = os.path.join(os.path.dirname(__file__), "../examples/math_example_openr1") def run_script( diff --git a/tests/test_retry_mechanism.py b/tests/test_retry_mechanism.py index f00be4fc..8b55869f 100644 --- a/tests/test_retry_mechanism.py +++ b/tests/test_retry_mechanism.py @@ -109,7 +109,7 @@ def test_retry_mechanism_mock_verification(): # Get our mock tracker mock_tracker = shared_processor.mock_tracker - print(f"\n🔄 MOCK CALL ANALYSIS:") + print("\n🔄 MOCK CALL ANALYSIS:") print(f" Batch calls made: {mock_tracker.batch_call.call_count}") print(f" Total row processing calls: {mock_tracker.process_row_call.call_count}") @@ -125,7 +125,7 @@ def test_retry_mechanism_mock_verification(): call_counts = Counter(rollout_ids) print(f" Call counts per rollout_id: {dict(call_counts)}") - print(f" Individual calls:") + print(" Individual calls:") for i, call_arg in enumerate(call_args, 1): rollout_id = call_arg[0][0] attempt_num = rollout_ids[:i].count(rollout_id) @@ -133,9 +133,9 @@ def test_retry_mechanism_mock_verification(): # ASSERTIONS USING MOCK DATA # Should have exactly 6 total row processing calls (5 initial + 1 retry) - assert ( - mock_tracker.process_row_call.call_count == 6 - ), f"Expected 6 total calls, got {mock_tracker.process_row_call.call_count}" + assert mock_tracker.process_row_call.call_count == 6, ( + f"Expected 6 total calls, got {mock_tracker.process_row_call.call_count}" + ) # Should have exactly 2 batch calls (initial batch + retry batch) assert mock_tracker.batch_call.call_count == 2, f"Expected 2 batch calls, got {mock_tracker.batch_call.call_count}" @@ -147,11 +147,11 @@ def test_retry_mechanism_mock_verification(): # Exactly one rollout_id should be called twice, others called once call_count_values = list(call_counts.values()) - assert ( - call_count_values.count(2) == 1 - ), f"Expected exactly 1 rollout_id to be called twice, got counts: {dict(call_counts)}" - assert ( - call_count_values.count(1) == 4 - ), f"Expected exactly 4 rollout_ids to be called once, got counts: {dict(call_counts)}" + assert call_count_values.count(2) == 1, ( + f"Expected exactly 1 rollout_id to be called twice, got counts: {dict(call_counts)}" + ) + assert call_count_values.count(1) == 4, ( + f"Expected exactly 4 rollout_ids to be called once, got counts: {dict(call_counts)}" + ) print("✅ All mock-based assertions passed! Retry mechanism is working correctly.") diff --git a/tests/test_reward_protocol_import.py b/tests/test_reward_protocol_import.py index 7466ed93..d643c483 100644 --- a/tests/test_reward_protocol_import.py +++ b/tests/test_reward_protocol_import.py @@ -36,14 +36,16 @@ def test_all_exports_consistency(self): def test_core_classes_available(self): """Test that core classes are available through both imports.""" - from eval_protocol import EvaluateResult - from eval_protocol import EvaluateResult as RPEvaluateResult - from eval_protocol import Message - from eval_protocol import Message as RPMessage - from eval_protocol import MetricResult - from eval_protocol import MetricResult as RPMetricResult - from eval_protocol import RewardFunction - from eval_protocol import RewardFunction as RPRewardFunction + from eval_protocol import ( + EvaluateResult, + EvaluateResult as RPEvaluateResult, + Message, + Message as RPMessage, + MetricResult, + MetricResult as RPMetricResult, + RewardFunction, + RewardFunction as RPRewardFunction, + ) # Classes should be the same assert RewardFunction is RPRewardFunction @@ -53,16 +55,18 @@ def test_core_classes_available(self): def test_functions_available(self): """Test that core functions are available through both imports.""" - from eval_protocol import load_jsonl - from eval_protocol import load_jsonl as rp_load_jsonl - from eval_protocol import make - from eval_protocol import make as rp_make - from eval_protocol import reward_function - from eval_protocol import reward_function as rp_reward_function - from eval_protocol import rollout - from eval_protocol import rollout as rp_rollout - from eval_protocol import test_mcp - from eval_protocol import test_mcp as rp_test_mcp + from eval_protocol import ( + load_jsonl, + load_jsonl as rp_load_jsonl, + make, + make as rp_make, + reward_function, + reward_function as rp_reward_function, + rollout, + rollout as rp_rollout, + test_mcp, + test_mcp as rp_test_mcp, + ) # Functions should be the same assert reward_function is rp_reward_function @@ -110,9 +114,11 @@ def test_star_import_works(self): def test_reward_function_decorator_works(self): """Test that the @reward_function decorator works through both imports.""" - from eval_protocol import EvaluateResult - from eval_protocol import reward_function as rk_reward_function - from eval_protocol import reward_function as rp_reward_function + from eval_protocol import ( + EvaluateResult, + reward_function as rk_reward_function, + reward_function as rp_reward_function, + ) # Create a simple reward function using eval_protocol @rk_reward_function @@ -147,8 +153,7 @@ def test_reward_rp(response: str, **kwargs) -> EvaluateResult: def test_message_class_works(self): """Test that Message class works through both imports.""" - from eval_protocol import Message as RKMessage - from eval_protocol import Message as RPMessage + from eval_protocol import Message as RKMessage, Message as RPMessage # They should be the same class assert RKMessage is RPMessage @@ -195,29 +200,28 @@ def test_deep_import_consistency(self): """Test that deep imports work consistently.""" try: # Test importing from submodules - from eval_protocol.models import Message as RKMessage - from eval_protocol.models import Message as RPMessage + from eval_protocol.models import Message as RKMessage, Message as RPMessage # Should be the same class assert RKMessage is RPMessage except ImportError: # If submodule imports don't work, that's expected in some install scenarios # Just verify the star import works - from eval_protocol import Message as RKMessage - from eval_protocol import Message as RPMessage + from eval_protocol import Message as RKMessage, Message as RPMessage assert RKMessage is RPMessage try: # Test another submodule - use a function that actually exists - from eval_protocol.auth import get_fireworks_account_id - from eval_protocol.auth import get_fireworks_account_id as rp_get_fireworks_account_id + from eval_protocol.auth import ( + get_fireworks_account_id, + get_fireworks_account_id as rp_get_fireworks_account_id, + ) assert get_fireworks_account_id is rp_get_fireworks_account_id except ImportError: # If submodule imports don't work, verify through star import - from eval_protocol import auth as rk_auth - from eval_protocol import auth as rp_auth + from eval_protocol import auth as rk_auth, auth as rp_auth assert rk_auth is rp_auth diff --git a/tests/test_rl_processing.py b/tests/test_rl_processing.py index 1086732c..93f9e7fc 100644 --- a/tests/test_rl_processing.py +++ b/tests/test_rl_processing.py @@ -9,14 +9,11 @@ import pytest from eval_protocol.agent.models import StepData -from eval_protocol.models import EvaluateResult -from eval_protocol.models import Message as RewardKitMessage -from eval_protocol.models import StepOutput +from eval_protocol.models import EvaluateResult, Message as RewardKitMessage, StepOutput from eval_protocol.rl_processing import RLDataAligner class TestRLDataAligner: - def create_mock_step_data( self, system_step_index: int, diff --git a/tests/test_rollout_control_plane_integration.py b/tests/test_rollout_control_plane_integration.py index 8d176780..e97769c7 100644 --- a/tests/test_rollout_control_plane_integration.py +++ b/tests/test_rollout_control_plane_integration.py @@ -89,7 +89,7 @@ def add_tool_response( { "role": "tool", "content": response, - "tool_call_id": tool_call.tool_call_id or f"call_{len(conversation_history)-1}", + "tool_call_id": tool_call.tool_call_id or f"call_{len(conversation_history) - 1}", "control_plane_step": { "step": env_index, "reward": reward, @@ -149,7 +149,6 @@ async def test_rollout_with_control_plane_separation(self): patch.object(GeneralMCPVectorEnv, "close") as mock_close, patch.object(GeneralMCPVectorEnv, "format_user_prompt") as mock_format_user_prompt, ): - # Setup mock vector environment mock_env = GeneralMCPVectorEnv(sessions, dataset_rows) mock_env.sessions = sessions @@ -269,9 +268,9 @@ def mock_step_side_effect(env_index, tool_call): # Tool responses should only contain data plane information content = msg.content or "" # The content should not directly contain rewards or termination (they're in control_plane_step) - assert ( - "reward" not in content.lower() or "reward_source" in content.lower() - ), "Tool response should not directly contain reward" + assert "reward" not in content.lower() or "reward_source" in content.lower(), ( + "Tool response should not directly contain reward" + ) # Validate control plane information from messages rewards = [msg.control_plane_step["reward"] for msg in messages_with_control_plane] @@ -433,7 +432,6 @@ async def test_rollout_handles_control_plane_failure_gracefully(self): patch.object(GeneralMCPVectorEnv, "close") as mock_close, patch.object(GeneralMCPVectorEnv, "format_user_prompt") as mock_format_user_prompt, ): - mock_env = GeneralMCPVectorEnv(sessions, dataset_rows) mock_env.sessions = sessions mock_env.dataset_rows = dataset_rows diff --git a/tests/test_typed_interface_rl.py b/tests/test_typed_interface_rl.py index 3b1bf975..5455f9f9 100644 --- a/tests/test_typed_interface_rl.py +++ b/tests/test_typed_interface_rl.py @@ -69,7 +69,6 @@ def batch_invalid_output_func(rollouts_messages: List[List[Message]], ground_tru class TestTypedInterfaceRL: - def test_pointwise_rl_rewards_valid_input(self): """Test pointwise RL reward function with valid dict messages.""" raw_messages = [ diff --git a/tests/test_url_handling.py b/tests/test_url_handling.py index 542b0f05..6c5d9f4f 100644 --- a/tests/test_url_handling.py +++ b/tests/test_url_handling.py @@ -1,4 +1,5 @@ from unittest.mock import AsyncMock, MagicMock, patch + import httpx import pytest from werkzeug.wrappers import Response diff --git a/uv.lock b/uv.lock index 0c27bafe..e1e74645 100644 --- a/uv.lock +++ b/uv.lock @@ -338,19 +338,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f9/58/cc6a08053f822f98f334d38a27687b69c6655fb05cd74a7a5e70a2aeed95/authlib-1.6.1-py2.py3-none-any.whl", hash = "sha256:e9d2031c34c6309373ab845afc24168fe9e93dc52d252631f52642f21f5ed06e", size = 239299, upload-time = "2025-07-20T07:38:39.259Z" }, ] -[[package]] -name = "autopep8" -version = "2.3.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "pycodestyle" }, - { name = "tomli", marker = "python_full_version < '3.11'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/50/d8/30873d2b7b57dee9263e53d142da044c4600a46f2d28374b3e38b023df16/autopep8-2.3.2.tar.gz", hash = "sha256:89440a4f969197b69a995e4ce0661b031f455a9f776d2c5ba3dbd83466931758", size = 92210, upload-time = "2025-01-14T14:46:18.454Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/9e/43/53afb8ba17218f19b77c7834128566c5bbb100a0ad9ba2e8e89d089d7079/autopep8-2.3.2-py2.py3-none-any.whl", hash = "sha256:ce8ad498672c845a0c3de2629c15b635ec2b05ef8177a6e7c91c74f3e9b51128", size = 45807, upload-time = "2025-01-14T14:46:15.466Z" }, -] - [[package]] name = "babel" version = "2.17.0" @@ -420,40 +407,6 @@ compiler = [ { name = "ruff" }, ] -[[package]] -name = "black" -version = "25.1.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "click" }, - { name = "mypy-extensions" }, - { name = "packaging" }, - { name = "pathspec" }, - { name = "platformdirs" }, - { name = "tomli", marker = "python_full_version < '3.11'" }, - { name = "typing-extensions", marker = "python_full_version < '3.11'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/94/49/26a7b0f3f35da4b5a65f081943b7bcd22d7002f5f0fb8098ec1ff21cb6ef/black-25.1.0.tar.gz", hash = "sha256:33496d5cd1222ad73391352b4ae8da15253c5de89b93a80b3e2c8d9a19ec2666", size = 649449, upload-time = "2025-01-29T04:15:40.373Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/4d/3b/4ba3f93ac8d90410423fdd31d7541ada9bcee1df32fb90d26de41ed40e1d/black-25.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:759e7ec1e050a15f89b770cefbf91ebee8917aac5c20483bc2d80a6c3a04df32", size = 1629419, upload-time = "2025-01-29T05:37:06.642Z" }, - { url = "https://files.pythonhosted.org/packages/b4/02/0bde0485146a8a5e694daed47561785e8b77a0466ccc1f3e485d5ef2925e/black-25.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0e519ecf93120f34243e6b0054db49c00a35f84f195d5bce7e9f5cfc578fc2da", size = 1461080, upload-time = "2025-01-29T05:37:09.321Z" }, - { url = "https://files.pythonhosted.org/packages/52/0e/abdf75183c830eaca7589144ff96d49bce73d7ec6ad12ef62185cc0f79a2/black-25.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:055e59b198df7ac0b7efca5ad7ff2516bca343276c466be72eb04a3bcc1f82d7", size = 1766886, upload-time = "2025-01-29T04:18:24.432Z" }, - { url = "https://files.pythonhosted.org/packages/dc/a6/97d8bb65b1d8a41f8a6736222ba0a334db7b7b77b8023ab4568288f23973/black-25.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:db8ea9917d6f8fc62abd90d944920d95e73c83a5ee3383493e35d271aca872e9", size = 1419404, upload-time = "2025-01-29T04:19:04.296Z" }, - { url = "https://files.pythonhosted.org/packages/7e/4f/87f596aca05c3ce5b94b8663dbfe242a12843caaa82dd3f85f1ffdc3f177/black-25.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a39337598244de4bae26475f77dda852ea00a93bd4c728e09eacd827ec929df0", size = 1614372, upload-time = "2025-01-29T05:37:11.71Z" }, - { url = "https://files.pythonhosted.org/packages/e7/d0/2c34c36190b741c59c901e56ab7f6e54dad8df05a6272a9747ecef7c6036/black-25.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:96c1c7cd856bba8e20094e36e0f948718dc688dba4a9d78c3adde52b9e6c2299", size = 1442865, upload-time = "2025-01-29T05:37:14.309Z" }, - { url = "https://files.pythonhosted.org/packages/21/d4/7518c72262468430ead45cf22bd86c883a6448b9eb43672765d69a8f1248/black-25.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bce2e264d59c91e52d8000d507eb20a9aca4a778731a08cfff7e5ac4a4bb7096", size = 1749699, upload-time = "2025-01-29T04:18:17.688Z" }, - { url = "https://files.pythonhosted.org/packages/58/db/4f5beb989b547f79096e035c4981ceb36ac2b552d0ac5f2620e941501c99/black-25.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:172b1dbff09f86ce6f4eb8edf9dede08b1fce58ba194c87d7a4f1a5aa2f5b3c2", size = 1428028, upload-time = "2025-01-29T04:18:51.711Z" }, - { url = "https://files.pythonhosted.org/packages/83/71/3fe4741df7adf015ad8dfa082dd36c94ca86bb21f25608eb247b4afb15b2/black-25.1.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:4b60580e829091e6f9238c848ea6750efed72140b91b048770b64e74fe04908b", size = 1650988, upload-time = "2025-01-29T05:37:16.707Z" }, - { url = "https://files.pythonhosted.org/packages/13/f3/89aac8a83d73937ccd39bbe8fc6ac8860c11cfa0af5b1c96d081facac844/black-25.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1e2978f6df243b155ef5fa7e558a43037c3079093ed5d10fd84c43900f2d8ecc", size = 1453985, upload-time = "2025-01-29T05:37:18.273Z" }, - { url = "https://files.pythonhosted.org/packages/6f/22/b99efca33f1f3a1d2552c714b1e1b5ae92efac6c43e790ad539a163d1754/black-25.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3b48735872ec535027d979e8dcb20bf4f70b5ac75a8ea99f127c106a7d7aba9f", size = 1783816, upload-time = "2025-01-29T04:18:33.823Z" }, - { url = "https://files.pythonhosted.org/packages/18/7e/a27c3ad3822b6f2e0e00d63d58ff6299a99a5b3aee69fa77cd4b0076b261/black-25.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:ea0213189960bda9cf99be5b8c8ce66bb054af5e9e861249cd23471bd7b0b3ba", size = 1440860, upload-time = "2025-01-29T04:19:12.944Z" }, - { url = "https://files.pythonhosted.org/packages/98/87/0edf98916640efa5d0696e1abb0a8357b52e69e82322628f25bf14d263d1/black-25.1.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8f0b18a02996a836cc9c9c78e5babec10930862827b1b724ddfe98ccf2f2fe4f", size = 1650673, upload-time = "2025-01-29T05:37:20.574Z" }, - { url = "https://files.pythonhosted.org/packages/52/e5/f7bf17207cf87fa6e9b676576749c6b6ed0d70f179a3d812c997870291c3/black-25.1.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:afebb7098bfbc70037a053b91ae8437c3857482d3a690fefc03e9ff7aa9a5fd3", size = 1453190, upload-time = "2025-01-29T05:37:22.106Z" }, - { url = "https://files.pythonhosted.org/packages/e3/ee/adda3d46d4a9120772fae6de454c8495603c37c4c3b9c60f25b1ab6401fe/black-25.1.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:030b9759066a4ee5e5aca28c3c77f9c64789cdd4de8ac1df642c40b708be6171", size = 1782926, upload-time = "2025-01-29T04:18:58.564Z" }, - { url = "https://files.pythonhosted.org/packages/cc/64/94eb5f45dcb997d2082f097a3944cfc7fe87e071907f677e80788a2d7b7a/black-25.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:a22f402b410566e2d1c950708c77ebf5ebd5d0d88a6a2e87c86d9fb48afa0d18", size = 1442613, upload-time = "2025-01-29T04:19:27.63Z" }, - { url = "https://files.pythonhosted.org/packages/09/71/54e999902aed72baf26bca0d50781b01838251a462612966e9fc4891eadd/black-25.1.0-py3-none-any.whl", hash = "sha256:95e8176dae143ba9097f351d174fdaf0ccd29efb414b362ae3fd72bf0f710717", size = 207646, upload-time = "2025-01-29T04:15:38.082Z" }, -] - [[package]] name = "bleach" version = "6.2.0" @@ -585,15 +538,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/84/c2/80633736cd183ee4a62107413def345f7e6e3c01563dbca1417363cf957e/build-1.2.2.post1-py3-none-any.whl", hash = "sha256:1d61c0887fa860c01971625baae8bdd338e517b836a2f70dd1f7aa3a6b2fc5b5", size = 22950, upload-time = "2024-10-06T17:22:23.299Z" }, ] -[[package]] -name = "cachetools" -version = "5.5.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/6c/81/3747dad6b14fa2cf53fcf10548cf5aea6913e96fab41a3c198676f8948a5/cachetools-5.5.2.tar.gz", hash = "sha256:1a661caa9175d26759571b2e19580f9d6393969e5dfca11fdb1f947a23e640d4", size = 28380, upload-time = "2025-02-20T21:01:19.524Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/72/76/20fa66124dbe6be5cafeb312ece67de6b61dd91a0247d1ea13db4ebb33c2/cachetools-5.5.2-py3-none-any.whl", hash = "sha256:d26a22bcc62eb95c3beabd9f1ee5e820d3d2704fe2967cbe350e20c8ffcd3f0a", size = 10080, upload-time = "2025-02-20T21:01:16.647Z" }, -] - [[package]] name = "certifi" version = "2025.7.14" @@ -1167,41 +1111,30 @@ dependencies = [ [package.optional-dependencies] adapters = [ { name = "datasets" }, - { name = "google-auth" }, - { name = "google-auth-oauthlib" }, - { name = "google-cloud-bigquery" }, { name = "langfuse" }, { name = "transformers" }, ] -bigquery = [ - { name = "google-auth" }, - { name = "google-auth-oauthlib" }, - { name = "google-cloud-bigquery" }, -] box2d = [ { name = "gymnasium", extra = ["box2d"] }, { name = "pillow" }, { name = "swig" }, ] dev = [ - { name = "autopep8" }, - { name = "black" }, { name = "build" }, { name = "docker" }, { name = "e2b" }, - { name = "flake8" }, { name = "haikus" }, { name = "ipykernel" }, - { name = "isort" }, { name = "jupyter" }, - { name = "mypy" }, { name = "openai" }, { name = "pip" }, { name = "pre-commit" }, + { name = "pyright" }, { name = "pytest-asyncio" }, { name = "pytest-cov" }, { name = "pytest-httpserver" }, { name = "pytest-xdist" }, + { name = "ruff" }, { name = "transformers" }, { name = "twine" }, { name = "types-docker" }, @@ -1250,8 +1183,6 @@ requires-dist = [ { name = "aiohttp" }, { name = "aiosqlite" }, { name = "anthropic", specifier = ">=0.59.0" }, - { name = "autopep8", marker = "extra == 'dev'", specifier = ">=1.5.0" }, - { name = "black", marker = "extra == 'dev'", specifier = ">=21.5b2" }, { name = "build", marker = "extra == 'dev'" }, { name = "dataclasses-json", specifier = ">=0.5.7" }, { name = "datasets" }, @@ -1263,14 +1194,7 @@ requires-dist = [ { name = "e2b", marker = "extra == 'dev'" }, { name = "fastapi", specifier = ">=0.116.1" }, { name = "fireworks-ai", marker = "extra == 'fireworks'", specifier = ">=0.19.12" }, - { name = "flake8", marker = "extra == 'dev'", specifier = ">=3.9.2" }, { name = "fsspec" }, - { name = "google-auth", marker = "extra == 'adapters'", specifier = ">=2.0.0" }, - { name = "google-auth", marker = "extra == 'bigquery'", specifier = ">=2.0.0" }, - { name = "google-auth-oauthlib", marker = "extra == 'adapters'", specifier = ">=1.0.0" }, - { name = "google-auth-oauthlib", marker = "extra == 'bigquery'", specifier = ">=1.0.0" }, - { name = "google-cloud-bigquery", marker = "extra == 'adapters'", specifier = ">=3.0.0" }, - { name = "google-cloud-bigquery", marker = "extra == 'bigquery'", specifier = ">=3.0.0" }, { name = "gymnasium", specifier = ">=0.29.0" }, { name = "gymnasium", extras = ["box2d"], marker = "extra == 'box2d'", specifier = ">=0.29.0" }, { name = "haikus", marker = "extra == 'dev'", specifier = "==0.3.8" }, @@ -1278,7 +1202,6 @@ requires-dist = [ { name = "hydra-core", specifier = ">=1.3.2" }, { name = "ipykernel", specifier = ">=6.30.0" }, { name = "ipykernel", marker = "extra == 'dev'", specifier = ">=6.30.0" }, - { name = "isort", marker = "extra == 'dev'", specifier = ">=5.0.0" }, { name = "jupyter", specifier = ">=1.1.1" }, { name = "jupyter", marker = "extra == 'dev'", specifier = ">=1.1.1" }, { name = "langfuse", marker = "extra == 'adapters'", specifier = ">=2.0.0" }, @@ -1286,7 +1209,6 @@ requires-dist = [ { name = "litellm", specifier = ">=1.0.0" }, { name = "loguru", specifier = ">=0.6.0" }, { name = "mcp", specifier = ">=1.9.2" }, - { name = "mypy", marker = "extra == 'dev'", specifier = ">=0.812" }, { name = "omegaconf", specifier = ">=2.3.0" }, { name = "openai", specifier = "==1.78.1" }, { name = "openai", marker = "extra == 'dev'", specifier = "==1.78.1" }, @@ -1299,6 +1221,7 @@ requires-dist = [ { name = "pre-commit", marker = "extra == 'dev'" }, { name = "psutil", specifier = ">=5.8.0" }, { name = "pydantic", specifier = ">=2.0.0" }, + { name = "pyright", marker = "extra == 'dev'", specifier = ">=1.1.365" }, { name = "pytest", specifier = ">=6.0.0" }, { name = "pytest-asyncio", marker = "extra == 'dev'" }, { name = "pytest-cov", marker = "extra == 'dev'" }, @@ -1308,6 +1231,7 @@ requires-dist = [ { name = "pyyaml", specifier = ">=5.0" }, { name = "requests", specifier = ">=2.25.0" }, { name = "rich", specifier = ">=12.0.0" }, + { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.5.0" }, { name = "selenium", marker = "extra == 'svgbench'", specifier = ">=4.0.0" }, { name = "swig", marker = "extra == 'box2d'" }, { name = "toml", specifier = ">=0.10.0" }, @@ -1327,7 +1251,7 @@ requires-dist = [ { name = "websockets", specifier = ">=15.0.1" }, { name = "werkzeug", marker = "extra == 'dev'", specifier = ">=2.0.0" }, ] -provides-extras = ["dev", "trl", "openevals", "fireworks", "box2d", "langfuse", "huggingface", "bigquery", "adapters", "svgbench"] +provides-extras = ["dev", "trl", "openevals", "fireworks", "box2d", "langfuse", "huggingface", "adapters", "svgbench"] [package.metadata.requires-dev] dev = [ @@ -1504,20 +1428,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/20/83/1d44379a2d60009e31d157673998e46be144fc842c9c9389523e42cdfa65/fireworks_ai-0.19.12-py3-none-any.whl", hash = "sha256:3a50d807a7ca274a62e37be3a0b0d93ce76c412fb58e24b3d4cbadf8b9be0b74", size = 584665, upload-time = "2025-07-28T16:09:48.132Z" }, ] -[[package]] -name = "flake8" -version = "7.3.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "mccabe" }, - { name = "pycodestyle" }, - { name = "pyflakes" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/9b/af/fbfe3c4b5a657d79e5c47a2827a362f9e1b763336a52f926126aa6dc7123/flake8-7.3.0.tar.gz", hash = "sha256:fe044858146b9fc69b551a4b490d69cf960fcb78ad1edcb84e7fbb1b4a8e3872", size = 48326, upload-time = "2025-06-20T19:31:35.838Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/9f/56/13ab06b4f93ca7cac71078fbe37fcea175d3216f31f85c3168a6bbd0bb9a/flake8-7.3.0-py2.py3-none-any.whl", hash = "sha256:b9696257b9ce8beb888cdbe31cf885c90d31928fe202be0889a7cdafad32f01e", size = 57922, upload-time = "2025-06-20T19:31:34.425Z" }, -] - [[package]] name = "fqdn" version = "1.5.1" @@ -1635,133 +1545,6 @@ http = [ { name = "aiohttp" }, ] -[[package]] -name = "google-api-core" -version = "2.25.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "google-auth" }, - { name = "googleapis-common-protos" }, - { name = "proto-plus" }, - { name = "protobuf" }, - { name = "requests" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/dc/21/e9d043e88222317afdbdb567165fdbc3b0aad90064c7e0c9eb0ad9955ad8/google_api_core-2.25.1.tar.gz", hash = "sha256:d2aaa0b13c78c61cb3f4282c464c046e45fbd75755683c9c525e6e8f7ed0a5e8", size = 165443, upload-time = "2025-06-12T20:52:20.439Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/14/4b/ead00905132820b623732b175d66354e9d3e69fcf2a5dcdab780664e7896/google_api_core-2.25.1-py3-none-any.whl", hash = "sha256:8a2a56c1fef82987a524371f99f3bd0143702fecc670c72e600c1cda6bf8dbb7", size = 160807, upload-time = "2025-06-12T20:52:19.334Z" }, -] - -[package.optional-dependencies] -grpc = [ - { name = "grpcio" }, - { name = "grpcio-status" }, -] - -[[package]] -name = "google-auth" -version = "2.40.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "cachetools" }, - { name = "pyasn1-modules" }, - { name = "rsa" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/9e/9b/e92ef23b84fa10a64ce4831390b7a4c2e53c0132568d99d4ae61d04c8855/google_auth-2.40.3.tar.gz", hash = "sha256:500c3a29adedeb36ea9cf24b8d10858e152f2412e3ca37829b3fa18e33d63b77", size = 281029, upload-time = "2025-06-04T18:04:57.577Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/17/63/b19553b658a1692443c62bd07e5868adaa0ad746a0751ba62c59568cd45b/google_auth-2.40.3-py2.py3-none-any.whl", hash = "sha256:1370d4593e86213563547f97a92752fc658456fe4514c809544f330fed45a7ca", size = 216137, upload-time = "2025-06-04T18:04:55.573Z" }, -] - -[[package]] -name = "google-auth-oauthlib" -version = "1.2.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "google-auth" }, - { name = "requests-oauthlib" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/fb/87/e10bf24f7bcffc1421b84d6f9c3377c30ec305d082cd737ddaa6d8f77f7c/google_auth_oauthlib-1.2.2.tar.gz", hash = "sha256:11046fb8d3348b296302dd939ace8af0a724042e8029c1b872d87fabc9f41684", size = 20955, upload-time = "2025-04-22T16:40:29.172Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ac/84/40ee070be95771acd2f4418981edb834979424565c3eec3cd88b6aa09d24/google_auth_oauthlib-1.2.2-py3-none-any.whl", hash = "sha256:fd619506f4b3908b5df17b65f39ca8d66ea56986e5472eb5978fd8f3786f00a2", size = 19072, upload-time = "2025-04-22T16:40:28.174Z" }, -] - -[[package]] -name = "google-cloud-bigquery" -version = "3.35.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "google-api-core", extra = ["grpc"] }, - { name = "google-auth" }, - { name = "google-cloud-core" }, - { name = "google-resumable-media" }, - { name = "packaging" }, - { name = "python-dateutil" }, - { name = "requests" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/44/e4/9cf03fa81fefd1b9811a7cd6e398804ae0de3b6a4edef810e2acd45cabbc/google_cloud_bigquery-3.35.1.tar.gz", hash = "sha256:599f26cacf190acfe88000f6cc5f4bc9e6baac7899e4f406ca054f1906f71960", size = 496433, upload-time = "2025-07-24T15:09:04.108Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/63/50/96fe9bc5b83d3a421e91ed8edc2535de45957e9af398273e3ecb5c3a1094/google_cloud_bigquery-3.35.1-py3-none-any.whl", hash = "sha256:6739a6ba63c6d80735ca2b34b1df2090ff473b80c1a62354caa2debe6dbbd961", size = 256877, upload-time = "2025-07-24T15:09:02.443Z" }, -] - -[[package]] -name = "google-cloud-core" -version = "2.4.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "google-api-core" }, - { name = "google-auth" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/d6/b8/2b53838d2acd6ec6168fd284a990c76695e84c65deee79c9f3a4276f6b4f/google_cloud_core-2.4.3.tar.gz", hash = "sha256:1fab62d7102844b278fe6dead3af32408b1df3eb06f5c7e8634cbd40edc4da53", size = 35861, upload-time = "2025-03-10T21:05:38.948Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/40/86/bda7241a8da2d28a754aad2ba0f6776e35b67e37c36ae0c45d49370f1014/google_cloud_core-2.4.3-py2.py3-none-any.whl", hash = "sha256:5130f9f4c14b4fafdff75c79448f9495cfade0d8775facf1b09c3bf67e027f6e", size = 29348, upload-time = "2025-03-10T21:05:37.785Z" }, -] - -[[package]] -name = "google-crc32c" -version = "1.7.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/19/ae/87802e6d9f9d69adfaedfcfd599266bf386a54d0be058b532d04c794f76d/google_crc32c-1.7.1.tar.gz", hash = "sha256:2bff2305f98846f3e825dbeec9ee406f89da7962accdb29356e4eadc251bd472", size = 14495, upload-time = "2025-03-26T14:29:13.32Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/eb/69/b1b05cf415df0d86691d6a8b4b7e60ab3a6fb6efb783ee5cd3ed1382bfd3/google_crc32c-1.7.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:b07d48faf8292b4db7c3d64ab86f950c2e94e93a11fd47271c28ba458e4a0d76", size = 30467, upload-time = "2025-03-26T14:31:11.92Z" }, - { url = "https://files.pythonhosted.org/packages/44/3d/92f8928ecd671bd5b071756596971c79d252d09b835cdca5a44177fa87aa/google_crc32c-1.7.1-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:7cc81b3a2fbd932a4313eb53cc7d9dde424088ca3a0337160f35d91826880c1d", size = 30311, upload-time = "2025-03-26T14:53:14.161Z" }, - { url = "https://files.pythonhosted.org/packages/33/42/c2d15a73df79d45ed6b430b9e801d0bd8e28ac139a9012d7d58af50a385d/google_crc32c-1.7.1-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:1c67ca0a1f5b56162951a9dae987988679a7db682d6f97ce0f6381ebf0fbea4c", size = 37889, upload-time = "2025-03-26T14:41:27.83Z" }, - { url = "https://files.pythonhosted.org/packages/57/ea/ac59c86a3c694afd117bb669bde32aaf17d0de4305d01d706495f09cbf19/google_crc32c-1.7.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc5319db92daa516b653600794d5b9f9439a9a121f3e162f94b0e1891c7933cb", size = 33028, upload-time = "2025-03-26T14:41:29.141Z" }, - { url = "https://files.pythonhosted.org/packages/60/44/87e77e8476767a4a93f6cf271157c6d948eacec63688c093580af13b04be/google_crc32c-1.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dcdf5a64adb747610140572ed18d011896e3b9ae5195f2514b7ff678c80f1603", size = 38026, upload-time = "2025-03-26T14:41:29.921Z" }, - { url = "https://files.pythonhosted.org/packages/c8/bf/21ac7bb305cd7c1a6de9c52f71db0868e104a5b573a4977cd9d0ff830f82/google_crc32c-1.7.1-cp310-cp310-win_amd64.whl", hash = "sha256:754561c6c66e89d55754106739e22fdaa93fafa8da7221b29c8b8e8270c6ec8a", size = 33476, upload-time = "2025-03-26T14:29:09.086Z" }, - { url = "https://files.pythonhosted.org/packages/f7/94/220139ea87822b6fdfdab4fb9ba81b3fff7ea2c82e2af34adc726085bffc/google_crc32c-1.7.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:6fbab4b935989e2c3610371963ba1b86afb09537fd0c633049be82afe153ac06", size = 30468, upload-time = "2025-03-26T14:32:52.215Z" }, - { url = "https://files.pythonhosted.org/packages/94/97/789b23bdeeb9d15dc2904660463ad539d0318286d7633fe2760c10ed0c1c/google_crc32c-1.7.1-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:ed66cbe1ed9cbaaad9392b5259b3eba4a9e565420d734e6238813c428c3336c9", size = 30313, upload-time = "2025-03-26T14:57:38.758Z" }, - { url = "https://files.pythonhosted.org/packages/81/b8/976a2b843610c211e7ccb3e248996a61e87dbb2c09b1499847e295080aec/google_crc32c-1.7.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ee6547b657621b6cbed3562ea7826c3e11cab01cd33b74e1f677690652883e77", size = 33048, upload-time = "2025-03-26T14:41:30.679Z" }, - { url = "https://files.pythonhosted.org/packages/c9/16/a3842c2cf591093b111d4a5e2bfb478ac6692d02f1b386d2a33283a19dc9/google_crc32c-1.7.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d68e17bad8f7dd9a49181a1f5a8f4b251c6dbc8cc96fb79f1d321dfd57d66f53", size = 32669, upload-time = "2025-03-26T14:41:31.432Z" }, - { url = "https://files.pythonhosted.org/packages/04/17/ed9aba495916fcf5fe4ecb2267ceb851fc5f273c4e4625ae453350cfd564/google_crc32c-1.7.1-cp311-cp311-win_amd64.whl", hash = "sha256:6335de12921f06e1f774d0dd1fbea6bf610abe0887a1638f64d694013138be5d", size = 33476, upload-time = "2025-03-26T14:29:10.211Z" }, - { url = "https://files.pythonhosted.org/packages/dd/b7/787e2453cf8639c94b3d06c9d61f512234a82e1d12d13d18584bd3049904/google_crc32c-1.7.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:2d73a68a653c57281401871dd4aeebbb6af3191dcac751a76ce430df4d403194", size = 30470, upload-time = "2025-03-26T14:34:31.655Z" }, - { url = "https://files.pythonhosted.org/packages/ed/b4/6042c2b0cbac3ec3a69bb4c49b28d2f517b7a0f4a0232603c42c58e22b44/google_crc32c-1.7.1-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:22beacf83baaf59f9d3ab2bbb4db0fb018da8e5aebdce07ef9f09fce8220285e", size = 30315, upload-time = "2025-03-26T15:01:54.634Z" }, - { url = "https://files.pythonhosted.org/packages/29/ad/01e7a61a5d059bc57b702d9ff6a18b2585ad97f720bd0a0dbe215df1ab0e/google_crc32c-1.7.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:19eafa0e4af11b0a4eb3974483d55d2d77ad1911e6cf6f832e1574f6781fd337", size = 33180, upload-time = "2025-03-26T14:41:32.168Z" }, - { url = "https://files.pythonhosted.org/packages/3b/a5/7279055cf004561894ed3a7bfdf5bf90a53f28fadd01af7cd166e88ddf16/google_crc32c-1.7.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b6d86616faaea68101195c6bdc40c494e4d76f41e07a37ffdef270879c15fb65", size = 32794, upload-time = "2025-03-26T14:41:33.264Z" }, - { url = "https://files.pythonhosted.org/packages/0f/d6/77060dbd140c624e42ae3ece3df53b9d811000729a5c821b9fd671ceaac6/google_crc32c-1.7.1-cp312-cp312-win_amd64.whl", hash = "sha256:b7491bdc0c7564fcf48c0179d2048ab2f7c7ba36b84ccd3a3e1c3f7a72d3bba6", size = 33477, upload-time = "2025-03-26T14:29:10.94Z" }, - { url = "https://files.pythonhosted.org/packages/8b/72/b8d785e9184ba6297a8620c8a37cf6e39b81a8ca01bb0796d7cbb28b3386/google_crc32c-1.7.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:df8b38bdaf1629d62d51be8bdd04888f37c451564c2042d36e5812da9eff3c35", size = 30467, upload-time = "2025-03-26T14:36:06.909Z" }, - { url = "https://files.pythonhosted.org/packages/34/25/5f18076968212067c4e8ea95bf3b69669f9fc698476e5f5eb97d5b37999f/google_crc32c-1.7.1-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:e42e20a83a29aa2709a0cf271c7f8aefaa23b7ab52e53b322585297bb94d4638", size = 30309, upload-time = "2025-03-26T15:06:15.318Z" }, - { url = "https://files.pythonhosted.org/packages/92/83/9228fe65bf70e93e419f38bdf6c5ca5083fc6d32886ee79b450ceefd1dbd/google_crc32c-1.7.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:905a385140bf492ac300026717af339790921f411c0dfd9aa5a9e69a08ed32eb", size = 33133, upload-time = "2025-03-26T14:41:34.388Z" }, - { url = "https://files.pythonhosted.org/packages/c3/ca/1ea2fd13ff9f8955b85e7956872fdb7050c4ace8a2306a6d177edb9cf7fe/google_crc32c-1.7.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b211ddaf20f7ebeec5c333448582c224a7c90a9d98826fbab82c0ddc11348e6", size = 32773, upload-time = "2025-03-26T14:41:35.19Z" }, - { url = "https://files.pythonhosted.org/packages/89/32/a22a281806e3ef21b72db16f948cad22ec68e4bdd384139291e00ff82fe2/google_crc32c-1.7.1-cp313-cp313-win_amd64.whl", hash = "sha256:0f99eaa09a9a7e642a61e06742856eec8b19fc0037832e03f941fe7cf0c8e4db", size = 33475, upload-time = "2025-03-26T14:29:11.771Z" }, - { url = "https://files.pythonhosted.org/packages/b8/c5/002975aff514e57fc084ba155697a049b3f9b52225ec3bc0f542871dd524/google_crc32c-1.7.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:32d1da0d74ec5634a05f53ef7df18fc646666a25efaaca9fc7dcfd4caf1d98c3", size = 33243, upload-time = "2025-03-26T14:41:35.975Z" }, - { url = "https://files.pythonhosted.org/packages/61/cb/c585282a03a0cea70fcaa1bf55d5d702d0f2351094d663ec3be1c6c67c52/google_crc32c-1.7.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e10554d4abc5238823112c2ad7e4560f96c7bf3820b202660373d769d9e6e4c9", size = 32870, upload-time = "2025-03-26T14:41:37.08Z" }, - { url = "https://files.pythonhosted.org/packages/0b/43/31e57ce04530794917dfe25243860ec141de9fadf4aa9783dffe7dac7c39/google_crc32c-1.7.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a8e9afc74168b0b2232fb32dd202c93e46b7d5e4bf03e66ba5dc273bb3559589", size = 28242, upload-time = "2025-03-26T14:41:42.858Z" }, - { url = "https://files.pythonhosted.org/packages/eb/f3/8b84cd4e0ad111e63e30eb89453f8dd308e3ad36f42305cf8c202461cdf0/google_crc32c-1.7.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa8136cc14dd27f34a3221c0f16fd42d8a40e4778273e61a3c19aedaa44daf6b", size = 28049, upload-time = "2025-03-26T14:41:44.651Z" }, - { url = "https://files.pythonhosted.org/packages/16/1b/1693372bf423ada422f80fd88260dbfd140754adb15cbc4d7e9a68b1cb8e/google_crc32c-1.7.1-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:85fef7fae11494e747c9fd1359a527e5970fc9603c90764843caabd3a16a0a48", size = 28241, upload-time = "2025-03-26T14:41:45.898Z" }, - { url = "https://files.pythonhosted.org/packages/fd/3c/2a19a60a473de48717b4efb19398c3f914795b64a96cf3fbe82588044f78/google_crc32c-1.7.1-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6efb97eb4369d52593ad6f75e7e10d053cf00c48983f7a973105bc70b0ac4d82", size = 28048, upload-time = "2025-03-26T14:41:46.696Z" }, -] - -[[package]] -name = "google-resumable-media" -version = "2.7.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "google-crc32c" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/58/5a/0efdc02665dca14e0837b62c8a1a93132c264bd02054a15abb2218afe0ae/google_resumable_media-2.7.2.tar.gz", hash = "sha256:5280aed4629f2b60b847b0d42f9857fd4935c11af266744df33d8074cae92fe0", size = 2163099, upload-time = "2024-08-07T22:20:38.555Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/82/35/b8d3baf8c46695858cb9d8835a53baa1eeb9906ddaf2f728a5f5b640fd1e/google_resumable_media-2.7.2-py2.py3-none-any.whl", hash = "sha256:3ce7551e9fe6d99e9a126101d2536612bb73486721951e9562fee0f90c6ababa", size = 81251, upload-time = "2024-08-07T22:20:36.409Z" }, -] - [[package]] name = "googleapis-common-protos" version = "1.70.0" @@ -1873,20 +1656,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/34/80/de3eb55eb581815342d097214bed4c59e806b05f1b3110df03b2280d6dfd/grpcio-1.74.0-cp313-cp313-win_amd64.whl", hash = "sha256:fd3c71aeee838299c5887230b8a1822795325ddfea635edd82954c1eaa831e24", size = 4489214, upload-time = "2025-07-24T18:53:59.771Z" }, ] -[[package]] -name = "grpcio-status" -version = "1.71.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "googleapis-common-protos" }, - { name = "grpcio" }, - { name = "protobuf" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/fd/d1/b6e9877fedae3add1afdeae1f89d1927d296da9cf977eca0eb08fb8a460e/grpcio_status-1.71.2.tar.gz", hash = "sha256:c7a97e176df71cdc2c179cd1847d7fc86cca5832ad12e9798d7fed6b7a1aab50", size = 13677, upload-time = "2025-06-28T04:24:05.426Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/67/58/317b0134129b556a93a3b0afe00ee675b5657f0155509e22fcb853bafe2d/grpcio_status-1.71.2-py3-none-any.whl", hash = "sha256:803c98cb6a8b7dc6dbb785b1111aed739f241ab5e9da0bba96888aa74704cfd3", size = 14424, upload-time = "2025-06-28T04:23:42.136Z" }, -] - [[package]] name = "grpclib" version = "0.4.8" @@ -2280,15 +2049,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7b/55/e5326141505c5d5e34c5e0935d2908a74e4561eca44108fbfb9c13d2911a/isoduration-20.11.0-py3-none-any.whl", hash = "sha256:b2904c2a4228c3d44f409c8ae8e2370eb21a26f7ac2ec5446df141dde3452042", size = 11321, upload-time = "2020-11-01T10:59:58.02Z" }, ] -[[package]] -name = "isort" -version = "6.0.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/b8/21/1e2a441f74a653a144224d7d21afe8f4169e6c7c20bb13aec3a2dc3815e0/isort-6.0.1.tar.gz", hash = "sha256:1cb5df28dfbc742e490c5e41bad6da41b805b0a8be7bc93cd0fb2a8a890ac450", size = 821955, upload-time = "2025-02-26T21:13:16.955Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c1/11/114d0a5f4dabbdcedc1125dee0888514c3c3b16d3e9facad87ed96fad97c/isort-6.0.1-py3-none-any.whl", hash = "sha256:2dc5d7f65c9678d94c88dfc29161a320eec67328bc97aad576874cb4be1e9615", size = 94186, upload-time = "2025-02-26T21:13:14.911Z" }, -] - [[package]] name = "jaconv" version = "0.4.0" @@ -2965,15 +2725,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8f/8e/9ad090d3553c280a8060fbf6e24dc1c0c29704ee7d1c372f0c174aa59285/matplotlib_inline-0.1.7-py3-none-any.whl", hash = "sha256:df192d39a4ff8f21b1895d72e6a13f5fcc5099f00fa84384e0ea28c2cc0653ca", size = 9899, upload-time = "2024-04-15T13:44:43.265Z" }, ] -[[package]] -name = "mccabe" -version = "0.7.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/e7/ff/0ffefdcac38932a54d2b5eed4e0ba8a408f215002cd178ad1df0f2806ff8/mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325", size = 9658, upload-time = "2022-01-24T01:14:51.113Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/27/1a/1f68f9ba0c207934b35b86a8ca3aad8395a3d6dd7921c0686e23853ff5a9/mccabe-0.7.0-py2.py3-none-any.whl", hash = "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e", size = 7350, upload-time = "2022-01-24T01:14:49.62Z" }, -] - [[package]] name = "mcp" version = "1.12.2" @@ -3227,45 +2978,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/da/d9/f7f9379981e39b8c2511c9e0326d212accacb82f12fbfdc1aa2ce2a7b2b6/multiprocess-0.70.16-py39-none-any.whl", hash = "sha256:a0bafd3ae1b732eac64be2e72038231c1ba97724b60b09400d68f229fcc2fbf3", size = 133351, upload-time = "2024-01-28T18:52:31.981Z" }, ] -[[package]] -name = "mypy" -version = "1.17.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "mypy-extensions" }, - { name = "pathspec" }, - { name = "tomli", marker = "python_full_version < '3.11'" }, - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/1e/e3/034322d5a779685218ed69286c32faa505247f1f096251ef66c8fd203b08/mypy-1.17.0.tar.gz", hash = "sha256:e5d7ccc08ba089c06e2f5629c660388ef1fee708444f1dee0b9203fa031dee03", size = 3352114, upload-time = "2025-07-14T20:34:30.181Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/6a/31/e762baa3b73905c856d45ab77b4af850e8159dffffd86a52879539a08c6b/mypy-1.17.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f8e08de6138043108b3b18f09d3f817a4783912e48828ab397ecf183135d84d6", size = 10998313, upload-time = "2025-07-14T20:33:24.519Z" }, - { url = "https://files.pythonhosted.org/packages/1c/c1/25b2f0d46fb7e0b5e2bee61ec3a47fe13eff9e3c2f2234f144858bbe6485/mypy-1.17.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ce4a17920ec144647d448fc43725b5873548b1aae6c603225626747ededf582d", size = 10128922, upload-time = "2025-07-14T20:34:06.414Z" }, - { url = "https://files.pythonhosted.org/packages/02/78/6d646603a57aa8a2886df1b8881fe777ea60f28098790c1089230cd9c61d/mypy-1.17.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6ff25d151cc057fdddb1cb1881ef36e9c41fa2a5e78d8dd71bee6e4dcd2bc05b", size = 11913524, upload-time = "2025-07-14T20:33:19.109Z" }, - { url = "https://files.pythonhosted.org/packages/4f/19/dae6c55e87ee426fb76980f7e78484450cad1c01c55a1dc4e91c930bea01/mypy-1.17.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:93468cf29aa9a132bceb103bd8475f78cacde2b1b9a94fd978d50d4bdf616c9a", size = 12650527, upload-time = "2025-07-14T20:32:44.095Z" }, - { url = "https://files.pythonhosted.org/packages/86/e1/f916845a235235a6c1e4d4d065a3930113767001d491b8b2e1b61ca56647/mypy-1.17.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:98189382b310f16343151f65dd7e6867386d3e35f7878c45cfa11383d175d91f", size = 12897284, upload-time = "2025-07-14T20:33:38.168Z" }, - { url = "https://files.pythonhosted.org/packages/ae/dc/414760708a4ea1b096bd214d26a24e30ac5e917ef293bc33cdb6fe22d2da/mypy-1.17.0-cp310-cp310-win_amd64.whl", hash = "sha256:c004135a300ab06a045c1c0d8e3f10215e71d7b4f5bb9a42ab80236364429937", size = 9506493, upload-time = "2025-07-14T20:34:01.093Z" }, - { url = "https://files.pythonhosted.org/packages/d4/24/82efb502b0b0f661c49aa21cfe3e1999ddf64bf5500fc03b5a1536a39d39/mypy-1.17.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:9d4fe5c72fd262d9c2c91c1117d16aac555e05f5beb2bae6a755274c6eec42be", size = 10914150, upload-time = "2025-07-14T20:31:51.985Z" }, - { url = "https://files.pythonhosted.org/packages/03/96/8ef9a6ff8cedadff4400e2254689ca1dc4b420b92c55255b44573de10c54/mypy-1.17.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d96b196e5c16f41b4f7736840e8455958e832871990c7ba26bf58175e357ed61", size = 10039845, upload-time = "2025-07-14T20:32:30.527Z" }, - { url = "https://files.pythonhosted.org/packages/df/32/7ce359a56be779d38021d07941cfbb099b41411d72d827230a36203dbb81/mypy-1.17.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:73a0ff2dd10337ceb521c080d4147755ee302dcde6e1a913babd59473904615f", size = 11837246, upload-time = "2025-07-14T20:32:01.28Z" }, - { url = "https://files.pythonhosted.org/packages/82/16/b775047054de4d8dbd668df9137707e54b07fe18c7923839cd1e524bf756/mypy-1.17.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:24cfcc1179c4447854e9e406d3af0f77736d631ec87d31c6281ecd5025df625d", size = 12571106, upload-time = "2025-07-14T20:34:26.942Z" }, - { url = "https://files.pythonhosted.org/packages/a1/cf/fa33eaf29a606102c8d9ffa45a386a04c2203d9ad18bf4eef3e20c43ebc8/mypy-1.17.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:3c56f180ff6430e6373db7a1d569317675b0a451caf5fef6ce4ab365f5f2f6c3", size = 12759960, upload-time = "2025-07-14T20:33:42.882Z" }, - { url = "https://files.pythonhosted.org/packages/94/75/3f5a29209f27e739ca57e6350bc6b783a38c7621bdf9cac3ab8a08665801/mypy-1.17.0-cp311-cp311-win_amd64.whl", hash = "sha256:eafaf8b9252734400f9b77df98b4eee3d2eecab16104680d51341c75702cad70", size = 9503888, upload-time = "2025-07-14T20:32:34.392Z" }, - { url = "https://files.pythonhosted.org/packages/12/e9/e6824ed620bbf51d3bf4d6cbbe4953e83eaf31a448d1b3cfb3620ccb641c/mypy-1.17.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:f986f1cab8dbec39ba6e0eaa42d4d3ac6686516a5d3dccd64be095db05ebc6bb", size = 11086395, upload-time = "2025-07-14T20:34:11.452Z" }, - { url = "https://files.pythonhosted.org/packages/ba/51/a4afd1ae279707953be175d303f04a5a7bd7e28dc62463ad29c1c857927e/mypy-1.17.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:51e455a54d199dd6e931cd7ea987d061c2afbaf0960f7f66deef47c90d1b304d", size = 10120052, upload-time = "2025-07-14T20:33:09.897Z" }, - { url = "https://files.pythonhosted.org/packages/8a/71/19adfeac926ba8205f1d1466d0d360d07b46486bf64360c54cb5a2bd86a8/mypy-1.17.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3204d773bab5ff4ebbd1f8efa11b498027cd57017c003ae970f310e5b96be8d8", size = 11861806, upload-time = "2025-07-14T20:32:16.028Z" }, - { url = "https://files.pythonhosted.org/packages/0b/64/d6120eca3835baf7179e6797a0b61d6c47e0bc2324b1f6819d8428d5b9ba/mypy-1.17.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1051df7ec0886fa246a530ae917c473491e9a0ba6938cfd0ec2abc1076495c3e", size = 12744371, upload-time = "2025-07-14T20:33:33.503Z" }, - { url = "https://files.pythonhosted.org/packages/1f/dc/56f53b5255a166f5bd0f137eed960e5065f2744509dfe69474ff0ba772a5/mypy-1.17.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:f773c6d14dcc108a5b141b4456b0871df638eb411a89cd1c0c001fc4a9d08fc8", size = 12914558, upload-time = "2025-07-14T20:33:56.961Z" }, - { url = "https://files.pythonhosted.org/packages/69/ac/070bad311171badc9add2910e7f89271695a25c136de24bbafc7eded56d5/mypy-1.17.0-cp312-cp312-win_amd64.whl", hash = "sha256:1619a485fd0e9c959b943c7b519ed26b712de3002d7de43154a489a2d0fd817d", size = 9585447, upload-time = "2025-07-14T20:32:20.594Z" }, - { url = "https://files.pythonhosted.org/packages/be/7b/5f8ab461369b9e62157072156935cec9d272196556bdc7c2ff5f4c7c0f9b/mypy-1.17.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:2c41aa59211e49d717d92b3bb1238c06d387c9325d3122085113c79118bebb06", size = 11070019, upload-time = "2025-07-14T20:32:07.99Z" }, - { url = "https://files.pythonhosted.org/packages/9c/f8/c49c9e5a2ac0badcc54beb24e774d2499748302c9568f7f09e8730e953fa/mypy-1.17.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0e69db1fb65b3114f98c753e3930a00514f5b68794ba80590eb02090d54a5d4a", size = 10114457, upload-time = "2025-07-14T20:33:47.285Z" }, - { url = "https://files.pythonhosted.org/packages/89/0c/fb3f9c939ad9beed3e328008b3fb90b20fda2cddc0f7e4c20dbefefc3b33/mypy-1.17.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:03ba330b76710f83d6ac500053f7727270b6b8553b0423348ffb3af6f2f7b889", size = 11857838, upload-time = "2025-07-14T20:33:14.462Z" }, - { url = "https://files.pythonhosted.org/packages/4c/66/85607ab5137d65e4f54d9797b77d5a038ef34f714929cf8ad30b03f628df/mypy-1.17.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:037bc0f0b124ce46bfde955c647f3e395c6174476a968c0f22c95a8d2f589bba", size = 12731358, upload-time = "2025-07-14T20:32:25.579Z" }, - { url = "https://files.pythonhosted.org/packages/73/d0/341dbbfb35ce53d01f8f2969facbb66486cee9804048bf6c01b048127501/mypy-1.17.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:c38876106cb6132259683632b287238858bd58de267d80defb6f418e9ee50658", size = 12917480, upload-time = "2025-07-14T20:34:21.868Z" }, - { url = "https://files.pythonhosted.org/packages/64/63/70c8b7dbfc520089ac48d01367a97e8acd734f65bd07813081f508a8c94c/mypy-1.17.0-cp313-cp313-win_amd64.whl", hash = "sha256:d30ba01c0f151998f367506fab31c2ac4527e6a7b2690107c7a7f9e3cb419a9c", size = 9589666, upload-time = "2025-07-14T20:34:16.841Z" }, - { url = "https://files.pythonhosted.org/packages/e3/fc/ee058cc4316f219078464555873e99d170bde1d9569abd833300dbeb484a/mypy-1.17.0-py3-none-any.whl", hash = "sha256:15d9d0018237ab058e5de3d8fce61b6fa72cc59cc78fd91f1b474bce12abf496", size = 2283195, upload-time = "2025-07-14T20:31:54.753Z" }, -] - [[package]] name = "mypy-extensions" version = "1.1.0" @@ -3719,15 +3431,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/9e/4e/0d0c945463719429b7bd21dece907ad0bde437a2ff12b9b12fee94722ab0/nvidia_nvtx_cu12-12.6.77-py3-none-manylinux2014_x86_64.whl", hash = "sha256:6574241a3ec5fdc9334353ab8c479fe75841dbe8f4532a8fc97ce63503330ba1", size = 89265, upload-time = "2024-10-01T17:00:38.172Z" }, ] -[[package]] -name = "oauthlib" -version = "3.3.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/0b/5f/19930f824ffeb0ad4372da4812c50edbd1434f678c90c2733e1188edfc63/oauthlib-3.3.1.tar.gz", hash = "sha256:0f0f8aa759826a193cf66c12ea1af1637f87b9b4622d46e866952bb022e538c9", size = 185918, upload-time = "2025-06-19T22:48:08.269Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/be/9c/92789c596b8df838baa98fa71844d84283302f7604ed565dafe5a6b5041a/oauthlib-3.3.1-py3-none-any.whl", hash = "sha256:88119c938d2b8fb88561af5f6ee0eec8cc8d552b7bb1f712743136eb7523b7a1", size = 160065, upload-time = "2025-06-19T22:48:06.508Z" }, -] - [[package]] name = "omegaconf" version = "2.3.0" @@ -4072,15 +3775,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c6/ac/dac4a63f978e4dcb3c6d3a78c4d8e0192a113d288502a1216950c41b1027/parso-0.8.4-py2.py3-none-any.whl", hash = "sha256:a418670a20291dacd2dddc80c377c5c3791378ee1e8d12bffc35420643d43f18", size = 103650, upload-time = "2024-04-05T09:43:53.299Z" }, ] -[[package]] -name = "pathspec" -version = "0.12.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/ca/bc/f35b8446f4531a7cb215605d100cd88b7ac6f44ab3fc94870c120ab3adbf/pathspec-0.12.1.tar.gz", hash = "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712", size = 51043, upload-time = "2023-12-10T22:30:45Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08", size = 31191, upload-time = "2023-12-10T22:30:43.14Z" }, -] - [[package]] name = "peewee" version = "3.18.2" @@ -4376,18 +4070,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cc/35/cc0aaecf278bb4575b8555f2b137de5ab821595ddae9da9d3cd1da4072c7/propcache-0.3.2-py3-none-any.whl", hash = "sha256:98f1ec44fb675f5052cccc8e609c46ed23a35a1cfd18545ad4e29002d858a43f", size = 12663, upload-time = "2025-06-09T22:56:04.484Z" }, ] -[[package]] -name = "proto-plus" -version = "1.26.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "protobuf" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/f4/ac/87285f15f7cce6d4a008f33f1757fb5a13611ea8914eb58c3d0d26243468/proto_plus-1.26.1.tar.gz", hash = "sha256:21a515a4c4c0088a773899e23c7bbade3d18f9c66c73edd4c7ee3816bc96a012", size = 56142, upload-time = "2025-03-10T15:54:38.843Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/4e/6d/280c4c2ce28b1593a19ad5239c8b826871fc6ec275c21afc8e1820108039/proto_plus-1.26.1-py3-none-any.whl", hash = "sha256:13285478c2dcf2abb829db158e1047e2f1e8d63a077d94263c2b88b043c75a66", size = 50163, upload-time = "2025-03-10T15:54:37.335Z" }, -] - [[package]] name = "protobuf" version = "5.29.3" @@ -4478,27 +4160,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e5/4e/519c1bc1876625fe6b71e9a28287c43ec2f20f73c658b9ae1d485c0c206e/pyarrow-21.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:222c39e2c70113543982c6b34f3077962b44fca38c0bd9e68bb6781534425c10", size = 26371006, upload-time = "2025-07-18T00:56:56.379Z" }, ] -[[package]] -name = "pyasn1" -version = "0.6.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/ba/e9/01f1a64245b89f039897cb0130016d79f77d52669aae6ee7b159a6c4c018/pyasn1-0.6.1.tar.gz", hash = "sha256:6f580d2bdd84365380830acf45550f2511469f673cb4a5ae3857a3170128b034", size = 145322, upload-time = "2024-09-10T22:41:42.55Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c8/f1/d6a797abb14f6283c0ddff96bbdd46937f64122b8c925cab503dd37f8214/pyasn1-0.6.1-py3-none-any.whl", hash = "sha256:0d632f46f2ba09143da3a8afe9e33fb6f92fa2320ab7e886e2d0f7672af84629", size = 83135, upload-time = "2024-09-11T16:00:36.122Z" }, -] - -[[package]] -name = "pyasn1-modules" -version = "0.4.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "pyasn1" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/e9/e6/78ebbb10a8c8e4b61a59249394a4a594c1a7af95593dc933a349c8d00964/pyasn1_modules-0.4.2.tar.gz", hash = "sha256:677091de870a80aae844b1ca6134f54652fa2c8c5a52aa396440ac3106e941e6", size = 307892, upload-time = "2025-03-28T02:41:22.17Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/47/8d/d529b5d697919ba8c11ad626e835d4039be708a35b0d22de83a269a6682c/pyasn1_modules-0.4.2-py3-none-any.whl", hash = "sha256:29253a9207ce32b64c3ac6600edc75368f98473906e8fd1043bd6b5b1de2c14a", size = 181259, upload-time = "2025-03-28T02:41:19.028Z" }, -] - [[package]] name = "pycares" version = "4.9.0" @@ -4570,15 +4231,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ee/f8/b4d4bf71ae92727a0b3a9b9092c2e722833c1ca50ebd0414824843cb84fd/pycares-4.9.0-cp313-cp313-win_arm64.whl", hash = "sha256:faa9de8e647ed06757a2c117b70a7645a755561def814da6aca0d766cf71a402", size = 115646, upload-time = "2025-06-13T00:37:33.251Z" }, ] -[[package]] -name = "pycodestyle" -version = "2.14.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/11/e0/abfd2a0d2efe47670df87f3e3a0e2edda42f055053c85361f19c0e2c1ca8/pycodestyle-2.14.0.tar.gz", hash = "sha256:c4b5b517d278089ff9d0abdec919cd97262a3367449ea1c8b49b91529167b783", size = 39472, upload-time = "2025-06-20T18:49:48.75Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d7/27/a58ddaf8c588a3ef080db9d0b7e0b97215cee3a45df74f3a94dbbf5c893a/pycodestyle-2.14.0-py2.py3-none-any.whl", hash = "sha256:dd6bf7cb4ee77f8e016f9c8e74a35ddd9f67e1d5fd4184d86c3b98e07099f42d", size = 31594, upload-time = "2025-06-20T18:49:47.491Z" }, -] - [[package]] name = "pycparser" version = "2.22" @@ -4709,15 +4361,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/58/f0/427018098906416f580e3cf1366d3b1abfb408a0652e9f31600c24a1903c/pydantic_settings-2.10.1-py3-none-any.whl", hash = "sha256:a60952460b99cf661dc25c29c0ef171721f98bfcb52ef8d9ea4c943d7c8cc796", size = 45235, upload-time = "2025-06-24T13:26:45.485Z" }, ] -[[package]] -name = "pyflakes" -version = "3.4.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/45/dc/fd034dc20b4b264b3d015808458391acbf9df40b1e54750ef175d39180b1/pyflakes-3.4.0.tar.gz", hash = "sha256:b24f96fafb7d2ab0ec5075b7350b3d2d2218eab42003821c06344973d3ea2f58", size = 64669, upload-time = "2025-06-20T18:45:27.834Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c2/2f/81d580a0fb83baeb066698975cb14a618bdbed7720678566f1b046a95fe8/pyflakes-3.4.0-py2.py3-none-any.whl", hash = "sha256:f742a7dbd0d9cb9ea41e9a24a918996e8170c799fa528688d40dd582c8265f4f", size = 63551, upload-time = "2025-06-20T18:45:26.937Z" }, -] - [[package]] name = "pygame" version = "2.6.1" @@ -4791,6 +4434,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/bd/24/12818598c362d7f300f18e74db45963dbcb85150324092410c8b49405e42/pyproject_hooks-1.2.0-py3-none-any.whl", hash = "sha256:9e5c6bfa8dcc30091c74b0cf803c81fdd29d94f01992a7707bc97babb1141913", size = 10216, upload-time = "2024-09-29T09:24:11.978Z" }, ] +[[package]] +name = "pyright" +version = "1.1.403" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nodeenv" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/fe/f6/35f885264ff08c960b23d1542038d8da86971c5d8c955cfab195a4f672d7/pyright-1.1.403.tar.gz", hash = "sha256:3ab69b9f41c67fb5bbb4d7a36243256f0d549ed3608678d381d5f51863921104", size = 3913526, upload-time = "2025-07-09T07:15:52.882Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/49/b6/b04e5c2f41a5ccad74a1a4759da41adb20b4bc9d59a5e08d29ba60084d07/pyright-1.1.403-py3-none-any.whl", hash = "sha256:c0eeca5aa76cbef3fcc271259bbd785753c7ad7bcac99a9162b4c4c7daed23b3", size = 5684504, upload-time = "2025-07-09T07:15:50.958Z" }, +] + [[package]] name = "pysocks" version = "1.7.1" @@ -5178,19 +4834,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7c/e4/56027c4a6b4ae70ca9de302488c5ca95ad4a39e190093d6c1a8ace08341b/requests-2.32.4-py3-none-any.whl", hash = "sha256:27babd3cda2a6d50b30443204ee89830707d396671944c998b5975b031ac2b2c", size = 64847, upload-time = "2025-06-09T16:43:05.728Z" }, ] -[[package]] -name = "requests-oauthlib" -version = "2.0.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "oauthlib" }, - { name = "requests" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/42/f2/05f29bc3913aea15eb670be136045bf5c5bbf4b99ecb839da9b422bb2c85/requests-oauthlib-2.0.0.tar.gz", hash = "sha256:b3dffaebd884d8cd778494369603a9e7b58d29111bf6b41bdc2dcd87203af4e9", size = 55650, upload-time = "2024-03-22T20:32:29.939Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/3b/5d/63d4ae3b9daea098d5d6f5da83984853c1bbacd5dc826764b249fe119d24/requests_oauthlib-2.0.0-py2.py3-none-any.whl", hash = "sha256:7dd8a5c40426b779b0868c404bdef9768deccf22749cde15852df527e6269b36", size = 24179, upload-time = "2024-03-22T20:32:28.055Z" }, -] - [[package]] name = "requests-toolbelt" version = "1.0.0" @@ -5535,18 +5178,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c8/ed/9de62c2150ca8e2e5858acf3f4f4d0d180a38feef9fdab4078bea63d8dba/rpds_py-0.26.0-pp311-pypy311_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:e99685fc95d386da368013e7fb4269dd39c30d99f812a8372d62f244f662709c", size = 555334, upload-time = "2025-07-01T15:56:51.703Z" }, ] -[[package]] -name = "rsa" -version = "4.9.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "pyasn1" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/da/8a/22b7beea3ee0d44b1916c0c1cb0ee3af23b700b6da9f04991899d0c555d4/rsa-4.9.1.tar.gz", hash = "sha256:e7bdbfdb5497da4c07dfd35530e1a902659db6ff241e39d9953cad06ebd0ae75", size = 29034, upload-time = "2025-04-16T09:51:18.218Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/64/8d/0133e4eb4beed9e425d9a98ed6e081a55d195481b7632472be1af08d2f6b/rsa-4.9.1-py3-none-any.whl", hash = "sha256:68635866661c6836b8d39430f97a996acbd61bfa49406748ea243539fe239762", size = 34696, upload-time = "2025-04-16T09:51:17.142Z" }, -] - [[package]] name = "ruff" version = "0.9.10" diff --git a/vendor/tau2/__init__.py b/vendor/tau2/__init__.py index 8b137891..e69de29b 100644 --- a/vendor/tau2/__init__.py +++ b/vendor/tau2/__init__.py @@ -1 +0,0 @@ - diff --git a/vendor/tau2/agent/README.md b/vendor/tau2/agent/README.md index fa201945..ee77cfb6 100644 --- a/vendor/tau2/agent/README.md +++ b/vendor/tau2/agent/README.md @@ -32,4 +32,4 @@ tau2 run \ --agent-llm \ --user-llm \ ... -``` \ No newline at end of file +``` diff --git a/vendor/tau2/agent/base.py b/vendor/tau2/agent/base.py index 7a345432..4d6d9dbd 100644 --- a/vendor/tau2/agent/base.py +++ b/vendor/tau2/agent/base.py @@ -73,9 +73,7 @@ def set_seed(self, seed: int): """ Set the seed for the agent. [Optional] """ - logger.warning( - f"Setting seed for agent is not implemented for class {self.__class__.__name__}" - ) + logger.warning(f"Setting seed for agent is not implemented for class {self.__class__.__name__}") class LocalAgent(BaseAgent[AgentState]): diff --git a/vendor/tau2/agent/llm_agent.py b/vendor/tau2/agent/llm_agent.py index b2fdee99..dffc106d 100644 --- a/vendor/tau2/agent/llm_agent.py +++ b/vendor/tau2/agent/llm_agent.py @@ -82,9 +82,9 @@ def get_init_state(self, message_history: Optional[list[Message]] = None) -> LLM """ if message_history is None: message_history = [] - assert all( - is_valid_agent_history_message(m) for m in message_history - ), "Message history must contain only AssistantMessage, UserMessage, or ToolMessage to Agent." + assert all(is_valid_agent_history_message(m) for m in message_history), ( + "Message history must contain only AssistantMessage, UserMessage, or ToolMessage to Agent." + ) return LLMAgentState( system_messages=[SystemMessage(role="system", content=self.system_prompt)], messages=message_history, @@ -206,9 +206,9 @@ def get_init_state(self, message_history: Optional[list[Message]] = None) -> LLM """ if message_history is None: message_history = [] - assert all( - is_valid_agent_history_message(m) for m in message_history - ), "Message history must contain only AssistantMessage, UserMessage, or ToolMessage to Agent." + assert all(is_valid_agent_history_message(m) for m in message_history), ( + "Message history must contain only AssistantMessage, UserMessage, or ToolMessage to Agent." + ) return LLMAgentState( system_messages=[SystemMessage(role="system", content=self.system_prompt)], messages=message_history, @@ -416,9 +416,9 @@ def get_init_state(self, message_history: Optional[list[Message]] = None) -> LLM """ if message_history is None: message_history = [] - assert all( - is_valid_agent_history_message(m) for m in message_history - ), "Message history must contain only AssistantMessage, UserMessage, or ToolMessage to Agent." + assert all(is_valid_agent_history_message(m) for m in message_history), ( + "Message history must contain only AssistantMessage, UserMessage, or ToolMessage to Agent." + ) return LLMAgentState( system_messages=[SystemMessage(role="system", content=self.system_prompt)], messages=message_history, diff --git a/vendor/tau2/cli.py b/vendor/tau2/cli.py index 65b2d115..89056109 100644 --- a/vendor/tau2/cli.py +++ b/vendor/tau2/cli.py @@ -193,9 +193,7 @@ def main(): start_parser.set_defaults(func=lambda args: run_start_servers()) # Check data command - check_data_parser = subparsers.add_parser( - "check-data", help="Check if data directory is properly configured" - ) + check_data_parser = subparsers.add_parser("check-data", help="Check if data directory is properly configured") check_data_parser.set_defaults(func=lambda args: run_check_data()) args = parser.parse_args() diff --git a/vendor/tau2/data/user_simulator/simulation_guidelines.md b/vendor/tau2/data/user_simulator/simulation_guidelines.md index 8bf34059..f7a559fd 100644 --- a/vendor/tau2/data/user_simulator/simulation_guidelines.md +++ b/vendor/tau2/data/user_simulator/simulation_guidelines.md @@ -1,5 +1,5 @@ # User Simulation Guidelines -You are playing the role of a customer contacting a customer service representative. +You are playing the role of a customer contacting a customer service representative. Your goal is to simulate realistic customer interactions while following specific scenario instructions. ## Core Principles @@ -15,4 +15,4 @@ Your goal is to simulate realistic customer interactions while following specifi - If you are transferred to another agent, generate the '###TRANSFER###' token to indicate the transfer. - If you find yourself in a situation in which the scenario does not provide enough information for you to continue the conversation, generate the '###OUT-OF-SCOPE###' token to end the conversation. -Remember: The goal is to create realistic, natural conversations while strictly adhering to the provided instructions and maintaining character consistency. \ No newline at end of file +Remember: The goal is to create realistic, natural conversations while strictly adhering to the provided instructions and maintaining character consistency. diff --git a/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md b/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md index 09f85a50..33908510 100644 --- a/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md +++ b/vendor/tau2/data/user_simulator/simulation_guidelines_tools.md @@ -1,6 +1,6 @@ # User Simulation Guidelines -You are playing the role of a customer contacting a customer service representative agent. +You are playing the role of a customer contacting a customer service representative agent. Your goal is to simulate realistic customer interactions while following specific scenario instructions. You have some tools to perform the actions on your end that might be requested by the agent to diagnose and resolve your issue. @@ -27,4 +27,4 @@ You have some tools to perform the actions on your end that might be requested b - If you have been transferred to another agent, generate the '###TRANSFER###' token to indicate the transfer. Only do this after the agent has clearly indicated that you are being transferred. - If you find yourself in a situation in which the scenario does not provide enough information for you to continue the conversation, generate the '###OUT-OF-SCOPE###' token to end the conversation. -Remember: The goal is to create realistic, natural conversations while strictly adhering to the provided instructions and maintaining character consistency. \ No newline at end of file +Remember: The goal is to create realistic, natural conversations while strictly adhering to the provided instructions and maintaining character consistency. diff --git a/vendor/tau2/data_model/__init__.py b/vendor/tau2/data_model/__init__.py index 8b137891..e69de29b 100644 --- a/vendor/tau2/data_model/__init__.py +++ b/vendor/tau2/data_model/__init__.py @@ -1 +0,0 @@ - diff --git a/vendor/tau2/data_model/message.py b/vendor/tau2/data_model/message.py index ef5f1f7b..077c176d 100644 --- a/vendor/tau2/data_model/message.py +++ b/vendor/tau2/data_model/message.py @@ -18,15 +18,9 @@ class SystemMessage(BaseModel): """ role: SystemRole = Field(description="The role of the message sender.") - content: Optional[str] = Field( - description="The content of the message.", default=None - ) - turn_idx: Optional[int] = Field( - description="The index of the turn in the conversation.", default=None - ) - timestamp: Optional[str] = Field( - description="The timestamp of the message.", default_factory=get_now - ) + content: Optional[str] = Field(description="The content of the message.", default=None) + turn_idx: Optional[int] = Field(description="The index of the turn in the conversation.", default=None) + timestamp: Optional[str] = Field(description="The timestamp of the message.", default_factory=get_now) def __str__(self) -> str: lines = [ @@ -87,35 +81,21 @@ class ParticipantMessageBase(BaseModel): role: str = Field(description="The role of the message sender.") - content: Optional[str] = Field( - description="The content of the message.", default=None - ) - tool_calls: Optional[list[ToolCall]] = Field( - description="The tool calls made in the message.", default=None - ) - turn_idx: Optional[int] = Field( - description="The index of the turn in the conversation.", default=None - ) - timestamp: Optional[str] = Field( - description="The timestamp of the message.", default_factory=get_now - ) + content: Optional[str] = Field(description="The content of the message.", default=None) + tool_calls: Optional[list[ToolCall]] = Field(description="The tool calls made in the message.", default=None) + turn_idx: Optional[int] = Field(description="The index of the turn in the conversation.", default=None) + timestamp: Optional[str] = Field(description="The timestamp of the message.", default_factory=get_now) cost: Optional[float] = Field(description="The cost of the message.", default=None) - usage: Optional[dict] = Field( - description="The token usage of the message.", default=None - ) - raw_data: Optional[dict] = Field( - description="The raw data of the message.", default=None - ) + usage: Optional[dict] = Field(description="The token usage of the message.", default=None) + raw_data: Optional[dict] = Field(description="The raw data of the message.", default=None) def validate(self): # NOTE: It would be better to do this in the Pydantic model """ Validate the message. """ if not (self.has_text_content() or self.is_tool_call()): - raise ValueError( - f"AssistantMessage must have either content or tool calls. Got {self}" - ) + raise ValueError(f"AssistantMessage must have either content or tool calls. Got {self}") def has_text_content(self) -> bool: """ @@ -151,11 +131,7 @@ def __str__(self) -> str: def __eq__(self, other: object) -> bool: if type(other) is not type(self): return False - return ( - self.role == other.role - and self.content == other.content - and self.tool_calls == other.tool_calls - ) + return self.role == other.role and self.content == other.content and self.tool_calls == other.tool_calls class AssistantMessage(ParticipantMessageBase): @@ -187,12 +163,8 @@ class ToolMessage(BaseModel): description="The requestor of the tool call.", ) error: bool = Field(description="Whether the tool call failed.", default=False) - turn_idx: Optional[int] = Field( - description="The index of the turn in the conversation.", default=None - ) - timestamp: Optional[str] = Field( - description="The timestamp of the message.", default_factory=get_now - ) + turn_idx: Optional[int] = Field(description="The index of the turn in the conversation.", default=None) + timestamp: Optional[str] = Field(description="The timestamp of the message.", default_factory=get_now) def __str__(self) -> str: lines = [f"ToolMessage (responding to {self.requestor})"] @@ -228,6 +200,4 @@ class MultiToolMessage(BaseModel): APICompatibleMessage = SystemMessage | AssistantMessage | UserMessage | ToolMessage -Message = ( - SystemMessage | AssistantMessage | UserMessage | ToolMessage | MultiToolMessage -) +Message = SystemMessage | AssistantMessage | UserMessage | ToolMessage | MultiToolMessage diff --git a/vendor/tau2/data_model/simulation.py b/vendor/tau2/data_model/simulation.py index 41de1d72..ee630cf7 100644 --- a/vendor/tau2/data_model/simulation.py +++ b/vendor/tau2/data_model/simulation.py @@ -216,9 +216,7 @@ class RewardInfo(BaseModel): """ reward: Annotated[float, Field(description="The reward received by the agent.")] - db_check: Annotated[ - Optional[DBCheck], Field(description="The database check.", default=None) - ] + db_check: Annotated[Optional[DBCheck], Field(description="The database check.", default=None)] env_assertions: Annotated[ Optional[list[EnvAssertionCheck]], Field(description="The environment assertions.", default=None), @@ -265,9 +263,7 @@ class AgentInfo(BaseModel): implementation: str = Field(description="The type of agent.") llm: Optional[str] = Field(description="The LLM used by the agent.", default=None) - llm_args: Optional[dict] = Field( - description="The arguments to pass to the LLM for the agent.", default=None - ) + llm_args: Optional[dict] = Field(description="The arguments to pass to the LLM for the agent.", default=None) class UserInfo(BaseModel): @@ -277,9 +273,7 @@ class UserInfo(BaseModel): implementation: str = Field(description="The type of user.") llm: Optional[str] = Field(description="The LLM used by the user.", default=None) - llm_args: Optional[dict] = Field( - description="The arguments to pass to the LLM for the user.", default=None - ) + llm_args: Optional[dict] = Field(description="The arguments to pass to the LLM for the user.", default=None) global_simulation_guidelines: Optional[str] = Field( description="The global simulation guidelines for the user.", default=None ) @@ -295,9 +289,7 @@ class Info(BaseModel): user_info: UserInfo = Field(description="User information.") agent_info: AgentInfo = Field(description="Agent information.") environment_info: EnvironmentInfo = Field(description="Environment information.") - seed: Optional[int] = Field( - description="The seed used for the simulation.", default=None - ) + seed: Optional[int] = Field(description="The seed used for the simulation.", default=None) class TerminationReason(str, Enum): @@ -314,31 +306,17 @@ class SimulationRun(BaseModel): id: str = Field(description="The unique identifier for the simulation run.") task_id: str = Field(description="The unique identifier for the task.") - timestamp: str = Field( - description="The timestamp of the simulation.", default_factory=get_now - ) + timestamp: str = Field(description="The timestamp of the simulation.", default_factory=get_now) start_time: str = Field(description="The start time of the simulation.") end_time: str = Field(description="The end time of the simulation.") duration: float = Field(description="The duration of the simulation.") - termination_reason: TerminationReason = Field( - description="The reason for the termination of the simulation." - ) - agent_cost: Optional[float] = Field( - description="The cost of the agent.", default=None - ) - user_cost: Optional[float] = Field( - description="The cost of the user.", default=None - ) - reward_info: Optional[RewardInfo] = Field( - description="The reward received by the agent.", default=None - ) - messages: list[Message] = Field( - description="The messages exchanged between the user, agent and environment." - ) + termination_reason: TerminationReason = Field(description="The reason for the termination of the simulation.") + agent_cost: Optional[float] = Field(description="The cost of the agent.", default=None) + user_cost: Optional[float] = Field(description="The cost of the user.", default=None) + reward_info: Optional[RewardInfo] = Field(description="The reward received by the agent.", default=None) + messages: list[Message] = Field(description="The messages exchanged between the user, agent and environment.") trial: Optional[int] = Field(description="Trial number", default=None) - seed: Optional[int] = Field( - description="Seed used for the simulation.", default=None - ) + seed: Optional[int] = Field(description="Seed used for the simulation.", default=None) class Results(BaseModel): @@ -346,9 +324,7 @@ class Results(BaseModel): Run results """ - timestamp: Optional[str] = Field( - description="The timestamp of the simulation.", default_factory=get_now - ) + timestamp: Optional[str] = Field(description="The timestamp of the simulation.", default_factory=get_now) info: Info = Field(description="Information.") tasks: list[Task] = Field(description="The list of tasks.") simulations: list[SimulationRun] = Field(description="The list of simulations.") @@ -387,14 +363,8 @@ def transfer_only(task: Task) -> bool: return False def get_task_metrics(task: Task) -> dict: - eval_metrics = ( - task.evaluation_criteria.info() - if task.evaluation_criteria is not None - else {} - ) - num_actions = ( - eval_metrics["num_agent_actions"] + eval_metrics["num_user_actions"] - ) + eval_metrics = task.evaluation_criteria.info() if task.evaluation_criteria is not None else {} + num_actions = eval_metrics["num_agent_actions"] + eval_metrics["num_user_actions"] if transfer_only(task): num_actions = -1 info = { diff --git a/vendor/tau2/data_model/tasks.py b/vendor/tau2/data_model/tasks.py index ef17dc3c..c3105557 100644 --- a/vendor/tau2/data_model/tasks.py +++ b/vendor/tau2/data_model/tasks.py @@ -18,9 +18,7 @@ class StructuredUserInstructions(BaseModel): """ domain: Annotated[str, Field(description="The domain of the task.")] - reason_for_call: Annotated[ - str, Field(description="The reason for the user to call the agent.") - ] + reason_for_call: Annotated[str, Field(description="The reason for the user to call the agent.")] known_info: Annotated[ Optional[str], Field(description="Known information about the user.", default=None), @@ -40,9 +38,7 @@ def __str__(self) -> str: lines.append(f"Known info:\n{textwrap.indent(self.known_info, tab)}") if self.unknown_info is not None: lines.append(f"Unknown info:\n{textwrap.indent(self.unknown_info, tab)}") - lines.append( - f"Task instructions:\n{textwrap.indent(self.task_instructions, tab)}" - ) + lines.append(f"Task instructions:\n{textwrap.indent(self.task_instructions, tab)}") return "\n".join(lines) @@ -128,18 +124,14 @@ class Action(BaseModel): If compare_args is None, will check all the arguments. """ - action_id: str = Field( - description="The unique identifier for the action within a scenario." - ) + action_id: str = Field(description="The unique identifier for the action within a scenario.") requestor: ToolRequestor = Field( description="The requestor of the action.", default="assistant", ) name: str = Field(description="The name of the action.") arguments: dict = Field(description="The arguments for the action.") - info: Optional[str] = Field( - description="Information about the action.", default=None - ) + info: Optional[str] = Field(description="Information about the action.", default=None) compare_args: Optional[list[str]] = Field( description="The arguments to check in tool call. If None, will check all the arguments.", default=None, @@ -159,9 +151,7 @@ def get_func_format(self) -> str: """ Get the function format of the action. """ - return ( - f"{self.name}({', '.join([f'{k}={v}' for k, v in self.arguments.items()])})" - ) + return f"{self.name}({', '.join([f'{k}={v}' for k, v in self.arguments.items()])})" def compare_with_tool_call(self, tool_call: ToolCall) -> bool: """ @@ -193,9 +183,7 @@ class EnvFunctionCall(BaseModel): Field(description="The type of environment to call the function on."), ] func_name: Annotated[str, Field(description="The name of the function to call.")] - arguments: Annotated[ - dict, Field(description="The arguments to pass to the function.") - ] + arguments: Annotated[dict, Field(description="The arguments to pass to the function.")] def __str__(self) -> str: lines = [] @@ -210,9 +198,7 @@ class EnvAssertion(EnvFunctionCall): An assertion on the agent or user environment. """ - assert_value: Annotated[ - bool, Field(default=True, description="The value to assert on.") - ] + assert_value: Annotated[bool, Field(default=True, description="The value to assert on.")] message: Annotated[ Optional[str], Field( @@ -279,27 +265,16 @@ def __str__(self) -> str: lines = [] if self.actions is not None: lines.append("Actions:") - lines.extend( - [textwrap.indent(str(action), "\t") for action in self.actions] - ) + lines.extend([textwrap.indent(str(action), "\t") for action in self.actions]) if self.env_assertions is not None: lines.append("Env Assertions:") - lines.extend( - [ - textwrap.indent(str(assertion), "\t") - for assertion in self.env_assertions - ] - ) + lines.extend([textwrap.indent(str(assertion), "\t") for assertion in self.env_assertions]) if self.communicate_info is not None: lines.append("Communicate Info:") - lines.extend( - [textwrap.indent(info, "\t") for info in self.communicate_info] - ) + lines.extend([textwrap.indent(info, "\t") for info in self.communicate_info]) if self.nl_assertions is not None: lines.append("NL Assertions:") - lines.extend( - [textwrap.indent(assertion, "\t") for assertion in self.nl_assertions] - ) + lines.extend([textwrap.indent(assertion, "\t") for assertion in self.nl_assertions]) return "\n".join(lines) def info(self) -> dict: @@ -309,16 +284,10 @@ def info(self) -> dict: else 0 ) num_user_actions = ( - len([action for action in self.actions if action.requestor == "user"]) - if self.actions is not None - else 0 - ) - num_env_assertions = ( - len(self.env_assertions) if self.env_assertions is not None else 0 - ) - num_nl_assertions = ( - len(self.nl_assertions) if self.nl_assertions is not None else 0 + len([action for action in self.actions if action.requestor == "user"]) if self.actions is not None else 0 ) + num_env_assertions = len(self.env_assertions) if self.env_assertions is not None else 0 + num_nl_assertions = len(self.nl_assertions) if self.nl_assertions is not None else 0 return { "num_agent_actions": num_agent_actions, "num_user_actions": num_user_actions, @@ -354,9 +323,7 @@ class InitialState(BaseModel): ] initialization_actions: Annotated[ Optional[list[EnvFunctionCall]], - Field( - description="Initial actions to be taken on the environment.", default=None - ), + Field(description="Initial actions to be taken on the environment.", default=None), ] message_history: Annotated[ Optional[list[Message]], @@ -370,29 +337,13 @@ def __str__(self) -> str: lines = [] if self.initialization_data is not None: lines.append("Initialization Data:") - lines.extend( - [ - textwrap.indent( - self.initialization_data.model_dump_json(indent=2), "\t" - ) - ] - ) + lines.extend([textwrap.indent(self.initialization_data.model_dump_json(indent=2), "\t")]) if self.initialization_actions is not None: lines.append("Initialization Actions:") - lines.extend( - [ - textwrap.indent(str(action), "\t") - for action in self.initialization_actions - ] - ) + lines.extend([textwrap.indent(str(action), "\t") for action in self.initialization_actions]) if self.message_history is not None: lines.append("Message History:") - lines.extend( - [ - textwrap.indent(str(message), "\t") - for message in self.message_history - ] - ) + lines.extend([textwrap.indent(str(message), "\t") for message in self.message_history]) return "\n".join(lines) @@ -411,9 +362,7 @@ class Task(BaseModel): ] user_scenario: Annotated[ UserScenario, - Field( - description="User scenario. This information will be sent to the user simulator." - ), + Field(description="User scenario. This information will be sent to the user simulator."), ] ticket: Annotated[ Optional[str], @@ -478,11 +427,7 @@ def make_task( if message_history is not None: # Patch to consider empty list of tool calls as None. for message in message_history: - if ( - message.role == "assistant" - and isinstance(message.tool_calls, list) - and len(message.tool_calls) == 0 - ): + if message.role == "assistant" and isinstance(message.tool_calls, list) and len(message.tool_calls) == 0: message.tool_calls = None initial_state = InitialState( diff --git a/vendor/tau2/domains/airline/data_model.py b/vendor/tau2/domains/airline/data_model.py index f2733727..c046d228 100644 --- a/vendor/tau2/domains/airline/data_model.py +++ b/vendor/tau2/domains/airline/data_model.py @@ -10,9 +10,7 @@ Insurance = Literal["yes", "no"] -MembershipLevel = Annotated[ - Literal["gold", "silver", "regular"], Field(description="Membership level") -] +MembershipLevel = Annotated[Literal["gold", "silver", "regular"], Field(description="Membership level")] class AirportCode(BaseModel): @@ -30,9 +28,7 @@ class Name(BaseModel): class Address(BaseModel): address1: str = Field(description="Primary address line") - address2: Optional[str] = Field( - None, description="Secondary address line (optional)" - ) + address2: Optional[str] = Field(None, description="Secondary address line (optional)") city: str = Field(description="City name") country: str = Field(description="Country name") state: str = Field(description="State or province name") @@ -51,25 +47,19 @@ class PaymentMethodBase(BaseModel): class CreditCard(PaymentMethodBase): - source: Literal["credit_card"] = Field( - description="Indicates this is a credit card payment method" - ) + source: Literal["credit_card"] = Field(description="Indicates this is a credit card payment method") brand: str = Field(description="Credit card brand (e.g., visa, mastercard)") last_four: str = Field(description="Last four digits of the credit card") class GiftCard(PaymentMethodBase): - source: Literal["gift_card"] = Field( - description="Indicates this is a gift card payment method" - ) + source: Literal["gift_card"] = Field(description="Indicates this is a gift card payment method") amount: float = Field(description="Gift card value amount") id: str = Field(description="Unique identifier for the gift card") class Certificate(PaymentMethodBase): - source: Literal["certificate"] = Field( - description="Indicates this is a certificate payment method" - ) + source: Literal["certificate"] = Field(description="Indicates this is a certificate payment method") amount: float = Field(description="Certificate value amount") @@ -82,9 +72,7 @@ class Passenger(BaseModel): dob: str = Field(description="Date of birth in YYYY-MM-DD format") -SeatPrices = Annotated[ - dict[CabinClass, int], Field(description="Prices for different cabin classes") -] +SeatPrices = Annotated[dict[CabinClass, int], Field(description="Prices for different cabin classes")] AvailableSeats = Annotated[ dict[CabinClass, int], Field(description="Available seats for different cabin classes"), @@ -92,9 +80,7 @@ class Passenger(BaseModel): class FlightDateStatusAvailable(BaseModel): - status: Literal["available"] = Field( - description="Indicates flight is available for booking" - ) + status: Literal["available"] = Field(description="Indicates flight is available for booking") available_seats: AvailableSeats = Field(description="Available seats by class") prices: SeatPrices = Field(description="Current prices by class") @@ -166,24 +152,18 @@ class Flight(FlightBase): scheduled_arrival_time_est: str = Field( description="Scheduled arrival time in EST in the format HH:MM:SS, e.g 07:00:00" ) - dates: Dict[str, FlightDateStatus] = Field( - description="Flight status by date (YYYY-MM-DD)" - ) + dates: Dict[str, FlightDateStatus] = Field(description="Flight status by date (YYYY-MM-DD)") class DirectFlight(FlightBase): - status: Literal["available"] = Field( - description="Indicates flight is available for booking" - ) + status: Literal["available"] = Field(description="Indicates flight is available for booking") scheduled_departure_time_est: str = Field( description="Scheduled departure time in EST in the format HH:MM:SS, e.g 06:00:00" ) scheduled_arrival_time_est: str = Field( description="Scheduled arrival time in EST in the format HH:MM:SS, e.g 07:00:00" ) - date: Optional[str] = Field( - description="Flight date in YYYY-MM-DD format", default=None - ) + date: Optional[str] = Field(description="Flight date in YYYY-MM-DD format", default=None) available_seats: AvailableSeats = Field(description="Available seats by class") prices: SeatPrices = Field(description="Current prices by class") @@ -195,9 +175,7 @@ class ReservationFlight(FlightBase): class FlightInfo(BaseModel): flight_number: str = Field(description="Flight number, such as 'HAT001'.") - date: str = Field( - description="The date for the flight in the format 'YYYY-MM-DD', such as '2024-05-01'." - ) + date: str = Field(description="The date for the flight in the format 'YYYY-MM-DD', such as '2024-05-01'.") class User(BaseModel): @@ -205,15 +183,9 @@ class User(BaseModel): name: Name = Field(description="User's full name") address: Address = Field(description="User's address information") email: str = Field(description="User's email address") - dob: str = Field( - description="User's date of birth in the format YYYY-MM-DD, e.g 1990-04-05" - ) - payment_methods: Dict[str, PaymentMethod] = Field( - description="User's saved payment methods" - ) - saved_passengers: List[Passenger] = Field( - description="User's saved passenger information" - ) + dob: str = Field(description="User's date of birth in the format YYYY-MM-DD, e.g 1990-04-05") + payment_methods: Dict[str, PaymentMethod] = Field(description="User's saved payment methods") + saved_passengers: List[Passenger] = Field(description="User's saved passenger information") membership: MembershipLevel = Field(description="User's membership level") reservations: List[str] = Field(description="List of user's reservation IDs") @@ -226,35 +198,21 @@ class Reservation(BaseModel): destination: str = Field(description="IATA code for trip destination") flight_type: FlightType = Field(description="Type of trip") cabin: CabinClass = Field(description="Selected cabin class") - flights: List[ReservationFlight] = Field( - description="List of flights in the reservation" - ) - passengers: List[Passenger] = Field( - description="List of passengers on the reservation" - ) - payment_history: List[Payment] = Field( - description="History of payments for this reservation" - ) - created_at: str = Field( - description="Timestamp when reservation was created in the format YYYY-MM-DDTHH:MM:SS" - ) + flights: List[ReservationFlight] = Field(description="List of flights in the reservation") + passengers: List[Passenger] = Field(description="List of passengers on the reservation") + payment_history: List[Payment] = Field(description="History of payments for this reservation") + created_at: str = Field(description="Timestamp when reservation was created in the format YYYY-MM-DDTHH:MM:SS") total_baggages: int = Field(description="Total number of bags in reservation") nonfree_baggages: int = Field(description="Number of paid bags in reservation") insurance: Insurance = Field(description="Whether travel insurance was purchased") - status: Optional[Literal["cancelled"]] = Field( - description="Status of the reservation", default=None - ) + status: Optional[Literal["cancelled"]] = Field(description="Status of the reservation", default=None) class FlightDB(DB): """Database of all flights, users, and reservations.""" - flights: Dict[str, Flight] = Field( - description="Dictionary of all flights indexed by flight number" - ) - users: Dict[str, User] = Field( - description="Dictionary of all users indexed by user ID" - ) + flights: Dict[str, Flight] = Field(description="Dictionary of all flights indexed by flight number") + users: Dict[str, User] = Field(description="Dictionary of all users indexed by user ID") reservations: Dict[str, Reservation] = Field( description="Dictionary of all reservations indexed by reservation ID" ) @@ -262,9 +220,7 @@ class FlightDB(DB): def get_statistics(self) -> dict[str, Any]: """Get the statistics of the database.""" num_flights = len(self.flights) - num_flights_instances = sum( - len(flight.dates) for flight in self.flights.values() - ) + num_flights_instances = sum(len(flight.dates) for flight in self.flights.values()) num_users = len(self.users) num_reservations = len(self.reservations) return { diff --git a/vendor/tau2/domains/airline/tools.py b/vendor/tau2/domains/airline/tools.py index d4f45694..d854f725 100644 --- a/vendor/tau2/domains/airline/tools.py +++ b/vendor/tau2/domains/airline/tools.py @@ -62,15 +62,11 @@ def _get_flight_instance(self, flight_number: str, date: str) -> FlightDateStatu raise ValueError(f"Flight {flight_number} not found on date {date}") return flight.dates[date] - def _get_flights_from_flight_infos( - self, flight_infos: List[FlightInfo] - ) -> list[FlightDateStatus]: + def _get_flights_from_flight_infos(self, flight_infos: List[FlightInfo]) -> list[FlightDateStatus]: """Get the flight from the reservation.""" flights = [] for flight_info in flight_infos: - flights.append( - self._get_flight_instance(flight_info.flight_number, flight_info.date) - ) + flights.append(self._get_flight_instance(flight_info.flight_number, flight_info.date)) return flights def _get_new_reservation_id(self) -> str: @@ -123,10 +119,7 @@ def _search_direct_flight( and (destination is None or flight.destination == destination) and (date in flight.dates) and (flight.dates[date].status == "available") - and ( - leave_after is None - or flight.scheduled_departure_time_est >= leave_after - ) + and (leave_after is None or flight.scheduled_departure_time_est >= leave_after) ) if check: direct_flight = DirectFlight( @@ -142,9 +135,7 @@ def _search_direct_flight( results.append(direct_flight) return results - def _payment_for_update( - self, user: User, payment_id: str, total_price: int - ) -> Optional[Payment]: + def _payment_for_update(self, user: User, payment_id: str, total_price: int) -> Optional[Payment]: """ Process payment for update reservation @@ -165,9 +156,7 @@ def _payment_for_update( payment_method = user.payment_methods[payment_id] if payment_method.source == "certificate": raise ValueError("Certificate cannot be used to update reservation") - elif ( - payment_method.source == "gift_card" and payment_method.amount < total_price - ): + elif payment_method.source == "gift_card" and payment_method.amount < total_price: raise ValueError("Gift card balance is not enough") # Deduct payment @@ -219,9 +208,7 @@ def book_reservation( if all(isinstance(passenger, dict) for passenger in passengers): passengers = [Passenger(**passenger) for passenger in passengers] if all(isinstance(payment_method, dict) for payment_method in payment_methods): - payment_methods = [ - Payment(**payment_method) for payment_method in payment_methods - ] + payment_methods = [Payment(**payment_method) for payment_method in payment_methods] user = self._get_user(user_id) reservation_id = self._get_new_reservation_id() @@ -248,14 +235,10 @@ def book_reservation( for flight_info in flights: flight_number = flight_info.flight_number flight = self._get_flight(flight_number) - flight_date_data = self._get_flight_instance( - flight_number=flight_number, date=flight_info.date - ) + flight_date_data = self._get_flight_instance(flight_number=flight_number, date=flight_info.date) # Checking flight availability if not isinstance(flight_date_data, FlightDateStatusAvailable): - raise ValueError( - f"Flight {flight_number} not available on date {flight_info.date}" - ) + raise ValueError(f"Flight {flight_number} not available on date {flight_info.date}") # Checking seat availability if flight_date_data.available_seats[cabin] < len(passengers): raise ValueError(f"Not enough seats on flight {flight_number}") @@ -290,15 +273,11 @@ def book_reservation( user_payment_method = user.payment_methods[payment_id] if user_payment_method.source in {"gift_card", "certificate"}: if user_payment_method.amount < amount: - raise ValueError( - f"Not enough balance in payment method {payment_id}" - ) + raise ValueError(f"Not enough balance in payment method {payment_id}") total_payment = sum(payment.amount for payment in payment_methods) if total_payment != total_price: - raise ValueError( - f"Payment amount does not add up, total price is {total_price}, but paid {total_payment}" - ) + raise ValueError(f"Payment amount does not add up, total price is {total_price}, but paid {total_payment}") # if checks pass, deduct payment for payment_method in payment_methods: @@ -430,9 +409,7 @@ def list_all_airports(self) -> AirportInfo: # DONE ] @is_tool(ToolType.READ) - def search_direct_flight( - self, origin: str, destination: str, date: str - ) -> list[DirectFlight]: + def search_direct_flight(self, origin: str, destination: str, date: str) -> list[DirectFlight]: """ Search for direct flights between two cities on a specific date. @@ -444,9 +421,7 @@ def search_direct_flight( Returns: The direct flights between the two cities on the specific date. """ - return self._search_direct_flight( - date=date, origin=origin, destination=destination - ) + return self._search_direct_flight(date=date, origin=origin, destination=destination) @is_tool(ToolType.READ) def search_onestop_flight( @@ -464,15 +439,9 @@ def search_onestop_flight( A list of pairs of DirectFlight objects. """ results = [] - for result1 in self._search_direct_flight( - date=date, origin=origin, destination=None - ): + for result1 in self._search_direct_flight(date=date, origin=origin, destination=None): result1.date = date - date2 = ( - f"2024-05-{int(date[-2:]) + 1}" - if "+1" in result1.scheduled_arrival_time_est - else date - ) + date2 = f"2024-05-{int(date[-2:]) + 1}" if "+1" in result1.scheduled_arrival_time_est else date # TODO: flight1.scheduled_arrival_time_est could have a +1? for result2 in self._search_direct_flight( date=date2, @@ -637,9 +606,7 @@ def update_reservation_flights( None, ) if matching_reservation_flight: - total_price += matching_reservation_flight.price * len( - reservation.passengers - ) + total_price += matching_reservation_flight.price * len(reservation.passengers) reservation_flights.append(matching_reservation_flight) continue @@ -651,15 +618,11 @@ def update_reservation_flights( date=flight_info.date, ) if not isinstance(flight_date_data, FlightDateStatusAvailable): - raise ValueError( - f"Flight {flight_info.flight_number} not available on date {flight_info.date}" - ) + raise ValueError(f"Flight {flight_info.flight_number} not available on date {flight_info.date}") # Check seat availability if flight_date_data.available_seats[cabin] < len(reservation.passengers): - raise ValueError( - f"Not enough seats on flight {flight_info.flight_number}" - ) + raise ValueError(f"Not enough seats on flight {flight_info.flight_number}") # Calculate price and add to reservation reservation_flight = ReservationFlight( @@ -673,9 +636,7 @@ def update_reservation_flights( reservation_flights.append(reservation_flight) # Deduct amount already paid for reservation - total_price -= sum(flight.price for flight in reservation.flights) * len( - reservation.passengers - ) + total_price -= sum(flight.price for flight in reservation.flights) * len(reservation.passengers) # Create payment payment = self._payment_for_update(user, payment_id, total_price) @@ -690,9 +651,7 @@ def update_reservation_flights( return reservation @is_tool(ToolType.WRITE) - def update_reservation_passengers( - self, reservation_id: str, passengers: List[Passenger | dict] - ) -> Reservation: + def update_reservation_passengers(self, reservation_id: str, passengers: List[Passenger | dict]) -> Reservation: """ Update the passenger information of a reservation. diff --git a/vendor/tau2/domains/mock/data_model.py b/vendor/tau2/domains/mock/data_model.py index f643d3e0..bff026df 100644 --- a/vendor/tau2/domains/mock/data_model.py +++ b/vendor/tau2/domains/mock/data_model.py @@ -24,12 +24,8 @@ class User(BaseModel): class MockDB(DB): """Simple database with users and their tasks.""" - tasks: Dict[str, Task] = Field( - description="Dictionary of all tasks indexed by task ID" - ) - users: Dict[str, User] = Field( - description="Dictionary of all users indexed by user ID" - ) + tasks: Dict[str, Task] = Field(description="Dictionary of all tasks indexed by task ID") + users: Dict[str, User] = Field(description="Dictionary of all users indexed by user ID") def get_db(): diff --git a/vendor/tau2/domains/mock/environment.py b/vendor/tau2/domains/mock/environment.py index d7063315..925fd297 100644 --- a/vendor/tau2/domains/mock/environment.py +++ b/vendor/tau2/domains/mock/environment.py @@ -13,9 +13,7 @@ from vendor.tau2.environment.environment import Environment -def get_environment( - db: Optional[MockDB] = None, solo_mode: bool = False -) -> Environment: +def get_environment(db: Optional[MockDB] = None, solo_mode: bool = False) -> Environment: if db is None: db = MockDB.load(MOCK_DB_PATH) tools = MockTools(db) diff --git a/vendor/tau2/domains/mock/tools.py b/vendor/tau2/domains/mock/tools.py index b36f46af..7c2ab361 100644 --- a/vendor/tau2/domains/mock/tools.py +++ b/vendor/tau2/domains/mock/tools.py @@ -30,9 +30,7 @@ def create_task(self, user_id: str, title: str, description: str = None) -> Task raise ValueError(f"User {user_id} not found") task_id = f"task_{len(self.db.tasks) + 1}" - task = Task( - task_id=task_id, title=title, description=description, status="pending" - ) + task = Task(task_id=task_id, title=title, description=description, status="pending") self.db.tasks[task_id] = task self.db.users[user_id].tasks.append(task_id) diff --git a/vendor/tau2/domains/retail/data_model.py b/vendor/tau2/domains/retail/data_model.py index ddb45e3d..d0415856 100644 --- a/vendor/tau2/domains/retail/data_model.py +++ b/vendor/tau2/domains/retail/data_model.py @@ -22,9 +22,7 @@ class Product(BaseModel): name: str = Field(description="Name of the product") product_id: str = Field(description="Unique identifier for the product") - variants: Dict[str, Variant] = Field( - description="Dictionary of variants indexed by variant ID" - ) + variants: Dict[str, Variant] = Field(description="Dictionary of variants indexed by variant ID") class UserName(BaseModel): @@ -51,23 +49,17 @@ class PaymentMethodBase(BaseModel): class CreditCard(PaymentMethodBase): - source: Literal["credit_card"] = Field( - description="Indicates this is a credit card payment method" - ) + source: Literal["credit_card"] = Field(description="Indicates this is a credit card payment method") brand: str = Field(description="Credit card brand (e.g., visa, mastercard)") last_four: str = Field(description="Last four digits of the credit card") class Paypal(PaymentMethodBase): - source: Literal["paypal"] = Field( - description="Indicates this is a paypal payment method" - ) + source: Literal["paypal"] = Field(description="Indicates this is a paypal payment method") class GiftCard(PaymentMethodBase): - source: Literal["gift_card"] = Field( - description="Indicates this is a gift card payment method" - ) + source: Literal["gift_card"] = Field(description="Indicates this is a gift card payment method") balance: float = Field(description="Gift card value amount") id: str = Field(description="Unique identifier for the gift card") @@ -92,9 +84,7 @@ class OrderFullfilment(BaseModel): """Represents the fulfillment details for items in an order""" tracking_id: list[str] = Field(description="List of tracking IDs for shipments") - item_ids: list[str] = Field( - description="List of item IDs included in this fulfillment" - ) + item_ids: list[str] = Field(description="List of item IDs included in this fulfillment") class OrderItem(BaseModel): @@ -113,9 +103,7 @@ class OrderItem(BaseModel): class OrderPayment(BaseModel): """Represents a payment or refund transaction for an order""" - transaction_type: OrderPaymentType = Field( - description="Type of transaction (payment or refund)" - ) + transaction_type: OrderPaymentType = Field(description="Type of transaction (payment or refund)") amount: float = Field(description="Amount of the transaction") payment_method_id: str = Field(description="ID of the payment method used") @@ -141,32 +129,18 @@ class BaseOrder(BaseModel): address: UserAddress = Field(description="Address of the user") items: List[OrderItem] = Field(description="Items in the order") status: OrderStatus = Field(description="Status of the order") - fulfillments: List[OrderFullfilment] = Field( - description="Fulfillments of the order" - ) + fulfillments: List[OrderFullfilment] = Field(description="Fulfillments of the order") payment_history: List[OrderPayment] = Field(description="Payments of the order") cancel_reason: Optional[CancelReason] = Field( description="Reason for cancelling the order. Can'no longer needed' or 'ordered by mistake'", default=None, ) - exchange_items: Optional[List[str]] = Field( - description="Items to be exchanged", default=None - ) - exchange_new_items: Optional[List[str]] = Field( - description="Items exchanged for", default=None - ) - exchange_payment_method_id: Optional[str] = Field( - description="Payment method ID for the exchange", default=None - ) - exchange_price_difference: Optional[float] = Field( - description="Price difference for the exchange", default=None - ) - return_items: Optional[List[str]] = Field( - description="Items to be returned", default=None - ) - return_payment_method_id: Optional[str] = Field( - description="Payment method ID for the return", default=None - ) + exchange_items: Optional[List[str]] = Field(description="Items to be exchanged", default=None) + exchange_new_items: Optional[List[str]] = Field(description="Items exchanged for", default=None) + exchange_payment_method_id: Optional[str] = Field(description="Payment method ID for the exchange", default=None) + exchange_price_difference: Optional[float] = Field(description="Price difference for the exchange", default=None) + return_items: Optional[List[str]] = Field(description="Items to be returned", default=None) + return_payment_method_id: Optional[str] = Field(description="Payment method ID for the return", default=None) class Order(BaseModel): @@ -177,55 +151,33 @@ class Order(BaseModel): address: UserAddress = Field(description="Address of the user") items: List[OrderItem] = Field(description="Items in the order") status: OrderStatus = Field(description="Status of the order") - fulfillments: List[OrderFullfilment] = Field( - description="Fulfillments of the order" - ) + fulfillments: List[OrderFullfilment] = Field(description="Fulfillments of the order") payment_history: List[OrderPayment] = Field(description="Payments of the order") cancel_reason: Optional[CancelReason] = Field( description="Reason for cancelling the order. Should be 'no longer needed' or 'ordered by mistake'", default=None, ) - exchange_items: Optional[List[str]] = Field( - description="Items to be exchanged", default=None - ) - exchange_new_items: Optional[List[str]] = Field( - description="Items exchanged for", default=None - ) - exchange_payment_method_id: Optional[str] = Field( - description="Payment method ID for the exchange", default=None - ) - exchange_price_difference: Optional[float] = Field( - description="Price difference for the exchange", default=None - ) - return_items: Optional[List[str]] = Field( - description="Items to be returned", default=None - ) - return_payment_method_id: Optional[str] = Field( - description="Payment method ID for the return", default=None - ) + exchange_items: Optional[List[str]] = Field(description="Items to be exchanged", default=None) + exchange_new_items: Optional[List[str]] = Field(description="Items exchanged for", default=None) + exchange_payment_method_id: Optional[str] = Field(description="Payment method ID for the exchange", default=None) + exchange_price_difference: Optional[float] = Field(description="Price difference for the exchange", default=None) + return_items: Optional[List[str]] = Field(description="Items to be returned", default=None) + return_payment_method_id: Optional[str] = Field(description="Payment method ID for the return", default=None) class RetailDB(DB): """Database containing all retail-related data including products, users and orders""" - products: Dict[str, Product] = Field( - description="Dictionary of all products indexed by product ID" - ) - users: Dict[str, User] = Field( - description="Dictionary of all users indexed by user ID" - ) - orders: Dict[str, Order] = Field( - description="Dictionary of all orders indexed by order ID" - ) + products: Dict[str, Product] = Field(description="Dictionary of all products indexed by product ID") + users: Dict[str, User] = Field(description="Dictionary of all users indexed by user ID") + orders: Dict[str, Order] = Field(description="Dictionary of all orders indexed by order ID") def get_statistics(self) -> dict[str, Any]: """Get the statistics of the database.""" num_products = len(self.products) num_users = len(self.users) num_orders = len(self.orders) - total_num_items = sum( - len(product.variants) for product in self.products.values() - ) + total_num_items = sum(len(product.variants) for product in self.products.values()) return { "num_products": num_products, "num_users": num_users, diff --git a/vendor/tau2/domains/retail/tools.py b/vendor/tau2/domains/retail/tools.py index 944e206c..6fc91e15 100644 --- a/vendor/tau2/domains/retail/tools.py +++ b/vendor/tau2/domains/retail/tools.py @@ -92,9 +92,7 @@ def _get_variant(self, product_id: str, variant_id: str) -> Variant: raise ValueError("Variant not found") return product.variants[variant_id] - def _get_payment_method( - self, user_id: str, payment_method_id: str - ) -> PaymentMethod: + def _get_payment_method(self, user_id: str, payment_method_id: str) -> PaymentMethod: """Get the payment method from the database. Args: @@ -252,9 +250,7 @@ def exchange_delivered_order_items( payment_method = self._get_payment_method(order.user_id, payment_method_id) if isinstance(payment_method, GiftCard) and payment_method.balance < diff_price: - raise ValueError( - "Insufficient gift card balance to pay for the price difference" - ) + raise ValueError("Insufficient gift card balance to pay for the price difference") # modify the order order.status = "exchange requested" @@ -266,9 +262,7 @@ def exchange_delivered_order_items( return order @is_tool(ToolType.READ) - def find_user_id_by_name_zip( - self, first_name: str, last_name: str, zip: str - ) -> str: + def find_user_id_by_name_zip(self, first_name: str, last_name: str, zip: str) -> str: """Find user id by first name, last name, and zip code. If the user is not found, the function will return an error message. By default, find user id by email, and only call this function if the user is not found by email or cannot remember email. @@ -368,9 +362,7 @@ def list_all_product_types(self) -> str: Returns: str: A JSON string mapping product names to their product IDs, sorted alphabetically by name. """ - product_dict = { - product.name: product.product_id for product in self.db.products.values() - } + product_dict = {product.name: product.product_id for product in self.db.products.values()} return json.dumps(product_dict, sort_keys=True) @is_tool(ToolType.WRITE) @@ -461,9 +453,7 @@ def modify_pending_order_items( diff_price = 0 for item_id, new_item_id in zip(item_ids, new_item_ids): if item_id == new_item_id: - raise ValueError( - "The new item id should be different from the old item id" - ) + raise ValueError("The new item id should be different from the old item id") item = next((item for item in order.items if item.item_id == item_id), None) if item is None: raise ValueError(f"Item {item_id} not found") @@ -538,17 +528,12 @@ def modify_pending_order_payment( payment_method = self._get_payment_method(order.user_id, payment_method_id) # Check that the payment history should only have one payment - if ( - len(order.payment_history) != 1 - or order.payment_history[0].transaction_type != "payment" - ): + if len(order.payment_history) != 1 or order.payment_history[0].transaction_type != "payment": raise ValueError("There should be exactly one payment for a pending order") # Check that the payment method is different if order.payment_history[0].payment_method_id == payment_method_id: - raise ValueError( - "The new payment method should be different from the current one" - ) + raise ValueError("The new payment method should be different from the current one") amount = order.payment_history[0].amount @@ -578,9 +563,7 @@ def modify_pending_order_payment( payment_method.balance = round(payment_method.balance, 2) # If refund is made to a gift card, update the balance - old_payment_method = self._get_payment_method( - order.user_id, order.payment_history[0].payment_method_id - ) + old_payment_method = self._get_payment_method(order.user_id, order.payment_history[0].payment_method_id) if isinstance(old_payment_method, GiftCard): old_payment_method.balance += amount old_payment_method.balance = round(old_payment_method.balance, 2) diff --git a/vendor/tau2/domains/telecom/data_model.py b/vendor/tau2/domains/telecom/data_model.py index b5dea830..ffc8b1a0 100644 --- a/vendor/tau2/domains/telecom/data_model.py +++ b/vendor/tau2/domains/telecom/data_model.py @@ -23,9 +23,7 @@ class Plan(BaseModelNoExtra): name: str = Field(description="Display name of the plan") data_limit_gb: float = Field(description="Monthly data allowance in gigabytes (GB)") price_per_month: float = Field(description="Monthly price of the plan in USD") - data_refueling_price_per_gb: float = Field( - description="Price per gigabyte for data refueling" - ) + data_refueling_price_per_gb: float = Field(description="Price per gigabyte for data refueling") class DeviceType(str, Enum): @@ -40,15 +38,9 @@ class Device(BaseModelNoExtra): device_id: str = Field(description="Unique identifier for the device") device_type: DeviceType = Field(description="Type/category of the device") model: str = Field(description="Model name/number of the device") - imei: Optional[str] = Field( - None, description="International Mobile Equipment Identity number" - ) - is_esim_capable: bool = Field( - description="Whether the device supports eSIM technology" - ) - activated: bool = Field( - False, description="Whether the device has been activated on the network" - ) + imei: Optional[str] = Field(None, description="International Mobile Equipment Identity number") + is_esim_capable: bool = Field(description="Whether the device supports eSIM technology") + activated: bool = Field(False, description="Whether the device has been activated on the network") activation_date: Optional[datetime.datetime] = Field( None, description="Date and time when the device was activated (format: YYYY-MM-DDTHH:MM:SS, timezone: EST)", @@ -69,22 +61,12 @@ class LineStatus(str, Enum): class Line(BaseModelNoExtra): line_id: str = Field(description="Unique identifier for the line") phone_number: str = Field(description="Phone number associated with the line") - status: LineStatus = Field( - LineStatus.PENDING_ACTIVATION, description="Current status of the line" - ) + status: LineStatus = Field(LineStatus.PENDING_ACTIVATION, description="Current status of the line") plan_id: str = Field(description="Plan associated with this line") - device_id: Optional[str] = Field( - None, description="Device associated with this line" - ) - data_used_gb: float = Field( - 0.0, description="Data used in the current billing cycle in gigabytes (GB)" - ) - data_refueling_gb: float = Field( - 0.0, description="Data refueled in the current billing cycle in gigabytes (GB)" - ) - roaming_enabled: bool = Field( - False, description="Whether international roaming is enabled for this line" - ) + device_id: Optional[str] = Field(None, description="Device associated with this line") + data_used_gb: float = Field(0.0, description="Data used in the current billing cycle in gigabytes (GB)") + data_refueling_gb: float = Field(0.0, description="Data refueled in the current billing cycle in gigabytes (GB)") + roaming_enabled: bool = Field(False, description="Whether international roaming is enabled for this line") contract_end_date: Optional[datetime.date] = Field( None, description="End date of the current contract, if applicable (format: YYYY-MM-DD, timezone: EST)", @@ -105,15 +87,9 @@ class Line(BaseModelNoExtra): class LineItem(BaseModelNoExtra): description: str = Field(description="Descriptive text for the line item") - amount: float = Field( - description="Monetary amount in USD (positive for charges, negative for credits)" - ) - date: datetime.date = Field( - description="Date the line item was applied (format: YYYY-MM-DD, timezone: EST)" - ) - item_type: str = Field( - description="Category of the line item (e.g., Plan Charge, Overage, Fee, Credit, Payment)" - ) + amount: float = Field(description="Monetary amount in USD (positive for charges, negative for credits)") + date: datetime.date = Field(description="Date the line item was applied (format: YYYY-MM-DD, timezone: EST)") + item_type: str = Field(description="Category of the line item (e.g., Plan Charge, Overage, Fee, Credit, Payment)") class BillStatus(str, Enum): @@ -131,23 +107,17 @@ class Bill(BaseModelNoExtra): period_start: datetime.date = Field( description="Start date of the billing period (format: YYYY-MM-DD, timezone: EST)" ) - period_end: datetime.date = Field( - description="End date of the billing period (format: YYYY-MM-DD, timezone: EST)" - ) + period_end: datetime.date = Field(description="End date of the billing period (format: YYYY-MM-DD, timezone: EST)") issue_date: datetime.date = Field( description="Date the bill was issued/generated (format: YYYY-MM-DD, timezone: EST)" ) total_due: float = Field(description="Total amount due in USD") - due_date: datetime.date = Field( - description="Date by which payment is due (format: YYYY-MM-DD, timezone: EST)" - ) + due_date: datetime.date = Field(description="Date by which payment is due (format: YYYY-MM-DD, timezone: EST)") line_items: List[LineItem] = Field( default_factory=list, description="Individual charges, credits, and payments on this bill", ) - status: BillStatus = Field( - BillStatus.DRAFT, description="Current status of the bill" - ) + status: BillStatus = Field(BillStatus.DRAFT, description="Current status of the bill") class AccountStatus(str, Enum): @@ -165,20 +135,14 @@ class PaymentMethodType(str, Enum): class PaymentMethod(BaseModelNoExtra): method_type: PaymentMethodType = Field(description="Type of payment method") - account_number_last_4: str = Field( - description="Last 4 digits of the account number" - ) - expiration_date: str = Field( - description="The expiration date of the payment method in the format MM/YYYY" - ) + account_number_last_4: str = Field(description="Last 4 digits of the account number") + expiration_date: str = Field(description="The expiration date of the payment method in the format MM/YYYY") class Customer(BaseModelNoExtra): customer_id: str = Field(description="Unique identifier for the customer") full_name: str = Field(description="Customer's full name") - date_of_birth: str = Field( - description="Customer's date of birth for identity verification (format: YYYY-MM-DD)" - ) + date_of_birth: str = Field(description="Customer's date of birth for identity verification (format: YYYY-MM-DD)") email: str = Field(description="Customer's email address") phone_number: str = Field(description="Customer's primary contact phone number") address: Address = Field(description="Customer's billing address") @@ -189,12 +153,8 @@ class Customer(BaseModelNoExtra): payment_methods: List[PaymentMethod] = Field( default_factory=list, description="Stored payment methods for this customer" ) - line_ids: List[str] = Field( - default_factory=list, description="Phone/data lines owned by this customer" - ) - bill_ids: List[str] = Field( - default_factory=list, description="Bills associated with this customer" - ) + line_ids: List[str] = Field(default_factory=list, description="Phone/data lines owned by this customer") + bill_ids: List[str] = Field(default_factory=list, description="Bills associated with this customer") created_at: datetime.datetime = Field( DEFAULT_START_DATE, description="Date and time when the customer account was created (format: YYYY-MM-DDTHH:MM:SS, timezone: EST)", @@ -211,21 +171,11 @@ class Customer(BaseModelNoExtra): class TelecomDB(DB): """Database interface for telecom domain.""" - plans: List[Plan] = Field( - default_factory=list, description="Available service plans" - ) - customers: List[Customer] = Field( - default_factory=list, description="All customers in the system" - ) - lines: List[Line] = Field( - default_factory=list, description="All lines in the system" - ) - bills: List[Bill] = Field( - default_factory=list, description="All bills in the system" - ) - devices: List[Device] = Field( - default_factory=list, description="All devices in the system" - ) + plans: List[Plan] = Field(default_factory=list, description="Available service plans") + customers: List[Customer] = Field(default_factory=list, description="All customers in the system") + lines: List[Line] = Field(default_factory=list, description="All lines in the system") + bills: List[Bill] = Field(default_factory=list, description="All bills in the system") + devices: List[Device] = Field(default_factory=list, description="All devices in the system") def get_statistics(self) -> Dict[str, Any]: """Get the statistics of the database.""" @@ -234,9 +184,7 @@ def get_statistics(self) -> Dict[str, Any]: num_lines = len(self.lines) num_bills = len(self.bills) num_devices = len(self.devices) - num_payment_methods = sum( - len(customer.payment_methods) for customer in self.customers - ) + num_payment_methods = sum(len(customer.payment_methods) for customer in self.customers) return { "num_plans": num_plans, diff --git a/vendor/tau2/domains/telecom/environment.py b/vendor/tau2/domains/telecom/environment.py index 00ab1b9e..5dcbaa79 100644 --- a/vendor/tau2/domains/telecom/environment.py +++ b/vendor/tau2/domains/telecom/environment.py @@ -47,9 +47,7 @@ def sync_tools(self): phone_number = self.user_tools.db.surroundings.phone_number line = self.tools._get_line_by_phone(phone_number) if line is None: - raise ValueError( - f"Wrong scenario, line not found for phone number: {phone_number}" - ) + raise ValueError(f"Wrong scenario, line not found for phone number: {phone_number}") # Check if the line is active if line.status == LineStatus.ACTIVE: self.user_tools.db.surroundings.line_active = True @@ -65,9 +63,7 @@ def sync_tools(self): # Check if the user has exceeded their data usage limit plan = self.tools._get_plan_by_id(line.plan_id) if plan is None: - raise ValueError( - f"Wrong scenario, invalid plan id ({line.plan_id}) for the phone number {phone_number}" - ) + raise ValueError(f"Wrong scenario, invalid plan id ({line.plan_id}) for the phone number {phone_number}") if line.data_used_gb >= plan.data_limit_gb + line.data_refueling_gb: self.user_tools.db.surroundings.mobile_data_usage_exceeded = True else: @@ -82,9 +78,7 @@ def sync_tools(self): # Check if the user has a payment request current_payment_request = self.user_tools.db.surroundings.payment_request - if ( - current_payment_request is None - ): # If there already is a payment request, do nothing + if current_payment_request is None: # If there already is a payment request, do nothing customer = self.tools.get_customer_by_phone(phone_number) bills = self.tools._get_bills_awaiting_payment(customer) if len(bills) != 0: diff --git a/vendor/tau2/domains/telecom/tasks/const.py b/vendor/tau2/domains/telecom/tasks/const.py index 8c41d7dc..555d935d 100644 --- a/vendor/tau2/domains/telecom/tasks/const.py +++ b/vendor/tau2/domains/telecom/tasks/const.py @@ -1,6 +1,6 @@ TOOL_CALL_INFO_CHECK = "If the tool call does not return updated status information, you might need to perform another tool call to get the updated status." TOOL_CALL_GROUNDING = """ -Whenever the agent asks you about your device, always ground your responses on the results of tool calls. +Whenever the agent asks you about your device, always ground your responses on the results of tool calls. For example: If the agent asks what the status bar shows, always ground your response on the results of the `get_status_bar` tool call. If the agent asks if you are able to send an MMS message, always ground your response on the results of the `can_send_mms` tool call. Never make up the results of tool calls, always ground your responses on the results of tool calls. If you are unsure about whether an action is necessary, always ask the agent for clarification. diff --git a/vendor/tau2/domains/telecom/tasks/create_tasks.py b/vendor/tau2/domains/telecom/tasks/create_tasks.py index aeaf6f40..e6a297f8 100644 --- a/vendor/tau2/domains/telecom/tasks/create_tasks.py +++ b/vendor/tau2/domains/telecom/tasks/create_tasks.py @@ -27,7 +27,7 @@ def create_tasks(save_tasks: bool = True, max_count_per_bin: int = 3) -> list[Ta print(f"Number of tasks: {len(tasks)}") - file = DATA_DIR / "domains" / "telecom" / f"tasks_full.json" + file = DATA_DIR / "domains" / "telecom" / "tasks_full.json" if save_tasks: with open(file, "w") as f: json.dump([t.model_dump() for t in tasks], f, indent=2) @@ -50,14 +50,14 @@ def create_tasks(save_tasks: bool = True, max_count_per_bin: int = 3) -> list[Ta } ) - file_small = DATA_DIR / "domains" / "telecom" / f"tasks_small.json" + file_small = DATA_DIR / "domains" / "telecom" / "tasks_small.json" small_tasks = [t["task"] for t in tasks_with_attrs if t["num_subtasks"] == 1] print(f"Number of tasks in small set: {len(small_tasks)}") if save_tasks: with open(file_small, "w") as f: json.dump([t.model_dump() for t in small_tasks], f, indent=2) - file_sampled = DATA_DIR / "domains" / "telecom" / f"tasks.json" + file_sampled = DATA_DIR / "domains" / "telecom" / "tasks.json" tasks_by_bins = defaultdict(list) for task in tasks_with_attrs: if task["num_subtasks"] < 2: # We only keep tasks with at least 2 subtasks diff --git a/vendor/tau2/domains/telecom/tasks/mms_issues.py b/vendor/tau2/domains/telecom/tasks/mms_issues.py index 13319476..313ce272 100644 --- a/vendor/tau2/domains/telecom/tasks/mms_issues.py +++ b/vendor/tau2/domains/telecom/tasks/mms_issues.py @@ -92,9 +92,7 @@ def break_apn_mms_setting(*args, **kwargs) -> list[EnvFunctionCall]: ] -def _get_remove_app_permission_actions( - app_name: str = "messaging", permission: str = "sms" -): +def _get_remove_app_permission_actions(app_name: str = "messaging", permission: str = "sms"): """ Get the remove app permission actions for the mms issue task. """ @@ -116,9 +114,7 @@ def break_app_storage_permission(*args, **kwargs) -> list[EnvFunctionCall]: """ Break the app storage permission for the mms issue task. """ - return [ - _get_remove_app_permission_actions(app_name="messaging", permission="storage") - ] + return [_get_remove_app_permission_actions(app_name="messaging", permission="storage")] def break_app_both_permissions(*args, **kwargs) -> list[EnvFunctionCall]: @@ -163,9 +159,7 @@ def fix_break_apn_mms_setting(*args, **kwargs) -> list[ToolCall]: ] -def _get_grant_app_permission_actions( - app_name: str = "messaging", permission: str = "sms" -) -> ToolCall: +def _get_grant_app_permission_actions(app_name: str = "messaging", permission: str = "sms") -> ToolCall: """ Get the grant app permission actions for the mms issue task. """ @@ -187,9 +181,7 @@ def fix_break_app_storage_permission(*args, **kwargs) -> list[ToolCall]: """ Fix the break app storage permission issue. """ - return [ - _get_grant_app_permission_actions(app_name="messaging", permission="storage") - ] + return [_get_grant_app_permission_actions(app_name="messaging", permission="storage")] def fix_break_app_both_permissions(*args, **kwargs) -> list[ToolCall]: @@ -277,11 +269,7 @@ def fix_break_app_both_permissions(*args, **kwargs) -> list[ToolCall]: app_permission_issues, # Step3.5 ] -selection_sets = ( - service_issues_sample_sets - + mobile_data_issues_sample_sets - + mms_issues_selection_sets -) +selection_sets = service_issues_sample_sets + mobile_data_issues_sample_sets + mms_issues_selection_sets def task_validator(tasks: list[Optional[BaseTask]]): @@ -304,9 +292,7 @@ def task_validator(tasks: list[Optional[BaseTask]]): num_tasks_mms_issues = len( [ task - for task in tasks[ - len(service_issues_sample_sets) + len(mobile_data_issues_sample_sets) : - ] + for task in tasks[len(service_issues_sample_sets) + len(mobile_data_issues_sample_sets) :] if task is not None ] ) diff --git a/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py b/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py index 8e1caa70..b5405d44 100644 --- a/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py +++ b/vendor/tau2/domains/telecom/tasks/mobile_data_issues.py @@ -485,9 +485,7 @@ def assert_data_refueling_amount(env: TelecomEnvironment) -> list[EnvAssertion]: # Path 2.2: Slow Mobile Data # Requires workflow Step 2.2.1 -data_usage_exceeded_issues = SelectionSet( - tasks=[data_usage_exceeded_task, data_usage_exceeded_no_refuel_task] -) +data_usage_exceeded_issues = SelectionSet(tasks=[data_usage_exceeded_task, data_usage_exceeded_no_refuel_task]) # Requires workflow Step 2.2.2 data_saver_mode_issues = SelectionSet(tasks=[data_saver_mode_on_task]) @@ -518,9 +516,7 @@ def task_validator(tasks: list[Optional[BaseTask]]): # num_tasks_service_issues = len( # [task for task in tasks[: len(service_issues_sample_sets)] if task is not None] # ) - num_tasks_mobile_data_issues = len( - [task for task in tasks[len(service_issues_sample_sets) :] if task is not None] - ) + num_tasks_mobile_data_issues = len([task for task in tasks[len(service_issues_sample_sets) :] if task is not None]) return num_tasks_mobile_data_issues > 0 diff --git a/vendor/tau2/domains/telecom/tasks/utils.py b/vendor/tau2/domains/telecom/tasks/utils.py index 73c22b37..360660e5 100644 --- a/vendor/tau2/domains/telecom/tasks/utils.py +++ b/vendor/tau2/domains/telecom/tasks/utils.py @@ -66,9 +66,7 @@ def compose_tasks( Return all the combinations of selecting 0 or more tasks from the selection sets """ - product_tasks = list( - product(*[selection_set.tasks + [None] for selection_set in selection_sets]) - ) + product_tasks = list(product(*[selection_set.tasks + [None] for selection_set in selection_sets])) composed_tasks = [] for tasks in product_tasks: if task_validator is not None: diff --git a/vendor/tau2/domains/telecom/tools.py b/vendor/tau2/domains/telecom/tools.py index 0a7fd05d..c092d35e 100644 --- a/vendor/tau2/domains/telecom/tools.py +++ b/vendor/tau2/domains/telecom/tools.py @@ -102,10 +102,7 @@ def get_customer_by_name(self, full_name: str, dob: str) -> List[Customer]: matching_customers = [] for customer in self.db.customers: - if ( - customer.full_name.lower() == full_name.lower() - and customer.date_of_birth == dob - ): + if customer.full_name.lower() == full_name.lower() and customer.date_of_birth == dob: matching_customers.append(customer) return matching_customers @@ -259,9 +256,7 @@ def get_details_by_id(self, id: str) -> Dict[str, Any]: raise ValueError(f"Unknown ID format or type: {id}") @is_tool(ToolType.WRITE) - def suspend_line( - self, customer_id: str, line_id: str, reason: str - ) -> Dict[str, Any]: + def suspend_line(self, customer_id: str, line_id: str, reason: str) -> Dict[str, Any]: """ Suspends a specific line (max 6 months). Checks: Line status must be Active. @@ -411,9 +406,7 @@ def _set_bill_to_paid(self, bill_id: str) -> None: bill.status = BillStatus.PAID return f"Bill {bill_id} set to paid" - def _apply_one_time_charge( - self, customer_id: str, amount: float, description: str - ) -> None: + def _apply_one_time_charge(self, customer_id: str, amount: float, description: str) -> None: """ Internal function to add a specific charge LineItem to the customer's next bill. Creates a pending bill if none exists. @@ -453,11 +446,7 @@ def _apply_one_time_charge( period_start=next_month, period_end=next_month.replace( month=next_month.month + 1 if next_month.month < 12 else 1, - year=( - next_month.year - if next_month.month < 12 - else next_month.year + 1 - ), + year=(next_month.year if next_month.month < 12 else next_month.year + 1), ) - timedelta(days=1), issue_date=next_month, @@ -501,9 +490,7 @@ def get_data_usage(self, customer_id: str, line_id: str) -> Dict[str, Any]: plan = self._get_plan_by_id(target_line.plan_id) today = get_today() - cycle_end_date = date( - today.year, today.month + 1 if today.month < 12 else 1, 1 - ) - timedelta(days=1) + cycle_end_date = date(today.year, today.month + 1 if today.month < 12 else 1, 1) - timedelta(days=1) return { "line_id": line_id, @@ -513,9 +500,7 @@ def get_data_usage(self, customer_id: str, line_id: str) -> Dict[str, Any]: "cycle_end_date": cycle_end_date, } - def set_data_usage( - self, customer_id: str, line_id: str, data_used_gb: float - ) -> str: + def set_data_usage(self, customer_id: str, line_id: str, data_used_gb: float) -> str: """ Sets the data usage for a line. Note: This method is not decorated as a tool but follows similar error handling. @@ -605,9 +590,7 @@ def transfer_to_human_agents(self, summary: str) -> str: return "Transfer successful" @is_tool(ToolType.WRITE) - def refuel_data( - self, customer_id: str, line_id: str, gb_amount: float - ) -> Dict[str, Any]: + def refuel_data(self, customer_id: str, line_id: str, gb_amount: float) -> Dict[str, Any]: """ Refuels data for a specific line, adding to the customer's bill. Checks: Line status must be Active, Customer owns the line. @@ -646,9 +629,7 @@ def refuel_data( f"Data refueling: {gb_amount} GB at ${plan.data_refueling_price_per_gb}/GB", ) - logger.info( - f"Data refueled for line {line_id}: {gb_amount} GB added, charge: ${charge_amount:.2f}" - ) + logger.info(f"Data refueled for line {line_id}: {gb_amount} GB added, charge: ${charge_amount:.2f}") return { "message": f"Successfully added {gb_amount} GB of data for line {line_id} for ${charge_amount:.2f}", @@ -721,27 +702,21 @@ def suspend_line_for_overdue_bill( return f"Line {line_id} suspended for unpaid bill {new_bill_id}. Contract ended: {contract_ended}" ### Assertions - def assert_data_refueling_amount( - self, customer_id: str, line_id: str, expected_amount: float - ) -> bool: + def assert_data_refueling_amount(self, customer_id: str, line_id: str, expected_amount: float) -> bool: """ Assert that the data refueling amount is as expected. """ target_line = self._get_target_line(customer_id, line_id) return abs(target_line.data_refueling_gb - expected_amount) < 1e-6 - def assert_line_status( - self, customer_id: str, line_id: str, expected_status: LineStatus - ) -> bool: + def assert_line_status(self, customer_id: str, line_id: str, expected_status: LineStatus) -> bool: """ Assert that the line status is as expected. """ target_line = self._get_target_line(customer_id, line_id) return target_line.status == expected_status - def assert_overdue_bill_exists( - self, customer_id: str, overdue_bill_id: str - ) -> bool: + def assert_overdue_bill_exists(self, customer_id: str, overdue_bill_id: str) -> bool: """ Assert that the overdue bill exists. """ diff --git a/vendor/tau2/domains/telecom/user_data_model.py b/vendor/tau2/domains/telecom/user_data_model.py index 77d838f9..218bbf93 100644 --- a/vendor/tau2/domains/telecom/user_data_model.py +++ b/vendor/tau2/domains/telecom/user_data_model.py @@ -99,12 +99,8 @@ def is_mms_basic_configured(self) -> bool: class VpnDetails(BaseModelNoExtra): """Holds details about the VPN connection if active.""" - server_address: Optional[str] = Field( - None, description="Address of the connected VPN server." - ) - protocol: Optional[str] = Field( - None, description="VPN protocol being used (e.g., WireGuard, OpenVPN)." - ) + server_address: Optional[str] = Field(None, description="Address of the connected VPN server.") + protocol: Optional[str] = Field(None, description="VPN protocol being used (e.g., WireGuard, OpenVPN).") server_performance: PerformanceLevel = Field( default=PerformanceLevel.UNKNOWN, validate_default=True, @@ -118,9 +114,7 @@ class AppPermissions(BaseModelNoExtra): sms: bool = Field(False, description="Permission to send/read SMS/MMS.") storage: bool = Field(False, description="Permission to access device storage.") phone: bool = Field(False, description="Permission to make/manage phone calls.") - network: bool = Field( - False, description="Permission to access network state/internet." - ) + network: bool = Field(False, description="Permission to access network state/internet.") class AppStatus(BaseModelNoExtra): @@ -146,22 +140,14 @@ class StatusBar(BaseModelNoExtra): validate_default=True, description="The network technology (2G, 3G, 4G, etc.) shown in the status bar.", ) - wifi_connected: bool = Field( - False, description="Whether WiFi is connected and shown in the status bar." - ) - airplane_mode: bool = Field( - False, description="Whether airplane mode is on and shown in the status bar." - ) - vpn_active: bool = Field( - False, description="Whether a VPN is active and shown in the status bar." - ) + wifi_connected: bool = Field(False, description="Whether WiFi is connected and shown in the status bar.") + airplane_mode: bool = Field(False, description="Whether airplane mode is on and shown in the status bar.") + vpn_active: bool = Field(False, description="Whether a VPN is active and shown in the status bar.") data_saver_active: bool = Field( False, description="Whether data saver mode is active and shown in the status bar.", ) - battery_level: int = Field( - 100, description="The battery level (0-100) shown in the status bar." - ) + battery_level: int = Field(100, description="The battery level (0-100) shown in the status bar.") # --- Main Device State Model --- @@ -201,9 +187,7 @@ class MockPhoneAttributes(BaseModelNoExtra): ) # --- Battery --- - battery_level: int = Field( - 80, description="The current battery level, from 0 to 100 percent." - ) + battery_level: int = Field(80, description="The current battery level, from 0 to 100 percent.") # --- Mobile Data --- data_enabled: bool = Field( @@ -230,9 +214,7 @@ class MockPhoneAttributes(BaseModelNoExtra): False, description="Whether the device is currently connected to a Wi-Fi network.", ) - wifi_ssid: Optional[str] = Field( - None, description="The name (SSID) of the connected Wi-Fi network, if any." - ) + wifi_ssid: Optional[str] = Field(None, description="The name (SSID) of the connected Wi-Fi network, if any.") wifi_signal_strength: SignalStrength = Field( default=SignalStrength.NONE, validate_default=True, @@ -240,9 +222,7 @@ class MockPhoneAttributes(BaseModelNoExtra): ) # --- Calling Features --- - wifi_calling_enabled: bool = Field( - False, description="Whether the Wi-Fi Calling feature is enabled." - ) + wifi_calling_enabled: bool = Field(False, description="Whether the Wi-Fi Calling feature is enabled.") wifi_calling_mms_over_wifi: bool = Field( False, description="Preference/capability to send/receive MMS over Wi-Fi (depends on carrier and device support).", @@ -259,9 +239,7 @@ class MockPhoneAttributes(BaseModelNoExtra): False, description="Whether a VPN profile is configured and potentially set to be 'always on' or manually enabled in settings.", ) - vpn_connected: bool = Field( - False, description="Whether there currently is an active VPN connection tunnel." - ) + vpn_connected: bool = Field(False, description="Whether there currently is an active VPN connection tunnel.") vpn_details: Optional[VpnDetails] = Field( None, description="Details about the active VPN connection, if connected." ) @@ -321,13 +299,9 @@ class UserSurroundings(BaseModelNoExtra): """Represents the physical surroundings of the user.""" name: Optional[str] = Field(None, description="The name of the user.") - phone_number: Optional[str] = Field( - None, description="The phone number of the user." - ) + phone_number: Optional[str] = Field(None, description="The phone number of the user.") is_abroad: bool = Field(False, description="Whether the user is currently abroad.") - roaming_allowed: bool = Field( - False, description="Whether the user is allowed to roam." - ) + roaming_allowed: bool = Field(False, description="Whether the user is allowed to roam.") signal_strength: dict[NetworkTechnology, SignalStrength] = Field( default_factory=lambda: { NetworkTechnology.TWO_G: SignalStrength.POOR, @@ -341,17 +315,13 @@ class UserSurroundings(BaseModelNoExtra): False, description="Whether the user has exceeded their data usage limit." ) line_active: bool = Field(True, description="Whether the user has an active line.") - payment_request: Optional[PaymentRequest] = Field( - None, description="The payment that the agent has requested." - ) + payment_request: Optional[PaymentRequest] = Field(None, description="The payment that the agent has requested.") class TelecomUserDB(DB): """Database interface for telecom domain.""" - device: MockPhoneAttributes = Field( - default_factory=MockPhoneAttributes, description="Mock phone device" - ) + device: MockPhoneAttributes = Field(default_factory=MockPhoneAttributes, description="Mock phone device") surroundings: UserSurroundings = Field( default_factory=UserSurroundings, description="User's physical surroundings" ) @@ -381,24 +351,16 @@ def main(): print("\n--- State after enabling Airplane Mode ---") print(f"Airplane Mode: {db.device.airplane_mode}") print(f"Network Status: {db.device.network_connection_status}") - print( - f"Helper - Potentially Online Mobile: {db.device.is_potentially_online_mobile()}" - ) + print(f"Helper - Potentially Online Mobile: {db.device.is_potentially_online_mobile()}") # 3. Simulate another problem: User disables Mobile Data and has wrong APN MMS URL # Start from default state again for clarity db = TelecomUserDB() update_2 = { "data_enabled": False, - "active_apn_settings": { # Update nested model - "mmsc_url": None # Simulate missing MMS config - }, + "active_apn_settings": {"mmsc_url": None}, # Update nested model # Simulate missing MMS config "app_statuses": { # Update nested dictionary/model - "messaging": { - "permissions": { - "storage": False - } # Update nested AppPermissions model field - } + "messaging": {"permissions": {"storage": False}} # Update nested AppPermissions model field }, } db.update_device(update_2) diff --git a/vendor/tau2/domains/telecom/user_tools.py b/vendor/tau2/domains/telecom/user_tools.py index aedf9fc4..aca28745 100644 --- a/vendor/tau2/domains/telecom/user_tools.py +++ b/vendor/tau2/domains/telecom/user_tools.py @@ -28,9 +28,7 @@ class TelecomUserTools(ToolKitBase): db: TelecomUserDB - network_mode_preference: NetworkModePreference = ( - NetworkModePreference.FOUR_G_5G_PREFERRED - ) + network_mode_preference: NetworkModePreference = NetworkModePreference.FOUR_G_5G_PREFERRED default_vpn_details: VpnDetails = VpnDetails( server_address="192.168.1.1", @@ -100,19 +98,14 @@ def _check_status_bar(self) -> str: SignalStrength.GOOD: "📶³ Good", SignalStrength.EXCELLENT: "📶⁴ Excellent", } - indicators.append( - signal_map.get(device.network_signal_strength, "📵 No Signal") - ) + indicators.append(signal_map.get(device.network_signal_strength, "📵 No Signal")) # Network technology if device.network_technology_connected != NetworkTechnology.NONE: indicators.append(device.network_technology_connected.value) # Data enabled indicator - if ( - device.data_enabled - and device.network_technology_connected != NetworkTechnology.NONE - ): + if device.data_enabled and device.network_technology_connected != NetworkTechnology.NONE: indicators.append("📱 Data Enabled") if device.data_saver_mode: indicators.append("🔽 Data Saver") @@ -186,9 +179,7 @@ def _check_network_mode_preference(self) -> NetworkModePreference: return self.device.network_mode_preference @is_tool(ToolType.WRITE) - def set_network_mode_preference( - self, mode: Union[NetworkModePreference, str] - ) -> str: + def set_network_mode_preference(self, mode: Union[NetworkModePreference, str]) -> str: """Changes the type of cellular network your phone prefers to connect to (e.g., 5G, LTE/4G, 3G). Higher-speed networks (LTE/5G) provide faster data but may use more battery.""" valid_mode = self._set_network_mode_preference(mode) if valid_mode is None: @@ -196,9 +187,7 @@ def set_network_mode_preference( status_update = f"Preferred Network Mode set to: {valid_mode.value}" return f"{status_update}\nStatus Bar: {self._check_status_bar()}" - def _set_network_mode_preference( - self, mode: Union[NetworkModePreference, str] - ) -> Optional[NetworkModePreference]: + def _set_network_mode_preference(self, mode: Union[NetworkModePreference, str]) -> Optional[NetworkModePreference]: """Sets the preferred network mode. This will trigger a network search. """ @@ -222,10 +211,7 @@ def _get_mobile_data_working(self) -> bool: - Data is not enabled - Data usage is exceeded """ - if ( - self.device.airplane_mode - or self.device.network_signal_strength == SignalStrength.NONE - ): + if self.device.airplane_mode or self.device.network_signal_strength == SignalStrength.NONE: return False if self.device.network_connection_status == NetworkStatus.NO_SERVICE: @@ -255,9 +241,7 @@ def run_speed_test(self) -> str: if description == "Very Poor": advice = "Connection is very slow. Basic web browsing might be difficult." elif description == "Poor": - advice = ( - "Connection is slow. Web browsing may be sluggish, streaming difficult." - ) + advice = "Connection is slow. Web browsing may be sluggish, streaming difficult." elif description == "Fair": advice = "Connection is okay for web browsing and some standard definition streaming." elif description == "Good": @@ -328,9 +312,7 @@ def _run_speed_test(self) -> Tuple[Optional[float], Optional[str]]: NetworkTechnology.FIVE_G: (50.0, 500.0), NetworkTechnology.NONE: (0.0, 0.0), } - min_speed, max_speed = tech_speed_map.get( - self.device.network_technology_connected, (0.0, 0.0) - ) + min_speed, max_speed = tech_speed_map.get(self.device.network_technology_connected, (0.0, 0.0)) # Adjust speed based on signal strength signal_factor_map = { @@ -343,9 +325,7 @@ def _run_speed_test(self) -> Tuple[Optional[float], Optional[str]]: signal_factor = signal_factor_map.get(self.device.network_signal_strength, 0.0) # Calculate simulated speed - simulated_speed = ( - (min_speed + max_speed) / 2.0 * signal_factor * base_speed_factor - ) + simulated_speed = (min_speed + max_speed) / 2.0 * signal_factor * base_speed_factor simulated_speed = round(simulated_speed, 2) # Determine description @@ -611,7 +591,7 @@ def reset_apn_settings(self) -> str: def _reset_apn_settings(self): """Resets your APN settings to the default settings. This will be applied at the next reboot.""" self.device.active_apn_settings.reset_at_reboot = True - return f"APN settings will reset at reboot." + return "APN settings will reset at reboot." def break_apn_settings(self) -> str: """Breaks the APN settings. This is fixed by calling reset_apn_settings().""" @@ -634,7 +614,9 @@ def check_wifi_status(self) -> str: if not status["enabled"]: return "Wi-Fi is turned OFF." if status["connected"]: - return f"Wi-Fi is ON and connected to '{status['ssid']}'. Signal strength: {status['signal_strength'].value}." + return ( + f"Wi-Fi is ON and connected to '{status['ssid']}'. Signal strength: {status['signal_strength'].value}." + ) else: return "Wi-Fi is ON but not connected to any network." @@ -702,9 +684,7 @@ def _toggle_wifi_calling(self) -> bool: self.device.wifi_calling_enabled = new_state return new_state - def set_wifi_calling( - self, enabled: bool, mms_over_wifi: Optional[bool] = None - ) -> str: + def set_wifi_calling(self, enabled: bool, mms_over_wifi: Optional[bool] = None) -> str: """Set the Wi-Fi Calling setting. Set MMS over WIFI accordingly if provided.""" if self.device.wifi_calling_enabled != enabled: self._toggle_wifi_calling() @@ -736,9 +716,7 @@ def _check_vpn_status(self) -> Dict[str, Any]: "enabled_setting": self.device.vpn_enabled_setting, "connected": self.device.vpn_connected, "details": ( - self.device.vpn_details.model_dump() - if self.device.vpn_details and self.device.vpn_connected - else None + self.device.vpn_details.model_dump() if self.device.vpn_details and self.device.vpn_connected else None ), } @@ -748,11 +726,7 @@ def connect_vpn(self) -> str: connected = self._connect_vpn() if connected is None: return "VPN already connected." - status_update = ( - "VPN connected successfully." - if connected - else "No VPN connection to connect." - ) + status_update = "VPN connected successfully." if connected else "No VPN connection to connect." return f"{status_update}\nStatus Bar: {self._check_status_bar()}" def _connect_vpn(self) -> Optional[bool]: @@ -769,11 +743,7 @@ def _connect_vpn(self) -> Optional[bool]: def disconnect_vpn(self) -> str: """Disconnects any active VPN (Virtual Private Network) connection. Stops routing your internet traffic through a VPN server, which might affect connection speed or access to content.""" disconnected = self._disconnect_vpn() - status_update = ( - "VPN disconnected successfully." - if disconnected - else "No active VPN connection to disconnect." - ) + status_update = "VPN disconnected successfully." if disconnected else "No active VPN connection to disconnect." return f"{status_update}\nStatus Bar: {self._check_status_bar()}" def _disconnect_vpn(self) -> bool: @@ -975,46 +945,34 @@ def simulate_network_search(self): self.device.network_connection_status = NetworkStatus.CONNECTED pref = self.device.network_mode_preference if pref == NetworkModePreference.FOUR_G_5G_PREFERRED: - five_g_signal = self.surroundings.signal_strength.get( - NetworkTechnology.FIVE_G, SignalStrength.NONE - ) + five_g_signal = self.surroundings.signal_strength.get(NetworkTechnology.FIVE_G, SignalStrength.NONE) if five_g_signal == SignalStrength.NONE: self.device.network_technology_connected = NetworkTechnology.FOUR_G - self.device.network_signal_strength = ( - self.surroundings.signal_strength.get( - NetworkTechnology.FOUR_G, SignalStrength.NONE - ) + self.device.network_signal_strength = self.surroundings.signal_strength.get( + NetworkTechnology.FOUR_G, SignalStrength.NONE ) else: self.device.network_technology_connected = NetworkTechnology.FIVE_G self.device.network_signal_strength = five_g_signal elif pref == NetworkModePreference.FOUR_G_ONLY: self.device.network_technology_connected = NetworkTechnology.FOUR_G - self.device.network_signal_strength = ( - self.surroundings.signal_strength.get( - NetworkTechnology.FOUR_G, SignalStrength.NONE - ) + self.device.network_signal_strength = self.surroundings.signal_strength.get( + NetworkTechnology.FOUR_G, SignalStrength.NONE ) elif pref == NetworkModePreference.THREE_G_ONLY: self.device.network_technology_connected = NetworkTechnology.THREE_G - self.device.network_signal_strength = ( - self.surroundings.signal_strength.get( - NetworkTechnology.THREE_G, SignalStrength.NONE - ) + self.device.network_signal_strength = self.surroundings.signal_strength.get( + NetworkTechnology.THREE_G, SignalStrength.NONE ) elif pref == NetworkModePreference.TWO_G_ONLY: self.device.network_technology_connected = NetworkTechnology.TWO_G - self.device.network_signal_strength = ( - self.surroundings.signal_strength.get( - NetworkTechnology.TWO_G, SignalStrength.NONE - ) + self.device.network_signal_strength = self.surroundings.signal_strength.get( + NetworkTechnology.TWO_G, SignalStrength.NONE ) else: # Default fallback self.device.network_technology_connected = NetworkTechnology.FOUR_G - self.device.network_signal_strength = ( - self.surroundings.signal_strength.get( - NetworkTechnology.FOUR_G, SignalStrength.NONE - ) + self.device.network_signal_strength = self.surroundings.signal_strength.get( + NetworkTechnology.FOUR_G, SignalStrength.NONE ) elif sim_status in [SimStatus.MISSING]: @@ -1120,9 +1078,7 @@ def assert_mobile_data_saver_mode_status(self, expected_status: bool) -> bool: """ return self.device.data_saver_mode == expected_status - def assert_internet_speed( - self, expected_speed: float, expected_desc: Optional[str] = None - ) -> bool: + def assert_internet_speed(self, expected_speed: float, expected_desc: Optional[str] = None) -> bool: """ Assert that the internet speed is as expected. """ diff --git a/vendor/tau2/environment/environment.py b/vendor/tau2/environment/environment.py index 07799bb7..2a69c176 100644 --- a/vendor/tau2/environment/environment.py +++ b/vendor/tau2/environment/environment.py @@ -258,9 +258,9 @@ def set_state( Set the state of the environment given initialization data and a list of messages. """ if self.solo_mode: - assert all( - [not isinstance(message, UserMessage) for message in message_history] - ), "User messages are not allowed in solo mode" + assert all([not isinstance(message, UserMessage) for message in message_history]), ( + "User messages are not allowed in solo mode" + ) def get_actions_from_messages( messages: list[Message], diff --git a/vendor/tau2/environment/server.py b/vendor/tau2/environment/server.py index 8faddd13..cce3fb2b 100644 --- a/vendor/tau2/environment/server.py +++ b/vendor/tau2/environment/server.py @@ -85,7 +85,8 @@ def _format_description(self, policy: str) -> str: description.append(content) # Add the tools section - description.append(""" + description.append( + """ ## Tools @@ -98,7 +99,8 @@ def _format_description(self, policy: str) -> str: ### Response Format All successful responses will return the tool's output directly. Errors will return a 400 status code with an error message. -""") +""" + ) return "\n".join(description) @@ -161,20 +163,14 @@ async def tool_endpoint( ) -> Any: try: if route_prefix == "user_tools": - result = self.environment.use_user_tool( - tool_name=tool_name, **request.model_dump() - ) + result = self.environment.use_user_tool(tool_name=tool_name, **request.model_dump()) else: - result = self.environment.use_tool( - tool_name=tool_name, **request.model_dump() - ) + result = self.environment.use_tool(tool_name=tool_name, **request.model_dump()) return result except Exception as e: raise HTTPException(status_code=400, detail=str(e)) - def _format_tool_description( - self, doc: str, returns: Optional[dict] = None, is_user_tool: bool = False - ) -> str: + def _format_tool_description(self, doc: str, returns: Optional[dict] = None, is_user_tool: bool = False) -> str: """Format tool documentation for better ReDoc rendering""" import re diff --git a/vendor/tau2/environment/tool.py b/vendor/tau2/environment/tool.py index cf55463c..fa63bdcb 100644 --- a/vendor/tau2/environment/tool.py +++ b/vendor/tau2/environment/tool.py @@ -49,9 +49,7 @@ class Tool(BaseTool): """The parameters of the Tool.""" returns: type[BaseModel] = Field(..., description="The return of the Tool") """The return of the Tool.""" - raises: List[Dict[str, Optional[str]]] = Field( - [], description="The exceptions raised by the Tool" - ) + raises: List[Dict[str, Optional[str]]] = Field([], description="The exceptions raised by the Tool") """The exceptions raised by the Tool.""" examples: List[str] = Field([], description="The examples of the Tool") """The examples of the Tool.""" @@ -79,9 +77,7 @@ def __init__(self, func: Callable, use_short_desc: bool = False, **predefined: A self.__doc__ = doc # overwrite the doc string @classmethod - def parse_data( - cls, sig: Signature, docstring: Optional[str], predefined: Dict[str, Any] - ) -> Dict[str, Any]: + def parse_data(cls, sig: Signature, docstring: Optional[str], predefined: Dict[str, Any]) -> Dict[str, Any]: """Parse data from the signature and docstring of a function.""" doc = parse(docstring or "") data: Dict[str, Any] = { @@ -127,9 +123,7 @@ def parse_data( data["returns"] = create_model("returns", returns=(anno, default)) # build raises - data["raises"] = [ - {"type": exc.type_name, "desc": exc.description} for exc in doc.raises - ] + data["raises"] = [{"type": exc.type_name, "desc": exc.description} for exc in doc.raises] # build examples data["examples"] = doc.examples diff --git a/vendor/tau2/environment/toolkit.py b/vendor/tau2/environment/toolkit.py index fff04d60..d21923e2 100644 --- a/vendor/tau2/environment/toolkit.py +++ b/vendor/tau2/environment/toolkit.py @@ -102,18 +102,10 @@ def tool_type(self, tool_name: str) -> ToolType: def get_statistics(self) -> dict[str, Any]: """Get the statistics of the ToolKit.""" num_tools = len(self.tools) - num_read_tools = sum( - self.tool_type(name) == ToolType.READ for name in self.tools - ) - num_write_tools = sum( - self.tool_type(name) == ToolType.WRITE for name in self.tools - ) - num_think_tools = sum( - self.tool_type(name) == ToolType.THINK for name in self.tools - ) - num_generic_tools = sum( - self.tool_type(name) == ToolType.GENERIC for name in self.tools - ) + num_read_tools = sum(self.tool_type(name) == ToolType.READ for name in self.tools) + num_write_tools = sum(self.tool_type(name) == ToolType.WRITE for name in self.tools) + num_think_tools = sum(self.tool_type(name) == ToolType.THINK for name in self.tools) + num_generic_tools = sum(self.tool_type(name) == ToolType.GENERIC for name in self.tools) return { "num_tools": num_tools, "num_read_tools": num_read_tools, diff --git a/vendor/tau2/environment/utils/interface_agent.py b/vendor/tau2/environment/utils/interface_agent.py index 015b9cd2..773bbfb9 100644 --- a/vendor/tau2/environment/utils/interface_agent.py +++ b/vendor/tau2/environment/utils/interface_agent.py @@ -216,15 +216,11 @@ def get_prompt_text() -> str: if message == ":n": console.print("[info]Starting new session...[/]") - interface_agent, message_history = init_session( - current_domain - ) + interface_agent, message_history = init_session(current_domain) continue with console.status("[info]Processing query...[/]"): - response, message_history = interface_agent.respond( - message, message_history - ) + response, message_history = interface_agent.respond(message, message_history) # Try to parse response as markdown for better formatting try: @@ -232,9 +228,7 @@ def get_prompt_text() -> str: console.print("\n[bold]Response:[/]") console.print(md) except Exception as e: - console.print( - f"\n[error]Error parsing response:[/] {str(e)}" - ) + console.print(f"\n[error]Error parsing response:[/] {str(e)}") console.print("\n[bold]Response:[/]", response.content) except KeyboardInterrupt: @@ -244,9 +238,7 @@ def get_prompt_text() -> str: console.print(f"\n[error]Error processing message:[/] {str(e)}") except Exception as e: - console.print( - f"\n[error]Error initializing domain '{current_domain}':[/] {str(e)}" - ) + console.print(f"\n[error]Error initializing domain '{current_domain}':[/] {str(e)}") new_domain = change_domain(console) if new_domain is None: return diff --git a/vendor/tau2/evaluator/__init__.py b/vendor/tau2/evaluator/__init__.py index 8b137891..e69de29b 100644 --- a/vendor/tau2/evaluator/__init__.py +++ b/vendor/tau2/evaluator/__init__.py @@ -1 +0,0 @@ - diff --git a/vendor/tau2/evaluator/evaluator.py b/vendor/tau2/evaluator/evaluator.py index fa3b4791..f206c3dc 100644 --- a/vendor/tau2/evaluator/evaluator.py +++ b/vendor/tau2/evaluator/evaluator.py @@ -33,9 +33,7 @@ def evaluate_simulation( }: return RewardInfo( reward=0.0, - info={ - "note": f"Simulation terminated prematurely. Termination reason: {simulation.termination_reason}" - }, + info={"note": f"Simulation terminated prematurely. Termination reason: {simulation.termination_reason}"}, ) if task.evaluation_criteria is None: return RewardInfo( diff --git a/vendor/tau2/evaluator/evaluator_action.py b/vendor/tau2/evaluator/evaluator_action.py index 518475ba..3576ce2e 100644 --- a/vendor/tau2/evaluator/evaluator_action.py +++ b/vendor/tau2/evaluator/evaluator_action.py @@ -59,10 +59,7 @@ def evaluate_actions( predicted_tool_calls: list[ToolCall] = [] for message in full_trajectory: - if ( - isinstance(message, AssistantMessage) - or isinstance(message, UserMessage) - ) and message.is_tool_call(): + if (isinstance(message, AssistantMessage) or isinstance(message, UserMessage)) and message.is_tool_call(): predicted_tool_calls.extend(message.tool_calls) # Check if all the gold actions are in the predicted actions diff --git a/vendor/tau2/evaluator/evaluator_communicate.py b/vendor/tau2/evaluator/evaluator_communicate.py index 43eecebf..8eada207 100644 --- a/vendor/tau2/evaluator/evaluator_communicate.py +++ b/vendor/tau2/evaluator/evaluator_communicate.py @@ -32,9 +32,7 @@ def calculate_reward( reward_breakdown={RewardType.COMMUNICATE: 1.0}, ) - communicate_info_checks = cls.evaluate_communicate_info( - full_trajectory, communicate_info - ) + communicate_info_checks = cls.evaluate_communicate_info(full_trajectory, communicate_info) # Calculate reward: 1 if all expectations are met, 0 otherwise all_expectations_met = all(result.met for result in communicate_info_checks) @@ -66,9 +64,7 @@ def evaluate_communicate_info( continue if not message.has_text_content(): continue - if info_str.lower() in message.content.lower().replace( - ",", "" - ): # TODO: This could be improved! + if info_str.lower() in message.content.lower().replace(",", ""): # TODO: This could be improved! found = True break if found: diff --git a/vendor/tau2/evaluator/evaluator_env.py b/vendor/tau2/evaluator/evaluator_env.py index 46c3e296..d2b724a4 100644 --- a/vendor/tau2/evaluator/evaluator_env.py +++ b/vendor/tau2/evaluator/evaluator_env.py @@ -49,24 +49,15 @@ def calculate_reward( ) initialization_data = None - if ( - task.initial_state is not None - and task.initial_state.initialization_data is not None - ): + if task.initial_state is not None and task.initial_state.initialization_data is not None: initialization_data = task.initial_state.initialization_data initialization_actions = None - if ( - task.initial_state is not None - and task.initial_state.initialization_actions is not None - ): + if task.initial_state is not None and task.initial_state.initialization_actions is not None: initialization_actions = task.initial_state.initialization_actions message_history = [] - if ( - task.initial_state is not None - and task.initial_state.message_history is not None - ): + if task.initial_state is not None and task.initial_state.message_history is not None: message_history = task.initial_state.message_history predicted_environment = environment_constructor(solo_mode=solo_mode) @@ -77,10 +68,7 @@ def calculate_reward( ) predicted_tool_calls: list[ToolCall] = [] for message in full_trajectory: - if ( - isinstance(message, AssistantMessage) - or isinstance(message, UserMessage) - ) and message.is_tool_call(): + if (isinstance(message, AssistantMessage) or isinstance(message, UserMessage)) and message.is_tool_call(): predicted_tool_calls.extend(message.tool_calls) # Setting up gold environment @@ -99,9 +87,7 @@ def calculate_reward( **action.arguments, ) except Exception as e: - logger.warning( - f"Error in golden actions {action.name}({action.arguments}): {e}" - ) + logger.warning(f"Error in golden actions {action.name}({action.arguments}): {e}") # Comparing the environments agent_db_hash = gold_environment.get_db_hash() diff --git a/vendor/tau2/metrics/agent_metrics.py b/vendor/tau2/metrics/agent_metrics.py index 3192e698..8f8c30b8 100644 --- a/vendor/tau2/metrics/agent_metrics.py +++ b/vendor/tau2/metrics/agent_metrics.py @@ -55,9 +55,7 @@ def get_metrics_df(results: Results) -> tuple[pd.DataFrame, int]: df = results.to_df() df["success"] = df.reward.apply(is_successful) if len(df.info_num_trials.unique()) > 1: - logger.warning( - f"All simulations must have the same number of trials. Found {df.info_num_trials.unique()}" - ) + logger.warning(f"All simulations must have the same number of trials. Found {df.info_num_trials.unique()}") max_k = df.info_num_trials.max() task_ids_counts = [(tid, count) for tid, count in df.task_id.value_counts().items()] @@ -78,9 +76,7 @@ def get_tasks_pass_hat_k(results: Results) -> pd.DataFrame: df, max_k = get_metrics_df(results) dfs = [] for k in range(1, max_k + 1): - res = df.groupby("task_id")["success"].apply( - lambda df: pass_hat_k(len(df), df.sum(), k) - ) + res = df.groupby("task_id")["success"].apply(lambda df: pass_hat_k(len(df), df.sum(), k)) res.name = f"pass^{k}" dfs.append(res) df_pass_hat_k = pd.concat(dfs, axis=1) diff --git a/vendor/tau2/metrics/break_down_metrics.py b/vendor/tau2/metrics/break_down_metrics.py index 3b6e9571..3ecf4e35 100644 --- a/vendor/tau2/metrics/break_down_metrics.py +++ b/vendor/tau2/metrics/break_down_metrics.py @@ -24,9 +24,7 @@ def get_write_tools(domain): return set(agent_write_tools), set(user_write_tools) -def analyze_reward( - reward_info: RewardInfo, agent_write_tools: set[str], user_write_tools: set[str] -): +def analyze_reward(reward_info: RewardInfo, agent_write_tools: set[str], user_write_tools: set[str]): """ Analyze the reward breakdown. """ @@ -34,26 +32,18 @@ def analyze_reward( try: if RewardType.COMMUNICATE in reward_info.reward_basis: communicate_success = ( - is_successful(reward_breakdown[RewardType.COMMUNICATE]) - if reward_breakdown is not None - else 0 + is_successful(reward_breakdown[RewardType.COMMUNICATE]) if reward_breakdown is not None else 0 ) else: communicate_success = None if RewardType.ENV_ASSERTION in reward_info.reward_basis: env_success = ( - is_successful(reward_breakdown[RewardType.ENV_ASSERTION]) - if reward_breakdown is not None - else 0 + is_successful(reward_breakdown[RewardType.ENV_ASSERTION]) if reward_breakdown is not None else 0 ) else: env_success = None if RewardType.DB in reward_info.reward_basis: - db_success = ( - is_successful(reward_breakdown[RewardType.DB]) - if reward_breakdown is not None - else 0 - ) + db_success = is_successful(reward_breakdown[RewardType.DB]) if reward_breakdown is not None else 0 else: db_success = None except Exception as e: @@ -110,13 +100,9 @@ def result_reward_analysis(results: Results): Analyze the reward breakdown. """ rows = [] - agent_write_tools, user_write_tools = get_write_tools( - results.info.environment_info.domain_name - ) + agent_write_tools, user_write_tools = get_write_tools(results.info.environment_info.domain_name) for simulation in results.simulations: - reward_analysis = analyze_reward( - simulation.reward_info, agent_write_tools, user_write_tools - ) + reward_analysis = analyze_reward(simulation.reward_info, agent_write_tools, user_write_tools) reward_analysis["task_id"] = simulation.task_id reward_analysis["trial"] = simulation.trial rows.append(reward_analysis) diff --git a/vendor/tau2/orchestrator/environment_manager.py b/vendor/tau2/orchestrator/environment_manager.py index 1b5b7bc9..f34d0758 100644 --- a/vendor/tau2/orchestrator/environment_manager.py +++ b/vendor/tau2/orchestrator/environment_manager.py @@ -145,9 +145,7 @@ async def status(): @self.app.post("/start_environment") async def start_env(request: StartEnvironmentRequest) -> EnvironmentResponse: - env_id = self.start_environment( - domain=request.domain, env_id=request.env_id - ) + env_id = self.start_environment(domain=request.domain, env_id=request.env_id) return EnvironmentResponse(env_id=env_id) @self.app.post("/{env_id}/set_state") @@ -169,9 +167,7 @@ async def get_info(env_id: str) -> EnvironmentInfo: return self.get_environment_info(env_id) @self.app.post("/{env_id}/tools/{tool_name}") - async def execute_tool( - env_id: str, tool_name: str, request: ToolCall - ) -> ToolMessage: + async def execute_tool(env_id: str, tool_name: str, request: ToolCall) -> ToolMessage: return self.execute_tool(env_id=env_id, tool_call=request) def get_environment_id(self) -> str: @@ -210,12 +206,8 @@ def set_environment_state( Set the state of an environment. """ - self.environments[env_id].set_state( - initialization_data, initialization_actions, message_history - ) - self.trajectories[env_id] = [ - msg for msg in message_history if is_valid_environment_message(msg) - ] + self.environments[env_id].set_state(initialization_data, initialization_actions, message_history) + self.trajectories[env_id] = [msg for msg in message_history if is_valid_environment_message(msg)] def stop_environment(self, env_id: str): """ @@ -225,9 +217,7 @@ def stop_environment(self, env_id: str): # Get the router instance router = self.app.router # Filter out the routes we want to remove - router.routes = [ - route for route in router.routes if route not in self.routes[env_id] - ] + router.routes = [route for route in router.routes if route not in self.routes[env_id]] del self.routes[env_id] if env_id in self.environments: diff --git a/vendor/tau2/orchestrator/orchestrator.py b/vendor/tau2/orchestrator/orchestrator.py index 172519c1..ebd4252a 100644 --- a/vendor/tau2/orchestrator/orchestrator.py +++ b/vendor/tau2/orchestrator/orchestrator.py @@ -31,9 +31,7 @@ class Role(str, Enum): ENV = "env" -DEFAULT_FIRST_AGENT_MESSAGE = AssistantMessage( - role="assistant", content="Hi! How can I help you today?", cost=0.0 -) +DEFAULT_FIRST_AGENT_MESSAGE = AssistantMessage(role="assistant", content="Hi! How can I help you today?", cost=0.0) class Orchestrator: @@ -82,12 +80,8 @@ def initialize(self): - Send the first message (default message from the agent to the user). """ initial_state = self.task.initial_state - initialization_data = ( - initial_state.initialization_data if initial_state is not None else None - ) - initialization_actions = ( - initial_state.initialization_actions if initial_state is not None else None - ) + initialization_data = initial_state.initialization_data if initial_state is not None else None + initialization_actions = initial_state.initialization_actions if initial_state is not None else None message_history = ( deepcopy(initial_state.message_history) if initial_state is not None and initial_state.message_history is not None @@ -101,12 +95,8 @@ def initialize(self): if self.solo_mode: assert self.environment.solo_mode, "Environment should be in solo mode" - assert isinstance(self.agent, LLMSoloAgent), ( - "Agent must be a LLMSoloAgent in solo mode" - ) - assert isinstance(self.user, DummyUser), ( - "User must be a DummyUser in solo mode" - ) + assert isinstance(self.agent, LLMSoloAgent), "Agent must be a LLMSoloAgent in solo mode" + assert isinstance(self.user, DummyUser), "User must be a DummyUser in solo mode" # Initialize Environment state self._initialize_environment( @@ -133,18 +123,10 @@ def initialize(self): else: # Last message is for the environment self.to_role = Role.ENV self.agent_state = self.agent.get_init_state( - message_history=[ - msg - for msg in message_history - if is_valid_agent_history_message(msg) - ] + message_history=[msg for msg in message_history if is_valid_agent_history_message(msg)] ) self.user_state = self.user.get_init_state( - message_history=[ - msg - for msg in message_history[:-1] - if is_valid_user_history_message(msg) - ] + message_history=[msg for msg in message_history[:-1] if is_valid_user_history_message(msg)] ) self.message = last_message if self.agent.is_stop(last_message): @@ -158,18 +140,10 @@ def initialize(self): else: # Last message is for the environment self.to_role = Role.ENV self.user_state = self.user.get_init_state( - message_history=[ - msg - for msg in message_history - if is_valid_user_history_message(msg) - ] + message_history=[msg for msg in message_history if is_valid_user_history_message(msg)] ) self.agent_state = self.agent.get_init_state( - message_history=[ - msg - for msg in message_history[:-1] - if is_valid_agent_history_message(msg) - ] + message_history=[msg for msg in message_history[:-1] if is_valid_agent_history_message(msg)] ) self.message = last_message self.done = UserSimulator.is_stop(last_message) @@ -181,34 +155,18 @@ def initialize(self): if last_message.requestor == "assistant": self.to_role = Role.AGENT self.agent_state = self.agent.get_init_state( - message_history=[ - msg - for msg in message_history[:-1] - if is_valid_agent_history_message(msg) - ] + message_history=[msg for msg in message_history[:-1] if is_valid_agent_history_message(msg)] ) self.user_state = self.user.get_init_state( - message_history=[ - msg - for msg in message_history - if is_valid_user_history_message(msg) - ] + message_history=[msg for msg in message_history if is_valid_user_history_message(msg)] ) else: self.to_role = Role.USER self.agent_state = self.agent.get_init_state( - message_history=[ - msg - for msg in message_history - if is_valid_agent_history_message(msg) - ] + message_history=[msg for msg in message_history if is_valid_agent_history_message(msg)] ) self.user_state = self.user.get_init_state( - message_history=[ - msg - for msg in message_history[:-1] - if is_valid_user_history_message(msg) - ] + message_history=[msg for msg in message_history[:-1] if is_valid_user_history_message(msg)] ) self.message = last_message else: @@ -228,9 +186,7 @@ def initialize(self): self.from_role = Role.AGENT self.to_role = Role.USER else: - first_message, agent_state = self.agent.generate_next_message( - None, self.agent_state - ) + first_message, agent_state = self.agent.generate_next_message(None, self.agent_state) self.trajectory = [first_message] self.message = first_message self.from_role = Role.AGENT @@ -290,17 +246,13 @@ def step(self): """ if self.done: raise ValueError("Simulation is done") - logger.debug( - f"Step {self.step_count}. Sending message from {self.from_role} to {self.to_role}" - ) + logger.debug(f"Step {self.step_count}. Sending message from {self.from_role} to {self.to_role}") logger.debug( f"Step {self.step_count}.\nFrom role: {self.from_role}\nTo role: {self.to_role}\nMessage: {self.message}" ) # AGENT/ENV -> USER if self.from_role in [Role.AGENT, Role.ENV] and self.to_role == Role.USER: - user_msg, self.user_state = self.user.generate_next_message( - self.message, self.user_state - ) + user_msg, self.user_state = self.user.generate_next_message(self.message, self.user_state) user_msg.validate() if UserSimulator.is_stop(user_msg): self.done = True @@ -313,12 +265,8 @@ def step(self): else: self.to_role = Role.AGENT # USER/ENV -> AGENT - elif ( - self.from_role == Role.USER or self.from_role == Role.ENV - ) and self.to_role == Role.AGENT: - agent_msg, self.agent_state = self.agent.generate_next_message( - self.message, self.agent_state - ) + elif (self.from_role == Role.USER or self.from_role == Role.ENV) and self.to_role == Role.AGENT: + agent_msg, self.agent_state = self.agent.generate_next_message(self.message, self.agent_state) agent_msg.validate() if self.agent.is_stop(agent_msg): self.done = True @@ -342,9 +290,7 @@ def step(self): "Number of tool calls and tool messages should be the same" ) self.trajectory.extend(tool_msgs) - if ( - len(tool_msgs) > 1 - ): # Packaging multiple tool messages into a MultiToolMessage + if len(tool_msgs) > 1: # Packaging multiple tool messages into a MultiToolMessage self.message = MultiToolMessage( role="tool", tool_messages=tool_msgs, @@ -354,9 +300,7 @@ def step(self): self.to_role = self.from_role self.from_role = Role.ENV else: - raise ValueError( - f"Invalid role combination. From role: {self.from_role}, To role: {self.to_role}" - ) + raise ValueError(f"Invalid role combination. From role: {self.from_role}, To role: {self.to_role}") self.step_count += 1 self.environment.sync_tools() @@ -403,9 +347,7 @@ def validate_message_history(cls, message_history: list[Message]): if num_expected_tool_messages == 0 or requestor is None: raise ValueError("No tool messages expected.") if requestor != msg.requestor: - raise ValueError( - f"Got tool message from {msg.requestor}, expected {requestor}." - ) + raise ValueError(f"Got tool message from {msg.requestor}, expected {requestor}.") num_expected_tool_messages -= 1 else: raise ValueError(f"Invalid message type: {type(msg)}") @@ -435,13 +377,9 @@ def _count_errors(self, message_history: list[Message]) -> int: """ Count the number of errors in the message history. """ - return sum( - 1 for msg in message_history if isinstance(msg, ToolMessage) and msg.error - ) + return sum(1 for msg in message_history if isinstance(msg, ToolMessage) and msg.error) - def _add_timestamps( - self, message_history: list[Message] - ) -> list[tuple[str, Message]]: + def _add_timestamps(self, message_history: list[Message]) -> list[tuple[str, Message]]: """ Add timestamps to the message history. This is used to sort the messages by timestamp. diff --git a/vendor/tau2/orchestrator/utils.py b/vendor/tau2/orchestrator/utils.py index 5b119813..d15b6435 100644 --- a/vendor/tau2/orchestrator/utils.py +++ b/vendor/tau2/orchestrator/utils.py @@ -5,6 +5,4 @@ def is_valid_environment_message(msg: Message) -> bool: """ Check if the message is valid to the environment. """ - return isinstance(msg, ToolMessage) or ( - isinstance(msg, AssistantMessage) and msg.is_tool_call() - ) + return isinstance(msg, ToolMessage) or (isinstance(msg, AssistantMessage) and msg.is_tool_call()) diff --git a/vendor/tau2/registry.py b/vendor/tau2/registry.py index 764b917c..fbdf45c0 100644 --- a/vendor/tau2/registry.py +++ b/vendor/tau2/registry.py @@ -7,29 +7,25 @@ from vendor.tau2.agent.base import BaseAgent from vendor.tau2.agent.llm_agent import LLMAgent, LLMGTAgent, LLMSoloAgent from vendor.tau2.data_model.tasks import Task -from vendor.tau2.domains.airline.environment import \ - get_environment as airline_domain_get_environment -from vendor.tau2.domains.airline.environment import \ - get_tasks as airline_domain_get_tasks -from vendor.tau2.domains.mock.environment import \ - get_environment as mock_domain_get_environment -from vendor.tau2.domains.mock.environment import get_tasks as mock_domain_get_tasks -from vendor.tau2.domains.retail.environment import \ - get_environment as retail_domain_get_environment -from vendor.tau2.domains.retail.environment import \ - get_tasks as retail_domain_get_tasks -from vendor.tau2.domains.telecom.environment import \ - get_environment_manual_policy as \ - telecom_domain_get_environment_manual_policy -from vendor.tau2.domains.telecom.environment import \ - get_environment_workflow_policy as \ - telecom_domain_get_environment_workflow_policy -from vendor.tau2.domains.telecom.environment import \ - get_tasks as telecom_domain_get_tasks -from vendor.tau2.domains.telecom.environment import \ - get_tasks_full as telecom_domain_get_tasks_full -from vendor.tau2.domains.telecom.environment import \ - get_tasks_small as telecom_domain_get_tasks_small +from vendor.tau2.domains.airline.environment import ( + get_environment as airline_domain_get_environment, + get_tasks as airline_domain_get_tasks, +) +from vendor.tau2.domains.mock.environment import ( + get_environment as mock_domain_get_environment, + get_tasks as mock_domain_get_tasks, +) +from vendor.tau2.domains.retail.environment import ( + get_environment as retail_domain_get_environment, + get_tasks as retail_domain_get_tasks, +) +from vendor.tau2.domains.telecom.environment import ( + get_environment_manual_policy as telecom_domain_get_environment_manual_policy, + get_environment_workflow_policy as telecom_domain_get_environment_workflow_policy, + get_tasks as telecom_domain_get_tasks, + get_tasks_full as telecom_domain_get_tasks_full, + get_tasks_small as telecom_domain_get_tasks_small, +) from vendor.tau2.environment.environment import Environment from vendor.tau2.user.base import BaseUser from vendor.tau2.user.user_simulator import DummyUser, UserSimulator @@ -184,13 +180,13 @@ def get_info(self) -> RegistryInfo: registry.register_domain(retail_domain_get_environment, "retail") registry.register_tasks(retail_domain_get_tasks, "retail") registry.register_domain(telecom_domain_get_environment_manual_policy, "telecom") - registry.register_domain( - telecom_domain_get_environment_workflow_policy, "telecom-workflow" - ) + registry.register_domain(telecom_domain_get_environment_workflow_policy, "telecom-workflow") registry.register_tasks(telecom_domain_get_tasks_full, "telecom_full") registry.register_tasks(telecom_domain_get_tasks_small, "telecom_small") registry.register_tasks(telecom_domain_get_tasks, "telecom") registry.register_tasks(telecom_domain_get_tasks, "telecom-workflow") - logger.debug(f"Default components registered successfully. Registry info: {json.dumps(registry.get_info().model_dump(), indent=2)}") + logger.debug( + f"Default components registered successfully. Registry info: {json.dumps(registry.get_info().model_dump(), indent=2)}" + ) except Exception as e: logger.error(f"Error initializing registry: {str(e)}") diff --git a/vendor/tau2/run.py b/vendor/tau2/run.py index c2813d34..d49521ab 100644 --- a/vendor/tau2/run.py +++ b/vendor/tau2/run.py @@ -8,8 +8,7 @@ from loguru import logger from vendor.tau2.agent.llm_agent import LLMAgent, LLMGTAgent, LLMSoloAgent -from vendor.tau2.data_model.simulation import (AgentInfo, Info, Results, RunConfig, - SimulationRun, UserInfo) +from vendor.tau2.data_model.simulation import AgentInfo, Info, Results, RunConfig, SimulationRun, UserInfo from vendor.tau2.data_model.tasks import Task from vendor.tau2.environment.environment import Environment, EnvironmentInfo from vendor.tau2.evaluator.evaluator import EvaluationType, evaluate_simulation @@ -29,9 +28,7 @@ def get_options() -> RegistryInfo: return registry.get_info() -def get_environment_info( - domain_name: str, include_tool_info: bool = False -) -> EnvironmentInfo: +def get_environment_info(domain_name: str, include_tool_info: bool = False) -> EnvironmentInfo: """Get information about the environment for a registered Domain""" global registry env_constructor = registry.get_env_constructor(domain_name) @@ -59,14 +56,10 @@ def get_tasks( if task_ids is None: tasks = load_tasks(task_set_name=task_set_name) else: - tasks = [ - task for task in load_tasks(task_set_name=task_set_name) if task.id in task_ids - ] + tasks = [task for task in load_tasks(task_set_name=task_set_name) if task.id in task_ids] if task_ids is not None and len(tasks) != len(task_ids): missing_tasks = set(task_ids) - set([task.id for task in tasks]) - raise ValueError( - f"Not all tasks were found for task set {task_set_name}: {missing_tasks}" - ) + raise ValueError(f"Not all tasks were found for task set {task_set_name}: {missing_tasks}") if num_tasks is not None: tasks = tasks[:num_tasks] return tasks @@ -100,13 +93,17 @@ def run_domain(config: RunConfig) -> Results: total_num_tasks = len(tasks) tasks = [task for task in tasks if LLMGTAgent.check_valid_task(task)] num_tasks = len(tasks) - console_text = Text(text=f"Running {num_tasks} out of {total_num_tasks} tasks for GT agent.", style="bold green") + console_text = Text( + text=f"Running {num_tasks} out of {total_num_tasks} tasks for GT agent.", style="bold green" + ) ConsoleDisplay.console.print(console_text) if "solo" in config.agent: total_num_tasks = len(tasks) tasks = [task for task in tasks if LLMSoloAgent.check_valid_task(task)] num_tasks = len(tasks) - console_text = Text(text=f"Running {num_tasks} out of {total_num_tasks} tasks for solo agent.", style="bold green") + console_text = Text( + text=f"Running {num_tasks} out of {total_num_tasks} tasks for solo agent.", style="bold green" + ) ConsoleDisplay.console.print(console_text) num_trials = config.num_trials @@ -244,9 +241,7 @@ def run_tasks( with open(save_to, "r") as fp: prev_simulation_results = Results.model_validate_json(fp.read()) # Check if the run config has changed - if get_pydantic_hash(prev_simulation_results.info) != get_pydantic_hash( - simulation_results.info - ): + if get_pydantic_hash(prev_simulation_results.info) != get_pydantic_hash(simulation_results.info): diff = show_dict_diff( prev_simulation_results.info.model_dump(), simulation_results.info.model_dump(), @@ -279,14 +274,12 @@ def run_tasks( "The task set has changed. Please delete the existing file or use a different save_to name." ) # Check which of the runs have already been done - done_runs = set( - [ - (sim.trial, sim.task_id, sim.seed) - for sim in prev_simulation_results.simulations - ] - ) + done_runs = set([(sim.trial, sim.task_id, sim.seed) for sim in prev_simulation_results.simulations]) simulation_results = prev_simulation_results - console_text = Text(text=f"Resuming run from {len(done_runs)} runs. {len(tasks) * num_trials - len(done_runs)} runs remaining.", style="bold yellow") + console_text = Text( + text=f"Resuming run from {len(done_runs)} runs. {len(tasks) * num_trials - len(done_runs)} runs remaining.", + style="bold yellow", + ) ConsoleDisplay.console.print(console_text) # Create new save file else: @@ -338,7 +331,10 @@ def _run(task: Task, trial: int, seed: int, progress_str: str) -> SimulationRun: for trial in range(num_trials): for i, task in enumerate(tasks): if (trial, task.id, seeds[trial]) in done_runs: - console_text = Text(text=f"Skipping task {task.id}, trial {trial} because it has already been run.", style="bold yellow") + console_text = Text( + text=f"Skipping task {task.id}, trial {trial} because it has already been run.", + style="bold yellow", + ) ConsoleDisplay.console.print(console_text) continue progress_str = f"{i}/{len(tasks)} (trial {trial + 1}/{num_trials})" @@ -394,9 +390,7 @@ def run_task( if max_errors <= 0: raise ValueError("Max errors must be greater than 0") global registry - logger.info( - f"STARTING SIMULATION: Domain: {domain}, Task: {task.id}, Agent: {agent}, User: {user}" - ) + logger.info(f"STARTING SIMULATION: Domain: {domain}, Task: {task.id}, Agent: {agent}, User: {user}") environment_constructor = registry.get_env_constructor(domain) environment = environment_constructor() AgentConstructor = registry.get_agent_constructor(agent) @@ -429,9 +423,7 @@ def run_task( task=task, ) else: - raise ValueError( - f"Unknown agent type: {AgentConstructor}. Should be LLMAgent or LLMSoloAgent" - ) + raise ValueError(f"Unknown agent type: {AgentConstructor}. Should be LLMAgent or LLMSoloAgent") try: user_tools = environment.get_user_tools() except Exception: @@ -439,9 +431,7 @@ def run_task( UserConstructor = registry.get_user_constructor(user) if issubclass(UserConstructor, DummyUser): - assert isinstance(agent, LLMSoloAgent), ( - "Dummy user can only be used with solo agent" - ) + assert isinstance(agent, LLMSoloAgent), "Dummy user can only be used with solo agent" user = UserConstructor( tools=user_tools, diff --git a/vendor/tau2/scripts/show_domain_doc.py b/vendor/tau2/scripts/show_domain_doc.py index a7d55cdd..b1b4c8ee 100755 --- a/vendor/tau2/scripts/show_domain_doc.py +++ b/vendor/tau2/scripts/show_domain_doc.py @@ -59,9 +59,7 @@ def main(domain: str): except KeyError: available_domains = registry.get_domains() - logger.error( - f"Domain '{domain}' not found. Available domains: {available_domains}" - ) + logger.error(f"Domain '{domain}' not found. Available domains: {available_domains}") exit(1) except Exception as e: logger.error(f"Failed to start domain documentation server: {str(e)}") diff --git a/vendor/tau2/scripts/start_servers.py b/vendor/tau2/scripts/start_servers.py index 7a7596be..ecbec88c 100755 --- a/vendor/tau2/scripts/start_servers.py +++ b/vendor/tau2/scripts/start_servers.py @@ -18,9 +18,7 @@ def kill_process_on_port(port): connections = proc.net_connections() for conn in connections: if hasattr(conn, "laddr") and conn.laddr.port == port: - logger.warning( - f"Killing existing process {proc.pid} on port {port}" - ) + logger.warning(f"Killing existing process {proc.pid} on port {port}") proc.terminate() time.sleep(0.5) # Give it a moment to terminate if proc.is_running(): # If still running @@ -82,9 +80,7 @@ def signal_handler(signum, frame): try: with ThreadPoolExecutor(max_workers=len(servers)) as executor: # Start each server in a separate thread - futures = [ - executor.submit(run_server, command, port) for command, port in servers - ] + futures = [executor.submit(run_server, command, port) for command, port in servers] # Wait for all servers to complete for future in futures: diff --git a/vendor/tau2/scripts/view_simulations.py b/vendor/tau2/scripts/view_simulations.py index b8fad2be..e3357477 100644 --- a/vendor/tau2/scripts/view_simulations.py +++ b/vendor/tau2/scripts/view_simulations.py @@ -23,9 +23,7 @@ def get_available_simulations(): return sorted([f for f in sim_dir.glob("*.json")]) -def display_simulation_list( - results: Results, only_show_failed: bool = False, only_show_all_failed: bool = False -): +def display_simulation_list(results: Results, only_show_failed: bool = False, only_show_all_failed: bool = False): """Display a numbered list of simulations with basic info.""" ConsoleDisplay.console.print("\n[bold blue]Available Simulations:[/]") @@ -74,9 +72,7 @@ def display_available_files(files): ConsoleDisplay.console.print(f"[cyan]{i}.[/] {file.name}") -def display_simulation_with_task( - simulation, task, results_file: str, sim_index: int, show_details: bool = True -): +def display_simulation_with_task(simulation, task, results_file: str, sim_index: int, show_details: bool = True): """Display a simulation along with its associated task.""" ConsoleDisplay.console.print("\n" + "=" * 80) # Separator ConsoleDisplay.console.print("[bold blue]Task Details:[/]") @@ -113,18 +109,12 @@ def find_task_by_id(tasks, task_id): def find_simulation_by_task_id_and_trial(results, task_id, trial): """Get a simulation by its task ID and trial number.""" return next( - ( - sim - for sim in results.simulations - if sim.task_id == task_id and sim.trial == trial - ), + (sim for sim in results.simulations if sim.task_id == task_id and sim.trial == trial), None, ) -def save_simulation_note( - simulation, task, note: str, results_file: str, sim_index: int -): +def save_simulation_note(simulation, task, note: str, results_file: str, sim_index: int): """Save a note about a simulation to a CSV file.""" notes_file = Path(DATA_DIR) / "simulations" / "simulation_notes.csv" file_exists = notes_file.exists() @@ -137,9 +127,11 @@ def save_simulation_note( "trial": simulation.trial, "duration": simulation.duration, "reward": simulation.reward_info.reward if simulation.reward_info else None, - "db_match": simulation.reward_info.db_check.db_match - if simulation.reward_info and simulation.reward_info.db_check - else None, + "db_match": ( + simulation.reward_info.db_check.db_match + if simulation.reward_info and simulation.reward_info.db_check + else None + ), "results_file": results_file, "sim_index": sim_index, "note": note, @@ -165,9 +157,7 @@ def main( sim_files = [Path(sim_file)] if not sim_files: - ConsoleDisplay.console.print( - "[red]No simulation files found in data/simulations/[/]" - ) + ConsoleDisplay.console.print("[red]No simulation files found in data/simulations/[/]") return results = None @@ -176,20 +166,14 @@ def main( # Show main menu ConsoleDisplay.console.print("\n[bold yellow]Main Menu:[/]") ConsoleDisplay.console.print("1. Select simulation file") - ConsoleDisplay.console.print( - " [dim]Choose a simulation results file to load and analyze[/]" - ) + ConsoleDisplay.console.print(" [dim]Choose a simulation results file to load and analyze[/]") if results: ConsoleDisplay.console.print("2. View agent performance metrics") ConsoleDisplay.console.print(" [dim]Display agent performance metrics[/]") ConsoleDisplay.console.print("3. View simulation") - ConsoleDisplay.console.print( - " [dim]Examine a specific simulation in detail with all its data[/]" - ) + ConsoleDisplay.console.print(" [dim]Examine a specific simulation in detail with all its data[/]") ConsoleDisplay.console.print("4. View task details") - ConsoleDisplay.console.print( - " [dim]Look at the configuration and parameters of a specific task[/]" - ) + ConsoleDisplay.console.print(" [dim]Look at the configuration and parameters of a specific task[/]") ConsoleDisplay.console.print("5. Exit") ConsoleDisplay.console.print(" [dim]Close the simulation viewer[/]") choices = ["1", "2", "3", "4", "5"] @@ -200,17 +184,13 @@ def main( choices = ["1", "2"] default_choice = "1" - choice = Prompt.ask( - "\nWhat would you like to do?", choices=choices, default=default_choice - ) + choice = Prompt.ask("\nWhat would you like to do?", choices=choices, default=default_choice) if choice == "1": # Show available files and get selection display_available_files(sim_files) # default to view the last file - file_num = IntPrompt.ask( - f"\nSelect file number (1-{len(sim_files)})", default=len(sim_files) - ) + file_num = IntPrompt.ask(f"\nSelect file number (1-{len(sim_files)})", default=len(sim_files)) if 1 <= file_num <= len(sim_files): try: @@ -219,13 +199,9 @@ def main( ConsoleDisplay.console.print( f"\n[bold green]Loaded {len(results.simulations)} simulations from {current_file}[/]" ) - results.simulations = sorted( - results.simulations, key=lambda x: (x.task_id, x.trial) - ) + results.simulations = sorted(results.simulations, key=lambda x: (x.task_id, x.trial)) except Exception as e: - ConsoleDisplay.console.print( - f"[red]Error loading results:[/] {str(e)}" - ) + ConsoleDisplay.console.print(f"[red]Error loading results:[/] {str(e)}") else: ConsoleDisplay.console.print("[red]Invalid file number[/]") @@ -245,21 +221,15 @@ def main( # Get simulation selection by index sim_count = len(results.simulations) - sim_index = IntPrompt.ask( - f"\nEnter simulation number (1-{sim_count})", default=1 - ) + sim_index = IntPrompt.ask(f"\nEnter simulation number (1-{sim_count})", default=1) if 1 <= sim_index <= sim_count: sim = results.simulations[sim_index - 1] task = find_task_by_id(results.tasks, sim.task_id) if task: - display_simulation_with_task( - sim, task, current_file, sim_index, show_details=True - ) + display_simulation_with_task(sim, task, current_file, sim_index, show_details=True) else: - ConsoleDisplay.console.print( - f"[red]Warning: Could not find task for simulation {sim.id}[/]" - ) + ConsoleDisplay.console.print(f"[red]Warning: Could not find task for simulation {sim.id}[/]") ConsoleDisplay.display_simulation(sim, show_details=True) continue else: diff --git a/vendor/tau2/user/base.py b/vendor/tau2/user/base.py index b47bdb9a..1fe018b6 100644 --- a/vendor/tau2/user/base.py +++ b/vendor/tau2/user/base.py @@ -63,9 +63,7 @@ def flip_roles(self) -> list[APICompatibleMessage]: ) ) else: - raise ValueError( - f"Tool calls are not supported in the flipped messages: {message}" - ) + raise ValueError(f"Tool calls are not supported in the flipped messages: {message}") elif isinstance(message, ToolMessage): if message.requestor == "user": # Only add tool messages for the user @@ -77,9 +75,7 @@ def flip_roles(self) -> list[APICompatibleMessage]: ) ) else: - raise ValueError( - f"Tool messages should be sent to the user in this message history: {message}" - ) + raise ValueError(f"Tool messages should be sent to the user in this message history: {message}") else: print(message, type(message)) raise ValueError(f"Unknown message role: {message.role}") @@ -100,9 +96,7 @@ def __init__( self.instructions = instructions @abstractmethod - async def get_init_state( - self, message_history: Optional[list[Message]] = None - ) -> UserState: + async def get_init_state(self, message_history: Optional[list[Message]] = None) -> UserState: """Get the initial state of the user simulator. Args: diff --git a/vendor/tau2/user/user_simulator.py b/vendor/tau2/user/user_simulator.py index d5508409..31581bdc 100644 --- a/vendor/tau2/user/user_simulator.py +++ b/vendor/tau2/user/user_simulator.py @@ -100,9 +100,9 @@ def get_init_state(self, message_history: Optional[list[Message]] = None) -> Use """ if message_history is None: message_history = [] - assert all( - is_valid_user_history_message(m) for m in message_history - ), "Invalid user message history. User messages must be of type UserMessage, AssistantMessage, or ToolMessage to User." + assert all(is_valid_user_history_message(m) for m in message_history), ( + "Invalid user message history. User messages must be of type UserMessage, AssistantMessage, or ToolMessage to User." + ) user_state = UserState( system_messages=[SystemMessage(role="system", content=self.system_prompt)], diff --git a/vendor/tau2/utils/display.py b/vendor/tau2/utils/display.py index 674efb92..8e83e2cd 100644 --- a/vendor/tau2/utils/display.py +++ b/vendor/tau2/utils/display.py @@ -95,9 +95,7 @@ def display_task(cls, task: Task): if task.description.purpose: content_parts.append(f"[white]Purpose:[/] {task.description.purpose}") if task.description.relevant_policies: - content_parts.append( - f"[white]Relevant Policies:[/] {task.description.relevant_policies}" - ) + content_parts.append(f"[white]Relevant Policies:[/] {task.description.relevant_policies}") if task.description.notes: content_parts.append(f"[white]Notes:[/] {task.description.notes}") @@ -108,14 +106,10 @@ def display_task(cls, task: Task): scenario_parts.append(f"[white]Persona:[/] {task.user_scenario.persona}") # User Instruction - scenario_parts.append( - f"[white]Task Instructions:[/] {task.user_scenario.instructions}" - ) + scenario_parts.append(f"[white]Task Instructions:[/] {task.user_scenario.instructions}") if scenario_parts: - content_parts.append( - "[bold cyan]User Scenario:[/]\n" + "\n".join(scenario_parts) - ) + content_parts.append("[bold cyan]User Scenario:[/]\n" + "\n".join(scenario_parts)) # Initial State section if task.initial_state: @@ -134,9 +128,7 @@ def display_task(cls, task: Task): ) if initial_state_parts: - content_parts.append( - "[bold cyan]Initial State:[/]\n" + "\n".join(initial_state_parts) - ) + content_parts.append("[bold cyan]Initial State:[/]\n" + "\n".join(initial_state_parts)) # Evaluation Criteria section if task.evaluation_criteria: @@ -154,15 +146,11 @@ def display_task(cls, task: Task): f"[white]Information to Communicate:[/]\n{json.dumps(task.evaluation_criteria.communicate_info, indent=2)}" ) if eval_parts: - content_parts.append( - "[bold cyan]Evaluation Criteria:[/]\n" + "\n".join(eval_parts) - ) + content_parts.append("[bold cyan]Evaluation Criteria:[/]\n" + "\n".join(eval_parts)) content = "\n\n".join(content_parts) # Create and display panel - task_panel = Panel( - content, title="[bold blue]Task Details", border_style="blue", expand=True - ) + task_panel = Panel(content, title="[bold blue]Task Details", border_style="blue", expand=True) cls.console.print(task_panel) @@ -203,18 +191,11 @@ def display_simulation(cls, simulation: SimulationRun, show_details: bool = True marker = "✅" if is_successful(simulation.reward_info.reward) else "❌" sim_info.append("Reward: ", style="bold cyan") if simulation.reward_info.reward_breakdown: - breakdown = sorted( - [ - f"{k.value}: {v:.1f}" - for k, v in simulation.reward_info.reward_breakdown.items() - ] - ) + breakdown = sorted([f"{k.value}: {v:.1f}" for k, v in simulation.reward_info.reward_breakdown.items()]) else: breakdown = [] - sim_info.append( - f"{marker} {simulation.reward_info.reward:.4f} ({', '.join(breakdown)})\n" - ) + sim_info.append(f"{marker} {simulation.reward_info.reward:.4f} ({', '.join(breakdown)})\n") # Add DB check info if present if simulation.reward_info.db_check: @@ -243,9 +224,7 @@ def display_simulation(cls, simulation: SimulationRun, show_details: bool = True if simulation.reward_info.communicate_checks: sim_info.append("\nCommunicate Checks:\n", style="bold magenta") for i, check in enumerate(simulation.reward_info.communicate_checks): - sim_info.append( - f"- {i}: {check.info} {'✅' if check.met else '❌'}\n" - ) + sim_info.append(f"- {i}: {check.info} {'✅' if check.met else '❌'}\n") # Add NL assertions if present if simulation.reward_info.nl_assertions: @@ -261,9 +240,7 @@ def display_simulation(cls, simulation: SimulationRun, show_details: bool = True for key, value in simulation.reward_info.info.items(): sim_info.append(f"{key}: {value}\n") - cls.console.print( - Panel(sim_info, title="Simulation Overview", border_style="blue") - ) + cls.console.print(Panel(sim_info, title="Simulation Overview", border_style="blue")) # Create messages table if simulation.messages: @@ -390,15 +367,8 @@ def display_simulation(cls, sim: SimulationRun) -> str: # Add reward info if present if sim.reward_info: - breakdown = sorted( - [ - f"{k.value}: {v:.1f}" - for k, v in sim.reward_info.reward_breakdown.items() - ] - ) - output.append( - f"**Reward**: {sim.reward_info.reward:.4f} ({', '.join(breakdown)})\n" - ) + breakdown = sorted([f"{k.value}: {v:.1f}" for k, v in sim.reward_info.reward_breakdown.items()]) + output.append(f"**Reward**: {sim.reward_info.reward:.4f} ({', '.join(breakdown)})\n") output.append(f"**Reward**: {sim.reward_info.reward:.4f}") # Add DB check info if present @@ -428,9 +398,7 @@ def display_simulation(cls, sim: SimulationRun) -> str: if sim.reward_info.communicate_checks: output.append("\n**Communicate Checks**") for i, check in enumerate(sim.reward_info.communicate_checks): - output.append( - f"- {i}: {check.info} {'✅' if check.met else '❌'} {check.justification}" - ) + output.append(f"- {i}: {check.info} {'✅' if check.met else '❌'} {check.justification}") # Add NL assertions if present if sim.reward_info.nl_assertions: diff --git a/vendor/tau2/utils/llm_utils.py b/vendor/tau2/utils/llm_utils.py index 750c75fe..895d3f56 100644 --- a/vendor/tau2/utils/llm_utils.py +++ b/vendor/tau2/utils/llm_utils.py @@ -1,4 +1,6 @@ import json +import logging +import os import re from typing import Any, Optional @@ -7,8 +9,6 @@ from litellm.caching.caching import Cache from litellm.main import ModelResponse, Usage from loguru import logger -import logging -import os from vendor.tau2.config import ( DEFAULT_LLM_CACHE_TYPE, diff --git a/vendor/tau2/utils/pydantic_utils.py b/vendor/tau2/utils/pydantic_utils.py index 5c34acd8..e46454eb 100644 --- a/vendor/tau2/utils/pydantic_utils.py +++ b/vendor/tau2/utils/pydantic_utils.py @@ -21,9 +21,7 @@ def get_pydantic_hash(obj: BaseModel) -> str: return get_dict_hash(hash_dict) -def update_pydantic_model_with_dict( - model_instance: T, update_data: Dict[str, Any] -) -> T: +def update_pydantic_model_with_dict(model_instance: T, update_data: Dict[str, Any]) -> T: """ Return an updated BaseModel instance based on the update_data. """ diff --git a/vendor/tau2/utils/utils.py b/vendor/tau2/utils/utils.py index c1103fe9..33c9b511 100644 --- a/vendor/tau2/utils/utils.py +++ b/vendor/tau2/utils/utils.py @@ -29,9 +29,7 @@ # Check if data directory exists and is accessible if not DATA_DIR.exists(): logger.warning(f"Data directory does not exist: {DATA_DIR}") - logger.warning( - "Set TAU2_DATA_DIR environment variable to point to your data directory" - ) + logger.warning("Set TAU2_DATA_DIR environment variable to point to your data directory") logger.warning("Or ensure the data directory exists in the expected location") @@ -72,11 +70,7 @@ def get_commit_hash() -> str: Get the commit hash of the current directory. """ try: - commit_hash = ( - subprocess.check_output(["git", "rev-parse", "HEAD"], text=True) - .strip() - .split("\n")[0] - ) + commit_hash = subprocess.check_output(["git", "rev-parse", "HEAD"], text=True).strip().split("\n")[0] except Exception as e: logger.error(f"Failed to get git hash: {e}") commit_hash = "unknown" diff --git a/versioneer.py b/versioneer.py index f8092765..4611fc7d 100644 --- a/versioneer.py +++ b/versioneer.py @@ -503,9 +503,7 @@ def run_command( return stdout, process.returncode -LONG_VERSION_PY[ - "git" -] = r''' +LONG_VERSION_PY["git"] = r''' # This file helps to compute a version number in source trees obtained from # git-archive tarball (such as those provided by githubs download-from-tag # feature). Distribution tarballs (built by setup.py sdist) and build From f05667c83e467516806731d8ba7fcb742069aded Mon Sep 17 00:00:00 2001 From: "Yufei (Benny) Chen" <1585539+benjibc@users.noreply.github.com> Date: Sun, 17 Aug 2025 17:27:54 -0700 Subject: [PATCH 26/26] fix bq errors (#91) --- eval_protocol/adapters/bigquery.py | 20 ++- pyproject.toml | 4 + uv.lock | 190 ++++++++++++++++++++++++++++- 3 files changed, 210 insertions(+), 4 deletions(-) diff --git a/eval_protocol/adapters/bigquery.py b/eval_protocol/adapters/bigquery.py index 1275b1e1..7b79884b 100644 --- a/eval_protocol/adapters/bigquery.py +++ b/eval_protocol/adapters/bigquery.py @@ -4,8 +4,10 @@ to EvaluationRow format for use in evaluation pipelines. """ +from __future__ import annotations + import logging -from typing import Any, Callable, Dict, Iterator, List, Optional, Union +from typing import TYPE_CHECKING, Any, Callable, Dict, Iterator, List, Optional, Union from eval_protocol.models import CompletionParams, EvaluationRow, InputMetadata, Message @@ -20,7 +22,19 @@ BIGQUERY_AVAILABLE = True except ImportError: BIGQUERY_AVAILABLE = False - logger.warning("Google Cloud BigQuery not installed. Install with: pip install 'eval-protocol[bigquery]'") + # Optional dependency: avoid noisy warnings during import + logger.debug("Google Cloud BigQuery not installed. Optional feature disabled.") + +# Avoid importing BigQuery types at runtime for annotations when not installed +if TYPE_CHECKING: + from google.cloud import bigquery as _bigquery_type + + QueryParameterType = Union[ + _bigquery_type.ScalarQueryParameter, + _bigquery_type.ArrayQueryParameter, + ] +else: + QueryParameterType = Any # Type alias for transformation function TransformFunction = Callable[[Dict[str, Any]], Dict[str, Any]] @@ -96,7 +110,7 @@ def __init__( def get_evaluation_rows( self, query: str, - query_params: Optional[List[Union[bigquery.ScalarQueryParameter, bigquery.ArrayQueryParameter]]] = None, + query_params: Optional[List[QueryParameterType]] = None, limit: Optional[int] = None, offset: int = 0, model_name: str = "gpt-3.5-turbo", diff --git a/pyproject.toml b/pyproject.toml index 30380dd4..46b66d77 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -111,6 +111,10 @@ adapters = [ "datasets>=2.0.0", "transformers>=4.0.0", ] +bigquery = [ + "google-cloud-bigquery>=3.0.0", + "google-auth>=2.0.0", +] svgbench = [ "selenium>=4.0.0", ] diff --git a/uv.lock b/uv.lock index e1e74645..6758d24c 100644 --- a/uv.lock +++ b/uv.lock @@ -538,6 +538,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/84/c2/80633736cd183ee4a62107413def345f7e6e3c01563dbca1417363cf957e/build-1.2.2.post1-py3-none-any.whl", hash = "sha256:1d61c0887fa860c01971625baae8bdd338e517b836a2f70dd1f7aa3a6b2fc5b5", size = 22950, upload-time = "2024-10-06T17:22:23.299Z" }, ] +[[package]] +name = "cachetools" +version = "5.5.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/6c/81/3747dad6b14fa2cf53fcf10548cf5aea6913e96fab41a3c198676f8948a5/cachetools-5.5.2.tar.gz", hash = "sha256:1a661caa9175d26759571b2e19580f9d6393969e5dfca11fdb1f947a23e640d4", size = 28380, upload-time = "2025-02-20T21:01:19.524Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/72/76/20fa66124dbe6be5cafeb312ece67de6b61dd91a0247d1ea13db4ebb33c2/cachetools-5.5.2-py3-none-any.whl", hash = "sha256:d26a22bcc62eb95c3beabd9f1ee5e820d3d2704fe2967cbe350e20c8ffcd3f0a", size = 10080, upload-time = "2025-02-20T21:01:16.647Z" }, +] + [[package]] name = "certifi" version = "2025.7.14" @@ -1114,6 +1123,10 @@ adapters = [ { name = "langfuse" }, { name = "transformers" }, ] +bigquery = [ + { name = "google-auth" }, + { name = "google-cloud-bigquery" }, +] box2d = [ { name = "gymnasium", extra = ["box2d"] }, { name = "pillow" }, @@ -1195,6 +1208,8 @@ requires-dist = [ { name = "fastapi", specifier = ">=0.116.1" }, { name = "fireworks-ai", marker = "extra == 'fireworks'", specifier = ">=0.19.12" }, { name = "fsspec" }, + { name = "google-auth", marker = "extra == 'bigquery'", specifier = ">=2.0.0" }, + { name = "google-cloud-bigquery", marker = "extra == 'bigquery'", specifier = ">=3.0.0" }, { name = "gymnasium", specifier = ">=0.29.0" }, { name = "gymnasium", extras = ["box2d"], marker = "extra == 'box2d'", specifier = ">=0.29.0" }, { name = "haikus", marker = "extra == 'dev'", specifier = "==0.3.8" }, @@ -1251,7 +1266,7 @@ requires-dist = [ { name = "websockets", specifier = ">=15.0.1" }, { name = "werkzeug", marker = "extra == 'dev'", specifier = ">=2.0.0" }, ] -provides-extras = ["dev", "trl", "openevals", "fireworks", "box2d", "langfuse", "huggingface", "adapters", "svgbench"] +provides-extras = ["dev", "trl", "openevals", "fireworks", "box2d", "langfuse", "huggingface", "adapters", "bigquery", "svgbench"] [package.metadata.requires-dev] dev = [ @@ -1545,6 +1560,120 @@ http = [ { name = "aiohttp" }, ] +[[package]] +name = "google-api-core" +version = "2.25.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "google-auth" }, + { name = "googleapis-common-protos" }, + { name = "proto-plus" }, + { name = "protobuf" }, + { name = "requests" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/dc/21/e9d043e88222317afdbdb567165fdbc3b0aad90064c7e0c9eb0ad9955ad8/google_api_core-2.25.1.tar.gz", hash = "sha256:d2aaa0b13c78c61cb3f4282c464c046e45fbd75755683c9c525e6e8f7ed0a5e8", size = 165443, upload-time = "2025-06-12T20:52:20.439Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/14/4b/ead00905132820b623732b175d66354e9d3e69fcf2a5dcdab780664e7896/google_api_core-2.25.1-py3-none-any.whl", hash = "sha256:8a2a56c1fef82987a524371f99f3bd0143702fecc670c72e600c1cda6bf8dbb7", size = 160807, upload-time = "2025-06-12T20:52:19.334Z" }, +] + +[package.optional-dependencies] +grpc = [ + { name = "grpcio" }, + { name = "grpcio-status" }, +] + +[[package]] +name = "google-auth" +version = "2.40.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cachetools" }, + { name = "pyasn1-modules" }, + { name = "rsa" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9e/9b/e92ef23b84fa10a64ce4831390b7a4c2e53c0132568d99d4ae61d04c8855/google_auth-2.40.3.tar.gz", hash = "sha256:500c3a29adedeb36ea9cf24b8d10858e152f2412e3ca37829b3fa18e33d63b77", size = 281029, upload-time = "2025-06-04T18:04:57.577Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/17/63/b19553b658a1692443c62bd07e5868adaa0ad746a0751ba62c59568cd45b/google_auth-2.40.3-py2.py3-none-any.whl", hash = "sha256:1370d4593e86213563547f97a92752fc658456fe4514c809544f330fed45a7ca", size = 216137, upload-time = "2025-06-04T18:04:55.573Z" }, +] + +[[package]] +name = "google-cloud-bigquery" +version = "3.35.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "google-api-core", extra = ["grpc"] }, + { name = "google-auth" }, + { name = "google-cloud-core" }, + { name = "google-resumable-media" }, + { name = "packaging" }, + { name = "python-dateutil" }, + { name = "requests" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/44/e4/9cf03fa81fefd1b9811a7cd6e398804ae0de3b6a4edef810e2acd45cabbc/google_cloud_bigquery-3.35.1.tar.gz", hash = "sha256:599f26cacf190acfe88000f6cc5f4bc9e6baac7899e4f406ca054f1906f71960", size = 496433, upload-time = "2025-07-24T15:09:04.108Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/63/50/96fe9bc5b83d3a421e91ed8edc2535de45957e9af398273e3ecb5c3a1094/google_cloud_bigquery-3.35.1-py3-none-any.whl", hash = "sha256:6739a6ba63c6d80735ca2b34b1df2090ff473b80c1a62354caa2debe6dbbd961", size = 256877, upload-time = "2025-07-24T15:09:02.443Z" }, +] + +[[package]] +name = "google-cloud-core" +version = "2.4.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "google-api-core" }, + { name = "google-auth" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d6/b8/2b53838d2acd6ec6168fd284a990c76695e84c65deee79c9f3a4276f6b4f/google_cloud_core-2.4.3.tar.gz", hash = "sha256:1fab62d7102844b278fe6dead3af32408b1df3eb06f5c7e8634cbd40edc4da53", size = 35861, upload-time = "2025-03-10T21:05:38.948Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/40/86/bda7241a8da2d28a754aad2ba0f6776e35b67e37c36ae0c45d49370f1014/google_cloud_core-2.4.3-py2.py3-none-any.whl", hash = "sha256:5130f9f4c14b4fafdff75c79448f9495cfade0d8775facf1b09c3bf67e027f6e", size = 29348, upload-time = "2025-03-10T21:05:37.785Z" }, +] + +[[package]] +name = "google-crc32c" +version = "1.7.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/19/ae/87802e6d9f9d69adfaedfcfd599266bf386a54d0be058b532d04c794f76d/google_crc32c-1.7.1.tar.gz", hash = "sha256:2bff2305f98846f3e825dbeec9ee406f89da7962accdb29356e4eadc251bd472", size = 14495, upload-time = "2025-03-26T14:29:13.32Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/eb/69/b1b05cf415df0d86691d6a8b4b7e60ab3a6fb6efb783ee5cd3ed1382bfd3/google_crc32c-1.7.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:b07d48faf8292b4db7c3d64ab86f950c2e94e93a11fd47271c28ba458e4a0d76", size = 30467, upload-time = "2025-03-26T14:31:11.92Z" }, + { url = "https://files.pythonhosted.org/packages/44/3d/92f8928ecd671bd5b071756596971c79d252d09b835cdca5a44177fa87aa/google_crc32c-1.7.1-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:7cc81b3a2fbd932a4313eb53cc7d9dde424088ca3a0337160f35d91826880c1d", size = 30311, upload-time = "2025-03-26T14:53:14.161Z" }, + { url = "https://files.pythonhosted.org/packages/33/42/c2d15a73df79d45ed6b430b9e801d0bd8e28ac139a9012d7d58af50a385d/google_crc32c-1.7.1-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:1c67ca0a1f5b56162951a9dae987988679a7db682d6f97ce0f6381ebf0fbea4c", size = 37889, upload-time = "2025-03-26T14:41:27.83Z" }, + { url = "https://files.pythonhosted.org/packages/57/ea/ac59c86a3c694afd117bb669bde32aaf17d0de4305d01d706495f09cbf19/google_crc32c-1.7.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc5319db92daa516b653600794d5b9f9439a9a121f3e162f94b0e1891c7933cb", size = 33028, upload-time = "2025-03-26T14:41:29.141Z" }, + { url = "https://files.pythonhosted.org/packages/60/44/87e77e8476767a4a93f6cf271157c6d948eacec63688c093580af13b04be/google_crc32c-1.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dcdf5a64adb747610140572ed18d011896e3b9ae5195f2514b7ff678c80f1603", size = 38026, upload-time = "2025-03-26T14:41:29.921Z" }, + { url = "https://files.pythonhosted.org/packages/c8/bf/21ac7bb305cd7c1a6de9c52f71db0868e104a5b573a4977cd9d0ff830f82/google_crc32c-1.7.1-cp310-cp310-win_amd64.whl", hash = "sha256:754561c6c66e89d55754106739e22fdaa93fafa8da7221b29c8b8e8270c6ec8a", size = 33476, upload-time = "2025-03-26T14:29:09.086Z" }, + { url = "https://files.pythonhosted.org/packages/f7/94/220139ea87822b6fdfdab4fb9ba81b3fff7ea2c82e2af34adc726085bffc/google_crc32c-1.7.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:6fbab4b935989e2c3610371963ba1b86afb09537fd0c633049be82afe153ac06", size = 30468, upload-time = "2025-03-26T14:32:52.215Z" }, + { url = "https://files.pythonhosted.org/packages/94/97/789b23bdeeb9d15dc2904660463ad539d0318286d7633fe2760c10ed0c1c/google_crc32c-1.7.1-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:ed66cbe1ed9cbaaad9392b5259b3eba4a9e565420d734e6238813c428c3336c9", size = 30313, upload-time = "2025-03-26T14:57:38.758Z" }, + { url = "https://files.pythonhosted.org/packages/81/b8/976a2b843610c211e7ccb3e248996a61e87dbb2c09b1499847e295080aec/google_crc32c-1.7.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ee6547b657621b6cbed3562ea7826c3e11cab01cd33b74e1f677690652883e77", size = 33048, upload-time = "2025-03-26T14:41:30.679Z" }, + { url = "https://files.pythonhosted.org/packages/c9/16/a3842c2cf591093b111d4a5e2bfb478ac6692d02f1b386d2a33283a19dc9/google_crc32c-1.7.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d68e17bad8f7dd9a49181a1f5a8f4b251c6dbc8cc96fb79f1d321dfd57d66f53", size = 32669, upload-time = "2025-03-26T14:41:31.432Z" }, + { url = "https://files.pythonhosted.org/packages/04/17/ed9aba495916fcf5fe4ecb2267ceb851fc5f273c4e4625ae453350cfd564/google_crc32c-1.7.1-cp311-cp311-win_amd64.whl", hash = "sha256:6335de12921f06e1f774d0dd1fbea6bf610abe0887a1638f64d694013138be5d", size = 33476, upload-time = "2025-03-26T14:29:10.211Z" }, + { url = "https://files.pythonhosted.org/packages/dd/b7/787e2453cf8639c94b3d06c9d61f512234a82e1d12d13d18584bd3049904/google_crc32c-1.7.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:2d73a68a653c57281401871dd4aeebbb6af3191dcac751a76ce430df4d403194", size = 30470, upload-time = "2025-03-26T14:34:31.655Z" }, + { url = "https://files.pythonhosted.org/packages/ed/b4/6042c2b0cbac3ec3a69bb4c49b28d2f517b7a0f4a0232603c42c58e22b44/google_crc32c-1.7.1-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:22beacf83baaf59f9d3ab2bbb4db0fb018da8e5aebdce07ef9f09fce8220285e", size = 30315, upload-time = "2025-03-26T15:01:54.634Z" }, + { url = "https://files.pythonhosted.org/packages/29/ad/01e7a61a5d059bc57b702d9ff6a18b2585ad97f720bd0a0dbe215df1ab0e/google_crc32c-1.7.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:19eafa0e4af11b0a4eb3974483d55d2d77ad1911e6cf6f832e1574f6781fd337", size = 33180, upload-time = "2025-03-26T14:41:32.168Z" }, + { url = "https://files.pythonhosted.org/packages/3b/a5/7279055cf004561894ed3a7bfdf5bf90a53f28fadd01af7cd166e88ddf16/google_crc32c-1.7.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b6d86616faaea68101195c6bdc40c494e4d76f41e07a37ffdef270879c15fb65", size = 32794, upload-time = "2025-03-26T14:41:33.264Z" }, + { url = "https://files.pythonhosted.org/packages/0f/d6/77060dbd140c624e42ae3ece3df53b9d811000729a5c821b9fd671ceaac6/google_crc32c-1.7.1-cp312-cp312-win_amd64.whl", hash = "sha256:b7491bdc0c7564fcf48c0179d2048ab2f7c7ba36b84ccd3a3e1c3f7a72d3bba6", size = 33477, upload-time = "2025-03-26T14:29:10.94Z" }, + { url = "https://files.pythonhosted.org/packages/8b/72/b8d785e9184ba6297a8620c8a37cf6e39b81a8ca01bb0796d7cbb28b3386/google_crc32c-1.7.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:df8b38bdaf1629d62d51be8bdd04888f37c451564c2042d36e5812da9eff3c35", size = 30467, upload-time = "2025-03-26T14:36:06.909Z" }, + { url = "https://files.pythonhosted.org/packages/34/25/5f18076968212067c4e8ea95bf3b69669f9fc698476e5f5eb97d5b37999f/google_crc32c-1.7.1-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:e42e20a83a29aa2709a0cf271c7f8aefaa23b7ab52e53b322585297bb94d4638", size = 30309, upload-time = "2025-03-26T15:06:15.318Z" }, + { url = "https://files.pythonhosted.org/packages/92/83/9228fe65bf70e93e419f38bdf6c5ca5083fc6d32886ee79b450ceefd1dbd/google_crc32c-1.7.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:905a385140bf492ac300026717af339790921f411c0dfd9aa5a9e69a08ed32eb", size = 33133, upload-time = "2025-03-26T14:41:34.388Z" }, + { url = "https://files.pythonhosted.org/packages/c3/ca/1ea2fd13ff9f8955b85e7956872fdb7050c4ace8a2306a6d177edb9cf7fe/google_crc32c-1.7.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b211ddaf20f7ebeec5c333448582c224a7c90a9d98826fbab82c0ddc11348e6", size = 32773, upload-time = "2025-03-26T14:41:35.19Z" }, + { url = "https://files.pythonhosted.org/packages/89/32/a22a281806e3ef21b72db16f948cad22ec68e4bdd384139291e00ff82fe2/google_crc32c-1.7.1-cp313-cp313-win_amd64.whl", hash = "sha256:0f99eaa09a9a7e642a61e06742856eec8b19fc0037832e03f941fe7cf0c8e4db", size = 33475, upload-time = "2025-03-26T14:29:11.771Z" }, + { url = "https://files.pythonhosted.org/packages/b8/c5/002975aff514e57fc084ba155697a049b3f9b52225ec3bc0f542871dd524/google_crc32c-1.7.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:32d1da0d74ec5634a05f53ef7df18fc646666a25efaaca9fc7dcfd4caf1d98c3", size = 33243, upload-time = "2025-03-26T14:41:35.975Z" }, + { url = "https://files.pythonhosted.org/packages/61/cb/c585282a03a0cea70fcaa1bf55d5d702d0f2351094d663ec3be1c6c67c52/google_crc32c-1.7.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e10554d4abc5238823112c2ad7e4560f96c7bf3820b202660373d769d9e6e4c9", size = 32870, upload-time = "2025-03-26T14:41:37.08Z" }, + { url = "https://files.pythonhosted.org/packages/0b/43/31e57ce04530794917dfe25243860ec141de9fadf4aa9783dffe7dac7c39/google_crc32c-1.7.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a8e9afc74168b0b2232fb32dd202c93e46b7d5e4bf03e66ba5dc273bb3559589", size = 28242, upload-time = "2025-03-26T14:41:42.858Z" }, + { url = "https://files.pythonhosted.org/packages/eb/f3/8b84cd4e0ad111e63e30eb89453f8dd308e3ad36f42305cf8c202461cdf0/google_crc32c-1.7.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa8136cc14dd27f34a3221c0f16fd42d8a40e4778273e61a3c19aedaa44daf6b", size = 28049, upload-time = "2025-03-26T14:41:44.651Z" }, + { url = "https://files.pythonhosted.org/packages/16/1b/1693372bf423ada422f80fd88260dbfd140754adb15cbc4d7e9a68b1cb8e/google_crc32c-1.7.1-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:85fef7fae11494e747c9fd1359a527e5970fc9603c90764843caabd3a16a0a48", size = 28241, upload-time = "2025-03-26T14:41:45.898Z" }, + { url = "https://files.pythonhosted.org/packages/fd/3c/2a19a60a473de48717b4efb19398c3f914795b64a96cf3fbe82588044f78/google_crc32c-1.7.1-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6efb97eb4369d52593ad6f75e7e10d053cf00c48983f7a973105bc70b0ac4d82", size = 28048, upload-time = "2025-03-26T14:41:46.696Z" }, +] + +[[package]] +name = "google-resumable-media" +version = "2.7.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "google-crc32c" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/58/5a/0efdc02665dca14e0837b62c8a1a93132c264bd02054a15abb2218afe0ae/google_resumable_media-2.7.2.tar.gz", hash = "sha256:5280aed4629f2b60b847b0d42f9857fd4935c11af266744df33d8074cae92fe0", size = 2163099, upload-time = "2024-08-07T22:20:38.555Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/82/35/b8d3baf8c46695858cb9d8835a53baa1eeb9906ddaf2f728a5f5b640fd1e/google_resumable_media-2.7.2-py2.py3-none-any.whl", hash = "sha256:3ce7551e9fe6d99e9a126101d2536612bb73486721951e9562fee0f90c6ababa", size = 81251, upload-time = "2024-08-07T22:20:36.409Z" }, +] + [[package]] name = "googleapis-common-protos" version = "1.70.0" @@ -1656,6 +1785,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/34/80/de3eb55eb581815342d097214bed4c59e806b05f1b3110df03b2280d6dfd/grpcio-1.74.0-cp313-cp313-win_amd64.whl", hash = "sha256:fd3c71aeee838299c5887230b8a1822795325ddfea635edd82954c1eaa831e24", size = 4489214, upload-time = "2025-07-24T18:53:59.771Z" }, ] +[[package]] +name = "grpcio-status" +version = "1.71.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "googleapis-common-protos" }, + { name = "grpcio" }, + { name = "protobuf" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/fd/d1/b6e9877fedae3add1afdeae1f89d1927d296da9cf977eca0eb08fb8a460e/grpcio_status-1.71.2.tar.gz", hash = "sha256:c7a97e176df71cdc2c179cd1847d7fc86cca5832ad12e9798d7fed6b7a1aab50", size = 13677, upload-time = "2025-06-28T04:24:05.426Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/67/58/317b0134129b556a93a3b0afe00ee675b5657f0155509e22fcb853bafe2d/grpcio_status-1.71.2-py3-none-any.whl", hash = "sha256:803c98cb6a8b7dc6dbb785b1111aed739f241ab5e9da0bba96888aa74704cfd3", size = 14424, upload-time = "2025-06-28T04:23:42.136Z" }, +] + [[package]] name = "grpclib" version = "0.4.8" @@ -4070,6 +4213,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cc/35/cc0aaecf278bb4575b8555f2b137de5ab821595ddae9da9d3cd1da4072c7/propcache-0.3.2-py3-none-any.whl", hash = "sha256:98f1ec44fb675f5052cccc8e609c46ed23a35a1cfd18545ad4e29002d858a43f", size = 12663, upload-time = "2025-06-09T22:56:04.484Z" }, ] +[[package]] +name = "proto-plus" +version = "1.26.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "protobuf" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f4/ac/87285f15f7cce6d4a008f33f1757fb5a13611ea8914eb58c3d0d26243468/proto_plus-1.26.1.tar.gz", hash = "sha256:21a515a4c4c0088a773899e23c7bbade3d18f9c66c73edd4c7ee3816bc96a012", size = 56142, upload-time = "2025-03-10T15:54:38.843Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4e/6d/280c4c2ce28b1593a19ad5239c8b826871fc6ec275c21afc8e1820108039/proto_plus-1.26.1-py3-none-any.whl", hash = "sha256:13285478c2dcf2abb829db158e1047e2f1e8d63a077d94263c2b88b043c75a66", size = 50163, upload-time = "2025-03-10T15:54:37.335Z" }, +] + [[package]] name = "protobuf" version = "5.29.3" @@ -4160,6 +4315,27 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e5/4e/519c1bc1876625fe6b71e9a28287c43ec2f20f73c658b9ae1d485c0c206e/pyarrow-21.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:222c39e2c70113543982c6b34f3077962b44fca38c0bd9e68bb6781534425c10", size = 26371006, upload-time = "2025-07-18T00:56:56.379Z" }, ] +[[package]] +name = "pyasn1" +version = "0.6.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ba/e9/01f1a64245b89f039897cb0130016d79f77d52669aae6ee7b159a6c4c018/pyasn1-0.6.1.tar.gz", hash = "sha256:6f580d2bdd84365380830acf45550f2511469f673cb4a5ae3857a3170128b034", size = 145322, upload-time = "2024-09-10T22:41:42.55Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c8/f1/d6a797abb14f6283c0ddff96bbdd46937f64122b8c925cab503dd37f8214/pyasn1-0.6.1-py3-none-any.whl", hash = "sha256:0d632f46f2ba09143da3a8afe9e33fb6f92fa2320ab7e886e2d0f7672af84629", size = 83135, upload-time = "2024-09-11T16:00:36.122Z" }, +] + +[[package]] +name = "pyasn1-modules" +version = "0.4.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pyasn1" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/e9/e6/78ebbb10a8c8e4b61a59249394a4a594c1a7af95593dc933a349c8d00964/pyasn1_modules-0.4.2.tar.gz", hash = "sha256:677091de870a80aae844b1ca6134f54652fa2c8c5a52aa396440ac3106e941e6", size = 307892, upload-time = "2025-03-28T02:41:22.17Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/47/8d/d529b5d697919ba8c11ad626e835d4039be708a35b0d22de83a269a6682c/pyasn1_modules-0.4.2-py3-none-any.whl", hash = "sha256:29253a9207ce32b64c3ac6600edc75368f98473906e8fd1043bd6b5b1de2c14a", size = 181259, upload-time = "2025-03-28T02:41:19.028Z" }, +] + [[package]] name = "pycares" version = "4.9.0" @@ -5178,6 +5354,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c8/ed/9de62c2150ca8e2e5858acf3f4f4d0d180a38feef9fdab4078bea63d8dba/rpds_py-0.26.0-pp311-pypy311_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:e99685fc95d386da368013e7fb4269dd39c30d99f812a8372d62f244f662709c", size = 555334, upload-time = "2025-07-01T15:56:51.703Z" }, ] +[[package]] +name = "rsa" +version = "4.9.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pyasn1" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/da/8a/22b7beea3ee0d44b1916c0c1cb0ee3af23b700b6da9f04991899d0c555d4/rsa-4.9.1.tar.gz", hash = "sha256:e7bdbfdb5497da4c07dfd35530e1a902659db6ff241e39d9953cad06ebd0ae75", size = 29034, upload-time = "2025-04-16T09:51:18.218Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/64/8d/0133e4eb4beed9e425d9a98ed6e081a55d195481b7632472be1af08d2f6b/rsa-4.9.1-py3-none-any.whl", hash = "sha256:68635866661c6836b8d39430f97a996acbd61bfa49406748ea243539fe239762", size = 34696, upload-time = "2025-04-16T09:51:17.142Z" }, +] + [[package]] name = "ruff" version = "0.9.10"