From 28d7d35bb3519b14168b7c12fe6f112febadc058 Mon Sep 17 00:00:00 2001 From: ricknie Date: Wed, 6 May 2026 16:15:34 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20eval=E6=A8=A1=E5=9D=97=E6=94=AF?= =?UTF-8?q?=E6=8C=81=E4=B8=8D=E5=90=8C=E5=A4=A7=E6=A8=A1=E5=9E=8B=E8=AF=84?= =?UTF-8?q?=E4=BC=B0=E5=90=8C=E4=B8=80metric?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TAPD: --story=134052565 --- docs/mkdocs/en/evaluation.md | 133 ++++- docs/mkdocs/zh/evaluation.md | 135 ++++- tests/evaluation/test_eval_result.py | 80 +++ tests/evaluation/test_llm_criterion.py | 162 +++++- .../evaluation/test_llm_evaluator_registry.py | 43 ++ .../test_llm_judge_models_aggregator.py | 198 +++++++ .../evaluation/test_llm_judge_multi_model.py | 354 +++++++++++++ tests/evaluation/test_llm_judge_think.py | 393 ++++++++++++++ trpc_agent_sdk/evaluation/__init__.py | 28 + trpc_agent_sdk/evaluation/_agent_evaluator.py | 171 +++++- trpc_agent_sdk/evaluation/_eval_case.py | 22 +- trpc_agent_sdk/evaluation/_eval_result.py | 28 + trpc_agent_sdk/evaluation/_evaluator_base.py | 17 + .../evaluation/_evaluator_registry.py | 13 + .../evaluation/_final_response_evaluator.py | 2 + trpc_agent_sdk/evaluation/_llm_criterion.py | 77 +++ trpc_agent_sdk/evaluation/_llm_evaluator.py | 30 ++ trpc_agent_sdk/evaluation/_llm_judge.py | 486 ++++++++++++++++-- .../evaluation/_local_eval_service.py | 104 +++- trpc_agent_sdk/evaluation/_rouge_evaluator.py | 2 + .../evaluation/_trajectory_evaluator.py | 2 + 21 files changed, 2398 insertions(+), 82 deletions(-) create mode 100644 tests/evaluation/test_llm_judge_models_aggregator.py create mode 100644 tests/evaluation/test_llm_judge_multi_model.py create mode 100644 tests/evaluation/test_llm_judge_think.py diff --git a/docs/mkdocs/en/evaluation.md b/docs/mkdocs/en/evaluation.md index 0245ced..ca51a85 100644 --- a/docs/mkdocs/en/evaluation.md +++ b/docs/mkdocs/en/evaluation.md @@ -425,6 +425,8 @@ The eval configuration describes "how to judge." This section teaches you how to `test_config.json` must be placed in the **same directory** as the eval set file (`.evalset.json` / `.test.json`); the framework loads it automatically. +> **Advanced**: If you want **multiple eval sets to share a single configuration** (e.g., centralizing all metric definitions in one JSON), pass `eval_metrics_file_path_or_dir` at call time to bypass the same-directory convention. See [Shared Configuration: `eval_metrics_file_path_or_dir`](#shared-configuration-eval_metrics_file_path_or_dir). + #### Structure Definition **EvalConfig** (parsed from `test_config.json`) @@ -718,7 +720,10 @@ Compare using text "contains" with case-insensitivity (common when the final res | Field | Type | Description | | --- | --- | --- | -| judge_model | object | Judge model configuration (JudgeModelOptions); required | +| judge_model | object | Judge model configuration (JudgeModelOptions); required when `judge_models` is not set | +| judge_models | array | Multi-model judge list (JudgeModelOptions items); mutually exclusive with `judge_model`. Cross-model results are combined by `models_aggregator` | +| models_aggregator | string | Cross-model aggregation strategy. Built-in: `all_pass` (default) / `any_pass` / `majority_pass` / `avg` / `weighted_avg` / `weighted_majority`. Custom names must be registered via `LLM_EVALUATOR_REGISTRY.register_models_aggregator` before evaluation | +| parallel | boolean | Whether to run the multiple judge models concurrently; default `true` | | rubrics | array | Rubric list; required for llm_rubric_response and llm_rubric_knowledge_recall | | knowledge_tool_names | array | List of knowledge retrieval tool names; used by llm_rubric_knowledge_recall, default `["knowledge_search"]` | @@ -730,7 +735,9 @@ Compare using text "contains" with case-insensitivity (common when the final res | api_key | string | API key | | base_url | string | Optional, custom endpoint | | num_samples | number | Number of judge samples per turn; default 1 | -| generation_config | object | Generation parameters (max_tokens, temperature, etc.) | +| weight | number | Per-model weight used by `weighted_avg` / `weighted_majority` aggregators; default 1.0 | +| think | boolean | Controls the judge model's thinking mode. `false`: disable thinking (sets both `thinking_config.thinking_budget=0` and `chat_template_kwargs.enable_thinking=false`). `true`: enable thinking with automatic budget (`include_thoughts=true`). Unset (default): keep the model default. Recommended `false` for judge models to save tokens and latency | +| generation_config | object | Generation parameters (max_tokens, temperature, etc.; may also explicitly set `thinking_config` / `http_options`; the `think` field overrides them) | **Rubric** (items in the rubrics array) @@ -812,6 +819,71 @@ LLM response quality with rubrics (llm_rubric_response or llm_rubric_knowledge_r It is recommended to use environment variable placeholders for `api_key` and `base_url` (e.g., `${TRPC_AGENT_API_KEY}`), which are replaced by the execution environment, to avoid writing plaintext in configuration files. +**Multi-model judge (cross-model aggregation)** + +A single LLM-judge metric may use multiple judge models simultaneously and combine their verdicts via `models_aggregator`. Use `judge_models` instead of `judge_model`; the two fields are mutually exclusive. Per-model details are available on `PerInvocationResult.per_model_scores` (a list of `NamedScoreResult`). + +Built-in aggregators: + +| Name | Pass rule | Overall score | +| --- | --- | --- | +| `all_pass` (default) | all models pass | min of per-model scores | +| `any_pass` | any model passes | max of per-model scores | +| `majority_pass` | strict majority passes (`passed*2 > total`) | `passed_count / total` | +| `avg` | mean ≥ threshold | mean of per-model scores | +| `weighted_avg` | weighted mean ≥ threshold | `sum(w*s) / sum(w)` | +| `weighted_majority` | weighted-passed share ≥ 0.5 | `sum(w where passed) / sum(w)` | + +If a single judge model raises during execution, that model is counted as a non-passing vote; if every model raises, the invocation is reported as `NOT_EVALUATED`. + +```json +{ + "metrics": [ + { + "metric_name": "llm_final_response", + "threshold": 1, + "criterion": { + "llm_judge": { + "judge_models": [ + { + "model_name": "glm-4.7", + "api_key": "${TRPC_AGENT_API_KEY}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "weight": 2.0 + }, + { + "model_name": "gpt-4o", + "api_key": "${TRPC_AGENT_API_KEY}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "weight": 1.0 + } + ], + "models_aggregator": "weighted_avg", + "parallel": true + } + } + } + ] +} +``` + +`parallel` controls how multiple judge models are executed: `true` (default) calls all models concurrently, with latency bounded by the slowest model; `false` calls them sequentially in the declared order. Only takes effect when `judge_models` contains more than one model. + +If a judge model has thinking enabled by default, consider setting `"think": false` on its `JudgeModelOptions`: the judge output is a structured JSON, thinking traces add no value to the final verdict, and disabling thinking significantly reduces token cost and latency. Each judge model has its own independent `think` flag. + +Custom aggregators can be registered at runtime and take precedence over the `models_aggregator` name written in the criterion: + +```python +from trpc_agent_sdk.evaluation import LLM_EVALUATOR_REGISTRY, ScoreResult + +def my_aggregator(per_model, threshold, weights): + # per_model: list[ScoreResult]; weights: list[float] + score = sum(s.score or 0.0 for s in per_model) / len(per_model) + return ScoreResult(score=score, reason="custom aggregation") + +LLM_EVALUATOR_REGISTRY.register_models_aggregator("llm_final_response", my_aggregator) +``` + #### Custom Criteria To fully customize the "whether it matches" logic in code, you can register a matching function with `CRITERION_REGISTRY` before running the evaluation. Supported types for registration are `TOOL_TRAJECTORY` and `FINAL_RESPONSE`; once registered, comparisons of that type will invoke your provided function `(actual, expected) -> bool`, bypassing the built-in criteria from the configuration file. @@ -1764,6 +1836,16 @@ In default mode, the eval service actually calls the Agent for inference. If you Replaying existing conversations, offline evaluation, or avoiding repeated Agent and model calls when debugging evaluation flows. +**`agent_module` is optional** + +`agent_module` tells the framework where to load the Agent from, so it can call the Agent for inference during evaluation. Trace mode no longer calls the Agent, so when **every case in the eval set is in trace mode**, `AgentEvaluator.evaluate()` / `get_executer()` no longer needs `agent_module` and you can simply omit it: + +```python +await AgentEvaluator.evaluate( + eval_dataset_file_path_or_dir=trace_only_eval_set_path, +) +``` + **Example**: A Trace mode case in the eval set ```json @@ -2160,6 +2242,53 @@ async def test_evaluate_with_custom_runner(): For the complete example, see [examples/evaluation/custom_runner/](../../../examples/evaluation/custom_runner/). +#### Shared Configuration (eval_metrics_file_path_or_dir) + +By default, every eval set needs a `test_config.json` placed in its **own directory**, which the framework loads automatically. When multiple eval sets need to use the same metrics and thresholds, copying `test_config.json` into every directory is redundant and prone to drift. You can extract the config into a single shared location and point to it via `eval_metrics_file_path_or_dir` at call time. The framework will then **ignore the same-directory convention** and apply this shared config to every eval set. + +**Comparison**: in the default layout each eval set needs its own `test_config.json`; in the shared layout there is only one. + +``` +## Default (same-directory) ## Shared (eval_metrics_file_path_or_dir) +project/ project/ +└── eval_data/ ├── shared_metrics.json ← shared config + ├── weather/ └── eval_data/ + │ ├── weather.evalset.json ├── weather/weather.evalset.json + │ └── test_config.json ├── booking/booking.evalset.json + └── booking/ └── search/search.evalset.json + ├── booking.evalset.json + └── test_config.json +``` + +**Configuration** + +Pass `eval_metrics_file_path_or_dir` to `AgentEvaluator.evaluate()` / `get_executer()`: + +- A **file path** (`.json`): loaded directly as the shared configuration; +- A **directory path**: the framework looks up `*.json` in that directory **non-recursively**, and exactly one must be present; otherwise it raises `FileNotFoundError` (zero matches) or `ValueError` (more than one); +- Omitted or `None`: keep the default behavior—load `test_config.json` from each eval set's own directory. + +**Applicable Scenarios** + +Multiple eval sets sharing the same metrics and thresholds; switching thresholds per environment (dev / staging / prod) in CI; eval sets generated by other tools (WebUI, log replayers, etc.) where maintaining a per-directory `test_config.json` is inconvenient. + +**Example**: point all eval sets in the right-hand layout above to `shared_metrics.json` + +```python +import os +import pytest +from trpc_agent_sdk.evaluation import AgentEvaluator + +@pytest.mark.asyncio +async def test_with_shared_metrics(): + project_dir = os.path.dirname(os.path.abspath(__file__)) + await AgentEvaluator.evaluate( + agent_module="agent", + eval_dataset_file_path_or_dir=os.path.join(project_dir, "eval_data"), + eval_metrics_file_path_or_dir=os.path.join(project_dir, "shared_metrics.json"), + ) +``` + ## Using WebUI for Agent Evaluation diff --git a/docs/mkdocs/zh/evaluation.md b/docs/mkdocs/zh/evaluation.md index 07f7fba..564e0e0 100644 --- a/docs/mkdocs/zh/evaluation.md +++ b/docs/mkdocs/zh/evaluation.md @@ -305,7 +305,7 @@ pytest test_quickstart.py -v --tb=short -s AgentEvaluator 是整个评测流程的入口和编排者: -1. **加载阶段**:AgentEvaluator 从评测集文件(`.evalset.json` / `.test.json`)加载 EvalSet,从同目录的 `test_config.json` 加载 EvalConfig,按 `agent_module` 加载 Agent。 +1. **加载阶段**:AgentEvaluator 从评测集文件(`.evalset.json` / `.test.json`)加载 EvalSet,从同目录的 `test_config.json` 加载 EvalConfig,按 `agent_module` 加载 Agent(若整集为 [Trace 模式](#trace-模式),此步可省略)。 2. **构建评估服务**:AgentEvaluator 将 EvalSet 写入 InMemoryEvalSetsManager,创建 LocalEvalService(依赖该 Manager、UserSimulatorProvider、可选 EvalSetResultsManager、Runner、Callbacks)。默认使用 StaticUserSimulator,按 conversation 的 user_content 驱动推理。可选注入 LocalEvalSetResultsManager 将运行结果写入目录。 3. **推理阶段**:评估服务按 EvalSet 中的用例与 conversation 驱动 Runner 推理,得到实际 Invocation 列表(实际工具调用、实际回复)。 4. **打分阶段**:评估服务根据 EvalConfig 中的 EvalMetric 列表,从 EvaluatorRegistry 获取各评估器,对实际与预期逐项打分并汇总为 EvalCaseResult。 @@ -422,6 +422,8 @@ Trace 模式的配置详见[高级功能 - Trace 模式](#trace-模式)。 `test_config.json` 必须放在评测集文件(`.evalset.json` / `.test.json`)的**同目录**下,框架会自动加载。 +> **进阶**:若希望**多个评测集共用同一份配置**(例如把所有指标定义集中到一个 JSON),可在调用时传入 `eval_metrics_file_path_or_dir`,跳过同目录约定。详见[共享配置:`eval_metrics_file_path_or_dir`](#共享配置eval_metrics_file_path_or_dir)。 + #### 结构定义 **EvalConfig**(由 `test_config.json` 解析) @@ -715,7 +717,10 @@ Criterion 定义了"怎样算匹配"——实际输出和预期输出之间用 | 字段 | 类型 | 说明 | | --- | --- | --- | -| judge_model | object | 评判模型配置(JudgeModelOptions);必填 | +| judge_model | object | 评判模型配置(JudgeModelOptions);未设置 `judge_models` 时必填 | +| judge_models | array | 多裁判模型列表(JudgeModelOptions 项),与 `judge_model` 互斥;跨模型结果通过 `models_aggregator` 聚合 | +| models_aggregator | string | 跨模型聚合策略。内置:`all_pass`(默认)/ `any_pass` / `majority_pass` / `avg` / `weighted_avg` / `weighted_majority`。自定义名称须在评估前通过 `LLM_EVALUATOR_REGISTRY.register_models_aggregator` 注册 | +| parallel | boolean | 多裁判模型是否并发执行;默认 `true` | | rubrics | array | Rubric 列表;llm_rubric_response 与 llm_rubric_knowledge_recall 需要 | | knowledge_tool_names | array | 知识检索工具名列表;llm_rubric_knowledge_recall 使用,默认`["knowledge_search"]` | @@ -727,7 +732,9 @@ Criterion 定义了"怎样算匹配"——实际输出和预期输出之间用 | api_key | string | API 密钥 | | base_url | string | 可选,自定义端点 | | num_samples | number | 每轮评判采样数;默认 1 | -| generation_config | object | 生成参数(max_tokens、temperature 等) | +| weight | number | 单模型权重,供 `weighted_avg` / `weighted_majority` 聚合器使用;默认 1.0 | +| think | boolean | 控制裁判模型的思考模式:`false` 关闭思考(同时设 `thinking_config.thinking_budget=0` 与 `chat_template_kwargs.enable_thinking=false`);`true` 显式开启(`include_thoughts=true`,自动预算);不设(默认)则保持模型默认。建议 judge 模型设 `false` 节省 token 与延时 | +| generation_config | object | 生成参数(max_tokens、temperature 等;也可显式写 `thinking_config` / `http_options`,`think` 字段优先) | **Rubric**(rubrics 数组项) @@ -809,6 +816,71 @@ LLM 最终响应评判(仅需 judge_model): 建议 `api_key`、`base_url` 用环境变量占位(如 `${TRPC_AGENT_API_KEY}`),由执行环境替换,避免明文写入配置文件。 +**多裁判模型(跨模型聚合)** + +同一个 LLM 裁判指标可以同时使用多个裁判模型,并通过 `models_aggregator` 聚合各模型的判定结果。此时改用 `judge_models` 而非 `judge_model`,两字段互斥。每个裁判模型的明细会输出到 `PerInvocationResult.per_model_scores`(`NamedScoreResult` 列表)。 + +内置聚合器: + +| 名称 | 通过规则 | 总分 | +| --- | --- | --- | +| `all_pass`(默认) | 所有模型都通过 | 各模型得分的最小值 | +| `any_pass` | 任一模型通过 | 各模型得分的最大值 | +| `majority_pass` | 严格多数通过(`passed*2 > total`) | `passed_count / total` | +| `avg` | 平均分 ≥ threshold | 各模型得分的平均值 | +| `weighted_avg` | 加权平均 ≥ threshold | `sum(w*s) / sum(w)` | +| `weighted_majority` | 通过模型的权重占比 ≥ 0.5 | `sum(w where passed) / sum(w)` | + +若某个裁判模型执行抛异常,则该模型视为一张反对票;若所有模型都抛异常,该轮结果记为 `NOT_EVALUATED`。 + +```json +{ + "metrics": [ + { + "metric_name": "llm_final_response", + "threshold": 1, + "criterion": { + "llm_judge": { + "judge_models": [ + { + "model_name": "glm-4.7", + "api_key": "${TRPC_AGENT_API_KEY}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "weight": 2.0 + }, + { + "model_name": "gpt-4o", + "api_key": "${TRPC_AGENT_API_KEY}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "weight": 1.0 + } + ], + "models_aggregator": "weighted_avg", + "parallel": true + } + } + } + ] +} +``` + +`parallel` 控制多个裁判模型之间的执行方式:`true`(默认)并发调用,耗时取决于最慢的模型;`false` 按声明顺序串行调用。仅在 `judge_models` 有多个模型时生效。 + +若裁判模型默认开启思考链,建议在对应 `JudgeModelOptions` 上显式设 `"think": false`:judge 输出本身是结构化 JSON,思考链对最终判分无价值,关闭可显著降低 token 消耗与延时。每个裁判模型的 `think` 独立设置。 + +也可以在运行时注册自定义聚合器,其优先级高于 criterion 中写的 `models_aggregator` 名: + +```python +from trpc_agent_sdk.evaluation import LLM_EVALUATOR_REGISTRY, ScoreResult + +def my_aggregator(per_model, threshold, weights): + # per_model: list[ScoreResult];weights: list[float] + score = sum(s.score or 0.0 for s in per_model) / len(per_model) + return ScoreResult(score=score, reason="custom aggregation") + +LLM_EVALUATOR_REGISTRY.register_models_aggregator("llm_final_response", my_aggregator) +``` + #### 自定义准则 若要在代码里完全自定义"是否匹配"的逻辑,可在评估运行前向 `CRITERION_REGISTRY` 注册一个匹配函数。支持注册的类型为 `TOOL_TRAJECTORY`、`FINAL_RESPONSE`;注册后,该类型在比较时会调用你提供的函数 `(actual, expected) -> bool`,不再使用配置文件中的内置准则。 @@ -1761,6 +1833,16 @@ async def test_pass_at_k(): 回放已有对话、离线评估、或调试评估流程时避免重复调用 Agent 与模型。 +**`agent_module` 可省略** + +`agent_module` 用来告诉框架去哪里加载 Agent,以便评测时调用它做推理。Trace 模式不再调用 Agent,因此当**评测集里所有用例都是 trace 模式**时,`AgentEvaluator.evaluate()` / `get_executer()` 不再需要 `agent_module`,可以直接省略: + +```python +await AgentEvaluator.evaluate( + eval_dataset_file_path_or_dir=trace_only_eval_set_path, +) +``` + **示例**:evalset 中一个 Trace 模式用例 ```json @@ -2157,6 +2239,53 @@ async def test_evaluate_with_custom_runner(): 完整示例见 [examples/evaluation/custom_runner/](../../../examples/evaluation/custom_runner/)。 +#### 共享配置(eval_metrics_file_path_or_dir) + +默认情况下,每个评测集都需要在**同目录**放一份 `test_config.json`,框架按目录就近加载。当多个评测集要使用同一套指标与阈值时,逐个目录复制 `test_config.json` 既冗余也容易漂移。此时可以把配置抽到一处共享,调用时通过 `eval_metrics_file_path_or_dir` 指定,框架将**忽略同目录约定**,让所有评测集都使用这份共享配置。 + +**对比**:默认布局每个评测集旁都要带一份 `test_config.json`;共享布局只保留一份。 + +``` +## 默认(同目录约定) ## 共享(eval_metrics_file_path_or_dir) +project/ project/ +└── eval_data/ ├── shared_metrics.json ← 共享配置 + ├── weather/ └── eval_data/ + │ ├── weather.evalset.json ├── weather/weather.evalset.json + │ └── test_config.json ├── booking/booking.evalset.json + └── booking/ └── search/search.evalset.json + ├── booking.evalset.json + └── test_config.json +``` + +**配置方式** + +在 `AgentEvaluator.evaluate()` / `get_executer()` 中传入 `eval_metrics_file_path_or_dir`: + +- 传**文件路径**(`.json`):直接作为共享配置加载; +- 传**目录路径**:在该目录下**非递归**查找 `*.json`,且必须恰好有一份,否则抛 `FileNotFoundError`(0 份)或 `ValueError`(多于 1 份); +- 不传或传 `None`:保持默认,按各评测集同目录的 `test_config.json` 加载。 + +**适用场景** + +多个评测集共用同一套指标与阈值;CI 中按环境(dev / staging / prod)切换不同阈值;评测集由其他工具(WebUI、日志回放器等)生成、不便逐个维护 `test_config.json`。 + +**示例**:将上图右侧布局的所有评测集统一指向 `shared_metrics.json` + +```python +import os +import pytest +from trpc_agent_sdk.evaluation import AgentEvaluator + +@pytest.mark.asyncio +async def test_with_shared_metrics(): + project_dir = os.path.dirname(os.path.abspath(__file__)) + await AgentEvaluator.evaluate( + agent_module="agent", + eval_dataset_file_path_or_dir=os.path.join(project_dir, "eval_data"), + eval_metrics_file_path_or_dir=os.path.join(project_dir, "shared_metrics.json"), + ) +``` + ## 使用 WebUI 进行 Agent 评测 diff --git a/tests/evaluation/test_eval_result.py b/tests/evaluation/test_eval_result.py index 6d79b48..429de39 100644 --- a/tests/evaluation/test_eval_result.py +++ b/tests/evaluation/test_eval_result.py @@ -135,3 +135,83 @@ def test_evaluate_result_empty(self): """Test empty EvaluateResult.""" r = EvaluateResult() assert r.results_by_eval_set_id == {} + + +class TestNamedScoreResult: + """Test suite for NamedScoreResult.""" + + def test_minimal_construction(self): + """Test NamedScoreResult with minimal fields uses defaults.""" + from trpc_agent_sdk.evaluation import NamedScoreResult + + n = NamedScoreResult(model_name="glm-4.7", score=1.0, passed=True) + assert n.model_name == "glm-4.7" + assert n.provider_name == "" + assert n.score == 1.0 + assert n.reason == "" + assert n.rubric_scores == [] + assert n.passed is True + + def test_full_construction_and_serialization(self): + """Test all fields round-trip through JSON serialization.""" + from trpc_agent_sdk.evaluation import NamedScoreResult + from trpc_agent_sdk.evaluation import RubricScore + + n = NamedScoreResult( + model_name="gpt-4o", + provider_name="openai", + score=0.5, + reason="half passed", + rubric_scores=[RubricScore(id="r1", reason="ok", score=1.0)], + passed=False, + ) + data = n.model_dump() + assert data["model_name"] == "gpt-4o" + assert data["provider_name"] == "openai" + assert data["passed"] is False + assert data["rubric_scores"][0]["id"] == "r1" + + +class TestPerInvocationResultPerModelScores: + """Test suite for PerInvocationResult.per_model_scores backward compatibility.""" + + def test_default_is_none(self): + """Test per_model_scores defaults to None for old code paths.""" + from unittest.mock import Mock + + from trpc_agent_sdk.evaluation import EvalStatus + from trpc_agent_sdk.evaluation import Invocation + from trpc_agent_sdk.evaluation import PerInvocationResult + + inv = Mock(spec=Invocation) + r = PerInvocationResult( + actual_invocation=inv, + score=1.0, + eval_status=EvalStatus.PASSED, + ) + assert r.per_model_scores is None + + def test_per_model_scores_populated(self): + """Test per_model_scores accepts list of NamedScoreResult.""" + from unittest.mock import Mock + + from trpc_agent_sdk.evaluation import EvalStatus + from trpc_agent_sdk.evaluation import Invocation + from trpc_agent_sdk.evaluation import NamedScoreResult + from trpc_agent_sdk.evaluation import PerInvocationResult + + inv = Mock(spec=Invocation) + per_model = [ + NamedScoreResult(model_name="m1", score=1.0, passed=True), + NamedScoreResult(model_name="m2", score=0.0, passed=False), + ] + r = PerInvocationResult( + actual_invocation=inv, + score=0.0, + eval_status=EvalStatus.FAILED, + per_model_scores=per_model, + ) + assert r.per_model_scores is not None + assert len(r.per_model_scores) == 2 + assert r.per_model_scores[0].model_name == "m1" + assert r.per_model_scores[1].passed is False diff --git a/tests/evaluation/test_llm_criterion.py b/tests/evaluation/test_llm_criterion.py index 34a80f9..e985866 100644 --- a/tests/evaluation/test_llm_criterion.py +++ b/tests/evaluation/test_llm_criterion.py @@ -52,7 +52,10 @@ def test_strips_api_key_camel_case(self): """Test apiKey is stripped when key is judgeModel.""" c = { "llmJudge": { - "judgeModel": {"model_name": "glm-4", "apiKey": "secret"}, + "judgeModel": { + "model_name": "glm-4", + "apiKey": "secret" + }, }, } out = sanitize_criterion_for_export(c) @@ -114,9 +117,18 @@ def test_from_dict_empty(self): def test_from_dict_snake_case(self): """Test from_dict with snake_case keys.""" d = { - "judge_model": {"model_name": "glm-4", "num_samples": 2}, + "judge_model": { + "model_name": "glm-4", + "num_samples": 2 + }, "rubrics": [ - {"id": "1", "content": {"text": "Must be relevant."}, "description": "Relevance"}, + { + "id": "1", + "content": { + "text": "Must be relevant." + }, + "description": "Relevance" + }, ], } c = LLMJudgeCriterion.from_dict(d) @@ -159,7 +171,9 @@ def test_metric_with_llm_judge(self): threshold=1.0, criterion={ "llm_judge": { - "judge_model": {"model_name": "glm-4"}, + "judge_model": { + "model_name": "glm-4" + }, }, }, ) @@ -175,7 +189,9 @@ def test_metric_with_llm_judge_camel_case(self): threshold=1.0, criterion={ "llmJudge": { - "judgeModel": {"model_name": "glm-4"}, + "judgeModel": { + "model_name": "glm-4" + }, "rubrics": [], }, }, @@ -183,3 +199,139 @@ def test_metric_with_llm_judge_camel_case(self): c = get_llm_criterion_from_metric(m) assert c is not None assert c.judge_model.model_name == "glm-4" + + +class TestJudgeModelOptionsWeight: + """Test suite for JudgeModelOptions.weight.""" + + def test_weight_default_is_one(self): + """Test weight defaults to 1.0 when omitted.""" + opts = JudgeModelOptions(model_name="m") + assert opts.weight == 1.0 + + def test_weight_custom_value(self): + """Test weight accepts custom float.""" + opts = JudgeModelOptions(model_name="m", weight=2.5) + assert opts.weight == 2.5 + + +class TestLLMJudgeCriterionMultiModel: + """Test suite for LLMJudgeCriterion multi-model fields and validation.""" + + def test_default_models_aggregator_and_parallel(self): + """Test defaults: models_aggregator='all_pass', parallel=True.""" + c = LLMJudgeCriterion(judge_model=JudgeModelOptions(model_name="m")) + assert c.models_aggregator == "all_pass" + assert c.parallel is True + + def test_get_judge_models_normalizes_singular(self): + """Test get_judge_models() returns 1-element list when only judge_model is set.""" + c = LLMJudgeCriterion(judge_model=JudgeModelOptions(model_name="m1")) + models = c.get_judge_models() + assert len(models) == 1 + assert models[0].model_name == "m1" + + def test_get_judge_models_returns_list_directly(self): + """Test get_judge_models() returns judge_models when set.""" + c = LLMJudgeCriterion(judge_models=[ + JudgeModelOptions(model_name="m1"), + JudgeModelOptions(model_name="m2"), + ], ) + models = c.get_judge_models() + assert [m.model_name for m in models] == ["m1", "m2"] + + def test_get_judge_models_empty_when_neither_set(self): + """Test get_judge_models() returns [] when neither field set (allowed at criterion level).""" + c = LLMJudgeCriterion() + assert c.get_judge_models() == [] + + def test_validate_judge_model_and_judge_models_mutually_exclusive(self): + """Test setting both judge_model and judge_models raises ValueError.""" + import pytest as _pytest + with _pytest.raises(ValueError, match="judge_model.*judge_models"): + LLMJudgeCriterion( + judge_model=JudgeModelOptions(model_name="m1"), + judge_models=[JudgeModelOptions(model_name="m2")], + ) + + def test_validate_empty_judge_models_raises(self): + """Test empty judge_models list raises ValueError.""" + import pytest as _pytest + with _pytest.raises(ValueError, match="judge_models.*empty"): + LLMJudgeCriterion(judge_models=[]) + + def test_validate_negative_weight_raises(self): + """Test any negative weight raises ValueError.""" + import pytest as _pytest + with _pytest.raises(ValueError, match="weight.*negative"): + LLMJudgeCriterion(judge_models=[ + JudgeModelOptions(model_name="m1", weight=1.0), + JudgeModelOptions(model_name="m2", weight=-0.5), + ], ) + + def test_validate_weighted_aggregator_zero_total_weight_raises(self): + """Test weighted_avg with all-zero weights raises ValueError.""" + import pytest as _pytest + with _pytest.raises(ValueError, match="weight"): + LLMJudgeCriterion( + judge_models=[ + JudgeModelOptions(model_name="m1", weight=0.0), + JudgeModelOptions(model_name="m2", weight=0.0), + ], + models_aggregator="weighted_avg", + ) + + def test_built_in_aggregator_names_accepted(self): + """Test all 6 built-in aggregator names pass validation.""" + for name in ("all_pass", "any_pass", "majority_pass", "avg", "weighted_avg", "weighted_majority"): + c = LLMJudgeCriterion( + judge_models=[JudgeModelOptions(model_name="m", weight=1.0)], + models_aggregator=name, + ) + assert c.models_aggregator == name + + def test_validate_models_aggregator_must_be_non_empty_string(self): + """Test empty models_aggregator string raises ValueError at criterion level.""" + import pytest as _pytest + with _pytest.raises(ValueError, match="models_aggregator.*non-empty"): + LLMJudgeCriterion( + judge_model=JudgeModelOptions(model_name="m"), + models_aggregator="", + ) + + def test_from_dict_with_judge_models(self): + """Test from_dict accepts judge_models list and models_aggregator string.""" + c = LLMJudgeCriterion.from_dict({ + "judge_models": [ + { + "model_name": "m1", + "weight": 2.0 + }, + { + "model_name": "m2", + "weight": 1.0 + }, + ], + "models_aggregator": + "weighted_avg", + "parallel": + False, + }) + assert c is not None + assert len(c.judge_models) == 2 + assert c.judge_models[0].weight == 2.0 + assert c.models_aggregator == "weighted_avg" + assert c.parallel is False + + def test_from_dict_legacy_judge_model_still_works(self): + """Test from_dict still works with legacy single judge_model (back compat).""" + c = LLMJudgeCriterion.from_dict({ + "judge_model": { + "model_name": "glm-4" + }, + }) + assert c is not None + assert c.judge_model.model_name == "glm-4" + assert c.judge_models is None + assert c.models_aggregator == "all_pass" + assert c.parallel is True diff --git a/tests/evaluation/test_llm_evaluator_registry.py b/tests/evaluation/test_llm_evaluator_registry.py index 6e76781..ded926c 100644 --- a/tests/evaluation/test_llm_evaluator_registry.py +++ b/tests/evaluation/test_llm_evaluator_registry.py @@ -59,3 +59,46 @@ def test_llm_metric_names_contains_expected(self): assert "llm_rubric_response" in LLM_METRIC_NAMES assert "llm_rubric_knowledge_recall" in LLM_METRIC_NAMES assert len(LLM_METRIC_NAMES) == 3 + + +class TestModelsAggregatorRegistry: + """Test suite for register_models_aggregator on LLMEvaluatorRegistry.""" + + @pytest.fixture + def registry(self): + from trpc_agent_sdk.evaluation import LLMEvaluatorRegistry + return LLMEvaluatorRegistry() + + def test_register_and_get(self, registry): + """Test register_models_aggregator + get_models_aggregator round-trip.""" + from trpc_agent_sdk.evaluation import ScoreResult + + def custom(per_model, threshold, weights): + return ScoreResult(score=1.0, reason="always pass") + + registry.register_models_aggregator("llm_final_response", custom) + agg = registry.get_models_aggregator("llm_final_response") + assert agg is not None + out = agg.aggregate_models([ScoreResult(score=0.0)], 0.5, [1.0]) + assert out.score == 1.0 + + def test_register_invalid_metric_raises(self, registry): + """Test register_models_aggregator with non-LLM metric raises ValueError.""" + with pytest.raises(ValueError, match="must be one of"): + registry.register_models_aggregator("rouge_score", lambda *a, **k: None) + + def test_get_unregistered_returns_none(self, registry): + """Test get_models_aggregator returns None when not set.""" + assert registry.get_models_aggregator("llm_final_response") is None + + def test_unregister(self, registry): + """Test unregister_models_aggregator removes the registration.""" + from trpc_agent_sdk.evaluation import ScoreResult + + def custom(per_model, threshold, weights): + return ScoreResult(score=1.0) + + registry.register_models_aggregator("llm_rubric_response", custom) + assert registry.get_models_aggregator("llm_rubric_response") is not None + registry.unregister_models_aggregator("llm_rubric_response") + assert registry.get_models_aggregator("llm_rubric_response") is None diff --git a/tests/evaluation/test_llm_judge_models_aggregator.py b/tests/evaluation/test_llm_judge_models_aggregator.py new file mode 100644 index 0000000..d9cd633 --- /dev/null +++ b/tests/evaluation/test_llm_judge_models_aggregator.py @@ -0,0 +1,198 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Unit tests for built-in ModelsAggregator strategies in _llm_judge.""" + +import pytest + +import trpc_agent_sdk.runners # noqa: F401 + +from trpc_agent_sdk.evaluation import ScoreResult +from trpc_agent_sdk.evaluation._llm_judge import AllPassModelsAggregator +from trpc_agent_sdk.evaluation._llm_judge import AnyPassModelsAggregator +from trpc_agent_sdk.evaluation._llm_judge import AverageModelsAggregator +from trpc_agent_sdk.evaluation._llm_judge import MajorityPassModelsAggregator +from trpc_agent_sdk.evaluation._llm_judge import WeightedAverageModelsAggregator +from trpc_agent_sdk.evaluation._llm_judge import WeightedMajorityModelsAggregator +from trpc_agent_sdk.evaluation._llm_judge import get_builtin_models_aggregator + + +class TestAllPassModelsAggregator: + + def test_empty_per_model_raises(self): + agg = AllPassModelsAggregator() + with pytest.raises(ValueError): + agg.aggregate_models([], threshold=0.5, weights=[]) + + def test_all_above_threshold_returns_min(self): + agg = AllPassModelsAggregator() + out = agg.aggregate_models( + [ScoreResult(score=1.0), ScoreResult(score=0.8)], + threshold=0.5, + weights=[1.0, 1.0], + ) + assert out.score == pytest.approx(0.8) + + def test_one_below_threshold_returns_min(self): + agg = AllPassModelsAggregator() + out = agg.aggregate_models( + [ScoreResult(score=1.0), ScoreResult(score=0.0)], + threshold=0.5, + weights=[1.0, 1.0], + ) + assert out.score == 0.0 + + def test_single_model_returns_its_score(self): + agg = AllPassModelsAggregator() + out = agg.aggregate_models( + [ScoreResult(score=0.7)], + threshold=0.5, + weights=[1.0], + ) + assert out.score == pytest.approx(0.7) + + +class TestAnyPassModelsAggregator: + + def test_one_above_returns_max(self): + agg = AnyPassModelsAggregator() + out = agg.aggregate_models( + [ScoreResult(score=1.0), ScoreResult(score=0.0)], + threshold=0.5, + weights=[1.0, 1.0], + ) + assert out.score == pytest.approx(1.0) + + def test_all_below_returns_max_still_below(self): + agg = AnyPassModelsAggregator() + out = agg.aggregate_models( + [ScoreResult(score=0.1), ScoreResult(score=0.2)], + threshold=0.5, + weights=[1.0, 1.0], + ) + assert out.score == pytest.approx(0.2) + + +class TestMajorityPassModelsAggregator: + + def test_strict_majority_passes(self): + agg = MajorityPassModelsAggregator() + out = agg.aggregate_models( + [ScoreResult(score=1.0), ScoreResult(score=1.0), + ScoreResult(score=0.0)], + threshold=0.5, + weights=[1.0, 1.0, 1.0], + ) + assert out.score == pytest.approx(2 / 3) + + def test_tie_returns_half(self): + agg = MajorityPassModelsAggregator() + out = agg.aggregate_models( + [ScoreResult(score=1.0), ScoreResult(score=0.0)], + threshold=0.5, + weights=[1.0, 1.0], + ) + assert out.score == pytest.approx(0.5) + + +class TestAverageModelsAggregator: + + def test_average_score(self): + agg = AverageModelsAggregator() + out = agg.aggregate_models( + [ScoreResult(score=1.0), ScoreResult(score=0.0)], + threshold=0.5, + weights=[1.0, 1.0], + ) + assert out.score == pytest.approx(0.5) + + +class TestWeightedAverageModelsAggregator: + + def test_weighted_mean(self): + agg = WeightedAverageModelsAggregator() + out = agg.aggregate_models( + [ScoreResult(score=1.0), ScoreResult(score=0.0)], + threshold=0.5, + weights=[2.0, 1.0], + ) + assert out.score == pytest.approx(2.0 / 3.0) + + def test_zero_weight_total_returns_zero(self): + agg = WeightedAverageModelsAggregator() + out = agg.aggregate_models( + [ScoreResult(score=1.0), ScoreResult(score=1.0)], + threshold=0.5, + weights=[0.0, 0.0], + ) + assert out.score == 0.0 + + +class TestWeightedMajorityModelsAggregator: + + def test_weighted_majority_passes(self): + agg = WeightedMajorityModelsAggregator() + out = agg.aggregate_models( + [ScoreResult(score=1.0), ScoreResult(score=0.0)], + threshold=0.5, + weights=[2.0, 1.0], + ) + assert out.score == pytest.approx(2.0 / 3.0) + + def test_weighted_majority_tie_returns_half(self): + agg = WeightedMajorityModelsAggregator() + out = agg.aggregate_models( + [ScoreResult(score=1.0), ScoreResult(score=0.0)], + threshold=0.5, + weights=[1.0, 1.0], + ) + assert out.score == pytest.approx(0.5) + + +class TestSingleModelEquivalence: + + @pytest.mark.parametrize("agg_cls", [ + AllPassModelsAggregator, + AnyPassModelsAggregator, + AverageModelsAggregator, + WeightedAverageModelsAggregator, + ]) + def test_n1_continuous_score_preserved(self, agg_cls): + agg = agg_cls() + out = agg.aggregate_models( + [ScoreResult(score=0.7)], + threshold=0.5, + weights=[1.0], + ) + assert out.score == pytest.approx(0.7) + + @pytest.mark.parametrize("agg_cls", [ + MajorityPassModelsAggregator, + WeightedMajorityModelsAggregator, + ]) + def test_n1_majority_passes_and_fails(self, agg_cls): + agg = agg_cls() + out_pass = agg.aggregate_models( + [ScoreResult(score=0.9)], + threshold=0.5, + weights=[1.0], + ) + out_fail = agg.aggregate_models( + [ScoreResult(score=0.1)], + threshold=0.5, + weights=[1.0], + ) + assert out_pass.score == 1.0 + assert out_fail.score == 0.0 + + +class TestGetBuiltinModelsAggregator: + + def test_known_names(self): + for name in ("all_pass", "any_pass", "majority_pass", "avg", "weighted_avg", "weighted_majority"): + assert get_builtin_models_aggregator(name) is not None + + def test_unknown_name_returns_none(self): + assert get_builtin_models_aggregator("nope") is None diff --git a/tests/evaluation/test_llm_judge_multi_model.py b/tests/evaluation/test_llm_judge_multi_model.py new file mode 100644 index 0000000..103d04c --- /dev/null +++ b/tests/evaluation/test_llm_judge_multi_model.py @@ -0,0 +1,354 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""End-to-end multi-model evaluation tests for LLMJudge (mocked judge agents).""" + +from unittest.mock import patch + +import pytest + +import trpc_agent_sdk.runners # noqa: F401 + +from trpc_agent_sdk.evaluation import EvalMetric +from trpc_agent_sdk.evaluation import EvalStatus +from trpc_agent_sdk.evaluation import Invocation +from trpc_agent_sdk.evaluation._llm_judge import LLMJudge + +# Module-level dict configuring per-model stubbed outcomes: +# "valid" -> JSON judge response with verdict valid +# "invalid" -> JSON judge response with verdict invalid +# "raise" -> raise RuntimeError on get_response +_STUB_RESPONSES: dict[str, str] = {} + + +class _StubModel: + """Tag object returned from stubbed _create_judge_model.""" + + def __init__(self, name: str) -> None: + self._stub_name = name + + +def _stub_create_judge_model(opts): + return _StubModel(opts.model_name or "") + + +class _StubJudgeAgent: + """Stub _JudgeAgent: returns configured JSON per call (for llm_final_response).""" + + def __init__(self, model, config, system_prompt, output_schema=None, tools=None, planner=None): + self._model_name = getattr(model, "_stub_name", "") + self._planner = planner + + async def get_response(self, user_message: str) -> str: + outcome = _STUB_RESPONSES.get(self._model_name, "valid") + if outcome == "raise": + raise RuntimeError(f"stubbed judge {self._model_name} failure") + verdict = "valid" if outcome == "valid" else "invalid" + return ('{"reasoning":"stub","is_the_agent_response_valid":' + f'"{verdict}"' + "}") + + +@pytest.fixture(autouse=True) +def _reset_stubs(): + _STUB_RESPONSES.clear() + yield + _STUB_RESPONSES.clear() + + +def _patch_judge_internals(): + """Return list of started patchers; caller must stop them.""" + patchers = [ + patch("trpc_agent_sdk.evaluation._llm_judge._create_judge_model", side_effect=_stub_create_judge_model), + patch("trpc_agent_sdk.evaluation._llm_judge._JudgeAgent", _StubJudgeAgent), + ] + for p in patchers: + p.start() + return patchers + + +def _stop(patchers): + for p in patchers: + p.stop() + + +def _make_metric(judge_models, models_aggregator="all_pass", parallel=True, threshold=0.5): + return EvalMetric( + metric_name="llm_final_response", + threshold=threshold, + criterion={ + "llm_judge": { + "judge_models": judge_models, + "models_aggregator": models_aggregator, + "parallel": parallel, + }, + }, + ) + + +def _make_invocation(user_text: str, response_text: str) -> Invocation: + from trpc_agent_sdk.types import Content + from trpc_agent_sdk.types import Part + + return Invocation( + invocation_id="inv", + user_content=Content(role="user", parts=[Part.from_text(text=user_text)]), + final_response=Content(role="model", parts=[Part.from_text(text=response_text)]), + ) + + +class TestMultiModelAllPass: + + @pytest.mark.asyncio + async def test_both_valid_passes(self): + _STUB_RESPONSES.update({"glm-4.7": "valid", "gpt-4o": "valid"}) + metric = _make_metric([ + { + "model_name": "glm-4.7" + }, + { + "model_name": "gpt-4o" + }, + ], models_aggregator="all_pass") + actual = _make_invocation("u", "a") + expected = _make_invocation("u", "a") + ps = _patch_judge_internals() + try: + judge = LLMJudge(metric) + result = await judge.evaluate([actual], [expected]) + finally: + _stop(ps) + assert result.overall_eval_status == EvalStatus.PASSED + per = result.per_invocation_results[0] + assert per.eval_status == EvalStatus.PASSED + assert per.per_model_scores is not None + assert len(per.per_model_scores) == 2 + + @pytest.mark.asyncio + async def test_one_invalid_fails(self): + _STUB_RESPONSES.update({"glm-4.7": "valid", "gpt-4o": "invalid"}) + metric = _make_metric([ + { + "model_name": "glm-4.7" + }, + { + "model_name": "gpt-4o" + }, + ], models_aggregator="all_pass") + actual = _make_invocation("u", "a") + expected = _make_invocation("u", "a") + ps = _patch_judge_internals() + try: + judge = LLMJudge(metric) + result = await judge.evaluate([actual], [expected]) + finally: + _stop(ps) + assert result.overall_eval_status == EvalStatus.FAILED + per = result.per_invocation_results[0] + assert per.per_model_scores is not None + names = [m.model_name for m in per.per_model_scores] + assert "glm-4.7" in names and "gpt-4o" in names + gpt_entry = [m for m in per.per_model_scores if m.model_name == "gpt-4o"][0] + assert gpt_entry.passed is False + + +class TestMultiModelAnyPass: + + @pytest.mark.asyncio + async def test_one_valid_passes(self): + _STUB_RESPONSES.update({"glm-4.7": "invalid", "gpt-4o": "valid"}) + metric = _make_metric([ + { + "model_name": "glm-4.7" + }, + { + "model_name": "gpt-4o" + }, + ], models_aggregator="any_pass") + actual = _make_invocation("u", "a") + expected = _make_invocation("u", "a") + ps = _patch_judge_internals() + try: + judge = LLMJudge(metric) + result = await judge.evaluate([actual], [expected]) + finally: + _stop(ps) + assert result.overall_eval_status == EvalStatus.PASSED + + +class TestMultiModelParallelEqualsSerial: + + @pytest.mark.asyncio + async def test_parallel_same_as_serial(self): + _STUB_RESPONSES.update({"a": "valid", "b": "invalid"}) + + async def run_with(parallel): + metric = _make_metric([ + { + "model_name": "a" + }, + { + "model_name": "b" + }, + ], + models_aggregator="all_pass", + parallel=parallel) + actual = _make_invocation("u", "x") + expected = _make_invocation("u", "x") + ps = _patch_judge_internals() + try: + j = LLMJudge(metric) + return await j.evaluate([actual], [expected]) + finally: + _stop(ps) + + r_p = await run_with(True) + r_s = await run_with(False) + assert r_p.overall_eval_status == r_s.overall_eval_status + assert r_p.overall_score == r_s.overall_score + names_p = sorted(m.model_name for m in r_p.per_invocation_results[0].per_model_scores) + names_s = sorted(m.model_name for m in r_s.per_invocation_results[0].per_model_scores) + assert names_p == names_s + + +class TestMultiModelSoftFailure: + + @pytest.mark.asyncio + async def test_one_model_raises_counts_as_fail_vote(self): + _STUB_RESPONSES.update({"a": "valid", "b": "raise"}) + metric = _make_metric([ + { + "model_name": "a" + }, + { + "model_name": "b" + }, + ], models_aggregator="all_pass") + actual = _make_invocation("u", "x") + expected = _make_invocation("u", "x") + ps = _patch_judge_internals() + try: + j = LLMJudge(metric) + r = await j.evaluate([actual], [expected]) + finally: + _stop(ps) + assert r.overall_eval_status == EvalStatus.FAILED + per = r.per_invocation_results[0] + b_entry = [m for m in per.per_model_scores if m.model_name == "b"][0] + assert b_entry.passed is False + assert b_entry.score == 0.0 + assert "stubbed judge b failure" in b_entry.reason + + @pytest.mark.asyncio + async def test_all_models_raise_returns_not_evaluated(self): + _STUB_RESPONSES.update({"a": "raise", "b": "raise"}) + metric = _make_metric([ + { + "model_name": "a" + }, + { + "model_name": "b" + }, + ], models_aggregator="all_pass") + actual = _make_invocation("u", "x") + expected = _make_invocation("u", "x") + ps = _patch_judge_internals() + try: + j = LLMJudge(metric) + r = await j.evaluate([actual], [expected]) + finally: + _stop(ps) + assert r.per_invocation_results[0].eval_status == EvalStatus.NOT_EVALUATED + + +class TestLegacySingleModelStillWorks: + + @pytest.mark.asyncio + async def test_legacy_single_judge_model(self): + _STUB_RESPONSES.update({"glm-4.7": "valid"}) + metric = EvalMetric( + metric_name="llm_final_response", + threshold=0.5, + criterion={ + "llm_judge": { + "judge_model": { + "model_name": "glm-4.7" + }, + }, + }, + ) + actual = _make_invocation("u", "x") + expected = _make_invocation("u", "x") + ps = _patch_judge_internals() + try: + j = LLMJudge(metric) + r = await j.evaluate([actual], [expected]) + finally: + _stop(ps) + assert r.overall_eval_status == EvalStatus.PASSED + + +class TestUnknownAggregatorRaisesAtConstruction: + + def test_unknown_aggregator_raises(self): + metric = EvalMetric( + metric_name="llm_final_response", + threshold=0.5, + criterion={ + "llm_judge": { + "judge_models": [{ + "model_name": "a" + }], + "models_aggregator": "definitely_not_registered", + }, + }, + ) + ps = _patch_judge_internals() + try: + with pytest.raises(ValueError, match="definitely_not_registered"): + LLMJudge(metric) + finally: + _stop(ps) + + +class TestRegistryRegisteredAggregator: + """Verify that a registry-registered ModelsAggregator is picked up by _judge_for_metric.""" + + @pytest.mark.asyncio + async def test_registered_custom_aggregator_used(self): + """Test register_models_aggregator -> _judge_for_metric injects it; criterion name ignored.""" + from trpc_agent_sdk.evaluation import LLM_EVALUATOR_REGISTRY + from trpc_agent_sdk.evaluation import ScoreResult + from trpc_agent_sdk.evaluation._llm_evaluator import _judge_for_metric + + _STUB_RESPONSES.update({"a": "invalid", "b": "invalid"}) + + def always_pass(per_model, threshold, weights): + return ScoreResult(score=1.0, reason="custom always pass") + + LLM_EVALUATOR_REGISTRY.register_models_aggregator("llm_final_response", always_pass) + try: + metric = _make_metric( + [ + { + "model_name": "a" + }, + { + "model_name": "b" + }, + ], + models_aggregator="all_pass", + ) + actual = _make_invocation("u", "x") + expected = _make_invocation("u", "x") + ps = _patch_judge_internals() + try: + judge = _judge_for_metric(metric) + r = await judge.evaluate([actual], [expected]) + finally: + _stop(ps) + assert r.overall_eval_status == EvalStatus.PASSED + finally: + LLM_EVALUATOR_REGISTRY.unregister_models_aggregator("llm_final_response") diff --git a/tests/evaluation/test_llm_judge_think.py b/tests/evaluation/test_llm_judge_think.py new file mode 100644 index 0000000..fdcd3c1 --- /dev/null +++ b/tests/evaluation/test_llm_judge_think.py @@ -0,0 +1,393 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Unit tests for LLM judge `think` field.""" + +from __future__ import annotations + +from typing import Any +from unittest.mock import patch + +import pytest + +import trpc_agent_sdk.runners # noqa: F401 +from trpc_agent_sdk.evaluation._llm_criterion import JudgeModelOptions +from trpc_agent_sdk.evaluation._llm_judge import _JudgeAgent +from trpc_agent_sdk.evaluation._llm_judge import _judge_generation_config +from trpc_agent_sdk.evaluation._llm_judge import _merge_extra_body +from trpc_agent_sdk.types import HttpOptions + + +class TestJudgeModelOptionsThinkField: + + def test_think_field_default_is_none(self): + opts = JudgeModelOptions(model_name="m") + assert opts.think is None + + def test_think_field_accepts_bool(self): + assert JudgeModelOptions(model_name="m", think=True).think is True + assert JudgeModelOptions(model_name="m", think=False).think is False + + def test_think_field_rejects_non_bool(self): + # EvalBaseModel uses pydantic v2 default lax mode (no strict). Strings like "yes" + # would be coerced to bool, so use an object() instance that cannot be coerced. + with pytest.raises(Exception): + JudgeModelOptions(model_name="m", think=object()) + + +class TestMergeExtraBody: + + def test_none_http_options_creates_new_with_patch(self): + result = _merge_extra_body(None, {"chat_template_kwargs": {"enable_thinking": False}}) + assert isinstance(result, HttpOptions) + assert result.extra_body == {"chat_template_kwargs": {"enable_thinking": False}} + + def test_preserves_other_top_level_keys_in_extra_body(self): + existing = HttpOptions(extra_body={"custom_user_field": "abc"}) + result = _merge_extra_body(existing, {"chat_template_kwargs": {"enable_thinking": False}}) + assert result.extra_body["custom_user_field"] == "abc" + assert result.extra_body["chat_template_kwargs"] == {"enable_thinking": False} + + def test_preserves_sibling_keys_in_chat_template_kwargs(self): + existing = HttpOptions( + extra_body={ + "chat_template_kwargs": {"enable_thinking": True, "other_key": "x"}, + "custom_user_field": "abc", + }) + result = _merge_extra_body(existing, {"chat_template_kwargs": {"enable_thinking": False}}) + assert result.extra_body["chat_template_kwargs"]["other_key"] == "x" + assert result.extra_body["chat_template_kwargs"]["enable_thinking"] is False + assert result.extra_body["custom_user_field"] == "abc" + + def test_patch_is_copied_not_shared(self): + patch_dict = {"chat_template_kwargs": {"enable_thinking": False}} + result = _merge_extra_body(None, patch_dict) + patch_dict["chat_template_kwargs"]["enable_thinking"] = True + assert result.extra_body["chat_template_kwargs"]["enable_thinking"] is False + + +class TestJudgeGenerationConfigThink: + + def test_think_none_returns_none_thinking_config_and_none_http_options(self): + cfg, tc = _judge_generation_config(None, None) + assert tc is None + assert cfg.http_options is None + assert cfg.thinking_config is None # must stay empty; LlmAgent rejects otherwise + + def test_think_none_preserves_caller_http_options(self): + gen = {"http_options": {"extra_body": {"my_key": 1}}} + cfg, tc = _judge_generation_config(gen, None) + assert tc is None + assert cfg.http_options is not None + assert cfg.http_options.extra_body == {"my_key": 1} + + def test_think_false_builds_disabled_thinking_config(self): + cfg, tc = _judge_generation_config(None, False) + assert tc is not None + assert tc.include_thoughts is False + assert tc.thinking_budget == 0 + assert cfg.thinking_config is None + assert cfg.http_options is not None + assert cfg.http_options.extra_body == { + "chat_template_kwargs": {"enable_thinking": False}, + } + + def test_think_true_builds_enabled_thinking_config_auto_budget(self): + cfg, tc = _judge_generation_config(None, True) + assert tc is not None + assert tc.include_thoughts is True + assert tc.thinking_budget == -1 + assert cfg.http_options is not None + assert cfg.http_options.extra_body == { + "chat_template_kwargs": {"enable_thinking": True}, + } + + def test_think_false_overrides_generation_config_thinking_config(self): + gen = { + "max_tokens": 4096, + "thinking_config": {"include_thoughts": True, "thinking_budget": 2048}, + } + cfg, tc = _judge_generation_config(gen, False) + assert cfg.max_output_tokens == 4096 + assert tc is not None + assert tc.include_thoughts is False + assert tc.thinking_budget == 0 + + def test_think_false_deep_merges_extra_body_preserving_other_keys(self): + gen = { + "http_options": { + "extra_body": { + "chat_template_kwargs": {"enable_thinking": True, "other_key": "x"}, + "custom_user_field": "abc", + }, + }, + } + cfg, tc = _judge_generation_config(gen, False) + assert tc is not None + assert cfg.http_options.extra_body["custom_user_field"] == "abc" + assert cfg.http_options.extra_body["chat_template_kwargs"]["other_key"] == "x" + assert ( + cfg.http_options.extra_body["chat_template_kwargs"]["enable_thinking"] is False + ) + + def test_generation_config_thinking_config_used_when_think_is_none(self): + gen = {"thinking_config": {"include_thoughts": True, "thinking_budget": 512}} + cfg, tc = _judge_generation_config(gen, None) + assert tc is not None + assert tc.include_thoughts is True + assert tc.thinking_budget == 512 + assert cfg.http_options is None + + +class TestJudgeAgentPlanner: + + def test_judge_agent_accepts_planner_and_forwards_to_llm_agent(self): + captured: dict[str, Any] = {} + + class _FakeLlmAgent: + + def __init__(self, **kwargs): + captured.update(kwargs) + + fake_planner = object() + with patch("trpc_agent_sdk.evaluation._llm_judge.LlmAgent", _FakeLlmAgent): + _JudgeAgent( + model=object(), + config=None, + system_prompt="sp", + output_schema=None, + tools=None, + planner=fake_planner, + ) + assert captured.get("planner") is fake_planner + + def test_judge_agent_planner_defaults_to_none(self): + captured: dict[str, Any] = {} + + class _FakeLlmAgent: + + def __init__(self, **kwargs): + captured.update(kwargs) + + with patch("trpc_agent_sdk.evaluation._llm_judge.LlmAgent", _FakeLlmAgent): + _JudgeAgent( + model=object(), + config=None, + system_prompt="sp", + ) + assert captured.get("planner") is None + + +# --- Integration tests: end-to-end LLMJudge wiring --- + + +class _SpyModel: + + def __init__(self, name: str) -> None: + self._stub_name = name + + +def _spy_create_judge_model(opts): + return _SpyModel(opts.model_name or "") + + +class _SpyJudgeAgent: + """Captures constructor kwargs for every judge model built by LLMJudge.""" + + instances: list[dict[str, Any]] = [] + + def __init__(self, model, config, system_prompt, output_schema=None, tools=None, planner=None): + _SpyJudgeAgent.instances.append({ + "model_name": getattr(model, "_stub_name", ""), + "config": config, + "planner": planner, + }) + + async def get_response(self, user_message: str) -> str: # pragma: no cover - not invoked here + return '{"reasoning":"stub","is_the_agent_response_valid":"valid"}' + + +def _make_metric(judge_models: list[dict[str, Any]]): + from trpc_agent_sdk.evaluation import EvalMetric + return EvalMetric( + metric_name="llm_final_response", + threshold=1.0, + criterion={ + "llm_judge": { + "judge_models": judge_models, + "models_aggregator": "all_pass", + }, + }, + ) + + +@pytest.fixture(autouse=True) +def _reset_spy(): + _SpyJudgeAgent.instances.clear() + yield + _SpyJudgeAgent.instances.clear() + + +def _build_judge(judge_models: list[dict[str, Any]]): + from trpc_agent_sdk.evaluation._llm_judge import LLMJudge + metric = _make_metric(judge_models) + patchers = [ + patch("trpc_agent_sdk.evaluation._llm_judge._create_judge_model", + side_effect=_spy_create_judge_model), + patch("trpc_agent_sdk.evaluation._llm_judge._JudgeAgent", _SpyJudgeAgent), + ] + for p in patchers: + p.start() + try: + return LLMJudge(metric) + finally: + for p in patchers: + p.stop() + + +class TestLLMJudgeThinkIntegration: + + def test_legacy_single_judge_model_supports_think(self): + from trpc_agent_sdk.evaluation import EvalMetric + from trpc_agent_sdk.evaluation._llm_judge import LLMJudge + from trpc_agent_sdk.planners import BuiltInPlanner + metric = EvalMetric( + metric_name="llm_final_response", + threshold=1.0, + criterion={ + "llm_judge": { + "judge_model": {"model_name": "glm-4.7", "think": False}, + }, + }, + ) + patchers = [ + patch("trpc_agent_sdk.evaluation._llm_judge._create_judge_model", + side_effect=_spy_create_judge_model), + patch("trpc_agent_sdk.evaluation._llm_judge._JudgeAgent", _SpyJudgeAgent), + ] + for p in patchers: + p.start() + try: + LLMJudge(metric) + finally: + for p in patchers: + p.stop() + assert len(_SpyJudgeAgent.instances) == 1 + inst = _SpyJudgeAgent.instances[0] + assert isinstance(inst["planner"], BuiltInPlanner) + assert inst["planner"].thinking_config.include_thoughts is False + assert inst["planner"].thinking_config.thinking_budget == 0 + + def test_per_judge_independent_think(self): + from trpc_agent_sdk.planners import BuiltInPlanner + _build_judge([ + {"model_name": "glm-4.7", "think": False}, + {"model_name": "gpt-4o", "think": True}, + {"model_name": "qwen2.5"}, # think None -> no planner + ]) + assert len(_SpyJudgeAgent.instances) == 3 + by_name = {i["model_name"]: i for i in _SpyJudgeAgent.instances} + + glm = by_name["glm-4.7"] + assert isinstance(glm["planner"], BuiltInPlanner) + assert glm["planner"].thinking_config.include_thoughts is False + assert glm["planner"].thinking_config.thinking_budget == 0 + assert glm["config"].http_options.extra_body == { + "chat_template_kwargs": {"enable_thinking": False}, + } + + gpt = by_name["gpt-4o"] + assert isinstance(gpt["planner"], BuiltInPlanner) + assert gpt["planner"].thinking_config.include_thoughts is True + assert gpt["planner"].thinking_config.thinking_budget == -1 + assert gpt["config"].http_options.extra_body == { + "chat_template_kwargs": {"enable_thinking": True}, + } + + qwen = by_name["qwen2.5"] + assert qwen["planner"] is None + assert qwen["config"].http_options is None + + def test_think_none_with_caller_http_options_preserves_it(self): + _build_judge([{ + "model_name": "m", + "generation_config": {"http_options": {"extra_body": {"preserved": 1}}}, + }]) + assert len(_SpyJudgeAgent.instances) == 1 + inst = _SpyJudgeAgent.instances[0] + assert inst["planner"] is None + assert inst["config"].http_options.extra_body == {"preserved": 1} + + +class TestCreateJudgeModelRouting: + """Verify _create_judge_model picks OpenAIModel directly when provider is + empty/openai (so http_options.extra_body actually reaches the backend), + and falls back to ModelRegistry.create_model for other providers (LiteLLM).""" + + def test_empty_provider_uses_openaimodel_directly(self): + from trpc_agent_sdk.evaluation._llm_judge import _create_judge_model + from trpc_agent_sdk.models import OpenAIModel + opts = JudgeModelOptions( + provider_name="", + model_name="glm-5.1-w4afp8", + api_key="k", + base_url="http://host/v1", + ) + model = _create_judge_model(opts) + assert isinstance(model, OpenAIModel) + + def test_openai_provider_uses_openaimodel_directly(self): + from trpc_agent_sdk.evaluation._llm_judge import _create_judge_model + from trpc_agent_sdk.models import OpenAIModel + opts = JudgeModelOptions( + provider_name="openai", + model_name="gpt-4o", + api_key="k", + ) + model = _create_judge_model(opts) + assert isinstance(model, OpenAIModel) + + def test_openai_provider_case_insensitive(self): + from trpc_agent_sdk.evaluation._llm_judge import _create_judge_model + from trpc_agent_sdk.models import OpenAIModel + opts = JudgeModelOptions( + provider_name="OpenAI", + model_name="gpt-4o", + api_key="k", + ) + model = _create_judge_model(opts) + assert isinstance(model, OpenAIModel) + + def test_non_openai_provider_uses_registry(self): + from trpc_agent_sdk.evaluation import _llm_judge as llm_judge_mod + from trpc_agent_sdk.evaluation._llm_judge import _create_judge_model + opts = JudgeModelOptions( + provider_name="anthropic", + model_name="claude-3-5-sonnet", + api_key="k", + ) + sentinel = object() + with patch.object( + llm_judge_mod.ModelRegistry, + "create_model", + return_value=sentinel, + ) as mock_reg: + model = _create_judge_model(opts) + assert model is sentinel + args, kwargs = mock_reg.call_args + assert args[0] == "anthropic/claude-3-5-sonnet" + assert kwargs.get("api_key") == "k" + + def test_openaimodel_receives_model_name_and_base_url(self): + from trpc_agent_sdk.evaluation._llm_judge import _create_judge_model + opts = JudgeModelOptions( + provider_name="", + model_name="glm-5.1-w4afp8", + api_key="sk-x", + base_url="http://example/v1", + ) + model = _create_judge_model(opts) + assert getattr(model, "_model_name", None) == "glm-5.1-w4afp8" + assert getattr(model, "_base_url", None) == "http://example/v1" diff --git a/trpc_agent_sdk/evaluation/__init__.py b/trpc_agent_sdk/evaluation/__init__.py index 8ff7c1f..1d4eb20 100644 --- a/trpc_agent_sdk/evaluation/__init__.py +++ b/trpc_agent_sdk/evaluation/__init__.py @@ -94,6 +94,7 @@ from ._eval_result import EvalStatusCounts from ._eval_result import EvaluateResult from ._eval_result import EvaluationResult +from ._eval_result import NamedScoreResult from ._eval_result import PerInvocationResult from ._eval_service_base import BaseEvalService from ._eval_service_base import EvaluateConfig @@ -121,6 +122,10 @@ from ._in_memory_eval_sets_manager import InMemoryEvalSetsManager from ._llm_criterion import DEFAULT_KNOWLEDGE_TOOL_NAMES from ._llm_criterion import DEFAULT_NUM_SAMPLES +from ._llm_criterion import BUILT_IN_MODELS_AGGREGATORS +from ._llm_criterion import DEFAULT_MODELS_AGGREGATOR +from ._llm_criterion import DEFAULT_PARALLEL +from ._llm_criterion import WEIGHTED_MODELS_AGGREGATORS from ._llm_criterion import JudgeModelOptions from ._llm_criterion import LLMJudgeCriterion from ._llm_criterion import Rubric @@ -137,20 +142,29 @@ from ._llm_evaluator import LLM_EVALUATOR_REGISTRY from ._llm_evaluator import LLM_METRIC_NAMES from ._llm_evaluator import MessagesConstructorFn +from ._llm_evaluator import ModelsAggregatorFn from ._llm_evaluator import ResponseScorerFn from ._llm_evaluator import SamplesAggregatorFn +from ._llm_judge import AllPassModelsAggregator +from ._llm_judge import AnyPassModelsAggregator from ._llm_judge import AverageInvocationsAggregator +from ._llm_judge import AverageModelsAggregator from ._llm_judge import DefaultMessagesConstructor from ._llm_judge import DefaultResponseScorer from ._llm_judge import FinalResponseOutput from ._llm_judge import InvocationsAggregator from ._llm_judge import LLMJudge +from ._llm_judge import MajorityPassModelsAggregator from ._llm_judge import MajorityVoteSamplesAggregator from ._llm_judge import MessagesConstructor +from ._llm_judge import ModelsAggregator from ._llm_judge import ResponseScorer from ._llm_judge import RubricItemOutput from ._llm_judge import RubricJudgeOutput from ._llm_judge import SamplesAggregator +from ._llm_judge import WeightedAverageModelsAggregator +from ._llm_judge import WeightedMajorityModelsAggregator +from ._llm_judge import get_builtin_models_aggregator from ._local_eval_service import LocalEvalService from ._local_eval_set_results_manager import LocalEvalSetResultsManager from ._local_eval_sets_manager import LocalEvalSetsManager @@ -223,6 +237,7 @@ "EvaluateResult", "EvaluationResult", "PerInvocationResult", + "NamedScoreResult", "BaseEvalService", "EvaluateConfig", "EvaluateRequest", @@ -239,6 +254,10 @@ "LLMJudgeCriterion", "DEFAULT_KNOWLEDGE_TOOL_NAMES", "DEFAULT_NUM_SAMPLES", + "BUILT_IN_MODELS_AGGREGATORS", + "DEFAULT_MODELS_AGGREGATOR", + "DEFAULT_PARALLEL", + "WEIGHTED_MODELS_AGGREGATORS", "RubricScore", "ScoreResult", "AverageInvocationsAggregator", @@ -249,6 +268,14 @@ "MessagesConstructor", "ResponseScorer", "SamplesAggregator", + "AllPassModelsAggregator", + "AnyPassModelsAggregator", + "AverageModelsAggregator", + "MajorityPassModelsAggregator", + "ModelsAggregator", + "WeightedAverageModelsAggregator", + "WeightedMajorityModelsAggregator", + "get_builtin_models_aggregator", "LocalEvalSetResultsManager", "LocalEvalSetsManager", "BaseUserSimulatorConfig", @@ -266,6 +293,7 @@ "LLM_EVALUATOR_REGISTRY", "LLM_METRIC_NAMES", "MessagesConstructorFn", + "ModelsAggregatorFn", "ResponseScorerFn", "SamplesAggregatorFn", "LocalEvalService", diff --git a/trpc_agent_sdk/evaluation/_agent_evaluator.py b/trpc_agent_sdk/evaluation/_agent_evaluator.py index 8d93c1d..09e1ed3 100644 --- a/trpc_agent_sdk/evaluation/_agent_evaluator.py +++ b/trpc_agent_sdk/evaluation/_agent_evaluator.py @@ -51,6 +51,7 @@ from ._local_eval_service import LocalEvalService from . import _utils from ._eval_callbacks import Callbacks +from ._eval_case import EvalModeTrace from ._eval_config import EvalConfig from ._eval_metrics import EvalStatus from ._eval_pass import pass_at_k as _pass_at_k @@ -85,8 +86,9 @@ class _EvalExecuter: def __init__( self, - agent_module: str, eval_dataset_file_path_or_dir: str, + *, + agent_module: Optional[str] = None, num_runs: int = NUM_RUNS, agent_name: Optional[str] = None, print_detailed_results: bool = True, @@ -95,6 +97,7 @@ def __init__( case_parallelism: Optional[int] = None, case_eval_parallelism: Optional[int] = None, callbacks: Optional[Callbacks] = None, + eval_metrics_file_path_or_dir: Optional[str] = None, ): self._agent_module = agent_module self._eval_dataset_file_path_or_dir = eval_dataset_file_path_or_dir @@ -106,6 +109,7 @@ def __init__( self._case_parallelism = case_parallelism self._case_eval_parallelism = case_eval_parallelism self._callbacks = callbacks + self._eval_metrics_file_path_or_dir = eval_metrics_file_path_or_dir self._result: Optional[EvaluateResult] = None self._task: Optional[asyncio.Task] = None @@ -120,6 +124,10 @@ async def _run(self) -> None: case_parallelism = self._case_parallelism case_eval_parallelism = self._case_eval_parallelism callbacks = self._callbacks + eval_metrics_file_path_or_dir = self._eval_metrics_file_path_or_dir + + # Resolve shared config once; None means "fall back to dataset-local test_config.json". + shared_eval_config = AgentEvaluator._resolve_shared_config(eval_metrics_file_path_or_dir) test_files = [] if os.path.isdir(eval_dataset_file_path_or_dir): @@ -139,15 +147,22 @@ async def _run(self) -> None: all_results: list[tuple[str, list[str]]] = [] results_by_eval_set_id: dict[str, EvalSetAggregateResult] = {} for test_file in test_files: - eval_config = AgentEvaluator.find_config_for_test_file(test_file) + if shared_eval_config is not None: + eval_config = shared_eval_config + # When shared config is explicit, honor its num_runs iff user + # set one; we keep the same precedence behavior as the + # dataset-local path (config overrides parameter). + num_runs_for_set = eval_config.num_runs + else: + eval_config = AgentEvaluator.find_config_for_test_file(test_file) + # Config (test_config.json) overrides parameter + config_path = os.path.join(os.path.dirname(test_file), "test_config.json") + num_runs_for_set = (eval_config.num_runs if os.path.exists(config_path) else num_runs) eval_set = AgentEvaluator._load_eval_set_from_file(test_file, eval_config) - # Config (test_config.json) overrides parameter - config_path = os.path.join(os.path.dirname(test_file), "test_config.json") - num_runs_for_set = (eval_config.num_runs if os.path.exists(config_path) else num_runs) failed_summary, details_lines, result_lines, eval_results_by_eval_id = ( await AgentEvaluator.evaluate_eval_set( + eval_set, agent_module=agent_module, - eval_set=eval_set, eval_config=eval_config, num_runs=num_runs_for_set, agent_name=agent_name, @@ -172,7 +187,7 @@ async def _run(self) -> None: _RESULT_HANDLER.print_evaluation_report( all_details=all_details, all_results=all_results, - display_agent_name=agent_name or agent_module, + display_agent_name=agent_name or agent_module or "trace-only", num_runs=num_runs_for_set, ) self._result = EvaluateResult(results_by_eval_set_id=results_by_eval_set_id) @@ -232,8 +247,8 @@ class AgentEvaluator: # Run evaluation await AgentEvaluator.evaluate_eval_set( - agent=my_agent, - eval_set=eval_set, + eval_set, + agent_module="my_pkg.my_agent", eval_config=eval_config, ) ``` @@ -253,10 +268,20 @@ def find_config_for_test_file(test_file: str) -> EvalConfig: config_path = os.path.join(test_folder, "test_config.json") return AgentEvaluator._load_config_from_file(config_path) + @staticmethod + def _is_trace_only(eval_set: EvalSet) -> bool: + """Return True iff every case in the EvalSet has eval_mode == 'trace'. + + Empty eval_cases list returns True by ``all()`` semantics; callers + that require at least one case should validate separately. + """ + return all(case.eval_mode == EvalModeTrace for case in eval_set.eval_cases) + @staticmethod async def evaluate( - agent_module: str, eval_dataset_file_path_or_dir: str, + *, + agent_module: Optional[str] = None, num_runs: int = NUM_RUNS, agent_name: Optional[str] = None, print_detailed_results: bool = True, @@ -265,13 +290,17 @@ async def evaluate( case_parallelism: Optional[int] = None, case_eval_parallelism: Optional[int] = None, callbacks: Optional[Callbacks] = None, + eval_metrics_file_path_or_dir: Optional[str] = None, ) -> None: """Run evaluation; no result returned. Use get_executer() if you need the result. Args: - agent_module: Python module path containing the agent (look for 'root_agent' or 'get_agent_async'). eval_dataset_file_path_or_dir: Path to eval dataset file or directory - (recursively .test.json / .evalset.json). + (recursively .test.json / .evalset.json). Positional-only usage + recommended. + agent_module: Python module path containing the agent (look for + 'root_agent' or 'get_agent_async'). Optional when every case in + every discovered dataset uses ``eval_mode='trace'``. num_runs: Number of runs per eval set. agent_name: Display name of the agent. print_detailed_results: Whether to print per-case details. @@ -281,10 +310,14 @@ async def evaluate( case_eval_parallelism: Max concurrent cases for evaluation (scoring); None uses default. callbacks: Optional lifecycle callbacks. + eval_metrics_file_path_or_dir: Optional explicit path to a shared + evaluation config JSON (file) or directory containing a single + config JSON. When provided, overrides the dataset-local + ``test_config.json`` convention for ALL discovered datasets. """ executer = AgentEvaluator.get_executer( + eval_dataset_file_path_or_dir, agent_module=agent_module, - eval_dataset_file_path_or_dir=eval_dataset_file_path_or_dir, num_runs=num_runs, agent_name=agent_name, print_detailed_results=print_detailed_results, @@ -293,13 +326,15 @@ async def evaluate( case_parallelism=case_parallelism, case_eval_parallelism=case_eval_parallelism, callbacks=callbacks, + eval_metrics_file_path_or_dir=eval_metrics_file_path_or_dir, ) await executer.evaluate() @staticmethod def get_executer( - agent_module: str, eval_dataset_file_path_or_dir: str, + *, + agent_module: Optional[str] = None, num_runs: int = NUM_RUNS, agent_name: Optional[str] = None, print_detailed_results: bool = True, @@ -308,13 +343,17 @@ def get_executer( case_parallelism: Optional[int] = None, case_eval_parallelism: Optional[int] = None, callbacks: Optional[Callbacks] = None, + eval_metrics_file_path_or_dir: Optional[str] = None, ) -> _EvalExecuter: """Return an executer (does not run). Await executer.evaluate() then executer.get_result() for result. Args: - agent_module: Python module path containing the agent (look for 'root_agent' or 'get_agent_async'). eval_dataset_file_path_or_dir: Path to eval dataset file or directory - (recursively .test.json / .evalset.json). + (recursively .test.json / .evalset.json). Positional-only usage + recommended. + agent_module: Python module path containing the agent (look for + 'root_agent' or 'get_agent_async'). Optional when every case in + every discovered dataset uses ``eval_mode='trace'``. num_runs: Number of runs per eval set. agent_name: Display name of the agent. print_detailed_results: Whether to print per-case details. @@ -324,13 +363,17 @@ def get_executer( case_eval_parallelism: Max concurrent cases for evaluation (scoring); None uses default. callbacks: Optional lifecycle callbacks. + eval_metrics_file_path_or_dir: Optional explicit path to a shared + evaluation config JSON (file) or directory containing a single + config JSON. When provided, overrides the dataset-local + ``test_config.json`` convention for ALL discovered datasets. Returns: _EvalExecuter: Await .evaluate() to run, then .get_result() for EvaluateResult. """ return _EvalExecuter( + eval_dataset_file_path_or_dir, agent_module=agent_module, - eval_dataset_file_path_or_dir=eval_dataset_file_path_or_dir, num_runs=num_runs, agent_name=agent_name, print_detailed_results=print_detailed_results, @@ -339,6 +382,7 @@ def get_executer( case_parallelism=case_parallelism, case_eval_parallelism=case_eval_parallelism, callbacks=callbacks, + eval_metrics_file_path_or_dir=eval_metrics_file_path_or_dir, ) @staticmethod @@ -384,8 +428,9 @@ def pass_hat_k(n: int, c: int, k: int) -> float: @staticmethod async def evaluate_eval_set( - agent_module: str, eval_set: EvalSet, + *, + agent_module: Optional[str] = None, eval_config: Optional[EvalConfig] = None, num_runs: int = NUM_RUNS, agent_name: Optional[str] = None, @@ -399,10 +444,14 @@ async def evaluate_eval_set( """Evaluates an agent using the given EvalSet. Args: + eval_set: The eval set. (Positional-only usage recommended.) agent_module: The path to python module that contains the definition of the agent. There is convention in place here, where the code is going to look for 'root_agent' or `get_agent_async` in the loaded module. - eval_set: The eval set. + Optional when every case in ``eval_set`` has ``eval_mode == 'trace'`` + (pre-recorded conversation used as inference result, no agent run). + Required otherwise; a ``ValueError`` is raised listing non-trace + case ids. eval_config: The evaluation config. num_runs: Number of times all entries in the eval dataset should be assessed. @@ -427,7 +476,16 @@ async def evaluate_eval_set( if eval_config is None: raise ValueError("`eval_config` is required.") - agent_for_eval = await AgentEvaluator._get_agent_for_eval(module_name=agent_module, agent_name=agent_name) + trace_only = AgentEvaluator._is_trace_only(eval_set) + if agent_module is None and not trace_only: + non_trace_ids = [case.eval_id for case in eval_set.eval_cases if case.eval_mode != EvalModeTrace] + raise ValueError("`agent_module` is required unless every case in eval_set uses " + "eval_mode='trace'. Non-trace case ids: " + f"{non_trace_ids}") + + agent_for_eval: Optional[BaseAgent] = None + if agent_module is not None: + agent_for_eval = await AgentEvaluator._get_agent_for_eval(module_name=agent_module, agent_name=agent_name) eval_metrics = eval_config.get_eval_metrics() user_simulator_provider = UserSimulatorProvider(user_simulator_config=eval_config.user_simulator_config) @@ -448,7 +506,7 @@ async def evaluate_eval_set( # Step 2: Post-process the results failures: list[str] = [] - display_agent_name = agent_name or agent_module + display_agent_name = agent_name or agent_module or "trace-only" details_lines: list[str] = [] result_lines: list[str] = [] @@ -619,6 +677,70 @@ def _load_config_from_file(file_path: Optional[str]) -> EvalConfig: except Exception as ex: raise ValueError(f"Failed to load config from {file_path}: {ex}") + @staticmethod + def _load_config_from_file_strict(file_path: str) -> EvalConfig: + """Load EvalConfig from JSON file; raise if missing (no default fallback). + + Unlike ``_load_config_from_file`` which silently returns a default config + when the file is absent, this strict variant is intended for explicit + user-supplied paths (e.g. ``eval_metrics_file_path_or_dir``) where a + missing file is a programmer error, not a signal to use defaults. + + Args: + file_path: Path to the config JSON file. Must exist. + + Returns: + EvalConfig instance loaded from ``file_path``. + + Raises: + FileNotFoundError: If ``file_path`` does not exist. + ValueError: If the file cannot be parsed as EvalConfig JSON. + """ + if not os.path.exists(file_path): + raise FileNotFoundError(f"Eval metrics/config file not found: {file_path}") + return AgentEvaluator._load_config_from_file(file_path) + + @staticmethod + def _resolve_shared_config(eval_metrics_file_path_or_dir: Optional[str], ) -> Optional[EvalConfig]: + """Resolve a user-provided metrics/config path into a single EvalConfig. + + Resolution rules: + - ``None`` -> returns ``None`` (no shared config; callers + fall back to per-dataset ``test_config.json`` convention). + - Path is a regular file -> load that file strictly. + - Path is a directory: + - Exactly one ``*.json`` in the directory (non-recursive) -> load it. + - Zero ``*.json`` -> ``FileNotFoundError``. + - Two or more -> ``ValueError`` (ambiguous). + - Path does not exist -> ``FileNotFoundError``. + + Args: + eval_metrics_file_path_or_dir: User-supplied path or None. + + Returns: + EvalConfig when a shared config was resolved, else None. + """ + if eval_metrics_file_path_or_dir is None: + return None + + path = eval_metrics_file_path_or_dir + if not os.path.exists(path): + raise FileNotFoundError(f"eval_metrics_file_path_or_dir does not exist: {path}") + + if os.path.isfile(path): + return AgentEvaluator._load_config_from_file_strict(path) + + # Directory case: non-recursive lookup for *.json + json_files = sorted( + os.path.join(path, entry) for entry in os.listdir(path) + if entry.endswith(".json") and os.path.isfile(os.path.join(path, entry))) + if not json_files: + raise FileNotFoundError(f"No *.json config file found in directory: {path}") + if len(json_files) > 1: + raise ValueError("eval_metrics_file_path_or_dir directory contains multiple " + f"*.json files; expected exactly one. Found: {json_files}") + return AgentEvaluator._load_config_from_file_strict(json_files[0]) + @staticmethod def _get_eval_sets_manager(app_name: str, eval_set: EvalSet) -> EvalSetsManager: """Create and populate an in-memory eval sets manager. @@ -644,7 +766,7 @@ def _get_eval_sets_manager(app_name: str, eval_set: EvalSet) -> EvalSetsManager: @staticmethod async def _get_eval_results_by_eval_id( - agent_for_eval: BaseAgent, + agent_for_eval: Optional[BaseAgent], eval_set: EvalSet, eval_metrics: list, num_runs: int, @@ -735,9 +857,12 @@ async def _get_eval_results_by_eval_id( return eval_results_by_eval_id + # yapf: disable @staticmethod def _get_eval_metric_results_with_invocation( - eval_results_per_eval_id: list[EvalCaseResult], ) -> dict[str, list[_utils.MetricRunRecord]]: + eval_results_per_eval_id: list[EvalCaseResult], + ) -> dict[str, list[_utils.MetricRunRecord]]: + # yapf: enable """Returns MetricRunRecord grouped by metric. EvalCaseResult contain results for each metric per invocation. diff --git a/trpc_agent_sdk/evaluation/_eval_case.py b/trpc_agent_sdk/evaluation/_eval_case.py index e6e7bd9..8ac476d 100644 --- a/trpc_agent_sdk/evaluation/_eval_case.py +++ b/trpc_agent_sdk/evaluation/_eval_case.py @@ -210,13 +210,29 @@ class EvalCase(EvalBaseModel): @model_validator(mode="after") def ensure_conversation_xor_conversation_scenario(self) -> EvalCase: - """Trace: conversation or actual_conversation (no scenario). Default: conversation xor conversation_scenario.""" + """Trace: actual_conversation is required (conversation optional as reference). + Default: conversation xor conversation_scenario. + + Trace-mode legal shapes (after Bug 3.2 strict fix): + * actual_conversation only -> scenario 3 (no reference) + * actual_conversation + conversation -> scenario 1 (full comparison) + + The legacy shape of providing only `conversation` under eval_mode='trace' + is now rejected because the field would have to serve as both the + recorded trace (actual) and the reference (expected), which is ambiguous + and causes silent evaluation errors in reference-based metrics + (see docs/superpowers/specs/2026-05-06-evaluator-trace-metric-strict-compat-design.md). + """ is_trace = self.eval_mode == EvalModeTrace if is_trace: if self.conversation_scenario is not None: raise ValueError("conversation_scenario is not allowed when eval_mode is \"trace\"") - if not self.conversation and not self.actual_conversation: - raise ValueError("trace mode requires at least one of conversation or actual_conversation") + if not self.actual_conversation: + raise ValueError("eval_mode='trace' requires `actual_conversation` field. " + "If the provided `conversation` is a recorded trace, move it to " + "`actual_conversation`. If it's a reference answer, also provide " + "`actual_conversation` for scenario-1 full comparison. " + "See Bug 3.2 of evaluator-trace-metric-strict-compat-design.") return self if (self.conversation is None) == (self.conversation_scenario is None): raise ValueError("Exactly one of conversation and conversation_scenario must be provided") diff --git a/trpc_agent_sdk/evaluation/_eval_result.py b/trpc_agent_sdk/evaluation/_eval_result.py index a10259f..700bb0e 100644 --- a/trpc_agent_sdk/evaluation/_eval_result.py +++ b/trpc_agent_sdk/evaluation/_eval_result.py @@ -35,6 +35,29 @@ from ._eval_metrics import EvalStatus +class NamedScoreResult(EvalBaseModel): + """One judge model's per-invocation score, used inside PerInvocationResult.per_model_scores. + + Attributes: + model_name: Judge model name. + provider_name: Provider name; empty when unset. + score: Numeric score from this model after SamplesAggregator on its own samples. + reason: Reason text from the judge model (or exception text on soft failure). + rubric_scores: Per-rubric scores (rubric metrics). Use _llm_criterion.RubricScore. + passed: True iff score >= metric.threshold. + """ + + model_name: str = Field(default="", description="Judge model name.") + provider_name: str = Field(default="", description="Provider name.") + score: float = Field(default=0.0, description="Score from this model.") + reason: str = Field(default="", description="Reason from this model.") + rubric_scores: list[Any] = Field( + default_factory=list, + description="Per-rubric scores from this model (rubric metrics).", + ) + passed: bool = Field(default=False, description="True iff score >= threshold.") + + class PerInvocationResult(EvalBaseModel): """Result for a single invocation. @@ -56,6 +79,11 @@ class PerInvocationResult(EvalBaseModel): default=None, description="Per-rubric scores (LLM rubric metrics). Use _llm_criterion.RubricScore.", ) + per_model_scores: Optional[list[NamedScoreResult]] = Field( + default=None, + description=("Per-judge-model breakdown for multi-model LLM judge metrics. " + "None for single-model or non-LLM metrics (back-compatible)."), + ) class EvaluationResult(EvalBaseModel): diff --git a/trpc_agent_sdk/evaluation/_evaluator_base.py b/trpc_agent_sdk/evaluation/_evaluator_base.py index 6404e29..98e4f89 100644 --- a/trpc_agent_sdk/evaluation/_evaluator_base.py +++ b/trpc_agent_sdk/evaluation/_evaluator_base.py @@ -26,6 +26,7 @@ from abc import ABC from abc import abstractmethod +from typing import ClassVar from typing import Optional from ._eval_case import Invocation @@ -39,6 +40,22 @@ class Evaluator(ABC): and expected invocations and computing a score. """ + requires_reference: ClassVar[bool] = True + """Whether this metric requires expected_invocations with non-placeholder + `final_response` / `intermediate_data` fields (a "reference answer"). + + Set to False for reference-free metrics (e.g. rubric-based LLM judges that + evaluate actual output against a rubric, not against a reference answer). + + Checked by `LocalEvalService._validate_metric_compat` at evaluate() startup + to fail-fast on incompatible (eval_case, metric) combinations. Defaults to + True — safer for new evaluators; opt into False explicitly when adding a + reference-free metric. + + See Bug 3.1 of + docs/superpowers/specs/2026-05-06-evaluator-trace-metric-strict-compat-design.md. + """ + @abstractmethod def evaluate_invocations( self, diff --git a/trpc_agent_sdk/evaluation/_evaluator_registry.py b/trpc_agent_sdk/evaluation/_evaluator_registry.py index 8044d14..aa2c67a 100644 --- a/trpc_agent_sdk/evaluation/_evaluator_registry.py +++ b/trpc_agent_sdk/evaluation/_evaluator_registry.py @@ -103,6 +103,19 @@ def get_evaluator(self, eval_metric: EvalMetric) -> Evaluator: return evaluator + def get_evaluator_class(self, eval_metric: EvalMetric) -> Type[Evaluator]: + """Return evaluator CLASS (not instance) for inspecting class attributes. + + Used by LocalEvalService._validate_metric_compat to look up the + `requires_reference` ClassVar without instantiating the evaluator + (which may have side effects like loading rouge-score or creating an + LLM judge). Raises if metric not registered. + """ + if eval_metric.metric_name not in self._registry: + raise ValueError(f"No evaluator registered for metric: {eval_metric.metric_name}. " + f"Available metrics: {list(self._registry.keys())}") + return self._registry[eval_metric.metric_name] + # Global default registry EVALUATOR_REGISTRY = EvaluatorRegistry() diff --git a/trpc_agent_sdk/evaluation/_final_response_evaluator.py b/trpc_agent_sdk/evaluation/_final_response_evaluator.py index 8b5e06a..9450211 100644 --- a/trpc_agent_sdk/evaluation/_final_response_evaluator.py +++ b/trpc_agent_sdk/evaluation/_final_response_evaluator.py @@ -33,6 +33,8 @@ class FinalResponseEvaluator(Evaluator): else exact text. Score 1.0 or 0.0 per invocation, overall = mean. """ + requires_reference = True + def __init__( self, threshold: Optional[float] = None, diff --git a/trpc_agent_sdk/evaluation/_llm_criterion.py b/trpc_agent_sdk/evaluation/_llm_criterion.py index 6e56b1b..7294c2f 100644 --- a/trpc_agent_sdk/evaluation/_llm_criterion.py +++ b/trpc_agent_sdk/evaluation/_llm_criterion.py @@ -13,10 +13,22 @@ from pydantic import Field from pydantic import model_serializer +from pydantic import model_validator from ._common import EvalBaseModel DEFAULT_NUM_SAMPLES = 1 +DEFAULT_MODELS_AGGREGATOR = "all_pass" +DEFAULT_PARALLEL = True +BUILT_IN_MODELS_AGGREGATORS = frozenset({ + "all_pass", + "any_pass", + "majority_pass", + "avg", + "weighted_avg", + "weighted_majority", +}) +WEIGHTED_MODELS_AGGREGATORS = frozenset({"weighted_avg", "weighted_majority"}) def sanitize_criterion_for_export(criterion: Optional[dict[str, Any]]) -> Optional[dict[str, Any]]: @@ -80,6 +92,17 @@ class JudgeModelOptions(EvalBaseModel): default=None, description="Generation params: max_tokens, temperature, stream, etc.", ) + weight: float = Field( + default=1.0, + description="Weight for weighted_* models_aggregator; ignored otherwise.", + ) + think: Optional[bool] = Field( + default=None, + description=("Toggle judge thinking mode. None (default): no change; " + "False: disable thinking via both ThinkingConfig(include_thoughts=False, " + "thinking_budget=0) and extra_body.chat_template_kwargs.enable_thinking=False; " + "True: enable thinking with automatic budget."), + ) def get_num_samples(self) -> int: """Return configured num_samples or DEFAULT_NUM_SAMPLES.""" @@ -123,6 +146,22 @@ class LLMJudgeCriterion(EvalBaseModel): default=None, description="Judge model options (required for all LLM judge metrics).", ) + judge_models: Optional[list[JudgeModelOptions]] = Field( + default=None, + description=("Multi-model judge list. Mutually exclusive with judge_model. " + "Cross-model results are combined by models_aggregator."), + ) + models_aggregator: str = Field( + default=DEFAULT_MODELS_AGGREGATOR, + description=("Cross-model aggregation strategy. Built-in: all_pass | any_pass | " + "majority_pass | avg | weighted_avg | weighted_majority. " + "Custom names must be registered via " + "LLM_EVALUATOR_REGISTRY.register_models_aggregator before LLMJudge construction."), + ) + parallel: bool = Field( + default=DEFAULT_PARALLEL, + description="Run multiple judge models concurrently via asyncio.gather (default True).", + ) rubrics: list[Rubric] = Field( default_factory=list, description="Rubric items for rubric-based metrics.", @@ -145,6 +184,44 @@ def get_knowledge_tool_names(self) -> list[str]: return list(self.knowledge_tool_names) return list(DEFAULT_KNOWLEDGE_TOOL_NAMES) + @model_validator(mode="after") + def _validate_multi_model_fields(self) -> "LLMJudgeCriterion": + """Validate judge_model/judge_models exclusivity, weights, and aggregator name shape. + + Registry-registered aggregator names are not validated here; only built-in + names are rejected at LLMJudge construction time when registry lookup misses. + """ + if self.judge_model is not None and self.judge_models is not None: + raise ValueError("judge_model and judge_models are mutually exclusive; set only one") + if self.judge_models is not None and len(self.judge_models) == 0: + raise ValueError("judge_models must not be empty when set") + if not isinstance(self.models_aggregator, str) or not self.models_aggregator: + raise ValueError("models_aggregator must be a non-empty string") + models = self.get_judge_models() + for m in models: + if m.weight < 0: + raise ValueError(f"judge model weight must not be negative: model_name={m.model_name!r} " + f"weight={m.weight}") + if self.models_aggregator in WEIGHTED_MODELS_AGGREGATORS and models: + total = sum(m.weight for m in models) + if total <= 0: + raise ValueError(f"models_aggregator={self.models_aggregator!r} requires sum of weights > 0; " + f"got total weight {total}") + return self + + def get_judge_models(self) -> list[JudgeModelOptions]: + """Return effective list of judge model options. + + - judge_models set -> returned as-is. + - only legacy judge_model set -> returned as 1-element list. + - neither set -> []. Caller (LLMJudge) decides whether to error. + """ + if self.judge_models is not None: + return list(self.judge_models) + if self.judge_model is not None: + return [self.judge_model] + return [] + @classmethod def from_dict(cls, d: dict | None) -> Optional["LLMJudgeCriterion"]: """Build from config dict (judgeModel, rubrics, knowledge_tool_names; camelCase or snake_case). diff --git a/trpc_agent_sdk/evaluation/_llm_evaluator.py b/trpc_agent_sdk/evaluation/_llm_evaluator.py index e780c7e..189b16f 100644 --- a/trpc_agent_sdk/evaluation/_llm_evaluator.py +++ b/trpc_agent_sdk/evaluation/_llm_evaluator.py @@ -25,6 +25,7 @@ from ._llm_judge import InvocationsAggregator from ._llm_judge import LLMJudge from ._llm_judge import MessagesConstructor +from ._llm_judge import ModelsAggregator from ._llm_judge import ResponseScorer from ._llm_judge import SamplesAggregator @@ -40,6 +41,7 @@ ResponseScorerFn = Callable[[str, str], ScoreResult] SamplesAggregatorFn = Callable[[list[ScoreResult], float], ScoreResult] InvocationsAggregatorFn = Callable[[list[PerInvocationResult], float], tuple[Optional[float], EvalStatus]] +ModelsAggregatorFn = Callable[[list[ScoreResult], float, list[float]], ScoreResult] class _MessagesConstructorAdapter: @@ -82,6 +84,16 @@ def aggregate_invocations(self, results, threshold): return self._fn(results, threshold) +class _ModelsAggregatorAdapter: + """Adapts a plain function to ModelsAggregator.""" + + def __init__(self, fn: ModelsAggregatorFn) -> None: + self._fn = fn + + def aggregate_models(self, per_model, threshold, weights): + return self._fn(per_model, threshold, weights) + + def _validate_metric(metric_name: str) -> None: """Raise ValueError if metric_name is not an LLM metric.""" if metric_name not in LLM_METRIC_NAMES: @@ -97,6 +109,7 @@ def __init__(self) -> None: self._response_scorer: dict[str, ResponseScorer] = {} self._samples_aggregator: dict[str, SamplesAggregator] = {} self._invocations_aggregator: dict[str, InvocationsAggregator] = {} + self._models_aggregator: dict[str, ModelsAggregator] = {} self._judge_tools: dict[str, List[Any]] = {} def register_messages_constructor(self, metric_name: str, fn: MessagesConstructorFn) -> None: @@ -115,6 +128,10 @@ def register_invocations_aggregator(self, metric_name: str, fn: InvocationsAggre _validate_metric(metric_name) self._invocations_aggregator[metric_name] = _InvocationsAggregatorAdapter(fn) + def register_models_aggregator(self, metric_name: str, fn: ModelsAggregatorFn) -> None: + _validate_metric(metric_name) + self._models_aggregator[metric_name] = _ModelsAggregatorAdapter(fn) + def register_judge_tools(self, metric_name: str, tools: List[Any]) -> None: """Register tools for the judge LlmAgent (e.g. BaseTool, BaseToolSet, or callables).""" _validate_metric(metric_name) @@ -136,6 +153,9 @@ def get_samples_aggregator(self, metric_name: str) -> Optional[SamplesAggregator def get_invocations_aggregator(self, metric_name: str) -> Optional[InvocationsAggregator]: return self._invocations_aggregator.get(metric_name) + def get_models_aggregator(self, metric_name: str) -> Optional[ModelsAggregator]: + return self._models_aggregator.get(metric_name) + def unregister_messages_constructor(self, metric_name: str) -> None: self._messages_constructor.pop(metric_name, None) @@ -148,6 +168,9 @@ def unregister_samples_aggregator(self, metric_name: str) -> None: def unregister_invocations_aggregator(self, metric_name: str) -> None: self._invocations_aggregator.pop(metric_name, None) + def unregister_models_aggregator(self, metric_name: str) -> None: + self._models_aggregator.pop(metric_name, None) + def unregister_judge_tools(self, metric_name: str) -> None: self._judge_tools.pop(metric_name, None) @@ -165,6 +188,7 @@ def _judge_for_metric(eval_metric: EvalMetric) -> LLMJudge: response_scorer=LLM_EVALUATOR_REGISTRY.get_response_scorer(name), samples_aggregator=LLM_EVALUATOR_REGISTRY.get_samples_aggregator(name), invocations_aggregator=LLM_EVALUATOR_REGISTRY.get_invocations_aggregator(name), + models_aggregator=LLM_EVALUATOR_REGISTRY.get_models_aggregator(name), judge_tools=LLM_EVALUATOR_REGISTRY.get_judge_tools(name), ) @@ -172,6 +196,8 @@ def _judge_for_metric(eval_metric: EvalMetric) -> LLMJudge: class LLMFinalResponseEvaluator(Evaluator): """LLM judge for final response (valid/invalid). Metric: llm_final_response.""" + requires_reference = True + def __init__(self, eval_metric: Optional[EvalMetric] = None) -> None: if not eval_metric: raise ValueError("eval_metric is required for LLMFinalResponseEvaluator") @@ -190,6 +216,8 @@ async def evaluate_invocations( class LLMRubricResponseEvaluator(Evaluator): """LLM rubric-based response quality. Metric: llm_rubric_response.""" + requires_reference = False + def __init__(self, eval_metric: Optional[EvalMetric] = None) -> None: if not eval_metric: raise ValueError("eval_metric is required for LLMRubricResponseEvaluator") @@ -208,6 +236,8 @@ async def evaluate_invocations( class LLMRubricKnowledgeRecallEvaluator(Evaluator): """LLM rubric knowledge recall. Metric: llm_rubric_knowledge_recall.""" + requires_reference = False + def __init__(self, eval_metric: Optional[EvalMetric] = None) -> None: if not eval_metric: raise ValueError("eval_metric is required for LLMRubricKnowledgeRecallEvaluator") diff --git a/trpc_agent_sdk/evaluation/_llm_judge.py b/trpc_agent_sdk/evaluation/_llm_judge.py index b68ebbe..f9c8998 100644 --- a/trpc_agent_sdk/evaluation/_llm_judge.py +++ b/trpc_agent_sdk/evaluation/_llm_judge.py @@ -7,6 +7,8 @@ from __future__ import annotations +import asyncio +import copy import json import os import uuid @@ -21,10 +23,14 @@ from trpc_agent_sdk.context import create_agent_context from trpc_agent_sdk.context import new_invocation_context_id from trpc_agent_sdk.models import ModelRegistry +from trpc_agent_sdk.models import OpenAIModel +from trpc_agent_sdk.planners import BuiltInPlanner from trpc_agent_sdk.sessions import InMemorySessionService from trpc_agent_sdk.types import Content from trpc_agent_sdk.types import GenerateContentConfig +from trpc_agent_sdk.types import HttpOptions from trpc_agent_sdk.types import Part +from trpc_agent_sdk.types import ThinkingConfig from ._eval_case import IntermediateData from ._eval_case import Invocation @@ -33,8 +39,10 @@ from ._eval_metrics import EvalMetric from ._eval_metrics import EvalStatus from ._eval_result import EvaluationResult +from ._eval_result import NamedScoreResult from ._eval_result import PerInvocationResult from ._llm_criterion import LLMJudgeCriterion +from ._llm_criterion import JudgeModelOptions from ._llm_criterion import Rubric from ._llm_criterion import RubricScore from ._llm_criterion import ScoreResult @@ -103,6 +111,18 @@ def aggregate_invocations( ... +class ModelsAggregator(Protocol): + """Aggregates per-model judge ScoreResults (single invocation, multiple judge models) into one ScoreResult.""" + + def aggregate_models( + self, + per_model: list[ScoreResult], + threshold: float, + weights: list[float], + ) -> ScoreResult: + ... + + class MajorityVoteSamplesAggregator: """Selects one sample by majority vote on pass/fail; on tie, prefers a failed sample if any.""" @@ -140,6 +160,161 @@ def aggregate_invocations( return (overall, status) +def _format_per_model_reason(per_model: list[ScoreResult], threshold: float) -> str: + """Build a multi-line per-model breakdown string for ScoreResult.reason.""" + lines: list[str] = [] + for i, s in enumerate(per_model): + passed = (s.score or 0.0) >= threshold + snippet = (s.reason or "").replace("\n", " ").strip() + if len(snippet) > 200: + snippet = snippet[:200] + "..." + lines.append(f" model#{i}: score={s.score:.4f} passed={passed} reason={snippet}") + return "\n".join(lines) + + +class AllPassModelsAggregator: + """All models must pass (AND); returned score = min(scores).""" + + def aggregate_models( + self, + per_model: list[ScoreResult], + threshold: float, + weights: list[float], + ) -> ScoreResult: + if not per_model: + raise ValueError("per_model must not be empty") + scores = [s.score or 0.0 for s in per_model] + overall = min(scores) + passed_all = all(s >= threshold for s in scores) + base_reason = _format_per_model_reason(per_model, threshold) + reason = f"{base_reason}\naggregator=all_pass -> {'PASSED' if passed_all else 'FAILED'}" + return ScoreResult(score=overall, reason=reason) + + +class AnyPassModelsAggregator: + """Any model passing is enough (OR); returned score = max(scores).""" + + def aggregate_models( + self, + per_model: list[ScoreResult], + threshold: float, + weights: list[float], + ) -> ScoreResult: + if not per_model: + raise ValueError("per_model must not be empty") + scores = [s.score or 0.0 for s in per_model] + overall = max(scores) + passed_any = any(s >= threshold for s in scores) + base_reason = _format_per_model_reason(per_model, threshold) + reason = f"{base_reason}\naggregator=any_pass -> {'PASSED' if passed_any else 'FAILED'}" + return ScoreResult(score=overall, reason=reason) + + +class MajorityPassModelsAggregator: + """Strict majority must pass (passed*2 > total). Score = passed_count/total.""" + + def aggregate_models( + self, + per_model: list[ScoreResult], + threshold: float, + weights: list[float], + ) -> ScoreResult: + if not per_model: + raise ValueError("per_model must not be empty") + passed_count = sum(1 for s in per_model if (s.score or 0.0) >= threshold) + total = len(per_model) + overall = passed_count / total if total else 0.0 + passed_majority = passed_count * 2 > total + reason = (_format_per_model_reason(per_model, threshold) + f"\naggregator=majority_pass -> " + f"{'PASSED' if passed_majority else 'FAILED'} ({passed_count}/{total})") + return ScoreResult(score=overall, reason=reason) + + +class AverageModelsAggregator: + """Mean of scores.""" + + def aggregate_models( + self, + per_model: list[ScoreResult], + threshold: float, + weights: list[float], + ) -> ScoreResult: + if not per_model: + raise ValueError("per_model must not be empty") + scores = [s.score or 0.0 for s in per_model] + overall = sum(scores) / len(scores) + reason = (_format_per_model_reason(per_model, threshold) + f"\naggregator=avg -> mean={overall:.4f}") + return ScoreResult(score=overall, reason=reason) + + +class WeightedAverageModelsAggregator: + """Weighted mean: sum(w*s)/sum(w). Zero total -> 0.0.""" + + def aggregate_models( + self, + per_model: list[ScoreResult], + threshold: float, + weights: list[float], + ) -> ScoreResult: + if not per_model: + raise ValueError("per_model must not be empty") + if len(weights) != len(per_model): + raise ValueError(f"weights length {len(weights)} must equal per_model length {len(per_model)}") + total_w = sum(weights) + if total_w <= 0: + overall = 0.0 + else: + overall = sum(w * (s.score or 0.0) for w, s in zip(weights, per_model)) / total_w + base_reason = _format_per_model_reason(per_model, threshold) + reason = f"{base_reason}\naggregator=weighted_avg -> weighted_mean={overall:.4f} (total_w={total_w})" + return ScoreResult(score=overall, reason=reason) + + +class WeightedMajorityModelsAggregator: + """passed_weight*2 > total_weight (strict). Score = passed_weight/total_weight.""" + + def aggregate_models( + self, + per_model: list[ScoreResult], + threshold: float, + weights: list[float], + ) -> ScoreResult: + if not per_model: + raise ValueError("per_model must not be empty") + if len(weights) != len(per_model): + raise ValueError(f"weights length {len(weights)} must equal per_model length {len(per_model)}") + total_w = sum(weights) + passed_w = sum(w for w, s in zip(weights, per_model) if (s.score or 0.0) >= threshold) + if total_w <= 0: + overall = 0.0 + passed_majority = False + else: + overall = passed_w / total_w + passed_majority = passed_w * 2 > total_w + reason = (_format_per_model_reason(per_model, threshold) + f"\naggregator=weighted_majority -> " + f"{'PASSED' if passed_majority else 'FAILED'} " + f"(passed_w={passed_w}, total_w={total_w})") + return ScoreResult(score=overall, reason=reason) + + +_BUILTIN_MODELS_AGGREGATORS: dict[str, type] = { + "all_pass": AllPassModelsAggregator, + "any_pass": AnyPassModelsAggregator, + "majority_pass": MajorityPassModelsAggregator, + "avg": AverageModelsAggregator, + "weighted_avg": WeightedAverageModelsAggregator, + "weighted_majority": WeightedMajorityModelsAggregator, +} + + +def get_builtin_models_aggregator(name: str) -> Optional[ModelsAggregator]: + """Return a built-in ModelsAggregator instance by name, or None if unknown.""" + cls = _BUILTIN_MODELS_AGGREGATORS.get(name) + if cls is None: + return None + return cls() + + def _extract_text_from_content(content: Any) -> str: """Extract plain text from Content parts (concatenate part texts).""" if content is None: @@ -617,13 +792,97 @@ def _expand_env(s: str) -> str: return os.path.expandvars(s) +def _create_judge_model(opts: JudgeModelOptions) -> Any: + """Build the underlying LLM model for one judge option. + + Provider routing: + - provider_name empty or "openai" -> OpenAIModel(...) directly. This + matches the framework's standard pattern for OpenAI-compatible + endpoints (see examples/llmagent/) and ensures http_options.extra_body + (e.g. chat_template_kwargs.enable_thinking used by judge `think` field) + is forwarded to the backend. Routing via "openai/" through + ModelRegistry lands on LiteLLMModel whose current implementation + drops extra_body. + - Any other provider_name -> ModelRegistry.create_model("{provider}/{model}") + which routes to LiteLLMModel for multi-provider support. + """ + provider_name = _expand_env(opts.provider_name or "") + model_name = _expand_env(opts.model_name or "") + base_url = _expand_env(opts.base_url or "") + api_key = _expand_env(opts.api_key or "") + extra = dict(opts.extra_fields or {}) + + if not provider_name or provider_name.lower() == "openai": + # Direct OpenAIModel instantiation bypasses ModelRegistry regex routing, + # so any model_name (e.g. "glm-5.1-w4afp8") works against any + # OpenAI-compatible endpoint. + return OpenAIModel( + model_name=model_name, + api_key=api_key, + base_url=base_url or None, + **extra, + ) + + model_str = f"{provider_name}/{model_name}" + return ModelRegistry.create_model( + model_str, + api_key=api_key, + base_url=base_url or "", + **extra, + ) + + # Default judge generation params when not specified in criterion. DEFAULT_JUDGE_MAX_TOKENS = 4096 DEFAULT_JUDGE_TEMPERATURE = 0.8 -def _judge_generation_config(gen: dict[str, Any] | None) -> GenerateContentConfig: - """Build GenerateContentConfig from criterion generation_config; use defaults for missing fields.""" +def _merge_extra_body( + http_options: Optional[HttpOptions], + patch: dict[str, Any], +) -> HttpOptions: + """Deep-merge patch into http_options.extra_body at nested-dict granularity. + + - None http_options -> returns new HttpOptions(extra_body=deepcopy(patch)). + - For top-level keys in patch: if both sides have dict, merge recursively (deep-copying + patch values); otherwise patch value wins. + - Other existing top-level keys in http_options.extra_body are preserved. + """ + base = (http_options.extra_body or {}) if http_options is not None else {} + merged: dict[str, Any] = dict(base) + for key, patch_val in patch.items(): + base_val = merged.get(key) + if isinstance(base_val, dict) and isinstance(patch_val, dict): + new_child = dict(base_val) + for subkey, subval in patch_val.items(): + new_child[subkey] = copy.deepcopy(subval) + merged[key] = new_child + else: + merged[key] = copy.deepcopy(patch_val) + if http_options is None: + return HttpOptions(extra_body=merged) + return http_options.model_copy(update={"extra_body": merged}) + + +def _judge_generation_config( + gen: dict[str, Any] | None, + think: Optional[bool], +) -> tuple[GenerateContentConfig, Optional[ThinkingConfig]]: + """Build GenerateContentConfig from criterion generation_config and resolve thinking config. + + Returns (cfg, effective_thinking_config): + - cfg: GenerateContentConfig WITHOUT thinking_config set (LlmAgent rejects it; + thinking_config must be applied via BuiltInPlanner). + - effective_thinking_config: None means caller should not build a planner; + otherwise caller wraps it in BuiltInPlanner. + + Resolution order: + 1. Parse gen for base fields (max_tokens/temperature/top_p/stop/...). + 2. Parse gen["thinking_config"] dict into a candidate ThinkingConfig (not written to cfg). + 3. Parse gen["http_options"] dict into cfg.http_options (if present). + 4. If `think` is not None, override the candidate ThinkingConfig and deep-merge + chat_template_kwargs.enable_thinking into cfg.http_options (preserving siblings). + """ gen = gen or {} cfg = GenerateContentConfig() cfg.max_output_tokens = (gen.get("max_tokens") or gen.get("max_output_tokens") or DEFAULT_JUDGE_MAX_TOKENS) @@ -638,7 +897,43 @@ def _judge_generation_config(gen: dict[str, Any] | None) -> GenerateContentConfi setattr(cfg, "presence_penalty", gen["presence_penalty"]) if "frequency_penalty" in gen and gen["frequency_penalty"] is not None: setattr(cfg, "frequency_penalty", gen["frequency_penalty"]) - return cfg + + # Parse thinking_config dict from generation_config (candidate; may be overridden by `think`). + effective_thinking_config: Optional[ThinkingConfig] = None + tc_dict = gen.get("thinking_config") + if isinstance(tc_dict, dict): + effective_thinking_config = ThinkingConfig(**tc_dict) + + # Parse http_options dict from generation_config, if any. + http_opts_dict = gen.get("http_options") + if isinstance(http_opts_dict, dict): + cfg.http_options = HttpOptions(**http_opts_dict) + + # `think` field overrides both paths when set. + if think is True: + effective_thinking_config = ThinkingConfig( + include_thoughts=True, + thinking_budget=-1, + ) + cfg.http_options = _merge_extra_body( + cfg.http_options, + {"chat_template_kwargs": { + "enable_thinking": True + }}, + ) + elif think is False: + effective_thinking_config = ThinkingConfig( + include_thoughts=False, + thinking_budget=0, + ) + cfg.http_options = _merge_extra_body( + cfg.http_options, + {"chat_template_kwargs": { + "enable_thinking": False + }}, + ) + + return cfg, effective_thinking_config class _JudgeAgent: @@ -651,6 +946,7 @@ def __init__( system_prompt: str, output_schema: Optional[type[PydanticBaseModel]] = None, tools: Optional[list] = None, + planner: Optional[Any] = None, ) -> None: self._agent = LlmAgent( name="judge", @@ -660,6 +956,7 @@ def __init__( add_name_to_instruction=False, output_schema=output_schema, tools=tools or [], + planner=planner, ) self._session_service = InMemorySessionService() @@ -694,9 +991,16 @@ async def get_response(self, user_message: str) -> str: class LLMJudge: - """Builds a judge agent from eval_metric. - Pluggable: messages constructor, response scorer, samples/invocations aggregators. - Defaults used when not provided. + """Builds judge agent(s) from eval_metric. Supports 1..N judge models with cross-model aggregation. + + Pluggable: messages_constructor, response_scorer, samples_aggregator, invocations_aggregator, + models_aggregator, judge_tools. + + models_aggregator resolution order: + 1) explicit constructor argument (if any) + 2) registry-registered ModelsAggregator for metric_name (resolved by caller, e.g. _judge_for_metric) + 3) criterion.models_aggregator string -> built-in 6 names + 4) fallback: all_pass """ def __init__( @@ -707,32 +1011,36 @@ def __init__( response_scorer: Optional[ResponseScorer] = None, samples_aggregator: Optional[SamplesAggregator] = None, invocations_aggregator: Optional[InvocationsAggregator] = None, + models_aggregator: Optional[ModelsAggregator] = None, judge_tools: Optional[list] = None, ) -> None: if not eval_metric: raise ValueError("LLMJudge requires eval_metric") self._eval_metric = eval_metric criterion = get_llm_criterion_from_metric(eval_metric) - if not criterion or not criterion.judge_model: - raise ValueError("eval_metric.criterion.llmJudge with judge_model is required") + if not criterion: + raise ValueError("eval_metric.criterion.llmJudge is required") + judge_models_list = criterion.get_judge_models() + if not judge_models_list: + raise ValueError("eval_metric.criterion.llmJudge requires either judge_model or judge_models") self._criterion = criterion self._metric_name = eval_metric.metric_name or "" - - opts = criterion.judge_model - provider_name = _expand_env(opts.provider_name or "") - model_name = _expand_env(opts.model_name or "") - base_url = _expand_env(opts.base_url or "") - api_key = _expand_env(opts.api_key or "") - model_str = f"{provider_name or 'openai'}/{model_name}" - extra = dict(opts.extra_fields or {}) - model = ModelRegistry.create_model( - model_str, - api_key=api_key, - base_url=base_url or "", - **extra, - ) - cfg = _judge_generation_config(opts.generation_config) - + self._judge_models: list[JudgeModelOptions] = judge_models_list + self._parallel: bool = bool(criterion.parallel) + + # Resolve models_aggregator: explicit > built-in name lookup > error. + resolved_models_agg = models_aggregator + if resolved_models_agg is None: + agg_name = criterion.models_aggregator or "all_pass" + built = get_builtin_models_aggregator(agg_name) + if built is None: + raise ValueError(f"models_aggregator {agg_name!r} is not a built-in name; " + f"register it via LLM_EVALUATOR_REGISTRY.register_models_aggregator " + f"before constructing LLMJudge") + resolved_models_agg = built + self._models_aggregator: ModelsAggregator = resolved_models_agg + + # Pick metric-specific system prompt + user template + output schema (unchanged from before). if self._metric_name == "llm_final_response": system_prompt = FINAL_RESPONSE_PROMPT user_template = ("\n" @@ -746,6 +1054,7 @@ def __init__( "\n" "{expected_response}\n" "") + output_schema: Optional[type[PydanticBaseModel]] = FinalResponseOutput elif self._metric_name == "llm_rubric_response": system_prompt = _rubric_system(RUBRIC_RESPONSE_PROMPT) user_template = ("\n" @@ -763,6 +1072,7 @@ def __init__( "\n" "{rubrics}\n" "") + output_schema = RubricJudgeOutput elif self._metric_name == "llm_rubric_knowledge_recall": system_prompt = _rubric_system(RUBRIC_KNOWLEDGE_RECALL_PROMPT) user_template = ("\n" @@ -778,15 +1088,25 @@ def __init__( "\n" "{rubrics}\n" "") + output_schema = RubricJudgeOutput else: raise ValueError(f"Unsupported metric_name for LLMJudge: {self._metric_name!r}") - if self._metric_name == "llm_final_response": - output_schema: Optional[type[PydanticBaseModel]] = FinalResponseOutput - else: - output_schema = RubricJudgeOutput - - self._agent = _JudgeAgent(model, cfg, system_prompt, output_schema=output_schema, tools=judge_tools) + # Build one _JudgeAgent per judge model option, in order. + self._judge_agents: list[_JudgeAgent] = [] + for opts in judge_models_list: + model = _create_judge_model(opts) + cfg, effective_tc = _judge_generation_config(opts.generation_config, opts.think) + planner = (BuiltInPlanner(thinking_config=effective_tc) if effective_tc is not None else None) + self._judge_agents.append( + _JudgeAgent( + model, + cfg, + system_prompt, + output_schema=output_schema, + tools=judge_tools, + planner=planner, + )) self._messages_constructor = messages_constructor or DefaultMessagesConstructor(user_template) self._response_scorer = response_scorer or DefaultResponseScorer() @@ -794,25 +1114,69 @@ def __init__( self._invocations_aggregator = invocations_aggregator or AverageInvocationsAggregator() def get_num_samples(self) -> int: - """Return the number of judge samples to run per invocation (e.g. for majority vote).""" + """Return num_samples for the *first* judge model (legacy single-model API). + + Multi-model judges may use different num_samples per model; callers that need + per-model sample counts should iterate criterion.get_judge_models() directly. + """ return self._criterion.get_num_samples() + async def _run_one_judge( + self, + agent_index: int, + opts: JudgeModelOptions, + user_message: str, + threshold: float, + ) -> "tuple[NamedScoreResult, ScoreResult, bool]": + """Run num_samples calls for one judge model, then SamplesAggregator. + + Returns (named_score, raw_score_result, had_exception). On exception, returns + a soft-failure NamedScoreResult with passed=False, score=0.0, reason=str(exc), + and had_exception=True. + """ + agent = self._judge_agents[agent_index] + n = opts.get_num_samples() + try: + samples: list[ScoreResult] = [] + for _ in range(n): + response_text = await agent.get_response(user_message) + samples.append(self._response_scorer.parse_response(response_text, self._metric_name)) + chosen = self._samples_aggregator.aggregate_samples(samples, threshold) + except Exception as exc: + named = NamedScoreResult( + model_name=opts.model_name or "", + provider_name=opts.provider_name or "", + score=0.0, + reason=str(exc), + rubric_scores=[], + passed=False, + ) + return named, ScoreResult(score=0.0, reason=str(exc)), True + passed = (chosen.score or 0.0) >= threshold + named = NamedScoreResult( + model_name=opts.model_name or "", + provider_name=opts.provider_name or "", + score=chosen.score or 0.0, + reason=chosen.reason or "", + rubric_scores=list(chosen.rubric_scores or []), + passed=passed, + ) + return named, chosen, False + async def evaluate( self, actual_invocations: list[Invocation], expected_invocations: Optional[list[Invocation]], ) -> EvaluationResult: - """Run the judge for each invocation, aggregate samples then invocations, and return EvaluationResult.""" + """Run multi-model judge per invocation, aggregate per-model + per-invocation.""" if expected_invocations is None: expected_invocations = [] if len(actual_invocations) != len(expected_invocations): raise ValueError(f"actual_invocations ({len(actual_invocations)}) and " f"expected_invocations ({len(expected_invocations)}) length mismatch") - num_samples = self.get_num_samples() - if num_samples <= 0: - raise ValueError("num_samples must be greater than 0") threshold = self._eval_metric.threshold + weights = [m.weight for m in self._judge_models] per_invocation_results: list[PerInvocationResult] = [] for i in range(len(actual_invocations)): @@ -826,22 +1190,56 @@ async def evaluate( self._metric_name, ) - samples: list[ScoreResult] = [] - for _ in range(num_samples): - response_text = await self._agent.get_response(user_message) - samples.append(self._response_scorer.parse_response(response_text, self._metric_name)) + # Step 1: each model runs its own samples + SamplesAggregator -> (named, raw, had_exception) + if self._parallel and len(self._judge_models) > 1: + tasks = [ + self._run_one_judge(idx, opts, user_message, threshold) + for idx, opts in enumerate(self._judge_models) + ] + triples = await asyncio.gather(*tasks) + else: + triples = [] + for idx, opts in enumerate(self._judge_models): + triples.append(await self._run_one_judge(idx, opts, user_message, threshold)) + + named_results: list[NamedScoreResult] = [t[0] for t in triples] + score_results: list[ScoreResult] = [t[1] for t in triples] + exceptions: list[bool] = [t[2] for t in triples] + + # Step 2: if every model raised, mark NOT_EVALUATED. + all_exception = all(exceptions) and len(exceptions) > 0 + + if all_exception: + per_invocation_results.append( + PerInvocationResult( + actual_invocation=actual, + expected_invocation=expected, + score=None, + eval_status=EvalStatus.NOT_EVALUATED, + reason="all judge models failed: " + "; ".join(f"{n.model_name}={n.reason}" + for n in named_results), + rubric_scores=None, + per_model_scores=named_results, + )) + continue - chosen = self._samples_aggregator.aggregate_samples(samples, threshold) - status = EvalStatus.PASSED if (chosen.score or 0) >= threshold else EvalStatus.FAILED - rubric_scores = (list(chosen.rubric_scores) if chosen.rubric_scores else None) + # Step 3: cross-model aggregation -> single ScoreResult + invocation_score = self._models_aggregator.aggregate_models( + score_results, + threshold, + weights, + ) + status = (EvalStatus.PASSED if (invocation_score.score or 0.0) >= threshold else EvalStatus.FAILED) + rubric_scores = (list(invocation_score.rubric_scores) if invocation_score.rubric_scores else None) per_invocation_results.append( PerInvocationResult( actual_invocation=actual, expected_invocation=expected, - score=chosen.score, + score=invocation_score.score, eval_status=status, - reason=chosen.reason or None, + reason=invocation_score.reason or None, rubric_scores=rubric_scores, + per_model_scores=named_results, )) overall_score, overall_status = self._invocations_aggregator.aggregate_invocations( diff --git a/trpc_agent_sdk/evaluation/_local_eval_service.py b/trpc_agent_sdk/evaluation/_local_eval_service.py index 5b5afef..2ea1a8d 100644 --- a/trpc_agent_sdk/evaluation/_local_eval_service.py +++ b/trpc_agent_sdk/evaluation/_local_eval_service.py @@ -94,7 +94,7 @@ class LocalEvalService(BaseEvalService): def __init__( self, - root_agent: BaseAgent, + root_agent: Optional[BaseAgent], eval_sets_manager: EvalSetsManager, evaluator_registry: Optional[EvaluatorRegistry] = None, session_service: Optional[BaseSessionService] = None, @@ -109,7 +109,9 @@ def __init__( """Initialize the local evaluation service. Args: - root_agent: The agent to evaluate + root_agent: The agent to evaluate. May be ``None`` only when every + eval case to be processed uses ``eval_mode='trace'``; standard + or mixed modes require a concrete agent. eval_sets_manager: Manager for eval sets storage evaluator_registry: Registry of metric evaluators session_service: Session service for maintaining state @@ -242,6 +244,13 @@ async def evaluate( Yields: EvalCaseResult for each evaluated inference """ + # Fail-fast: validate every (eval_case × metric) pair is semantically + # feasible before running any evaluator. See Bug 3.1 design doc. + self._validate_metric_compat( + inference_results=evaluate_request.inference_results, + evaluate_config=evaluate_request.evaluate_config, + ) + run_ctx: dict[str, Any] = {} start_time = time.monotonic() eval_case_results_list: list[EvalCaseResult] = [] @@ -483,6 +492,93 @@ async def _evaluate_metric( expected_invocations=expected_invocations, ) + def _validate_metric_compat( + self, + inference_results: list[InferenceResult], + evaluate_config: EvaluateConfig, + ) -> None: + """Fail-fast check every (eval_case × metric) pair for semantic feasibility. + + Reference-based metrics (``requires_reference=True``) need a real + expected answer in ``eval_case.conversation``. Scenario-3 trace cases + (only ``actual_conversation`` set) have placeholder-only expected + invocations, which would produce silent 0.0 (rouge/final_response) or + phantom 1.0 (tool_trajectory with subset_matching) results. + + This method aggregates ALL incompatible pairs across the request and + raises a single ValueError listing each one, so the user can fix them + in one pass. See Bug 3.1 of the strict-compat design doc. + """ + incompatible: list[tuple[str, str, str]] = [] + for ir in inference_results: + eval_case = self._eval_sets_manager.get_eval_case( + app_name=ir.app_name, + eval_set_id=ir.eval_set_id, + eval_case_id=ir.eval_case_id, + ) + if eval_case is None: + # The per-case evaluate path will raise a clear error later; + # skip here to avoid duplicate / misleading messages. + continue + has_reference = self._case_has_reference(eval_case) + for eval_metric in evaluate_config.eval_metrics: + try: + evaluator_cls = self._evaluator_registry.get_evaluator_class(eval_metric) + except ValueError: + # Unknown metric — let the regular evaluate loop surface the + # registry error so the message stays consistent. + continue + if getattr(evaluator_cls, "requires_reference", True) and not has_reference: + incompatible.append(( + ir.eval_case_id, + eval_metric.metric_name, + "requires reference answer (set in eval_case.conversation)", + )) + + if incompatible: + raise ValueError(self._format_compat_error(incompatible)) + + @staticmethod + def _case_has_reference(eval_case: EvalCase) -> bool: + """True iff ``expected_invocations`` will carry a real reference answer. + + - Non-trace (``eval_mode is None``): ``conversation`` IS the expected → True. + - Scenario 1 (trace + both conversation and actual_conversation): True. + - Scenario 3 (trace + only actual_conversation): expected is a placeholder + built by ``_trace_expecteds_for_eval`` → False. + """ + if eval_case.eval_mode != EvalModeTrace: + return True + return bool(eval_case.conversation and eval_case.actual_conversation) + + @staticmethod + def _format_compat_error(incompatible: list[tuple[str, str, str]]) -> str: + """Aggregate incompatible pairs into a single actionable error message. + + Groups by eval_case_id for readability; always closes with a fix guide. + """ + by_case: dict[str, list[tuple[str, str]]] = {} + for eval_id, metric_name, reason in incompatible: + by_case.setdefault(eval_id, []).append((metric_name, reason)) + + lines = ["evaluator config incompatible with eval_set:", ""] + for eval_id in sorted(by_case): + lines.append(f" eval_case='{eval_id}' (scenario: trace-without-reference):") + for metric_name, reason in by_case[eval_id]: + lines.append(f" - metric '{metric_name}' {reason}") + lines.extend([ + "", + "To fix, choose one:", + " (a) Remove the incompatible metrics from your EvaluateConfig.", + " (b) Use reference-free metrics only: llm_rubric_response, " + "llm_rubric_knowledge_recall.", + " (c) Upgrade eval_cases to scenario-1 by providing BOTH `conversation` " + "(expected) and `actual_conversation` (actual) fields.", + "", + f"Incompatible (metric_name, eval_id) pairs: {len(incompatible)}", + ]) + return "\n".join(lines) + def _generate_final_eval_status(self, overall_eval_metric_results: list[EvalMetricResult]) -> EvalStatus: """Determine final evaluation status from all metrics. @@ -539,6 +635,10 @@ async def _perform_inference_single_eval_item( raise ValueError( f"inference eval case (eval_case_id={eval_case.eval_id}, session_id={session_id}): " "actual_conversation is only supported in trace mode") + if root_agent is None: + raise ValueError(f"inference eval case (eval_case_id={eval_case.eval_id}, " + f"session_id={session_id}): a root_agent is required for " + f"standard (non-trace) eval_mode; got root_agent=None") inferences = await self._generate_inferences_from_agent( agent=root_agent, eval_case=eval_case, diff --git a/trpc_agent_sdk/evaluation/_rouge_evaluator.py b/trpc_agent_sdk/evaluation/_rouge_evaluator.py index a6b7ff3..48c720f 100644 --- a/trpc_agent_sdk/evaluation/_rouge_evaluator.py +++ b/trpc_agent_sdk/evaluation/_rouge_evaluator.py @@ -51,6 +51,8 @@ class RougeEvaluator(Evaluator): Score range: [0, 1], where 1 means perfect match. """ + requires_reference = True + def __init__( self, threshold: Optional[float] = None, diff --git a/trpc_agent_sdk/evaluation/_trajectory_evaluator.py b/trpc_agent_sdk/evaluation/_trajectory_evaluator.py index e776b6f..82da309 100644 --- a/trpc_agent_sdk/evaluation/_trajectory_evaluator.py +++ b/trpc_agent_sdk/evaluation/_trajectory_evaluator.py @@ -55,6 +55,8 @@ class TrajectoryEvaluator(Evaluator): Without: strict count, order, name and arguments match. """ + requires_reference = True + def __init__( self, threshold: Optional[float] = None,