From 28d7d35bb3519b14168b7c12fe6f112febadc058 Mon Sep 17 00:00:00 2001
From: ricknie <ricknie@tencent.com>
Date: Wed, 6 May 2026 16:15:34 +0800
Subject: [PATCH] =?UTF-8?q?feat:=20eval=E6=A8=A1=E5=9D=97=E6=94=AF?=
 =?UTF-8?q?=E6=8C=81=E4=B8=8D=E5=90=8C=E5=A4=A7=E6=A8=A1=E5=9E=8B=E8=AF=84?=
 =?UTF-8?q?=E4=BC=B0=E5=90=8C=E4=B8=80metric?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

TAPD: --story=134052565
---
 docs/mkdocs/en/evaluation.md                  | 133 ++++-
 docs/mkdocs/zh/evaluation.md                  | 135 ++++-
 tests/evaluation/test_eval_result.py          |  80 +++
 tests/evaluation/test_llm_criterion.py        | 162 +++++-
 .../evaluation/test_llm_evaluator_registry.py |  43 ++
 .../test_llm_judge_models_aggregator.py       | 198 +++++++
 .../evaluation/test_llm_judge_multi_model.py  | 354 +++++++++++++
 tests/evaluation/test_llm_judge_think.py      | 393 ++++++++++++++
 trpc_agent_sdk/evaluation/__init__.py         |  28 +
 trpc_agent_sdk/evaluation/_agent_evaluator.py | 171 +++++-
 trpc_agent_sdk/evaluation/_eval_case.py       |  22 +-
 trpc_agent_sdk/evaluation/_eval_result.py     |  28 +
 trpc_agent_sdk/evaluation/_evaluator_base.py  |  17 +
 .../evaluation/_evaluator_registry.py         |  13 +
 .../evaluation/_final_response_evaluator.py   |   2 +
 trpc_agent_sdk/evaluation/_llm_criterion.py   |  77 +++
 trpc_agent_sdk/evaluation/_llm_evaluator.py   |  30 ++
 trpc_agent_sdk/evaluation/_llm_judge.py       | 486 ++++++++++++++++--
 .../evaluation/_local_eval_service.py         | 104 +++-
 trpc_agent_sdk/evaluation/_rouge_evaluator.py |   2 +
 .../evaluation/_trajectory_evaluator.py       |   2 +
 21 files changed, 2398 insertions(+), 82 deletions(-)
 create mode 100644 tests/evaluation/test_llm_judge_models_aggregator.py
 create mode 100644 tests/evaluation/test_llm_judge_multi_model.py
 create mode 100644 tests/evaluation/test_llm_judge_think.py

diff --git a/docs/mkdocs/en/evaluation.md b/docs/mkdocs/en/evaluation.md
index 0245ced..ca51a85 100644
--- a/docs/mkdocs/en/evaluation.md
+++ b/docs/mkdocs/en/evaluation.md
@@ -425,6 +425,8 @@ The eval configuration describes "how to judge." This section teaches you how to
 
 `test_config.json` must be placed in the **same directory** as the eval set file (`.evalset.json` / `.test.json`); the framework loads it automatically.
 
+> **Advanced**: If you want **multiple eval sets to share a single configuration** (e.g., centralizing all metric definitions in one JSON), pass `eval_metrics_file_path_or_dir` at call time to bypass the same-directory convention. See [Shared Configuration: `eval_metrics_file_path_or_dir`](#shared-configuration-eval_metrics_file_path_or_dir).
+
 #### Structure Definition
 
 **EvalConfig** (parsed from `test_config.json`)
@@ -718,7 +720,10 @@ Compare using text "contains" with case-insensitivity (common when the final res
 
 | Field | Type | Description |
 | --- | --- | --- |
-| judge_model | object | Judge model configuration (JudgeModelOptions); required |
+| judge_model | object | Judge model configuration (JudgeModelOptions); required when `judge_models` is not set |
+| judge_models | array | Multi-model judge list (JudgeModelOptions items); mutually exclusive with `judge_model`. Cross-model results are combined by `models_aggregator` |
+| models_aggregator | string | Cross-model aggregation strategy. Built-in: `all_pass` (default) / `any_pass` / `majority_pass` / `avg` / `weighted_avg` / `weighted_majority`. Custom names must be registered via `LLM_EVALUATOR_REGISTRY.register_models_aggregator` before evaluation |
+| parallel | boolean | Whether to run the multiple judge models concurrently; default `true` |
 | rubrics | array | Rubric list; required for llm_rubric_response and llm_rubric_knowledge_recall |
 | knowledge_tool_names | array | List of knowledge retrieval tool names; used by llm_rubric_knowledge_recall, default `["knowledge_search"]` |
 
@@ -730,7 +735,9 @@ Compare using text "contains" with case-insensitivity (common when the final res
 | api_key | string | API key |
 | base_url | string | Optional, custom endpoint |
 | num_samples | number | Number of judge samples per turn; default 1 |
-| generation_config | object | Generation parameters (max_tokens, temperature, etc.) |
+| weight | number | Per-model weight used by `weighted_avg` / `weighted_majority` aggregators; default 1.0 |
+| think | boolean | Controls the judge model's thinking mode. `false`: disable thinking (sets both `thinking_config.thinking_budget=0` and `chat_template_kwargs.enable_thinking=false`). `true`: enable thinking with automatic budget (`include_thoughts=true`). Unset (default): keep the model default. Recommended `false` for judge models to save tokens and latency |
+| generation_config | object | Generation parameters (max_tokens, temperature, etc.; may also explicitly set `thinking_config` / `http_options`; the `think` field overrides them) |
 
 **Rubric** (items in the rubrics array)
 
@@ -812,6 +819,71 @@ LLM response quality with rubrics (llm_rubric_response or llm_rubric_knowledge_r
 
 It is recommended to use environment variable placeholders for `api_key` and `base_url` (e.g., `${TRPC_AGENT_API_KEY}`), which are replaced by the execution environment, to avoid writing plaintext in configuration files.
 
+**Multi-model judge (cross-model aggregation)**
+
+A single LLM-judge metric may use multiple judge models simultaneously and combine their verdicts via `models_aggregator`. Use `judge_models` instead of `judge_model`; the two fields are mutually exclusive. Per-model details are available on `PerInvocationResult.per_model_scores` (a list of `NamedScoreResult`).
+
+Built-in aggregators:
+
+| Name | Pass rule | Overall score |
+| --- | --- | --- |
+| `all_pass` (default) | all models pass | min of per-model scores |
+| `any_pass` | any model passes | max of per-model scores |
+| `majority_pass` | strict majority passes (`passed*2 > total`) | `passed_count / total` |
+| `avg` | mean ≥ threshold | mean of per-model scores |
+| `weighted_avg` | weighted mean ≥ threshold | `sum(w*s) / sum(w)` |
+| `weighted_majority` | weighted-passed share ≥ 0.5 | `sum(w where passed) / sum(w)` |
+
+If a single judge model raises during execution, that model is counted as a non-passing vote; if every model raises, the invocation is reported as `NOT_EVALUATED`.
+
+```json
+{
+  "metrics": [
+    {
+      "metric_name": "llm_final_response",
+      "threshold": 1,
+      "criterion": {
+        "llm_judge": {
+          "judge_models": [
+            {
+              "model_name": "glm-4.7",
+              "api_key": "${TRPC_AGENT_API_KEY}",
+              "base_url": "${TRPC_AGENT_BASE_URL}",
+              "weight": 2.0
+            },
+            {
+              "model_name": "gpt-4o",
+              "api_key": "${TRPC_AGENT_API_KEY}",
+              "base_url": "${TRPC_AGENT_BASE_URL}",
+              "weight": 1.0
+            }
+          ],
+          "models_aggregator": "weighted_avg",
+          "parallel": true
+        }
+      }
+    }
+  ]
+}
+```
+
+`parallel` controls how multiple judge models are executed: `true` (default) calls all models concurrently, with latency bounded by the slowest model; `false` calls them sequentially in the declared order. Only takes effect when `judge_models` contains more than one model.
+
+If a judge model has thinking enabled by default, consider setting `"think": false` on its `JudgeModelOptions`: the judge output is a structured JSON, thinking traces add no value to the final verdict, and disabling thinking significantly reduces token cost and latency. Each judge model has its own independent `think` flag.
+
+Custom aggregators can be registered at runtime and take precedence over the `models_aggregator` name written in the criterion:
+
+```python
+from trpc_agent_sdk.evaluation import LLM_EVALUATOR_REGISTRY, ScoreResult
+
+def my_aggregator(per_model, threshold, weights):
+    # per_model: list[ScoreResult]; weights: list[float]
+    score = sum(s.score or 0.0 for s in per_model) / len(per_model)
+    return ScoreResult(score=score, reason="custom aggregation")
+
+LLM_EVALUATOR_REGISTRY.register_models_aggregator("llm_final_response", my_aggregator)
+```
+
 #### Custom Criteria
 
 To fully customize the "whether it matches" logic in code, you can register a matching function with `CRITERION_REGISTRY` before running the evaluation. Supported types for registration are `TOOL_TRAJECTORY` and `FINAL_RESPONSE`; once registered, comparisons of that type will invoke your provided function `(actual, expected) -> bool`, bypassing the built-in criteria from the configuration file.
@@ -1764,6 +1836,16 @@ In default mode, the eval service actually calls the Agent for inference. If you
 
 Replaying existing conversations, offline evaluation, or avoiding repeated Agent and model calls when debugging evaluation flows.
 
+**`agent_module` is optional**
+
+`agent_module` tells the framework where to load the Agent from, so it can call the Agent for inference during evaluation. Trace mode no longer calls the Agent, so when **every case in the eval set is in trace mode**, `AgentEvaluator.evaluate()` / `get_executer()` no longer needs `agent_module` and you can simply omit it:
+
+```python
+await AgentEvaluator.evaluate(
+    eval_dataset_file_path_or_dir=trace_only_eval_set_path,
+)
+```
+
 **Example**: A Trace mode case in the eval set
 
 ```json
@@ -2160,6 +2242,53 @@ async def test_evaluate_with_custom_runner():
 
 For the complete example, see [examples/evaluation/custom_runner/](../../../examples/evaluation/custom_runner/).
 
+#### Shared Configuration (eval_metrics_file_path_or_dir)
+
+By default, every eval set needs a `test_config.json` placed in its **own directory**, which the framework loads automatically. When multiple eval sets need to use the same metrics and thresholds, copying `test_config.json` into every directory is redundant and prone to drift. You can extract the config into a single shared location and point to it via `eval_metrics_file_path_or_dir` at call time. The framework will then **ignore the same-directory convention** and apply this shared config to every eval set.
+
+**Comparison**: in the default layout each eval set needs its own `test_config.json`; in the shared layout there is only one.
+
+```
+## Default (same-directory)        ## Shared (eval_metrics_file_path_or_dir)
+project/                           project/
+└── eval_data/                     ├── shared_metrics.json     ← shared config
+    ├── weather/                   └── eval_data/
+    │   ├── weather.evalset.json       ├── weather/weather.evalset.json
+    │   └── test_config.json           ├── booking/booking.evalset.json
+    └── booking/                       └── search/search.evalset.json
+        ├── booking.evalset.json
+        └── test_config.json
+```
+
+**Configuration**
+
+Pass `eval_metrics_file_path_or_dir` to `AgentEvaluator.evaluate()` / `get_executer()`:
+
+- A **file path** (`.json`): loaded directly as the shared configuration;
+- A **directory path**: the framework looks up `*.json` in that directory **non-recursively**, and exactly one must be present; otherwise it raises `FileNotFoundError` (zero matches) or `ValueError` (more than one);
+- Omitted or `None`: keep the default behavior—load `test_config.json` from each eval set's own directory.
+
+**Applicable Scenarios**
+
+Multiple eval sets sharing the same metrics and thresholds; switching thresholds per environment (dev / staging / prod) in CI; eval sets generated by other tools (WebUI, log replayers, etc.) where maintaining a per-directory `test_config.json` is inconvenient.
+
+**Example**: point all eval sets in the right-hand layout above to `shared_metrics.json`
+
+```python
+import os
+import pytest
+from trpc_agent_sdk.evaluation import AgentEvaluator
+
+@pytest.mark.asyncio
+async def test_with_shared_metrics():
+    project_dir = os.path.dirname(os.path.abspath(__file__))
+    await AgentEvaluator.evaluate(
+        agent_module="agent",
+        eval_dataset_file_path_or_dir=os.path.join(project_dir, "eval_data"),
+        eval_metrics_file_path_or_dir=os.path.join(project_dir, "shared_metrics.json"),
+    )
+```
+
 
 ## Using WebUI for Agent Evaluation
 
diff --git a/docs/mkdocs/zh/evaluation.md b/docs/mkdocs/zh/evaluation.md
index 07f7fba..564e0e0 100644
--- a/docs/mkdocs/zh/evaluation.md
+++ b/docs/mkdocs/zh/evaluation.md
@@ -305,7 +305,7 @@ pytest test_quickstart.py -v --tb=short -s
 
 AgentEvaluator 是整个评测流程的入口和编排者：
 
-1. **加载阶段**：AgentEvaluator 从评测集文件（`.evalset.json` / `.test.json`）加载 EvalSet，从同目录的 `test_config.json` 加载 EvalConfig，按 `agent_module` 加载 Agent。
+1. **加载阶段**：AgentEvaluator 从评测集文件（`.evalset.json` / `.test.json`）加载 EvalSet，从同目录的 `test_config.json` 加载 EvalConfig，按 `agent_module` 加载 Agent（若整集为 [Trace 模式](#trace-模式)，此步可省略）。
 2. **构建评估服务**：AgentEvaluator 将 EvalSet 写入 InMemoryEvalSetsManager，创建 LocalEvalService（依赖该 Manager、UserSimulatorProvider、可选 EvalSetResultsManager、Runner、Callbacks）。默认使用 StaticUserSimulator，按 conversation 的 user_content 驱动推理。可选注入 LocalEvalSetResultsManager 将运行结果写入目录。
 3. **推理阶段**：评估服务按 EvalSet 中的用例与 conversation 驱动 Runner 推理，得到实际 Invocation 列表（实际工具调用、实际回复）。
 4. **打分阶段**：评估服务根据 EvalConfig 中的 EvalMetric 列表，从 EvaluatorRegistry 获取各评估器，对实际与预期逐项打分并汇总为 EvalCaseResult。
@@ -422,6 +422,8 @@ Trace 模式的配置详见[高级功能 - Trace 模式](#trace-模式)。
 
 `test_config.json` 必须放在评测集文件（`.evalset.json` / `.test.json`）的**同目录**下，框架会自动加载。
 
+> **进阶**：若希望**多个评测集共用同一份配置**（例如把所有指标定义集中到一个 JSON），可在调用时传入 `eval_metrics_file_path_or_dir`，跳过同目录约定。详见[共享配置：`eval_metrics_file_path_or_dir`](#共享配置eval_metrics_file_path_or_dir)。
+
 #### 结构定义
 
 **EvalConfig**（由 `test_config.json` 解析）
@@ -715,7 +717,10 @@ Criterion 定义了"怎样算匹配"——实际输出和预期输出之间用
 
 | 字段 | 类型 | 说明 |
 | --- | --- | --- |
-| judge_model | object | 评判模型配置（JudgeModelOptions）；必填 |
+| judge_model | object | 评判模型配置（JudgeModelOptions）；未设置 `judge_models` 时必填 |
+| judge_models | array | 多裁判模型列表（JudgeModelOptions 项），与 `judge_model` 互斥；跨模型结果通过 `models_aggregator` 聚合 |
+| models_aggregator | string | 跨模型聚合策略。内置：`all_pass`（默认）/ `any_pass` / `majority_pass` / `avg` / `weighted_avg` / `weighted_majority`。自定义名称须在评估前通过 `LLM_EVALUATOR_REGISTRY.register_models_aggregator` 注册 |
+| parallel | boolean | 多裁判模型是否并发执行；默认 `true` |
 | rubrics | array | Rubric 列表；llm_rubric_response 与 llm_rubric_knowledge_recall 需要 |
 | knowledge_tool_names | array | 知识检索工具名列表；llm_rubric_knowledge_recall 使用，默认`["knowledge_search"]` |
 
@@ -727,7 +732,9 @@ Criterion 定义了"怎样算匹配"——实际输出和预期输出之间用
 | api_key | string | API 密钥 |
 | base_url | string | 可选，自定义端点 |
 | num_samples | number | 每轮评判采样数；默认 1 |
-| generation_config | object | 生成参数（max_tokens、temperature 等） |
+| weight | number | 单模型权重，供 `weighted_avg` / `weighted_majority` 聚合器使用；默认 1.0 |
+| think | boolean | 控制裁判模型的思考模式：`false` 关闭思考（同时设 `thinking_config.thinking_budget=0` 与 `chat_template_kwargs.enable_thinking=false`）；`true` 显式开启（`include_thoughts=true`，自动预算）；不设（默认）则保持模型默认。建议 judge 模型设 `false` 节省 token 与延时 |
+| generation_config | object | 生成参数（max_tokens、temperature 等；也可显式写 `thinking_config` / `http_options`，`think` 字段优先） |
 
 **Rubric**（rubrics 数组项）
 
@@ -809,6 +816,71 @@ LLM 最终响应评判（仅需 judge_model）：
 
 建议 `api_key`、`base_url` 用环境变量占位（如 `${TRPC_AGENT_API_KEY}`），由执行环境替换，避免明文写入配置文件。
 
+**多裁判模型（跨模型聚合）**
+
+同一个 LLM 裁判指标可以同时使用多个裁判模型，并通过 `models_aggregator` 聚合各模型的判定结果。此时改用 `judge_models` 而非 `judge_model`，两字段互斥。每个裁判模型的明细会输出到 `PerInvocationResult.per_model_scores`（`NamedScoreResult` 列表）。
+
+内置聚合器：
+
+| 名称 | 通过规则 | 总分 |
+| --- | --- | --- |
+| `all_pass`（默认） | 所有模型都通过 | 各模型得分的最小值 |
+| `any_pass` | 任一模型通过 | 各模型得分的最大值 |
+| `majority_pass` | 严格多数通过（`passed*2 > total`） | `passed_count / total` |
+| `avg` | 平均分 ≥ threshold | 各模型得分的平均值 |
+| `weighted_avg` | 加权平均 ≥ threshold | `sum(w*s) / sum(w)` |
+| `weighted_majority` | 通过模型的权重占比 ≥ 0.5 | `sum(w where passed) / sum(w)` |
+
+若某个裁判模型执行抛异常，则该模型视为一张反对票；若所有模型都抛异常，该轮结果记为 `NOT_EVALUATED`。
+
+```json
+{
+  "metrics": [
+    {
+      "metric_name": "llm_final_response",
+      "threshold": 1,
+      "criterion": {
+        "llm_judge": {
+          "judge_models": [
+            {
+              "model_name": "glm-4.7",
+              "api_key": "${TRPC_AGENT_API_KEY}",
+              "base_url": "${TRPC_AGENT_BASE_URL}",
+              "weight": 2.0
+            },
+            {
+              "model_name": "gpt-4o",
+              "api_key": "${TRPC_AGENT_API_KEY}",
+              "base_url": "${TRPC_AGENT_BASE_URL}",
+              "weight": 1.0
+            }
+          ],
+          "models_aggregator": "weighted_avg",
+          "parallel": true
+        }
+      }
+    }
+  ]
+}
+```
+
+`parallel` 控制多个裁判模型之间的执行方式：`true`（默认）并发调用，耗时取决于最慢的模型；`false` 按声明顺序串行调用。仅在 `judge_models` 有多个模型时生效。
+
+若裁判模型默认开启思考链，建议在对应 `JudgeModelOptions` 上显式设 `"think": false`：judge 输出本身是结构化 JSON，思考链对最终判分无价值，关闭可显著降低 token 消耗与延时。每个裁判模型的 `think` 独立设置。
+
+也可以在运行时注册自定义聚合器，其优先级高于 criterion 中写的 `models_aggregator` 名：
+
+```python
+from trpc_agent_sdk.evaluation import LLM_EVALUATOR_REGISTRY, ScoreResult
+
+def my_aggregator(per_model, threshold, weights):
+    # per_model: list[ScoreResult]；weights: list[float]
+    score = sum(s.score or 0.0 for s in per_model) / len(per_model)
+    return ScoreResult(score=score, reason="custom aggregation")
+
+LLM_EVALUATOR_REGISTRY.register_models_aggregator("llm_final_response", my_aggregator)
+```
+
 #### 自定义准则
 
 若要在代码里完全自定义"是否匹配"的逻辑，可在评估运行前向 `CRITERION_REGISTRY` 注册一个匹配函数。支持注册的类型为 `TOOL_TRAJECTORY`、`FINAL_RESPONSE`；注册后，该类型在比较时会调用你提供的函数 `(actual, expected) -> bool`，不再使用配置文件中的内置准则。
@@ -1761,6 +1833,16 @@ async def test_pass_at_k():
 
 回放已有对话、离线评估、或调试评估流程时避免重复调用 Agent 与模型。
 
+**`agent_module` 可省略**
+
+`agent_module` 用来告诉框架去哪里加载 Agent，以便评测时调用它做推理。Trace 模式不再调用 Agent，因此当**评测集里所有用例都是 trace 模式**时，`AgentEvaluator.evaluate()` / `get_executer()` 不再需要 `agent_module`，可以直接省略：
+
+```python
+await AgentEvaluator.evaluate(
+    eval_dataset_file_path_or_dir=trace_only_eval_set_path,
+)
+```
+
 **示例**：evalset 中一个 Trace 模式用例
 
 ```json
@@ -2157,6 +2239,53 @@ async def test_evaluate_with_custom_runner():
 
 完整示例见 [examples/evaluation/custom_runner/](../../../examples/evaluation/custom_runner/)。
 
+#### 共享配置（eval_metrics_file_path_or_dir）
+
+默认情况下，每个评测集都需要在**同目录**放一份 `test_config.json`，框架按目录就近加载。当多个评测集要使用同一套指标与阈值时，逐个目录复制 `test_config.json` 既冗余也容易漂移。此时可以把配置抽到一处共享，调用时通过 `eval_metrics_file_path_or_dir` 指定，框架将**忽略同目录约定**，让所有评测集都使用这份共享配置。
+
+**对比**：默认布局每个评测集旁都要带一份 `test_config.json`；共享布局只保留一份。
+
+```
+## 默认（同目录约定）            ## 共享（eval_metrics_file_path_or_dir）
+project/                          project/
+└── eval_data/                    ├── shared_metrics.json     ← 共享配置
+    ├── weather/                  └── eval_data/
+    │   ├── weather.evalset.json      ├── weather/weather.evalset.json
+    │   └── test_config.json          ├── booking/booking.evalset.json
+    └── booking/                      └── search/search.evalset.json
+        ├── booking.evalset.json
+        └── test_config.json
+```
+
+**配置方式**
+
+在 `AgentEvaluator.evaluate()` / `get_executer()` 中传入 `eval_metrics_file_path_or_dir`：
+
+- 传**文件路径**（`.json`）：直接作为共享配置加载；
+- 传**目录路径**：在该目录下**非递归**查找 `*.json`，且必须恰好有一份，否则抛 `FileNotFoundError`（0 份）或 `ValueError`（多于 1 份）；
+- 不传或传 `None`：保持默认，按各评测集同目录的 `test_config.json` 加载。
+
+**适用场景**
+
+多个评测集共用同一套指标与阈值；CI 中按环境（dev / staging / prod）切换不同阈值；评测集由其他工具（WebUI、日志回放器等）生成、不便逐个维护 `test_config.json`。
+
+**示例**：将上图右侧布局的所有评测集统一指向 `shared_metrics.json`
+
+```python
+import os
+import pytest
+from trpc_agent_sdk.evaluation import AgentEvaluator
+
+@pytest.mark.asyncio
+async def test_with_shared_metrics():
+    project_dir = os.path.dirname(os.path.abspath(__file__))
+    await AgentEvaluator.evaluate(
+        agent_module="agent",
+        eval_dataset_file_path_or_dir=os.path.join(project_dir, "eval_data"),
+        eval_metrics_file_path_or_dir=os.path.join(project_dir, "shared_metrics.json"),
+    )
+```
+
 
 ## 使用 WebUI 进行 Agent 评测
 
diff --git a/tests/evaluation/test_eval_result.py b/tests/evaluation/test_eval_result.py
index 6d79b48..429de39 100644
--- a/tests/evaluation/test_eval_result.py
+++ b/tests/evaluation/test_eval_result.py
@@ -135,3 +135,83 @@ def test_evaluate_result_empty(self):
         """Test empty EvaluateResult."""
         r = EvaluateResult()
         assert r.results_by_eval_set_id == {}
+
+
+class TestNamedScoreResult:
+    """Test suite for NamedScoreResult."""
+
+    def test_minimal_construction(self):
+        """Test NamedScoreResult with minimal fields uses defaults."""
+        from trpc_agent_sdk.evaluation import NamedScoreResult
+
+        n = NamedScoreResult(model_name="glm-4.7", score=1.0, passed=True)
+        assert n.model_name == "glm-4.7"
+        assert n.provider_name == ""
+        assert n.score == 1.0
+        assert n.reason == ""
+        assert n.rubric_scores == []
+        assert n.passed is True
+
+    def test_full_construction_and_serialization(self):
+        """Test all fields round-trip through JSON serialization."""
+        from trpc_agent_sdk.evaluation import NamedScoreResult
+        from trpc_agent_sdk.evaluation import RubricScore
+
+        n = NamedScoreResult(
+            model_name="gpt-4o",
+            provider_name="openai",
+            score=0.5,
+            reason="half passed",
+            rubric_scores=[RubricScore(id="r1", reason="ok", score=1.0)],
+            passed=False,
+        )
+        data = n.model_dump()
+        assert data["model_name"] == "gpt-4o"
+        assert data["provider_name"] == "openai"
+        assert data["passed"] is False
+        assert data["rubric_scores"][0]["id"] == "r1"
+
+
+class TestPerInvocationResultPerModelScores:
+    """Test suite for PerInvocationResult.per_model_scores backward compatibility."""
+
+    def test_default_is_none(self):
+        """Test per_model_scores defaults to None for old code paths."""
+        from unittest.mock import Mock
+
+        from trpc_agent_sdk.evaluation import EvalStatus
+        from trpc_agent_sdk.evaluation import Invocation
+        from trpc_agent_sdk.evaluation import PerInvocationResult
+
+        inv = Mock(spec=Invocation)
+        r = PerInvocationResult(
+            actual_invocation=inv,
+            score=1.0,
+            eval_status=EvalStatus.PASSED,
+        )
+        assert r.per_model_scores is None
+
+    def test_per_model_scores_populated(self):
+        """Test per_model_scores accepts list of NamedScoreResult."""
+        from unittest.mock import Mock
+
+        from trpc_agent_sdk.evaluation import EvalStatus
+        from trpc_agent_sdk.evaluation import Invocation
+        from trpc_agent_sdk.evaluation import NamedScoreResult
+        from trpc_agent_sdk.evaluation import PerInvocationResult
+
+        inv = Mock(spec=Invocation)
+        per_model = [
+            NamedScoreResult(model_name="m1", score=1.0, passed=True),
+            NamedScoreResult(model_name="m2", score=0.0, passed=False),
+        ]
+        r = PerInvocationResult(
+            actual_invocation=inv,
+            score=0.0,
+            eval_status=EvalStatus.FAILED,
+            per_model_scores=per_model,
+        )
+        assert r.per_model_scores is not None
+        assert len(r.per_model_scores) == 2
+        assert r.per_model_scores[0].model_name == "m1"
+        assert r.per_model_scores[1].passed is False
diff --git a/tests/evaluation/test_llm_criterion.py b/tests/evaluation/test_llm_criterion.py
index 34a80f9..e985866 100644
--- a/tests/evaluation/test_llm_criterion.py
+++ b/tests/evaluation/test_llm_criterion.py
@@ -52,7 +52,10 @@ def test_strips_api_key_camel_case(self):
         """Test apiKey is stripped when key is judgeModel."""
         c = {
             "llmJudge": {
-                "judgeModel": {"model_name": "glm-4", "apiKey": "secret"},
+                "judgeModel": {
+                    "model_name": "glm-4",
+                    "apiKey": "secret"
+                },
             },
         }
         out = sanitize_criterion_for_export(c)
@@ -114,9 +117,18 @@ def test_from_dict_empty(self):
     def test_from_dict_snake_case(self):
         """Test from_dict with snake_case keys."""
         d = {
-            "judge_model": {"model_name": "glm-4", "num_samples": 2},
+            "judge_model": {
+                "model_name": "glm-4",
+                "num_samples": 2
+            },
             "rubrics": [
-                {"id": "1", "content": {"text": "Must be relevant."}, "description": "Relevance"},
+                {
+                    "id": "1",
+                    "content": {
+                        "text": "Must be relevant."
+                    },
+                    "description": "Relevance"
+                },
             ],
         }
         c = LLMJudgeCriterion.from_dict(d)
@@ -159,7 +171,9 @@ def test_metric_with_llm_judge(self):
             threshold=1.0,
             criterion={
                 "llm_judge": {
-                    "judge_model": {"model_name": "glm-4"},
+                    "judge_model": {
+                        "model_name": "glm-4"
+                    },
                 },
             },
         )
@@ -175,7 +189,9 @@ def test_metric_with_llm_judge_camel_case(self):
             threshold=1.0,
             criterion={
                 "llmJudge": {
-                    "judgeModel": {"model_name": "glm-4"},
+                    "judgeModel": {
+                        "model_name": "glm-4"
+                    },
                     "rubrics": [],
                 },
             },
@@ -183,3 +199,139 @@ def test_metric_with_llm_judge_camel_case(self):
         c = get_llm_criterion_from_metric(m)
         assert c is not None
         assert c.judge_model.model_name == "glm-4"
+
+
+class TestJudgeModelOptionsWeight:
+    """Test suite for JudgeModelOptions.weight."""
+
+    def test_weight_default_is_one(self):
+        """Test weight defaults to 1.0 when omitted."""
+        opts = JudgeModelOptions(model_name="m")
+        assert opts.weight == 1.0
+
+    def test_weight_custom_value(self):
+        """Test weight accepts custom float."""
+        opts = JudgeModelOptions(model_name="m", weight=2.5)
+        assert opts.weight == 2.5
+
+
+class TestLLMJudgeCriterionMultiModel:
+    """Test suite for LLMJudgeCriterion multi-model fields and validation."""
+
+    def test_default_models_aggregator_and_parallel(self):
+        """Test defaults: models_aggregator='all_pass', parallel=True."""
+        c = LLMJudgeCriterion(judge_model=JudgeModelOptions(model_name="m"))
+        assert c.models_aggregator == "all_pass"
+        assert c.parallel is True
+
+    def test_get_judge_models_normalizes_singular(self):
+        """Test get_judge_models() returns 1-element list when only judge_model is set."""
+        c = LLMJudgeCriterion(judge_model=JudgeModelOptions(model_name="m1"))
+        models = c.get_judge_models()
+        assert len(models) == 1
+        assert models[0].model_name == "m1"
+
+    def test_get_judge_models_returns_list_directly(self):
+        """Test get_judge_models() returns judge_models when set."""
+        c = LLMJudgeCriterion(judge_models=[
+            JudgeModelOptions(model_name="m1"),
+            JudgeModelOptions(model_name="m2"),
+        ], )
+        models = c.get_judge_models()
+        assert [m.model_name for m in models] == ["m1", "m2"]
+
+    def test_get_judge_models_empty_when_neither_set(self):
+        """Test get_judge_models() returns [] when neither field set (allowed at criterion level)."""
+        c = LLMJudgeCriterion()
+        assert c.get_judge_models() == []
+
+    def test_validate_judge_model_and_judge_models_mutually_exclusive(self):
+        """Test setting both judge_model and judge_models raises ValueError."""
+        import pytest as _pytest
+        with _pytest.raises(ValueError, match="judge_model.*judge_models"):
+            LLMJudgeCriterion(
+                judge_model=JudgeModelOptions(model_name="m1"),
+                judge_models=[JudgeModelOptions(model_name="m2")],
+            )
+
+    def test_validate_empty_judge_models_raises(self):
+        """Test empty judge_models list raises ValueError."""
+        import pytest as _pytest
+        with _pytest.raises(ValueError, match="judge_models.*empty"):
+            LLMJudgeCriterion(judge_models=[])
+
+    def test_validate_negative_weight_raises(self):
+        """Test any negative weight raises ValueError."""
+        import pytest as _pytest
+        with _pytest.raises(ValueError, match="weight.*negative"):
+            LLMJudgeCriterion(judge_models=[
+                JudgeModelOptions(model_name="m1", weight=1.0),
+                JudgeModelOptions(model_name="m2", weight=-0.5),
+            ], )
+
+    def test_validate_weighted_aggregator_zero_total_weight_raises(self):
+        """Test weighted_avg with all-zero weights raises ValueError."""
+        import pytest as _pytest
+        with _pytest.raises(ValueError, match="weight"):
+            LLMJudgeCriterion(
+                judge_models=[
+                    JudgeModelOptions(model_name="m1", weight=0.0),
+                    JudgeModelOptions(model_name="m2", weight=0.0),
+                ],
+                models_aggregator="weighted_avg",
+            )
+
+    def test_built_in_aggregator_names_accepted(self):
+        """Test all 6 built-in aggregator names pass validation."""
+        for name in ("all_pass", "any_pass", "majority_pass", "avg", "weighted_avg", "weighted_majority"):
+            c = LLMJudgeCriterion(
+                judge_models=[JudgeModelOptions(model_name="m", weight=1.0)],
+                models_aggregator=name,
+            )
+            assert c.models_aggregator == name
+
+    def test_validate_models_aggregator_must_be_non_empty_string(self):
+        """Test empty models_aggregator string raises ValueError at criterion level."""
+        import pytest as _pytest
+        with _pytest.raises(ValueError, match="models_aggregator.*non-empty"):
+            LLMJudgeCriterion(
+                judge_model=JudgeModelOptions(model_name="m"),
+                models_aggregator="",
+            )
+
+    def test_from_dict_with_judge_models(self):
+        """Test from_dict accepts judge_models list and models_aggregator string."""
+        c = LLMJudgeCriterion.from_dict({
+            "judge_models": [
+                {
+                    "model_name": "m1",
+                    "weight": 2.0
+                },
+                {
+                    "model_name": "m2",
+                    "weight": 1.0
+                },
+            ],
+            "models_aggregator":
+            "weighted_avg",
+            "parallel":
+            False,
+        })
+        assert c is not None
+        assert len(c.judge_models) == 2
+        assert c.judge_models[0].weight == 2.0
+        assert c.models_aggregator == "weighted_avg"
+        assert c.parallel is False
+
+    def test_from_dict_legacy_judge_model_still_works(self):
+        """Test from_dict still works with legacy single judge_model (back compat)."""
+        c = LLMJudgeCriterion.from_dict({
+            "judge_model": {
+                "model_name": "glm-4"
+            },
+        })
+        assert c is not None
+        assert c.judge_model.model_name == "glm-4"
+        assert c.judge_models is None
+        assert c.models_aggregator == "all_pass"
+        assert c.parallel is True
diff --git a/tests/evaluation/test_llm_evaluator_registry.py b/tests/evaluation/test_llm_evaluator_registry.py
index 6e76781..ded926c 100644
--- a/tests/evaluation/test_llm_evaluator_registry.py
+++ b/tests/evaluation/test_llm_evaluator_registry.py
@@ -59,3 +59,46 @@ def test_llm_metric_names_contains_expected(self):
         assert "llm_rubric_response" in LLM_METRIC_NAMES
         assert "llm_rubric_knowledge_recall" in LLM_METRIC_NAMES
         assert len(LLM_METRIC_NAMES) == 3
+
+
+class TestModelsAggregatorRegistry:
+    """Test suite for register_models_aggregator on LLMEvaluatorRegistry."""
+
+    @pytest.fixture
+    def registry(self):
+        from trpc_agent_sdk.evaluation import LLMEvaluatorRegistry
+        return LLMEvaluatorRegistry()
+
+    def test_register_and_get(self, registry):
+        """Test register_models_aggregator + get_models_aggregator round-trip."""
+        from trpc_agent_sdk.evaluation import ScoreResult
+
+        def custom(per_model, threshold, weights):
+            return ScoreResult(score=1.0, reason="always pass")
+
+        registry.register_models_aggregator("llm_final_response", custom)
+        agg = registry.get_models_aggregator("llm_final_response")
+        assert agg is not None
+        out = agg.aggregate_models([ScoreResult(score=0.0)], 0.5, [1.0])
+        assert out.score == 1.0
+
+    def test_register_invalid_metric_raises(self, registry):
+        """Test register_models_aggregator with non-LLM metric raises ValueError."""
+        with pytest.raises(ValueError, match="must be one of"):
+            registry.register_models_aggregator("rouge_score", lambda *a, **k: None)
+
+    def test_get_unregistered_returns_none(self, registry):
+        """Test get_models_aggregator returns None when not set."""
+        assert registry.get_models_aggregator("llm_final_response") is None
+
+    def test_unregister(self, registry):
+        """Test unregister_models_aggregator removes the registration."""
+        from trpc_agent_sdk.evaluation import ScoreResult
+
+        def custom(per_model, threshold, weights):
+            return ScoreResult(score=1.0)
+
+        registry.register_models_aggregator("llm_rubric_response", custom)
+        assert registry.get_models_aggregator("llm_rubric_response") is not None
+        registry.unregister_models_aggregator("llm_rubric_response")
+        assert registry.get_models_aggregator("llm_rubric_response") is None
diff --git a/tests/evaluation/test_llm_judge_models_aggregator.py b/tests/evaluation/test_llm_judge_models_aggregator.py
new file mode 100644
index 0000000..d9cd633
--- /dev/null
+++ b/tests/evaluation/test_llm_judge_models_aggregator.py
@@ -0,0 +1,198 @@
+# Tencent is pleased to support the open source community by making tRPC-Agent-Python available.
+#
+# Copyright (C) 2026 Tencent. All rights reserved.
+#
+# tRPC-Agent-Python is licensed under Apache-2.0.
+"""Unit tests for built-in ModelsAggregator strategies in _llm_judge."""
+
+import pytest
+
+import trpc_agent_sdk.runners  # noqa: F401
+
+from trpc_agent_sdk.evaluation import ScoreResult
+from trpc_agent_sdk.evaluation._llm_judge import AllPassModelsAggregator
+from trpc_agent_sdk.evaluation._llm_judge import AnyPassModelsAggregator
+from trpc_agent_sdk.evaluation._llm_judge import AverageModelsAggregator
+from trpc_agent_sdk.evaluation._llm_judge import MajorityPassModelsAggregator
+from trpc_agent_sdk.evaluation._llm_judge import WeightedAverageModelsAggregator
+from trpc_agent_sdk.evaluation._llm_judge import WeightedMajorityModelsAggregator
+from trpc_agent_sdk.evaluation._llm_judge import get_builtin_models_aggregator
+
+
+class TestAllPassModelsAggregator:
+
+    def test_empty_per_model_raises(self):
+        agg = AllPassModelsAggregator()
+        with pytest.raises(ValueError):
+            agg.aggregate_models([], threshold=0.5, weights=[])
+
+    def test_all_above_threshold_returns_min(self):
+        agg = AllPassModelsAggregator()
+        out = agg.aggregate_models(
+            [ScoreResult(score=1.0), ScoreResult(score=0.8)],
+            threshold=0.5,
+            weights=[1.0, 1.0],
+        )
+        assert out.score == pytest.approx(0.8)
+
+    def test_one_below_threshold_returns_min(self):
+        agg = AllPassModelsAggregator()
+        out = agg.aggregate_models(
+            [ScoreResult(score=1.0), ScoreResult(score=0.0)],
+            threshold=0.5,
+            weights=[1.0, 1.0],
+        )
+        assert out.score == 0.0
+
+    def test_single_model_returns_its_score(self):
+        agg = AllPassModelsAggregator()
+        out = agg.aggregate_models(
+            [ScoreResult(score=0.7)],
+            threshold=0.5,
+            weights=[1.0],
+        )
+        assert out.score == pytest.approx(0.7)
+
+
+class TestAnyPassModelsAggregator:
+
+    def test_one_above_returns_max(self):
+        agg = AnyPassModelsAggregator()
+        out = agg.aggregate_models(
+            [ScoreResult(score=1.0), ScoreResult(score=0.0)],
+            threshold=0.5,
+            weights=[1.0, 1.0],
+        )
+        assert out.score == pytest.approx(1.0)
+
+    def test_all_below_returns_max_still_below(self):
+        agg = AnyPassModelsAggregator()
+        out = agg.aggregate_models(
+            [ScoreResult(score=0.1), ScoreResult(score=0.2)],
+            threshold=0.5,
+            weights=[1.0, 1.0],
+        )
+        assert out.score == pytest.approx(0.2)
+
+
+class TestMajorityPassModelsAggregator:
+
+    def test_strict_majority_passes(self):
+        agg = MajorityPassModelsAggregator()
+        out = agg.aggregate_models(
+            [ScoreResult(score=1.0), ScoreResult(score=1.0),
+             ScoreResult(score=0.0)],
+            threshold=0.5,
+            weights=[1.0, 1.0, 1.0],
+        )
+        assert out.score == pytest.approx(2 / 3)
+
+    def test_tie_returns_half(self):
+        agg = MajorityPassModelsAggregator()
+        out = agg.aggregate_models(
+            [ScoreResult(score=1.0), ScoreResult(score=0.0)],
+            threshold=0.5,
+            weights=[1.0, 1.0],
+        )
+        assert out.score == pytest.approx(0.5)
+
+
+class TestAverageModelsAggregator:
+
+    def test_average_score(self):
+        agg = AverageModelsAggregator()
+        out = agg.aggregate_models(
+            [ScoreResult(score=1.0), ScoreResult(score=0.0)],
+            threshold=0.5,
+            weights=[1.0, 1.0],
+        )
+        assert out.score == pytest.approx(0.5)
+
+
+class TestWeightedAverageModelsAggregator:
+
+    def test_weighted_mean(self):
+        agg = WeightedAverageModelsAggregator()
+        out = agg.aggregate_models(
+            [ScoreResult(score=1.0), ScoreResult(score=0.0)],
+            threshold=0.5,
+            weights=[2.0, 1.0],
+        )
+        assert out.score == pytest.approx(2.0 / 3.0)
+
+    def test_zero_weight_total_returns_zero(self):
+        agg = WeightedAverageModelsAggregator()
+        out = agg.aggregate_models(
+            [ScoreResult(score=1.0), ScoreResult(score=1.0)],
+            threshold=0.5,
+            weights=[0.0, 0.0],
+        )
+        assert out.score == 0.0
+
+
+class TestWeightedMajorityModelsAggregator:
+
+    def test_weighted_majority_passes(self):
+        agg = WeightedMajorityModelsAggregator()
+        out = agg.aggregate_models(
+            [ScoreResult(score=1.0), ScoreResult(score=0.0)],
+            threshold=0.5,
+            weights=[2.0, 1.0],
+        )
+        assert out.score == pytest.approx(2.0 / 3.0)
+
+    def test_weighted_majority_tie_returns_half(self):
+        agg = WeightedMajorityModelsAggregator()
+        out = agg.aggregate_models(
+            [ScoreResult(score=1.0), ScoreResult(score=0.0)],
+            threshold=0.5,
+            weights=[1.0, 1.0],
+        )
+        assert out.score == pytest.approx(0.5)
+
+
+class TestSingleModelEquivalence:
+
+    @pytest.mark.parametrize("agg_cls", [
+        AllPassModelsAggregator,
+        AnyPassModelsAggregator,
+        AverageModelsAggregator,
+        WeightedAverageModelsAggregator,
+    ])
+    def test_n1_continuous_score_preserved(self, agg_cls):
+        agg = agg_cls()
+        out = agg.aggregate_models(
+            [ScoreResult(score=0.7)],
+            threshold=0.5,
+            weights=[1.0],
+        )
+        assert out.score == pytest.approx(0.7)
+
+    @pytest.mark.parametrize("agg_cls", [
+        MajorityPassModelsAggregator,
+        WeightedMajorityModelsAggregator,
+    ])
+    def test_n1_majority_passes_and_fails(self, agg_cls):
+        agg = agg_cls()
+        out_pass = agg.aggregate_models(
+            [ScoreResult(score=0.9)],
+            threshold=0.5,
+            weights=[1.0],
+        )
+        out_fail = agg.aggregate_models(
+            [ScoreResult(score=0.1)],
+            threshold=0.5,
+            weights=[1.0],
+        )
+        assert out_pass.score == 1.0
+        assert out_fail.score == 0.0
+
+
+class TestGetBuiltinModelsAggregator:
+
+    def test_known_names(self):
+        for name in ("all_pass", "any_pass", "majority_pass", "avg", "weighted_avg", "weighted_majority"):
+            assert get_builtin_models_aggregator(name) is not None
+
+    def test_unknown_name_returns_none(self):
+        assert get_builtin_models_aggregator("nope") is None
diff --git a/tests/evaluation/test_llm_judge_multi_model.py b/tests/evaluation/test_llm_judge_multi_model.py
new file mode 100644
index 0000000..103d04c
--- /dev/null
+++ b/tests/evaluation/test_llm_judge_multi_model.py
@@ -0,0 +1,354 @@
+# Tencent is pleased to support the open source community by making tRPC-Agent-Python available.
+#
+# Copyright (C) 2026 Tencent. All rights reserved.
+#
+# tRPC-Agent-Python is licensed under Apache-2.0.
+"""End-to-end multi-model evaluation tests for LLMJudge (mocked judge agents)."""
+
+from unittest.mock import patch
+
+import pytest
+
+import trpc_agent_sdk.runners  # noqa: F401
+
+from trpc_agent_sdk.evaluation import EvalMetric
+from trpc_agent_sdk.evaluation import EvalStatus
+from trpc_agent_sdk.evaluation import Invocation
+from trpc_agent_sdk.evaluation._llm_judge import LLMJudge
+
+# Module-level dict configuring per-model stubbed outcomes:
+#   "valid"   -> JSON judge response with verdict valid
+#   "invalid" -> JSON judge response with verdict invalid
+#   "raise"   -> raise RuntimeError on get_response
+_STUB_RESPONSES: dict[str, str] = {}
+
+
+class _StubModel:
+    """Tag object returned from stubbed _create_judge_model."""
+
+    def __init__(self, name: str) -> None:
+        self._stub_name = name
+
+
+def _stub_create_judge_model(opts):
+    return _StubModel(opts.model_name or "")
+
+
+class _StubJudgeAgent:
+    """Stub _JudgeAgent: returns configured JSON per call (for llm_final_response)."""
+
+    def __init__(self, model, config, system_prompt, output_schema=None, tools=None, planner=None):
+        self._model_name = getattr(model, "_stub_name", "")
+        self._planner = planner
+
+    async def get_response(self, user_message: str) -> str:
+        outcome = _STUB_RESPONSES.get(self._model_name, "valid")
+        if outcome == "raise":
+            raise RuntimeError(f"stubbed judge {self._model_name} failure")
+        verdict = "valid" if outcome == "valid" else "invalid"
+        return ('{"reasoning":"stub","is_the_agent_response_valid":'
+                f'"{verdict}"'
+                "}")
+
+
+@pytest.fixture(autouse=True)
+def _reset_stubs():
+    _STUB_RESPONSES.clear()
+    yield
+    _STUB_RESPONSES.clear()
+
+
+def _patch_judge_internals():
+    """Return list of started patchers; caller must stop them."""
+    patchers = [
+        patch("trpc_agent_sdk.evaluation._llm_judge._create_judge_model", side_effect=_stub_create_judge_model),
+        patch("trpc_agent_sdk.evaluation._llm_judge._JudgeAgent", _StubJudgeAgent),
+    ]
+    for p in patchers:
+        p.start()
+    return patchers
+
+
+def _stop(patchers):
+    for p in patchers:
+        p.stop()
+
+
+def _make_metric(judge_models, models_aggregator="all_pass", parallel=True, threshold=0.5):
+    return EvalMetric(
+        metric_name="llm_final_response",
+        threshold=threshold,
+        criterion={
+            "llm_judge": {
+                "judge_models": judge_models,
+                "models_aggregator": models_aggregator,
+                "parallel": parallel,
+            },
+        },
+    )
+
+
+def _make_invocation(user_text: str, response_text: str) -> Invocation:
+    from trpc_agent_sdk.types import Content
+    from trpc_agent_sdk.types import Part
+
+    return Invocation(
+        invocation_id="inv",
+        user_content=Content(role="user", parts=[Part.from_text(text=user_text)]),
+        final_response=Content(role="model", parts=[Part.from_text(text=response_text)]),
+    )
+
+
+class TestMultiModelAllPass:
+
+    @pytest.mark.asyncio
+    async def test_both_valid_passes(self):
+        _STUB_RESPONSES.update({"glm-4.7": "valid", "gpt-4o": "valid"})
+        metric = _make_metric([
+            {
+                "model_name": "glm-4.7"
+            },
+            {
+                "model_name": "gpt-4o"
+            },
+        ], models_aggregator="all_pass")
+        actual = _make_invocation("u", "a")
+        expected = _make_invocation("u", "a")
+        ps = _patch_judge_internals()
+        try:
+            judge = LLMJudge(metric)
+            result = await judge.evaluate([actual], [expected])
+        finally:
+            _stop(ps)
+        assert result.overall_eval_status == EvalStatus.PASSED
+        per = result.per_invocation_results[0]
+        assert per.eval_status == EvalStatus.PASSED
+        assert per.per_model_scores is not None
+        assert len(per.per_model_scores) == 2
+
+    @pytest.mark.asyncio
+    async def test_one_invalid_fails(self):
+        _STUB_RESPONSES.update({"glm-4.7": "valid", "gpt-4o": "invalid"})
+        metric = _make_metric([
+            {
+                "model_name": "glm-4.7"
+            },
+            {
+                "model_name": "gpt-4o"
+            },
+        ], models_aggregator="all_pass")
+        actual = _make_invocation("u", "a")
+        expected = _make_invocation("u", "a")
+        ps = _patch_judge_internals()
+        try:
+            judge = LLMJudge(metric)
+            result = await judge.evaluate([actual], [expected])
+        finally:
+            _stop(ps)
+        assert result.overall_eval_status == EvalStatus.FAILED
+        per = result.per_invocation_results[0]
+        assert per.per_model_scores is not None
+        names = [m.model_name for m in per.per_model_scores]
+        assert "glm-4.7" in names and "gpt-4o" in names
+        gpt_entry = [m for m in per.per_model_scores if m.model_name == "gpt-4o"][0]
+        assert gpt_entry.passed is False
+
+
+class TestMultiModelAnyPass:
+
+    @pytest.mark.asyncio
+    async def test_one_valid_passes(self):
+        _STUB_RESPONSES.update({"glm-4.7": "invalid", "gpt-4o": "valid"})
+        metric = _make_metric([
+            {
+                "model_name": "glm-4.7"
+            },
+            {
+                "model_name": "gpt-4o"
+            },
+        ], models_aggregator="any_pass")
+        actual = _make_invocation("u", "a")
+        expected = _make_invocation("u", "a")
+        ps = _patch_judge_internals()
+        try:
+            judge = LLMJudge(metric)
+            result = await judge.evaluate([actual], [expected])
+        finally:
+            _stop(ps)
+        assert result.overall_eval_status == EvalStatus.PASSED
+
+
+class TestMultiModelParallelEqualsSerial:
+
+    @pytest.mark.asyncio
+    async def test_parallel_same_as_serial(self):
+        _STUB_RESPONSES.update({"a": "valid", "b": "invalid"})
+
+        async def run_with(parallel):
+            metric = _make_metric([
+                {
+                    "model_name": "a"
+                },
+                {
+                    "model_name": "b"
+                },
+            ],
+                                  models_aggregator="all_pass",
+                                  parallel=parallel)
+            actual = _make_invocation("u", "x")
+            expected = _make_invocation("u", "x")
+            ps = _patch_judge_internals()
+            try:
+                j = LLMJudge(metric)
+                return await j.evaluate([actual], [expected])
+            finally:
+                _stop(ps)
+
+        r_p = await run_with(True)
+        r_s = await run_with(False)
+        assert r_p.overall_eval_status == r_s.overall_eval_status
+        assert r_p.overall_score == r_s.overall_score
+        names_p = sorted(m.model_name for m in r_p.per_invocation_results[0].per_model_scores)
+        names_s = sorted(m.model_name for m in r_s.per_invocation_results[0].per_model_scores)
+        assert names_p == names_s
+
+
+class TestMultiModelSoftFailure:
+
+    @pytest.mark.asyncio
+    async def test_one_model_raises_counts_as_fail_vote(self):
+        _STUB_RESPONSES.update({"a": "valid", "b": "raise"})
+        metric = _make_metric([
+            {
+                "model_name": "a"
+            },
+            {
+                "model_name": "b"
+            },
+        ], models_aggregator="all_pass")
+        actual = _make_invocation("u", "x")
+        expected = _make_invocation("u", "x")
+        ps = _patch_judge_internals()
+        try:
+            j = LLMJudge(metric)
+            r = await j.evaluate([actual], [expected])
+        finally:
+            _stop(ps)
+        assert r.overall_eval_status == EvalStatus.FAILED
+        per = r.per_invocation_results[0]
+        b_entry = [m for m in per.per_model_scores if m.model_name == "b"][0]
+        assert b_entry.passed is False
+        assert b_entry.score == 0.0
+        assert "stubbed judge b failure" in b_entry.reason
+
+    @pytest.mark.asyncio
+    async def test_all_models_raise_returns_not_evaluated(self):
+        _STUB_RESPONSES.update({"a": "raise", "b": "raise"})
+        metric = _make_metric([
+            {
+                "model_name": "a"
+            },
+            {
+                "model_name": "b"
+            },
+        ], models_aggregator="all_pass")
+        actual = _make_invocation("u", "x")
+        expected = _make_invocation("u", "x")
+        ps = _patch_judge_internals()
+        try:
+            j = LLMJudge(metric)
+            r = await j.evaluate([actual], [expected])
+        finally:
+            _stop(ps)
+        assert r.per_invocation_results[0].eval_status == EvalStatus.NOT_EVALUATED
+
+
+class TestLegacySingleModelStillWorks:
+
+    @pytest.mark.asyncio
+    async def test_legacy_single_judge_model(self):
+        _STUB_RESPONSES.update({"glm-4.7": "valid"})
+        metric = EvalMetric(
+            metric_name="llm_final_response",
+            threshold=0.5,
+            criterion={
+                "llm_judge": {
+                    "judge_model": {
+                        "model_name": "glm-4.7"
+                    },
+                },
+            },
+        )
+        actual = _make_invocation("u", "x")
+        expected = _make_invocation("u", "x")
+        ps = _patch_judge_internals()
+        try:
+            j = LLMJudge(metric)
+            r = await j.evaluate([actual], [expected])
+        finally:
+            _stop(ps)
+        assert r.overall_eval_status == EvalStatus.PASSED
+
+
+class TestUnknownAggregatorRaisesAtConstruction:
+
+    def test_unknown_aggregator_raises(self):
+        metric = EvalMetric(
+            metric_name="llm_final_response",
+            threshold=0.5,
+            criterion={
+                "llm_judge": {
+                    "judge_models": [{
+                        "model_name": "a"
+                    }],
+                    "models_aggregator": "definitely_not_registered",
+                },
+            },
+        )
+        ps = _patch_judge_internals()
+        try:
+            with pytest.raises(ValueError, match="definitely_not_registered"):
+                LLMJudge(metric)
+        finally:
+            _stop(ps)
+
+
+class TestRegistryRegisteredAggregator:
+    """Verify that a registry-registered ModelsAggregator is picked up by _judge_for_metric."""
+
+    @pytest.mark.asyncio
+    async def test_registered_custom_aggregator_used(self):
+        """Test register_models_aggregator -> _judge_for_metric injects it; criterion name ignored."""
+        from trpc_agent_sdk.evaluation import LLM_EVALUATOR_REGISTRY
+        from trpc_agent_sdk.evaluation import ScoreResult
+        from trpc_agent_sdk.evaluation._llm_evaluator import _judge_for_metric
+
+        _STUB_RESPONSES.update({"a": "invalid", "b": "invalid"})
+
+        def always_pass(per_model, threshold, weights):
+            return ScoreResult(score=1.0, reason="custom always pass")
+
+        LLM_EVALUATOR_REGISTRY.register_models_aggregator("llm_final_response", always_pass)
+        try:
+            metric = _make_metric(
+                [
+                    {
+                        "model_name": "a"
+                    },
+                    {
+                        "model_name": "b"
+                    },
+                ],
+                models_aggregator="all_pass",
+            )
+            actual = _make_invocation("u", "x")
+            expected = _make_invocation("u", "x")
+            ps = _patch_judge_internals()
+            try:
+                judge = _judge_for_metric(metric)
+                r = await judge.evaluate([actual], [expected])
+            finally:
+                _stop(ps)
+            assert r.overall_eval_status == EvalStatus.PASSED
+        finally:
+            LLM_EVALUATOR_REGISTRY.unregister_models_aggregator("llm_final_response")
diff --git a/tests/evaluation/test_llm_judge_think.py b/tests/evaluation/test_llm_judge_think.py
new file mode 100644
index 0000000..fdcd3c1
--- /dev/null
+++ b/tests/evaluation/test_llm_judge_think.py
@@ -0,0 +1,393 @@
+# Tencent is pleased to support the open source community by making tRPC-Agent-Python available.
+#
+# Copyright (C) 2026 Tencent. All rights reserved.
+#
+# tRPC-Agent-Python is licensed under Apache-2.0.
+"""Unit tests for LLM judge `think` field."""
+
+from __future__ import annotations
+
+from typing import Any
+from unittest.mock import patch
+
+import pytest
+
+import trpc_agent_sdk.runners  # noqa: F401
+from trpc_agent_sdk.evaluation._llm_criterion import JudgeModelOptions
+from trpc_agent_sdk.evaluation._llm_judge import _JudgeAgent
+from trpc_agent_sdk.evaluation._llm_judge import _judge_generation_config
+from trpc_agent_sdk.evaluation._llm_judge import _merge_extra_body
+from trpc_agent_sdk.types import HttpOptions
+
+
+class TestJudgeModelOptionsThinkField:
+
+    def test_think_field_default_is_none(self):
+        opts = JudgeModelOptions(model_name="m")
+        assert opts.think is None
+
+    def test_think_field_accepts_bool(self):
+        assert JudgeModelOptions(model_name="m", think=True).think is True
+        assert JudgeModelOptions(model_name="m", think=False).think is False
+
+    def test_think_field_rejects_non_bool(self):
+        # EvalBaseModel uses pydantic v2 default lax mode (no strict). Strings like "yes"
+        # would be coerced to bool, so use an object() instance that cannot be coerced.
+        with pytest.raises(Exception):
+            JudgeModelOptions(model_name="m", think=object())
+
+
+class TestMergeExtraBody:
+
+    def test_none_http_options_creates_new_with_patch(self):
+        result = _merge_extra_body(None, {"chat_template_kwargs": {"enable_thinking": False}})
+        assert isinstance(result, HttpOptions)
+        assert result.extra_body == {"chat_template_kwargs": {"enable_thinking": False}}
+
+    def test_preserves_other_top_level_keys_in_extra_body(self):
+        existing = HttpOptions(extra_body={"custom_user_field": "abc"})
+        result = _merge_extra_body(existing, {"chat_template_kwargs": {"enable_thinking": False}})
+        assert result.extra_body["custom_user_field"] == "abc"
+        assert result.extra_body["chat_template_kwargs"] == {"enable_thinking": False}
+
+    def test_preserves_sibling_keys_in_chat_template_kwargs(self):
+        existing = HttpOptions(
+            extra_body={
+                "chat_template_kwargs": {"enable_thinking": True, "other_key": "x"},
+                "custom_user_field": "abc",
+            })
+        result = _merge_extra_body(existing, {"chat_template_kwargs": {"enable_thinking": False}})
+        assert result.extra_body["chat_template_kwargs"]["other_key"] == "x"
+        assert result.extra_body["chat_template_kwargs"]["enable_thinking"] is False
+        assert result.extra_body["custom_user_field"] == "abc"
+
+    def test_patch_is_copied_not_shared(self):
+        patch_dict = {"chat_template_kwargs": {"enable_thinking": False}}
+        result = _merge_extra_body(None, patch_dict)
+        patch_dict["chat_template_kwargs"]["enable_thinking"] = True
+        assert result.extra_body["chat_template_kwargs"]["enable_thinking"] is False
+
+
+class TestJudgeGenerationConfigThink:
+
+    def test_think_none_returns_none_thinking_config_and_none_http_options(self):
+        cfg, tc = _judge_generation_config(None, None)
+        assert tc is None
+        assert cfg.http_options is None
+        assert cfg.thinking_config is None  # must stay empty; LlmAgent rejects otherwise
+
+    def test_think_none_preserves_caller_http_options(self):
+        gen = {"http_options": {"extra_body": {"my_key": 1}}}
+        cfg, tc = _judge_generation_config(gen, None)
+        assert tc is None
+        assert cfg.http_options is not None
+        assert cfg.http_options.extra_body == {"my_key": 1}
+
+    def test_think_false_builds_disabled_thinking_config(self):
+        cfg, tc = _judge_generation_config(None, False)
+        assert tc is not None
+        assert tc.include_thoughts is False
+        assert tc.thinking_budget == 0
+        assert cfg.thinking_config is None
+        assert cfg.http_options is not None
+        assert cfg.http_options.extra_body == {
+            "chat_template_kwargs": {"enable_thinking": False},
+        }
+
+    def test_think_true_builds_enabled_thinking_config_auto_budget(self):
+        cfg, tc = _judge_generation_config(None, True)
+        assert tc is not None
+        assert tc.include_thoughts is True
+        assert tc.thinking_budget == -1
+        assert cfg.http_options is not None
+        assert cfg.http_options.extra_body == {
+            "chat_template_kwargs": {"enable_thinking": True},
+        }
+
+    def test_think_false_overrides_generation_config_thinking_config(self):
+        gen = {
+            "max_tokens": 4096,
+            "thinking_config": {"include_thoughts": True, "thinking_budget": 2048},
+        }
+        cfg, tc = _judge_generation_config(gen, False)
+        assert cfg.max_output_tokens == 4096
+        assert tc is not None
+        assert tc.include_thoughts is False
+        assert tc.thinking_budget == 0
+
+    def test_think_false_deep_merges_extra_body_preserving_other_keys(self):
+        gen = {
+            "http_options": {
+                "extra_body": {
+                    "chat_template_kwargs": {"enable_thinking": True, "other_key": "x"},
+                    "custom_user_field": "abc",
+                },
+            },
+        }
+        cfg, tc = _judge_generation_config(gen, False)
+        assert tc is not None
+        assert cfg.http_options.extra_body["custom_user_field"] == "abc"
+        assert cfg.http_options.extra_body["chat_template_kwargs"]["other_key"] == "x"
+        assert (
+            cfg.http_options.extra_body["chat_template_kwargs"]["enable_thinking"] is False
+        )
+
+    def test_generation_config_thinking_config_used_when_think_is_none(self):
+        gen = {"thinking_config": {"include_thoughts": True, "thinking_budget": 512}}
+        cfg, tc = _judge_generation_config(gen, None)
+        assert tc is not None
+        assert tc.include_thoughts is True
+        assert tc.thinking_budget == 512
+        assert cfg.http_options is None
+
+
+class TestJudgeAgentPlanner:
+
+    def test_judge_agent_accepts_planner_and_forwards_to_llm_agent(self):
+        captured: dict[str, Any] = {}
+
+        class _FakeLlmAgent:
+
+            def __init__(self, **kwargs):
+                captured.update(kwargs)
+
+        fake_planner = object()
+        with patch("trpc_agent_sdk.evaluation._llm_judge.LlmAgent", _FakeLlmAgent):
+            _JudgeAgent(
+                model=object(),
+                config=None,
+                system_prompt="sp",
+                output_schema=None,
+                tools=None,
+                planner=fake_planner,
+            )
+        assert captured.get("planner") is fake_planner
+
+    def test_judge_agent_planner_defaults_to_none(self):
+        captured: dict[str, Any] = {}
+
+        class _FakeLlmAgent:
+
+            def __init__(self, **kwargs):
+                captured.update(kwargs)
+
+        with patch("trpc_agent_sdk.evaluation._llm_judge.LlmAgent", _FakeLlmAgent):
+            _JudgeAgent(
+                model=object(),
+                config=None,
+                system_prompt="sp",
+            )
+        assert captured.get("planner") is None
+
+
+# --- Integration tests: end-to-end LLMJudge wiring ---
+
+
+class _SpyModel:
+
+    def __init__(self, name: str) -> None:
+        self._stub_name = name
+
+
+def _spy_create_judge_model(opts):
+    return _SpyModel(opts.model_name or "")
+
+
+class _SpyJudgeAgent:
+    """Captures constructor kwargs for every judge model built by LLMJudge."""
+
+    instances: list[dict[str, Any]] = []
+
+    def __init__(self, model, config, system_prompt, output_schema=None, tools=None, planner=None):
+        _SpyJudgeAgent.instances.append({
+            "model_name": getattr(model, "_stub_name", ""),
+            "config": config,
+            "planner": planner,
+        })
+
+    async def get_response(self, user_message: str) -> str:  # pragma: no cover - not invoked here
+        return '{"reasoning":"stub","is_the_agent_response_valid":"valid"}'
+
+
+def _make_metric(judge_models: list[dict[str, Any]]):
+    from trpc_agent_sdk.evaluation import EvalMetric
+    return EvalMetric(
+        metric_name="llm_final_response",
+        threshold=1.0,
+        criterion={
+            "llm_judge": {
+                "judge_models": judge_models,
+                "models_aggregator": "all_pass",
+            },
+        },
+    )
+
+
+@pytest.fixture(autouse=True)
+def _reset_spy():
+    _SpyJudgeAgent.instances.clear()
+    yield
+    _SpyJudgeAgent.instances.clear()
+
+
+def _build_judge(judge_models: list[dict[str, Any]]):
+    from trpc_agent_sdk.evaluation._llm_judge import LLMJudge
+    metric = _make_metric(judge_models)
+    patchers = [
+        patch("trpc_agent_sdk.evaluation._llm_judge._create_judge_model",
+              side_effect=_spy_create_judge_model),
+        patch("trpc_agent_sdk.evaluation._llm_judge._JudgeAgent", _SpyJudgeAgent),
+    ]
+    for p in patchers:
+        p.start()
+    try:
+        return LLMJudge(metric)
+    finally:
+        for p in patchers:
+            p.stop()
+
+
+class TestLLMJudgeThinkIntegration:
+
+    def test_legacy_single_judge_model_supports_think(self):
+        from trpc_agent_sdk.evaluation import EvalMetric
+        from trpc_agent_sdk.evaluation._llm_judge import LLMJudge
+        from trpc_agent_sdk.planners import BuiltInPlanner
+        metric = EvalMetric(
+            metric_name="llm_final_response",
+            threshold=1.0,
+            criterion={
+                "llm_judge": {
+                    "judge_model": {"model_name": "glm-4.7", "think": False},
+                },
+            },
+        )
+        patchers = [
+            patch("trpc_agent_sdk.evaluation._llm_judge._create_judge_model",
+                  side_effect=_spy_create_judge_model),
+            patch("trpc_agent_sdk.evaluation._llm_judge._JudgeAgent", _SpyJudgeAgent),
+        ]
+        for p in patchers:
+            p.start()
+        try:
+            LLMJudge(metric)
+        finally:
+            for p in patchers:
+                p.stop()
+        assert len(_SpyJudgeAgent.instances) == 1
+        inst = _SpyJudgeAgent.instances[0]
+        assert isinstance(inst["planner"], BuiltInPlanner)
+        assert inst["planner"].thinking_config.include_thoughts is False
+        assert inst["planner"].thinking_config.thinking_budget == 0
+
+    def test_per_judge_independent_think(self):
+        from trpc_agent_sdk.planners import BuiltInPlanner
+        _build_judge([
+            {"model_name": "glm-4.7", "think": False},
+            {"model_name": "gpt-4o", "think": True},
+            {"model_name": "qwen2.5"},  # think None -> no planner
+        ])
+        assert len(_SpyJudgeAgent.instances) == 3
+        by_name = {i["model_name"]: i for i in _SpyJudgeAgent.instances}
+
+        glm = by_name["glm-4.7"]
+        assert isinstance(glm["planner"], BuiltInPlanner)
+        assert glm["planner"].thinking_config.include_thoughts is False
+        assert glm["planner"].thinking_config.thinking_budget == 0
+        assert glm["config"].http_options.extra_body == {
+            "chat_template_kwargs": {"enable_thinking": False},
+        }
+
+        gpt = by_name["gpt-4o"]
+        assert isinstance(gpt["planner"], BuiltInPlanner)
+        assert gpt["planner"].thinking_config.include_thoughts is True
+        assert gpt["planner"].thinking_config.thinking_budget == -1
+        assert gpt["config"].http_options.extra_body == {
+            "chat_template_kwargs": {"enable_thinking": True},
+        }
+
+        qwen = by_name["qwen2.5"]
+        assert qwen["planner"] is None
+        assert qwen["config"].http_options is None
+
+    def test_think_none_with_caller_http_options_preserves_it(self):
+        _build_judge([{
+            "model_name": "m",
+            "generation_config": {"http_options": {"extra_body": {"preserved": 1}}},
+        }])
+        assert len(_SpyJudgeAgent.instances) == 1
+        inst = _SpyJudgeAgent.instances[0]
+        assert inst["planner"] is None
+        assert inst["config"].http_options.extra_body == {"preserved": 1}
+
+
+class TestCreateJudgeModelRouting:
+    """Verify _create_judge_model picks OpenAIModel directly when provider is
+    empty/openai (so http_options.extra_body actually reaches the backend),
+    and falls back to ModelRegistry.create_model for other providers (LiteLLM)."""
+
+    def test_empty_provider_uses_openaimodel_directly(self):
+        from trpc_agent_sdk.evaluation._llm_judge import _create_judge_model
+        from trpc_agent_sdk.models import OpenAIModel
+        opts = JudgeModelOptions(
+            provider_name="",
+            model_name="glm-5.1-w4afp8",
+            api_key="k",
+            base_url="http://host/v1",
+        )
+        model = _create_judge_model(opts)
+        assert isinstance(model, OpenAIModel)
+
+    def test_openai_provider_uses_openaimodel_directly(self):
+        from trpc_agent_sdk.evaluation._llm_judge import _create_judge_model
+        from trpc_agent_sdk.models import OpenAIModel
+        opts = JudgeModelOptions(
+            provider_name="openai",
+            model_name="gpt-4o",
+            api_key="k",
+        )
+        model = _create_judge_model(opts)
+        assert isinstance(model, OpenAIModel)
+
+    def test_openai_provider_case_insensitive(self):
+        from trpc_agent_sdk.evaluation._llm_judge import _create_judge_model
+        from trpc_agent_sdk.models import OpenAIModel
+        opts = JudgeModelOptions(
+            provider_name="OpenAI",
+            model_name="gpt-4o",
+            api_key="k",
+        )
+        model = _create_judge_model(opts)
+        assert isinstance(model, OpenAIModel)
+
+    def test_non_openai_provider_uses_registry(self):
+        from trpc_agent_sdk.evaluation import _llm_judge as llm_judge_mod
+        from trpc_agent_sdk.evaluation._llm_judge import _create_judge_model
+        opts = JudgeModelOptions(
+            provider_name="anthropic",
+            model_name="claude-3-5-sonnet",
+            api_key="k",
+        )
+        sentinel = object()
+        with patch.object(
+                llm_judge_mod.ModelRegistry,
+                "create_model",
+                return_value=sentinel,
+        ) as mock_reg:
+            model = _create_judge_model(opts)
+        assert model is sentinel
+        args, kwargs = mock_reg.call_args
+        assert args[0] == "anthropic/claude-3-5-sonnet"
+        assert kwargs.get("api_key") == "k"
+
+    def test_openaimodel_receives_model_name_and_base_url(self):
+        from trpc_agent_sdk.evaluation._llm_judge import _create_judge_model
+        opts = JudgeModelOptions(
+            provider_name="",
+            model_name="glm-5.1-w4afp8",
+            api_key="sk-x",
+            base_url="http://example/v1",
+        )
+        model = _create_judge_model(opts)
+        assert getattr(model, "_model_name", None) == "glm-5.1-w4afp8"
+        assert getattr(model, "_base_url", None) == "http://example/v1"
diff --git a/trpc_agent_sdk/evaluation/__init__.py b/trpc_agent_sdk/evaluation/__init__.py
index 8ff7c1f..1d4eb20 100644
--- a/trpc_agent_sdk/evaluation/__init__.py
+++ b/trpc_agent_sdk/evaluation/__init__.py
@@ -94,6 +94,7 @@
 from ._eval_result import EvalStatusCounts
 from ._eval_result import EvaluateResult
 from ._eval_result import EvaluationResult
+from ._eval_result import NamedScoreResult
 from ._eval_result import PerInvocationResult
 from ._eval_service_base import BaseEvalService
 from ._eval_service_base import EvaluateConfig
@@ -121,6 +122,10 @@
 from ._in_memory_eval_sets_manager import InMemoryEvalSetsManager
 from ._llm_criterion import DEFAULT_KNOWLEDGE_TOOL_NAMES
 from ._llm_criterion import DEFAULT_NUM_SAMPLES
+from ._llm_criterion import BUILT_IN_MODELS_AGGREGATORS
+from ._llm_criterion import DEFAULT_MODELS_AGGREGATOR
+from ._llm_criterion import DEFAULT_PARALLEL
+from ._llm_criterion import WEIGHTED_MODELS_AGGREGATORS
 from ._llm_criterion import JudgeModelOptions
 from ._llm_criterion import LLMJudgeCriterion
 from ._llm_criterion import Rubric
@@ -137,20 +142,29 @@
 from ._llm_evaluator import LLM_EVALUATOR_REGISTRY
 from ._llm_evaluator import LLM_METRIC_NAMES
 from ._llm_evaluator import MessagesConstructorFn
+from ._llm_evaluator import ModelsAggregatorFn
 from ._llm_evaluator import ResponseScorerFn
 from ._llm_evaluator import SamplesAggregatorFn
+from ._llm_judge import AllPassModelsAggregator
+from ._llm_judge import AnyPassModelsAggregator
 from ._llm_judge import AverageInvocationsAggregator
+from ._llm_judge import AverageModelsAggregator
 from ._llm_judge import DefaultMessagesConstructor
 from ._llm_judge import DefaultResponseScorer
 from ._llm_judge import FinalResponseOutput
 from ._llm_judge import InvocationsAggregator
 from ._llm_judge import LLMJudge
+from ._llm_judge import MajorityPassModelsAggregator
 from ._llm_judge import MajorityVoteSamplesAggregator
 from ._llm_judge import MessagesConstructor
+from ._llm_judge import ModelsAggregator
 from ._llm_judge import ResponseScorer
 from ._llm_judge import RubricItemOutput
 from ._llm_judge import RubricJudgeOutput
 from ._llm_judge import SamplesAggregator
+from ._llm_judge import WeightedAverageModelsAggregator
+from ._llm_judge import WeightedMajorityModelsAggregator
+from ._llm_judge import get_builtin_models_aggregator
 from ._local_eval_service import LocalEvalService
 from ._local_eval_set_results_manager import LocalEvalSetResultsManager
 from ._local_eval_sets_manager import LocalEvalSetsManager
@@ -223,6 +237,7 @@
     "EvaluateResult",
     "EvaluationResult",
     "PerInvocationResult",
+    "NamedScoreResult",
     "BaseEvalService",
     "EvaluateConfig",
     "EvaluateRequest",
@@ -239,6 +254,10 @@
     "LLMJudgeCriterion",
     "DEFAULT_KNOWLEDGE_TOOL_NAMES",
     "DEFAULT_NUM_SAMPLES",
+    "BUILT_IN_MODELS_AGGREGATORS",
+    "DEFAULT_MODELS_AGGREGATOR",
+    "DEFAULT_PARALLEL",
+    "WEIGHTED_MODELS_AGGREGATORS",
     "RubricScore",
     "ScoreResult",
     "AverageInvocationsAggregator",
@@ -249,6 +268,14 @@
     "MessagesConstructor",
     "ResponseScorer",
     "SamplesAggregator",
+    "AllPassModelsAggregator",
+    "AnyPassModelsAggregator",
+    "AverageModelsAggregator",
+    "MajorityPassModelsAggregator",
+    "ModelsAggregator",
+    "WeightedAverageModelsAggregator",
+    "WeightedMajorityModelsAggregator",
+    "get_builtin_models_aggregator",
     "LocalEvalSetResultsManager",
     "LocalEvalSetsManager",
     "BaseUserSimulatorConfig",
@@ -266,6 +293,7 @@
     "LLM_EVALUATOR_REGISTRY",
     "LLM_METRIC_NAMES",
     "MessagesConstructorFn",
+    "ModelsAggregatorFn",
     "ResponseScorerFn",
     "SamplesAggregatorFn",
     "LocalEvalService",
diff --git a/trpc_agent_sdk/evaluation/_agent_evaluator.py b/trpc_agent_sdk/evaluation/_agent_evaluator.py
index 8d93c1d..09e1ed3 100644
--- a/trpc_agent_sdk/evaluation/_agent_evaluator.py
+++ b/trpc_agent_sdk/evaluation/_agent_evaluator.py
@@ -51,6 +51,7 @@
 from ._local_eval_service import LocalEvalService
 from . import _utils
 from ._eval_callbacks import Callbacks
+from ._eval_case import EvalModeTrace
 from ._eval_config import EvalConfig
 from ._eval_metrics import EvalStatus
 from ._eval_pass import pass_at_k as _pass_at_k
@@ -85,8 +86,9 @@ class _EvalExecuter:
 
     def __init__(
         self,
-        agent_module: str,
         eval_dataset_file_path_or_dir: str,
+        *,
+        agent_module: Optional[str] = None,
         num_runs: int = NUM_RUNS,
         agent_name: Optional[str] = None,
         print_detailed_results: bool = True,
@@ -95,6 +97,7 @@ def __init__(
         case_parallelism: Optional[int] = None,
         case_eval_parallelism: Optional[int] = None,
         callbacks: Optional[Callbacks] = None,
+        eval_metrics_file_path_or_dir: Optional[str] = None,
     ):
         self._agent_module = agent_module
         self._eval_dataset_file_path_or_dir = eval_dataset_file_path_or_dir
@@ -106,6 +109,7 @@ def __init__(
         self._case_parallelism = case_parallelism
         self._case_eval_parallelism = case_eval_parallelism
         self._callbacks = callbacks
+        self._eval_metrics_file_path_or_dir = eval_metrics_file_path_or_dir
         self._result: Optional[EvaluateResult] = None
         self._task: Optional[asyncio.Task] = None
 
@@ -120,6 +124,10 @@ async def _run(self) -> None:
         case_parallelism = self._case_parallelism
         case_eval_parallelism = self._case_eval_parallelism
         callbacks = self._callbacks
+        eval_metrics_file_path_or_dir = self._eval_metrics_file_path_or_dir
+
+        # Resolve shared config once; None means "fall back to dataset-local test_config.json".
+        shared_eval_config = AgentEvaluator._resolve_shared_config(eval_metrics_file_path_or_dir)
 
         test_files = []
         if os.path.isdir(eval_dataset_file_path_or_dir):
@@ -139,15 +147,22 @@ async def _run(self) -> None:
         all_results: list[tuple[str, list[str]]] = []
         results_by_eval_set_id: dict[str, EvalSetAggregateResult] = {}
         for test_file in test_files:
-            eval_config = AgentEvaluator.find_config_for_test_file(test_file)
+            if shared_eval_config is not None:
+                eval_config = shared_eval_config
+                # When shared config is explicit, honor its num_runs iff user
+                # set one; we keep the same precedence behavior as the
+                # dataset-local path (config overrides parameter).
+                num_runs_for_set = eval_config.num_runs
+            else:
+                eval_config = AgentEvaluator.find_config_for_test_file(test_file)
+                # Config (test_config.json) overrides parameter
+                config_path = os.path.join(os.path.dirname(test_file), "test_config.json")
+                num_runs_for_set = (eval_config.num_runs if os.path.exists(config_path) else num_runs)
             eval_set = AgentEvaluator._load_eval_set_from_file(test_file, eval_config)
-            # Config (test_config.json) overrides parameter
-            config_path = os.path.join(os.path.dirname(test_file), "test_config.json")
-            num_runs_for_set = (eval_config.num_runs if os.path.exists(config_path) else num_runs)
             failed_summary, details_lines, result_lines, eval_results_by_eval_id = (
                 await AgentEvaluator.evaluate_eval_set(
+                    eval_set,
                     agent_module=agent_module,
-                    eval_set=eval_set,
                     eval_config=eval_config,
                     num_runs=num_runs_for_set,
                     agent_name=agent_name,
@@ -172,7 +187,7 @@ async def _run(self) -> None:
             _RESULT_HANDLER.print_evaluation_report(
                 all_details=all_details,
                 all_results=all_results,
-                display_agent_name=agent_name or agent_module,
+                display_agent_name=agent_name or agent_module or "trace-only",
                 num_runs=num_runs_for_set,
             )
         self._result = EvaluateResult(results_by_eval_set_id=results_by_eval_set_id)
@@ -232,8 +247,8 @@ class AgentEvaluator:
 
         # Run evaluation
         await AgentEvaluator.evaluate_eval_set(
-            agent=my_agent,
-            eval_set=eval_set,
+            eval_set,
+            agent_module="my_pkg.my_agent",
             eval_config=eval_config,
         )
         ```
@@ -253,10 +268,20 @@ def find_config_for_test_file(test_file: str) -> EvalConfig:
         config_path = os.path.join(test_folder, "test_config.json")
         return AgentEvaluator._load_config_from_file(config_path)
 
+    @staticmethod
+    def _is_trace_only(eval_set: EvalSet) -> bool:
+        """Return True iff every case in the EvalSet has eval_mode == 'trace'.
+
+        Empty eval_cases list returns True by ``all()`` semantics; callers
+        that require at least one case should validate separately.
+        """
+        return all(case.eval_mode == EvalModeTrace for case in eval_set.eval_cases)
+
     @staticmethod
     async def evaluate(
-        agent_module: str,
         eval_dataset_file_path_or_dir: str,
+        *,
+        agent_module: Optional[str] = None,
         num_runs: int = NUM_RUNS,
         agent_name: Optional[str] = None,
         print_detailed_results: bool = True,
@@ -265,13 +290,17 @@ async def evaluate(
         case_parallelism: Optional[int] = None,
         case_eval_parallelism: Optional[int] = None,
         callbacks: Optional[Callbacks] = None,
+        eval_metrics_file_path_or_dir: Optional[str] = None,
     ) -> None:
         """Run evaluation; no result returned. Use get_executer() if you need the result.
 
         Args:
-            agent_module: Python module path containing the agent (look for 'root_agent' or 'get_agent_async').
             eval_dataset_file_path_or_dir: Path to eval dataset file or directory
-                (recursively .test.json / .evalset.json).
+                (recursively .test.json / .evalset.json). Positional-only usage
+                recommended.
+            agent_module: Python module path containing the agent (look for
+                'root_agent' or 'get_agent_async'). Optional when every case in
+                every discovered dataset uses ``eval_mode='trace'``.
             num_runs: Number of runs per eval set.
             agent_name: Display name of the agent.
             print_detailed_results: Whether to print per-case details.
@@ -281,10 +310,14 @@ async def evaluate(
             case_eval_parallelism: Max concurrent cases for evaluation (scoring);
                 None uses default.
             callbacks: Optional lifecycle callbacks.
+            eval_metrics_file_path_or_dir: Optional explicit path to a shared
+                evaluation config JSON (file) or directory containing a single
+                config JSON. When provided, overrides the dataset-local
+                ``test_config.json`` convention for ALL discovered datasets.
         """
         executer = AgentEvaluator.get_executer(
+            eval_dataset_file_path_or_dir,
             agent_module=agent_module,
-            eval_dataset_file_path_or_dir=eval_dataset_file_path_or_dir,
             num_runs=num_runs,
             agent_name=agent_name,
             print_detailed_results=print_detailed_results,
@@ -293,13 +326,15 @@ async def evaluate(
             case_parallelism=case_parallelism,
             case_eval_parallelism=case_eval_parallelism,
             callbacks=callbacks,
+            eval_metrics_file_path_or_dir=eval_metrics_file_path_or_dir,
         )
         await executer.evaluate()
 
     @staticmethod
     def get_executer(
-        agent_module: str,
         eval_dataset_file_path_or_dir: str,
+        *,
+        agent_module: Optional[str] = None,
         num_runs: int = NUM_RUNS,
         agent_name: Optional[str] = None,
         print_detailed_results: bool = True,
@@ -308,13 +343,17 @@ def get_executer(
         case_parallelism: Optional[int] = None,
         case_eval_parallelism: Optional[int] = None,
         callbacks: Optional[Callbacks] = None,
+        eval_metrics_file_path_or_dir: Optional[str] = None,
     ) -> _EvalExecuter:
         """Return an executer (does not run). Await executer.evaluate() then executer.get_result() for result.
 
         Args:
-            agent_module: Python module path containing the agent (look for 'root_agent' or 'get_agent_async').
             eval_dataset_file_path_or_dir: Path to eval dataset file or directory
-                (recursively .test.json / .evalset.json).
+                (recursively .test.json / .evalset.json). Positional-only usage
+                recommended.
+            agent_module: Python module path containing the agent (look for
+                'root_agent' or 'get_agent_async'). Optional when every case in
+                every discovered dataset uses ``eval_mode='trace'``.
             num_runs: Number of runs per eval set.
             agent_name: Display name of the agent.
             print_detailed_results: Whether to print per-case details.
@@ -324,13 +363,17 @@ def get_executer(
             case_eval_parallelism: Max concurrent cases for evaluation (scoring);
                 None uses default.
             callbacks: Optional lifecycle callbacks.
+            eval_metrics_file_path_or_dir: Optional explicit path to a shared
+                evaluation config JSON (file) or directory containing a single
+                config JSON. When provided, overrides the dataset-local
+                ``test_config.json`` convention for ALL discovered datasets.
 
         Returns:
             _EvalExecuter: Await .evaluate() to run, then .get_result() for EvaluateResult.
         """
         return _EvalExecuter(
+            eval_dataset_file_path_or_dir,
             agent_module=agent_module,
-            eval_dataset_file_path_or_dir=eval_dataset_file_path_or_dir,
             num_runs=num_runs,
             agent_name=agent_name,
             print_detailed_results=print_detailed_results,
@@ -339,6 +382,7 @@ def get_executer(
             case_parallelism=case_parallelism,
             case_eval_parallelism=case_eval_parallelism,
             callbacks=callbacks,
+            eval_metrics_file_path_or_dir=eval_metrics_file_path_or_dir,
         )
 
     @staticmethod
@@ -384,8 +428,9 @@ def pass_hat_k(n: int, c: int, k: int) -> float:
 
     @staticmethod
     async def evaluate_eval_set(
-        agent_module: str,
         eval_set: EvalSet,
+        *,
+        agent_module: Optional[str] = None,
         eval_config: Optional[EvalConfig] = None,
         num_runs: int = NUM_RUNS,
         agent_name: Optional[str] = None,
@@ -399,10 +444,14 @@ async def evaluate_eval_set(
         """Evaluates an agent using the given EvalSet.
 
         Args:
+            eval_set: The eval set. (Positional-only usage recommended.)
             agent_module: The path to python module that contains the definition of
                 the agent. There is convention in place here, where the code is going to
                 look for 'root_agent' or `get_agent_async` in the loaded module.
-            eval_set: The eval set.
+                Optional when every case in ``eval_set`` has ``eval_mode == 'trace'``
+                (pre-recorded conversation used as inference result, no agent run).
+                Required otherwise; a ``ValueError`` is raised listing non-trace
+                case ids.
             eval_config: The evaluation config.
             num_runs: Number of times all entries in the eval dataset should be
                 assessed.
@@ -427,7 +476,16 @@ async def evaluate_eval_set(
         if eval_config is None:
             raise ValueError("`eval_config` is required.")
 
-        agent_for_eval = await AgentEvaluator._get_agent_for_eval(module_name=agent_module, agent_name=agent_name)
+        trace_only = AgentEvaluator._is_trace_only(eval_set)
+        if agent_module is None and not trace_only:
+            non_trace_ids = [case.eval_id for case in eval_set.eval_cases if case.eval_mode != EvalModeTrace]
+            raise ValueError("`agent_module` is required unless every case in eval_set uses "
+                             "eval_mode='trace'. Non-trace case ids: "
+                             f"{non_trace_ids}")
+
+        agent_for_eval: Optional[BaseAgent] = None
+        if agent_module is not None:
+            agent_for_eval = await AgentEvaluator._get_agent_for_eval(module_name=agent_module, agent_name=agent_name)
         eval_metrics = eval_config.get_eval_metrics()
 
         user_simulator_provider = UserSimulatorProvider(user_simulator_config=eval_config.user_simulator_config)
@@ -448,7 +506,7 @@ async def evaluate_eval_set(
 
         # Step 2: Post-process the results
         failures: list[str] = []
-        display_agent_name = agent_name or agent_module
+        display_agent_name = agent_name or agent_module or "trace-only"
         details_lines: list[str] = []
         result_lines: list[str] = []
 
@@ -619,6 +677,70 @@ def _load_config_from_file(file_path: Optional[str]) -> EvalConfig:
         except Exception as ex:
             raise ValueError(f"Failed to load config from {file_path}: {ex}")
 
+    @staticmethod
+    def _load_config_from_file_strict(file_path: str) -> EvalConfig:
+        """Load EvalConfig from JSON file; raise if missing (no default fallback).
+
+        Unlike ``_load_config_from_file`` which silently returns a default config
+        when the file is absent, this strict variant is intended for explicit
+        user-supplied paths (e.g. ``eval_metrics_file_path_or_dir``) where a
+        missing file is a programmer error, not a signal to use defaults.
+
+        Args:
+            file_path: Path to the config JSON file. Must exist.
+
+        Returns:
+            EvalConfig instance loaded from ``file_path``.
+
+        Raises:
+            FileNotFoundError: If ``file_path`` does not exist.
+            ValueError: If the file cannot be parsed as EvalConfig JSON.
+        """
+        if not os.path.exists(file_path):
+            raise FileNotFoundError(f"Eval metrics/config file not found: {file_path}")
+        return AgentEvaluator._load_config_from_file(file_path)
+
+    @staticmethod
+    def _resolve_shared_config(eval_metrics_file_path_or_dir: Optional[str], ) -> Optional[EvalConfig]:
+        """Resolve a user-provided metrics/config path into a single EvalConfig.
+
+        Resolution rules:
+        - ``None``                 -> returns ``None`` (no shared config; callers
+          fall back to per-dataset ``test_config.json`` convention).
+        - Path is a regular file  -> load that file strictly.
+        - Path is a directory:
+            - Exactly one ``*.json`` in the directory (non-recursive) -> load it.
+            - Zero ``*.json``     -> ``FileNotFoundError``.
+            - Two or more         -> ``ValueError`` (ambiguous).
+        - Path does not exist     -> ``FileNotFoundError``.
+
+        Args:
+            eval_metrics_file_path_or_dir: User-supplied path or None.
+
+        Returns:
+            EvalConfig when a shared config was resolved, else None.
+        """
+        if eval_metrics_file_path_or_dir is None:
+            return None
+
+        path = eval_metrics_file_path_or_dir
+        if not os.path.exists(path):
+            raise FileNotFoundError(f"eval_metrics_file_path_or_dir does not exist: {path}")
+
+        if os.path.isfile(path):
+            return AgentEvaluator._load_config_from_file_strict(path)
+
+        # Directory case: non-recursive lookup for *.json
+        json_files = sorted(
+            os.path.join(path, entry) for entry in os.listdir(path)
+            if entry.endswith(".json") and os.path.isfile(os.path.join(path, entry)))
+        if not json_files:
+            raise FileNotFoundError(f"No *.json config file found in directory: {path}")
+        if len(json_files) > 1:
+            raise ValueError("eval_metrics_file_path_or_dir directory contains multiple "
+                             f"*.json files; expected exactly one. Found: {json_files}")
+        return AgentEvaluator._load_config_from_file_strict(json_files[0])
+
     @staticmethod
     def _get_eval_sets_manager(app_name: str, eval_set: EvalSet) -> EvalSetsManager:
         """Create and populate an in-memory eval sets manager.
@@ -644,7 +766,7 @@ def _get_eval_sets_manager(app_name: str, eval_set: EvalSet) -> EvalSetsManager:
 
     @staticmethod
     async def _get_eval_results_by_eval_id(
-        agent_for_eval: BaseAgent,
+        agent_for_eval: Optional[BaseAgent],
         eval_set: EvalSet,
         eval_metrics: list,
         num_runs: int,
@@ -735,9 +857,12 @@ async def _get_eval_results_by_eval_id(
 
         return eval_results_by_eval_id
 
+    # yapf: disable
     @staticmethod
     def _get_eval_metric_results_with_invocation(
-        eval_results_per_eval_id: list[EvalCaseResult], ) -> dict[str, list[_utils.MetricRunRecord]]:
+        eval_results_per_eval_id: list[EvalCaseResult],
+    ) -> dict[str, list[_utils.MetricRunRecord]]:
+        # yapf: enable
         """Returns MetricRunRecord grouped by metric.
 
         EvalCaseResult contain results for each metric per invocation.
diff --git a/trpc_agent_sdk/evaluation/_eval_case.py b/trpc_agent_sdk/evaluation/_eval_case.py
index e6e7bd9..8ac476d 100644
--- a/trpc_agent_sdk/evaluation/_eval_case.py
+++ b/trpc_agent_sdk/evaluation/_eval_case.py
@@ -210,13 +210,29 @@ class EvalCase(EvalBaseModel):
 
     @model_validator(mode="after")
     def ensure_conversation_xor_conversation_scenario(self) -> EvalCase:
-        """Trace: conversation or actual_conversation (no scenario). Default: conversation xor conversation_scenario."""
+        """Trace: actual_conversation is required (conversation optional as reference).
+        Default: conversation xor conversation_scenario.
+
+        Trace-mode legal shapes (after Bug 3.2 strict fix):
+          * actual_conversation only                -> scenario 3 (no reference)
+          * actual_conversation + conversation      -> scenario 1 (full comparison)
+
+        The legacy shape of providing only `conversation` under eval_mode='trace'
+        is now rejected because the field would have to serve as both the
+        recorded trace (actual) and the reference (expected), which is ambiguous
+        and causes silent evaluation errors in reference-based metrics
+        (see docs/superpowers/specs/2026-05-06-evaluator-trace-metric-strict-compat-design.md).
+        """
         is_trace = self.eval_mode == EvalModeTrace
         if is_trace:
             if self.conversation_scenario is not None:
                 raise ValueError("conversation_scenario is not allowed when eval_mode is \"trace\"")
-            if not self.conversation and not self.actual_conversation:
-                raise ValueError("trace mode requires at least one of conversation or actual_conversation")
+            if not self.actual_conversation:
+                raise ValueError("eval_mode='trace' requires `actual_conversation` field. "
+                                 "If the provided `conversation` is a recorded trace, move it to "
+                                 "`actual_conversation`. If it's a reference answer, also provide "
+                                 "`actual_conversation` for scenario-1 full comparison. "
+                                 "See Bug 3.2 of evaluator-trace-metric-strict-compat-design.")
             return self
         if (self.conversation is None) == (self.conversation_scenario is None):
             raise ValueError("Exactly one of conversation and conversation_scenario must be provided")
diff --git a/trpc_agent_sdk/evaluation/_eval_result.py b/trpc_agent_sdk/evaluation/_eval_result.py
index a10259f..700bb0e 100644
--- a/trpc_agent_sdk/evaluation/_eval_result.py
+++ b/trpc_agent_sdk/evaluation/_eval_result.py
@@ -35,6 +35,29 @@
 from ._eval_metrics import EvalStatus
 
 
+class NamedScoreResult(EvalBaseModel):
+    """One judge model's per-invocation score, used inside PerInvocationResult.per_model_scores.
+
+    Attributes:
+        model_name: Judge model name.
+        provider_name: Provider name; empty when unset.
+        score: Numeric score from this model after SamplesAggregator on its own samples.
+        reason: Reason text from the judge model (or exception text on soft failure).
+        rubric_scores: Per-rubric scores (rubric metrics). Use _llm_criterion.RubricScore.
+        passed: True iff score >= metric.threshold.
+    """
+
+    model_name: str = Field(default="", description="Judge model name.")
+    provider_name: str = Field(default="", description="Provider name.")
+    score: float = Field(default=0.0, description="Score from this model.")
+    reason: str = Field(default="", description="Reason from this model.")
+    rubric_scores: list[Any] = Field(
+        default_factory=list,
+        description="Per-rubric scores from this model (rubric metrics).",
+    )
+    passed: bool = Field(default=False, description="True iff score >= threshold.")
+
+
 class PerInvocationResult(EvalBaseModel):
     """Result for a single invocation.
 
@@ -56,6 +79,11 @@ class PerInvocationResult(EvalBaseModel):
         default=None,
         description="Per-rubric scores (LLM rubric metrics). Use _llm_criterion.RubricScore.",
     )
+    per_model_scores: Optional[list[NamedScoreResult]] = Field(
+        default=None,
+        description=("Per-judge-model breakdown for multi-model LLM judge metrics. "
+                     "None for single-model or non-LLM metrics (back-compatible)."),
+    )
 
 
 class EvaluationResult(EvalBaseModel):
diff --git a/trpc_agent_sdk/evaluation/_evaluator_base.py b/trpc_agent_sdk/evaluation/_evaluator_base.py
index 6404e29..98e4f89 100644
--- a/trpc_agent_sdk/evaluation/_evaluator_base.py
+++ b/trpc_agent_sdk/evaluation/_evaluator_base.py
@@ -26,6 +26,7 @@
 
 from abc import ABC
 from abc import abstractmethod
+from typing import ClassVar
 from typing import Optional
 
 from ._eval_case import Invocation
@@ -39,6 +40,22 @@ class Evaluator(ABC):
     and expected invocations and computing a score.
     """
 
+    requires_reference: ClassVar[bool] = True
+    """Whether this metric requires expected_invocations with non-placeholder
+    `final_response` / `intermediate_data` fields (a "reference answer").
+
+    Set to False for reference-free metrics (e.g. rubric-based LLM judges that
+    evaluate actual output against a rubric, not against a reference answer).
+
+    Checked by `LocalEvalService._validate_metric_compat` at evaluate() startup
+    to fail-fast on incompatible (eval_case, metric) combinations. Defaults to
+    True — safer for new evaluators; opt into False explicitly when adding a
+    reference-free metric.
+
+    See Bug 3.1 of
+    docs/superpowers/specs/2026-05-06-evaluator-trace-metric-strict-compat-design.md.
+    """
+
     @abstractmethod
     def evaluate_invocations(
         self,
diff --git a/trpc_agent_sdk/evaluation/_evaluator_registry.py b/trpc_agent_sdk/evaluation/_evaluator_registry.py
index 8044d14..aa2c67a 100644
--- a/trpc_agent_sdk/evaluation/_evaluator_registry.py
+++ b/trpc_agent_sdk/evaluation/_evaluator_registry.py
@@ -103,6 +103,19 @@ def get_evaluator(self, eval_metric: EvalMetric) -> Evaluator:
 
         return evaluator
 
+    def get_evaluator_class(self, eval_metric: EvalMetric) -> Type[Evaluator]:
+        """Return evaluator CLASS (not instance) for inspecting class attributes.
+
+        Used by LocalEvalService._validate_metric_compat to look up the
+        `requires_reference` ClassVar without instantiating the evaluator
+        (which may have side effects like loading rouge-score or creating an
+        LLM judge). Raises if metric not registered.
+        """
+        if eval_metric.metric_name not in self._registry:
+            raise ValueError(f"No evaluator registered for metric: {eval_metric.metric_name}. "
+                             f"Available metrics: {list(self._registry.keys())}")
+        return self._registry[eval_metric.metric_name]
+
 
 # Global default registry
 EVALUATOR_REGISTRY = EvaluatorRegistry()
diff --git a/trpc_agent_sdk/evaluation/_final_response_evaluator.py b/trpc_agent_sdk/evaluation/_final_response_evaluator.py
index 8b5e06a..9450211 100644
--- a/trpc_agent_sdk/evaluation/_final_response_evaluator.py
+++ b/trpc_agent_sdk/evaluation/_final_response_evaluator.py
@@ -33,6 +33,8 @@ class FinalResponseEvaluator(Evaluator):
     else exact text. Score 1.0 or 0.0 per invocation, overall = mean.
     """
 
+    requires_reference = True
+
     def __init__(
         self,
         threshold: Optional[float] = None,
diff --git a/trpc_agent_sdk/evaluation/_llm_criterion.py b/trpc_agent_sdk/evaluation/_llm_criterion.py
index 6e56b1b..7294c2f 100644
--- a/trpc_agent_sdk/evaluation/_llm_criterion.py
+++ b/trpc_agent_sdk/evaluation/_llm_criterion.py
@@ -13,10 +13,22 @@
 
 from pydantic import Field
 from pydantic import model_serializer
+from pydantic import model_validator
 
 from ._common import EvalBaseModel
 
 DEFAULT_NUM_SAMPLES = 1
+DEFAULT_MODELS_AGGREGATOR = "all_pass"
+DEFAULT_PARALLEL = True
+BUILT_IN_MODELS_AGGREGATORS = frozenset({
+    "all_pass",
+    "any_pass",
+    "majority_pass",
+    "avg",
+    "weighted_avg",
+    "weighted_majority",
+})
+WEIGHTED_MODELS_AGGREGATORS = frozenset({"weighted_avg", "weighted_majority"})
 
 
 def sanitize_criterion_for_export(criterion: Optional[dict[str, Any]]) -> Optional[dict[str, Any]]:
@@ -80,6 +92,17 @@ class JudgeModelOptions(EvalBaseModel):
         default=None,
         description="Generation params: max_tokens, temperature, stream, etc.",
     )
+    weight: float = Field(
+        default=1.0,
+        description="Weight for weighted_* models_aggregator; ignored otherwise.",
+    )
+    think: Optional[bool] = Field(
+        default=None,
+        description=("Toggle judge thinking mode. None (default): no change; "
+                     "False: disable thinking via both ThinkingConfig(include_thoughts=False, "
+                     "thinking_budget=0) and extra_body.chat_template_kwargs.enable_thinking=False; "
+                     "True: enable thinking with automatic budget."),
+    )
 
     def get_num_samples(self) -> int:
         """Return configured num_samples or DEFAULT_NUM_SAMPLES."""
@@ -123,6 +146,22 @@ class LLMJudgeCriterion(EvalBaseModel):
         default=None,
         description="Judge model options (required for all LLM judge metrics).",
     )
+    judge_models: Optional[list[JudgeModelOptions]] = Field(
+        default=None,
+        description=("Multi-model judge list. Mutually exclusive with judge_model. "
+                     "Cross-model results are combined by models_aggregator."),
+    )
+    models_aggregator: str = Field(
+        default=DEFAULT_MODELS_AGGREGATOR,
+        description=("Cross-model aggregation strategy. Built-in: all_pass | any_pass | "
+                     "majority_pass | avg | weighted_avg | weighted_majority. "
+                     "Custom names must be registered via "
+                     "LLM_EVALUATOR_REGISTRY.register_models_aggregator before LLMJudge construction."),
+    )
+    parallel: bool = Field(
+        default=DEFAULT_PARALLEL,
+        description="Run multiple judge models concurrently via asyncio.gather (default True).",
+    )
     rubrics: list[Rubric] = Field(
         default_factory=list,
         description="Rubric items for rubric-based metrics.",
@@ -145,6 +184,44 @@ def get_knowledge_tool_names(self) -> list[str]:
             return list(self.knowledge_tool_names)
         return list(DEFAULT_KNOWLEDGE_TOOL_NAMES)
 
+    @model_validator(mode="after")
+    def _validate_multi_model_fields(self) -> "LLMJudgeCriterion":
+        """Validate judge_model/judge_models exclusivity, weights, and aggregator name shape.
+
+        Registry-registered aggregator names are not validated here; only built-in
+        names are rejected at LLMJudge construction time when registry lookup misses.
+        """
+        if self.judge_model is not None and self.judge_models is not None:
+            raise ValueError("judge_model and judge_models are mutually exclusive; set only one")
+        if self.judge_models is not None and len(self.judge_models) == 0:
+            raise ValueError("judge_models must not be empty when set")
+        if not isinstance(self.models_aggregator, str) or not self.models_aggregator:
+            raise ValueError("models_aggregator must be a non-empty string")
+        models = self.get_judge_models()
+        for m in models:
+            if m.weight < 0:
+                raise ValueError(f"judge model weight must not be negative: model_name={m.model_name!r} "
+                                 f"weight={m.weight}")
+        if self.models_aggregator in WEIGHTED_MODELS_AGGREGATORS and models:
+            total = sum(m.weight for m in models)
+            if total <= 0:
+                raise ValueError(f"models_aggregator={self.models_aggregator!r} requires sum of weights > 0; "
+                                 f"got total weight {total}")
+        return self
+
+    def get_judge_models(self) -> list[JudgeModelOptions]:
+        """Return effective list of judge model options.
+
+        - judge_models set -> returned as-is.
+        - only legacy judge_model set -> returned as 1-element list.
+        - neither set -> []. Caller (LLMJudge) decides whether to error.
+        """
+        if self.judge_models is not None:
+            return list(self.judge_models)
+        if self.judge_model is not None:
+            return [self.judge_model]
+        return []
+
     @classmethod
     def from_dict(cls, d: dict | None) -> Optional["LLMJudgeCriterion"]:
         """Build from config dict (judgeModel, rubrics, knowledge_tool_names; camelCase or snake_case).
diff --git a/trpc_agent_sdk/evaluation/_llm_evaluator.py b/trpc_agent_sdk/evaluation/_llm_evaluator.py
index e780c7e..189b16f 100644
--- a/trpc_agent_sdk/evaluation/_llm_evaluator.py
+++ b/trpc_agent_sdk/evaluation/_llm_evaluator.py
@@ -25,6 +25,7 @@
 from ._llm_judge import InvocationsAggregator
 from ._llm_judge import LLMJudge
 from ._llm_judge import MessagesConstructor
+from ._llm_judge import ModelsAggregator
 from ._llm_judge import ResponseScorer
 from ._llm_judge import SamplesAggregator
 
@@ -40,6 +41,7 @@
 ResponseScorerFn = Callable[[str, str], ScoreResult]
 SamplesAggregatorFn = Callable[[list[ScoreResult], float], ScoreResult]
 InvocationsAggregatorFn = Callable[[list[PerInvocationResult], float], tuple[Optional[float], EvalStatus]]
+ModelsAggregatorFn = Callable[[list[ScoreResult], float, list[float]], ScoreResult]
 
 
 class _MessagesConstructorAdapter:
@@ -82,6 +84,16 @@ def aggregate_invocations(self, results, threshold):
         return self._fn(results, threshold)
 
 
+class _ModelsAggregatorAdapter:
+    """Adapts a plain function to ModelsAggregator."""
+
+    def __init__(self, fn: ModelsAggregatorFn) -> None:
+        self._fn = fn
+
+    def aggregate_models(self, per_model, threshold, weights):
+        return self._fn(per_model, threshold, weights)
+
+
 def _validate_metric(metric_name: str) -> None:
     """Raise ValueError if metric_name is not an LLM metric."""
     if metric_name not in LLM_METRIC_NAMES:
@@ -97,6 +109,7 @@ def __init__(self) -> None:
         self._response_scorer: dict[str, ResponseScorer] = {}
         self._samples_aggregator: dict[str, SamplesAggregator] = {}
         self._invocations_aggregator: dict[str, InvocationsAggregator] = {}
+        self._models_aggregator: dict[str, ModelsAggregator] = {}
         self._judge_tools: dict[str, List[Any]] = {}
 
     def register_messages_constructor(self, metric_name: str, fn: MessagesConstructorFn) -> None:
@@ -115,6 +128,10 @@ def register_invocations_aggregator(self, metric_name: str, fn: InvocationsAggre
         _validate_metric(metric_name)
         self._invocations_aggregator[metric_name] = _InvocationsAggregatorAdapter(fn)
 
+    def register_models_aggregator(self, metric_name: str, fn: ModelsAggregatorFn) -> None:
+        _validate_metric(metric_name)
+        self._models_aggregator[metric_name] = _ModelsAggregatorAdapter(fn)
+
     def register_judge_tools(self, metric_name: str, tools: List[Any]) -> None:
         """Register tools for the judge LlmAgent (e.g. BaseTool, BaseToolSet, or callables)."""
         _validate_metric(metric_name)
@@ -136,6 +153,9 @@ def get_samples_aggregator(self, metric_name: str) -> Optional[SamplesAggregator
     def get_invocations_aggregator(self, metric_name: str) -> Optional[InvocationsAggregator]:
         return self._invocations_aggregator.get(metric_name)
 
+    def get_models_aggregator(self, metric_name: str) -> Optional[ModelsAggregator]:
+        return self._models_aggregator.get(metric_name)
+
     def unregister_messages_constructor(self, metric_name: str) -> None:
         self._messages_constructor.pop(metric_name, None)
 
@@ -148,6 +168,9 @@ def unregister_samples_aggregator(self, metric_name: str) -> None:
     def unregister_invocations_aggregator(self, metric_name: str) -> None:
         self._invocations_aggregator.pop(metric_name, None)
 
+    def unregister_models_aggregator(self, metric_name: str) -> None:
+        self._models_aggregator.pop(metric_name, None)
+
     def unregister_judge_tools(self, metric_name: str) -> None:
         self._judge_tools.pop(metric_name, None)
 
@@ -165,6 +188,7 @@ def _judge_for_metric(eval_metric: EvalMetric) -> LLMJudge:
         response_scorer=LLM_EVALUATOR_REGISTRY.get_response_scorer(name),
         samples_aggregator=LLM_EVALUATOR_REGISTRY.get_samples_aggregator(name),
         invocations_aggregator=LLM_EVALUATOR_REGISTRY.get_invocations_aggregator(name),
+        models_aggregator=LLM_EVALUATOR_REGISTRY.get_models_aggregator(name),
         judge_tools=LLM_EVALUATOR_REGISTRY.get_judge_tools(name),
     )
 
@@ -172,6 +196,8 @@ def _judge_for_metric(eval_metric: EvalMetric) -> LLMJudge:
 class LLMFinalResponseEvaluator(Evaluator):
     """LLM judge for final response (valid/invalid). Metric: llm_final_response."""
 
+    requires_reference = True
+
     def __init__(self, eval_metric: Optional[EvalMetric] = None) -> None:
         if not eval_metric:
             raise ValueError("eval_metric is required for LLMFinalResponseEvaluator")
@@ -190,6 +216,8 @@ async def evaluate_invocations(
 class LLMRubricResponseEvaluator(Evaluator):
     """LLM rubric-based response quality. Metric: llm_rubric_response."""
 
+    requires_reference = False
+
     def __init__(self, eval_metric: Optional[EvalMetric] = None) -> None:
         if not eval_metric:
             raise ValueError("eval_metric is required for LLMRubricResponseEvaluator")
@@ -208,6 +236,8 @@ async def evaluate_invocations(
 class LLMRubricKnowledgeRecallEvaluator(Evaluator):
     """LLM rubric knowledge recall. Metric: llm_rubric_knowledge_recall."""
 
+    requires_reference = False
+
     def __init__(self, eval_metric: Optional[EvalMetric] = None) -> None:
         if not eval_metric:
             raise ValueError("eval_metric is required for LLMRubricKnowledgeRecallEvaluator")
diff --git a/trpc_agent_sdk/evaluation/_llm_judge.py b/trpc_agent_sdk/evaluation/_llm_judge.py
index b68ebbe..f9c8998 100644
--- a/trpc_agent_sdk/evaluation/_llm_judge.py
+++ b/trpc_agent_sdk/evaluation/_llm_judge.py
@@ -7,6 +7,8 @@
 
 from __future__ import annotations
 
+import asyncio
+import copy
 import json
 import os
 import uuid
@@ -21,10 +23,14 @@
 from trpc_agent_sdk.context import create_agent_context
 from trpc_agent_sdk.context import new_invocation_context_id
 from trpc_agent_sdk.models import ModelRegistry
+from trpc_agent_sdk.models import OpenAIModel
+from trpc_agent_sdk.planners import BuiltInPlanner
 from trpc_agent_sdk.sessions import InMemorySessionService
 from trpc_agent_sdk.types import Content
 from trpc_agent_sdk.types import GenerateContentConfig
+from trpc_agent_sdk.types import HttpOptions
 from trpc_agent_sdk.types import Part
+from trpc_agent_sdk.types import ThinkingConfig
 
 from ._eval_case import IntermediateData
 from ._eval_case import Invocation
@@ -33,8 +39,10 @@
 from ._eval_metrics import EvalMetric
 from ._eval_metrics import EvalStatus
 from ._eval_result import EvaluationResult
+from ._eval_result import NamedScoreResult
 from ._eval_result import PerInvocationResult
 from ._llm_criterion import LLMJudgeCriterion
+from ._llm_criterion import JudgeModelOptions
 from ._llm_criterion import Rubric
 from ._llm_criterion import RubricScore
 from ._llm_criterion import ScoreResult
@@ -103,6 +111,18 @@ def aggregate_invocations(
         ...
 
 
+class ModelsAggregator(Protocol):
+    """Aggregates per-model judge ScoreResults (single invocation, multiple judge models) into one ScoreResult."""
+
+    def aggregate_models(
+        self,
+        per_model: list[ScoreResult],
+        threshold: float,
+        weights: list[float],
+    ) -> ScoreResult:
+        ...
+
+
 class MajorityVoteSamplesAggregator:
     """Selects one sample by majority vote on pass/fail; on tie, prefers a failed sample if any."""
 
@@ -140,6 +160,161 @@ def aggregate_invocations(
         return (overall, status)
 
 
+def _format_per_model_reason(per_model: list[ScoreResult], threshold: float) -> str:
+    """Build a multi-line per-model breakdown string for ScoreResult.reason."""
+    lines: list[str] = []
+    for i, s in enumerate(per_model):
+        passed = (s.score or 0.0) >= threshold
+        snippet = (s.reason or "").replace("\n", " ").strip()
+        if len(snippet) > 200:
+            snippet = snippet[:200] + "..."
+        lines.append(f"  model#{i}: score={s.score:.4f} passed={passed} reason={snippet}")
+    return "\n".join(lines)
+
+
+class AllPassModelsAggregator:
+    """All models must pass (AND); returned score = min(scores)."""
+
+    def aggregate_models(
+        self,
+        per_model: list[ScoreResult],
+        threshold: float,
+        weights: list[float],
+    ) -> ScoreResult:
+        if not per_model:
+            raise ValueError("per_model must not be empty")
+        scores = [s.score or 0.0 for s in per_model]
+        overall = min(scores)
+        passed_all = all(s >= threshold for s in scores)
+        base_reason = _format_per_model_reason(per_model, threshold)
+        reason = f"{base_reason}\naggregator=all_pass -> {'PASSED' if passed_all else 'FAILED'}"
+        return ScoreResult(score=overall, reason=reason)
+
+
+class AnyPassModelsAggregator:
+    """Any model passing is enough (OR); returned score = max(scores)."""
+
+    def aggregate_models(
+        self,
+        per_model: list[ScoreResult],
+        threshold: float,
+        weights: list[float],
+    ) -> ScoreResult:
+        if not per_model:
+            raise ValueError("per_model must not be empty")
+        scores = [s.score or 0.0 for s in per_model]
+        overall = max(scores)
+        passed_any = any(s >= threshold for s in scores)
+        base_reason = _format_per_model_reason(per_model, threshold)
+        reason = f"{base_reason}\naggregator=any_pass -> {'PASSED' if passed_any else 'FAILED'}"
+        return ScoreResult(score=overall, reason=reason)
+
+
+class MajorityPassModelsAggregator:
+    """Strict majority must pass (passed*2 > total). Score = passed_count/total."""
+
+    def aggregate_models(
+        self,
+        per_model: list[ScoreResult],
+        threshold: float,
+        weights: list[float],
+    ) -> ScoreResult:
+        if not per_model:
+            raise ValueError("per_model must not be empty")
+        passed_count = sum(1 for s in per_model if (s.score or 0.0) >= threshold)
+        total = len(per_model)
+        overall = passed_count / total if total else 0.0
+        passed_majority = passed_count * 2 > total
+        reason = (_format_per_model_reason(per_model, threshold) + f"\naggregator=majority_pass -> "
+                  f"{'PASSED' if passed_majority else 'FAILED'} ({passed_count}/{total})")
+        return ScoreResult(score=overall, reason=reason)
+
+
+class AverageModelsAggregator:
+    """Mean of scores."""
+
+    def aggregate_models(
+        self,
+        per_model: list[ScoreResult],
+        threshold: float,
+        weights: list[float],
+    ) -> ScoreResult:
+        if not per_model:
+            raise ValueError("per_model must not be empty")
+        scores = [s.score or 0.0 for s in per_model]
+        overall = sum(scores) / len(scores)
+        reason = (_format_per_model_reason(per_model, threshold) + f"\naggregator=avg -> mean={overall:.4f}")
+        return ScoreResult(score=overall, reason=reason)
+
+
+class WeightedAverageModelsAggregator:
+    """Weighted mean: sum(w*s)/sum(w). Zero total -> 0.0."""
+
+    def aggregate_models(
+        self,
+        per_model: list[ScoreResult],
+        threshold: float,
+        weights: list[float],
+    ) -> ScoreResult:
+        if not per_model:
+            raise ValueError("per_model must not be empty")
+        if len(weights) != len(per_model):
+            raise ValueError(f"weights length {len(weights)} must equal per_model length {len(per_model)}")
+        total_w = sum(weights)
+        if total_w <= 0:
+            overall = 0.0
+        else:
+            overall = sum(w * (s.score or 0.0) for w, s in zip(weights, per_model)) / total_w
+        base_reason = _format_per_model_reason(per_model, threshold)
+        reason = f"{base_reason}\naggregator=weighted_avg -> weighted_mean={overall:.4f} (total_w={total_w})"
+        return ScoreResult(score=overall, reason=reason)
+
+
+class WeightedMajorityModelsAggregator:
+    """passed_weight*2 > total_weight (strict). Score = passed_weight/total_weight."""
+
+    def aggregate_models(
+        self,
+        per_model: list[ScoreResult],
+        threshold: float,
+        weights: list[float],
+    ) -> ScoreResult:
+        if not per_model:
+            raise ValueError("per_model must not be empty")
+        if len(weights) != len(per_model):
+            raise ValueError(f"weights length {len(weights)} must equal per_model length {len(per_model)}")
+        total_w = sum(weights)
+        passed_w = sum(w for w, s in zip(weights, per_model) if (s.score or 0.0) >= threshold)
+        if total_w <= 0:
+            overall = 0.0
+            passed_majority = False
+        else:
+            overall = passed_w / total_w
+            passed_majority = passed_w * 2 > total_w
+        reason = (_format_per_model_reason(per_model, threshold) + f"\naggregator=weighted_majority -> "
+                  f"{'PASSED' if passed_majority else 'FAILED'} "
+                  f"(passed_w={passed_w}, total_w={total_w})")
+        return ScoreResult(score=overall, reason=reason)
+
+
+_BUILTIN_MODELS_AGGREGATORS: dict[str, type] = {
+    "all_pass": AllPassModelsAggregator,
+    "any_pass": AnyPassModelsAggregator,
+    "majority_pass": MajorityPassModelsAggregator,
+    "avg": AverageModelsAggregator,
+    "weighted_avg": WeightedAverageModelsAggregator,
+    "weighted_majority": WeightedMajorityModelsAggregator,
+}
+
+
+def get_builtin_models_aggregator(name: str) -> Optional[ModelsAggregator]:
+    """Return a built-in ModelsAggregator instance by name, or None if unknown."""
+    cls = _BUILTIN_MODELS_AGGREGATORS.get(name)
+    if cls is None:
+        return None
+    return cls()
+
+
 def _extract_text_from_content(content: Any) -> str:
     """Extract plain text from Content parts (concatenate part texts)."""
     if content is None:
@@ -617,13 +792,97 @@ def _expand_env(s: str) -> str:
     return os.path.expandvars(s)
 
 
+def _create_judge_model(opts: JudgeModelOptions) -> Any:
+    """Build the underlying LLM model for one judge option.
+
+    Provider routing:
+      - provider_name empty or "openai" -> OpenAIModel(...) directly. This
+        matches the framework's standard pattern for OpenAI-compatible
+        endpoints (see examples/llmagent/) and ensures http_options.extra_body
+        (e.g. chat_template_kwargs.enable_thinking used by judge `think` field)
+        is forwarded to the backend. Routing via "openai/<name>" through
+        ModelRegistry lands on LiteLLMModel whose current implementation
+        drops extra_body.
+      - Any other provider_name -> ModelRegistry.create_model("{provider}/{model}")
+        which routes to LiteLLMModel for multi-provider support.
+    """
+    provider_name = _expand_env(opts.provider_name or "")
+    model_name = _expand_env(opts.model_name or "")
+    base_url = _expand_env(opts.base_url or "")
+    api_key = _expand_env(opts.api_key or "")
+    extra = dict(opts.extra_fields or {})
+
+    if not provider_name or provider_name.lower() == "openai":
+        # Direct OpenAIModel instantiation bypasses ModelRegistry regex routing,
+        # so any model_name (e.g. "glm-5.1-w4afp8") works against any
+        # OpenAI-compatible endpoint.
+        return OpenAIModel(
+            model_name=model_name,
+            api_key=api_key,
+            base_url=base_url or None,
+            **extra,
+        )
+
+    model_str = f"{provider_name}/{model_name}"
+    return ModelRegistry.create_model(
+        model_str,
+        api_key=api_key,
+        base_url=base_url or "",
+        **extra,
+    )
+
+
 # Default judge generation params when not specified in criterion.
 DEFAULT_JUDGE_MAX_TOKENS = 4096
 DEFAULT_JUDGE_TEMPERATURE = 0.8
 
 
-def _judge_generation_config(gen: dict[str, Any] | None) -> GenerateContentConfig:
-    """Build GenerateContentConfig from criterion generation_config; use defaults for missing fields."""
+def _merge_extra_body(
+    http_options: Optional[HttpOptions],
+    patch: dict[str, Any],
+) -> HttpOptions:
+    """Deep-merge patch into http_options.extra_body at nested-dict granularity.
+
+    - None http_options -> returns new HttpOptions(extra_body=deepcopy(patch)).
+    - For top-level keys in patch: if both sides have dict, merge recursively (deep-copying
+      patch values); otherwise patch value wins.
+    - Other existing top-level keys in http_options.extra_body are preserved.
+    """
+    base = (http_options.extra_body or {}) if http_options is not None else {}
+    merged: dict[str, Any] = dict(base)
+    for key, patch_val in patch.items():
+        base_val = merged.get(key)
+        if isinstance(base_val, dict) and isinstance(patch_val, dict):
+            new_child = dict(base_val)
+            for subkey, subval in patch_val.items():
+                new_child[subkey] = copy.deepcopy(subval)
+            merged[key] = new_child
+        else:
+            merged[key] = copy.deepcopy(patch_val)
+    if http_options is None:
+        return HttpOptions(extra_body=merged)
+    return http_options.model_copy(update={"extra_body": merged})
+
+
+def _judge_generation_config(
+    gen: dict[str, Any] | None,
+    think: Optional[bool],
+) -> tuple[GenerateContentConfig, Optional[ThinkingConfig]]:
+    """Build GenerateContentConfig from criterion generation_config and resolve thinking config.
+
+    Returns (cfg, effective_thinking_config):
+      - cfg: GenerateContentConfig WITHOUT thinking_config set (LlmAgent rejects it;
+        thinking_config must be applied via BuiltInPlanner).
+      - effective_thinking_config: None means caller should not build a planner;
+        otherwise caller wraps it in BuiltInPlanner.
+
+    Resolution order:
+      1. Parse gen for base fields (max_tokens/temperature/top_p/stop/...).
+      2. Parse gen["thinking_config"] dict into a candidate ThinkingConfig (not written to cfg).
+      3. Parse gen["http_options"] dict into cfg.http_options (if present).
+      4. If `think` is not None, override the candidate ThinkingConfig and deep-merge
+         chat_template_kwargs.enable_thinking into cfg.http_options (preserving siblings).
+    """
     gen = gen or {}
     cfg = GenerateContentConfig()
     cfg.max_output_tokens = (gen.get("max_tokens") or gen.get("max_output_tokens") or DEFAULT_JUDGE_MAX_TOKENS)
@@ -638,7 +897,43 @@ def _judge_generation_config(gen: dict[str, Any] | None) -> GenerateContentConfi
         setattr(cfg, "presence_penalty", gen["presence_penalty"])
     if "frequency_penalty" in gen and gen["frequency_penalty"] is not None:
         setattr(cfg, "frequency_penalty", gen["frequency_penalty"])
-    return cfg
+
+    # Parse thinking_config dict from generation_config (candidate; may be overridden by `think`).
+    effective_thinking_config: Optional[ThinkingConfig] = None
+    tc_dict = gen.get("thinking_config")
+    if isinstance(tc_dict, dict):
+        effective_thinking_config = ThinkingConfig(**tc_dict)
+
+    # Parse http_options dict from generation_config, if any.
+    http_opts_dict = gen.get("http_options")
+    if isinstance(http_opts_dict, dict):
+        cfg.http_options = HttpOptions(**http_opts_dict)
+
+    # `think` field overrides both paths when set.
+    if think is True:
+        effective_thinking_config = ThinkingConfig(
+            include_thoughts=True,
+            thinking_budget=-1,
+        )
+        cfg.http_options = _merge_extra_body(
+            cfg.http_options,
+            {"chat_template_kwargs": {
+                "enable_thinking": True
+            }},
+        )
+    elif think is False:
+        effective_thinking_config = ThinkingConfig(
+            include_thoughts=False,
+            thinking_budget=0,
+        )
+        cfg.http_options = _merge_extra_body(
+            cfg.http_options,
+            {"chat_template_kwargs": {
+                "enable_thinking": False
+            }},
+        )
+
+    return cfg, effective_thinking_config
 
 
 class _JudgeAgent:
@@ -651,6 +946,7 @@ def __init__(
         system_prompt: str,
         output_schema: Optional[type[PydanticBaseModel]] = None,
         tools: Optional[list] = None,
+        planner: Optional[Any] = None,
     ) -> None:
         self._agent = LlmAgent(
             name="judge",
@@ -660,6 +956,7 @@ def __init__(
             add_name_to_instruction=False,
             output_schema=output_schema,
             tools=tools or [],
+            planner=planner,
         )
         self._session_service = InMemorySessionService()
 
@@ -694,9 +991,16 @@ async def get_response(self, user_message: str) -> str:
 
 
 class LLMJudge:
-    """Builds a judge agent from eval_metric.
-    Pluggable: messages constructor, response scorer, samples/invocations aggregators.
-    Defaults used when not provided.
+    """Builds judge agent(s) from eval_metric. Supports 1..N judge models with cross-model aggregation.
+
+    Pluggable: messages_constructor, response_scorer, samples_aggregator, invocations_aggregator,
+    models_aggregator, judge_tools.
+
+    models_aggregator resolution order:
+      1) explicit constructor argument (if any)
+      2) registry-registered ModelsAggregator for metric_name (resolved by caller, e.g. _judge_for_metric)
+      3) criterion.models_aggregator string -> built-in 6 names
+      4) fallback: all_pass
     """
 
     def __init__(
@@ -707,32 +1011,36 @@ def __init__(
         response_scorer: Optional[ResponseScorer] = None,
         samples_aggregator: Optional[SamplesAggregator] = None,
         invocations_aggregator: Optional[InvocationsAggregator] = None,
+        models_aggregator: Optional[ModelsAggregator] = None,
         judge_tools: Optional[list] = None,
     ) -> None:
         if not eval_metric:
             raise ValueError("LLMJudge requires eval_metric")
         self._eval_metric = eval_metric
         criterion = get_llm_criterion_from_metric(eval_metric)
-        if not criterion or not criterion.judge_model:
-            raise ValueError("eval_metric.criterion.llmJudge with judge_model is required")
+        if not criterion:
+            raise ValueError("eval_metric.criterion.llmJudge is required")
+        judge_models_list = criterion.get_judge_models()
+        if not judge_models_list:
+            raise ValueError("eval_metric.criterion.llmJudge requires either judge_model or judge_models")
         self._criterion = criterion
         self._metric_name = eval_metric.metric_name or ""
-
-        opts = criterion.judge_model
-        provider_name = _expand_env(opts.provider_name or "")
-        model_name = _expand_env(opts.model_name or "")
-        base_url = _expand_env(opts.base_url or "")
-        api_key = _expand_env(opts.api_key or "")
-        model_str = f"{provider_name or 'openai'}/{model_name}"
-        extra = dict(opts.extra_fields or {})
-        model = ModelRegistry.create_model(
-            model_str,
-            api_key=api_key,
-            base_url=base_url or "",
-            **extra,
-        )
-        cfg = _judge_generation_config(opts.generation_config)
-
+        self._judge_models: list[JudgeModelOptions] = judge_models_list
+        self._parallel: bool = bool(criterion.parallel)
+
+        # Resolve models_aggregator: explicit > built-in name lookup > error.
+        resolved_models_agg = models_aggregator
+        if resolved_models_agg is None:
+            agg_name = criterion.models_aggregator or "all_pass"
+            built = get_builtin_models_aggregator(agg_name)
+            if built is None:
+                raise ValueError(f"models_aggregator {agg_name!r} is not a built-in name; "
+                                 f"register it via LLM_EVALUATOR_REGISTRY.register_models_aggregator "
+                                 f"before constructing LLMJudge")
+            resolved_models_agg = built
+        self._models_aggregator: ModelsAggregator = resolved_models_agg
+
+        # Pick metric-specific system prompt + user template + output schema (unchanged from before).
         if self._metric_name == "llm_final_response":
             system_prompt = FINAL_RESPONSE_PROMPT
             user_template = ("<user_prompt>\n"
@@ -746,6 +1054,7 @@ def __init__(
                              "<reference_response>\n"
                              "{expected_response}\n"
                              "</reference_response>")
+            output_schema: Optional[type[PydanticBaseModel]] = FinalResponseOutput
         elif self._metric_name == "llm_rubric_response":
             system_prompt = _rubric_system(RUBRIC_RESPONSE_PROMPT)
             user_template = ("<user_prompt>\n"
@@ -763,6 +1072,7 @@ def __init__(
                              "<rubric>\n"
                              "{rubrics}\n"
                              "</rubric>")
+            output_schema = RubricJudgeOutput
         elif self._metric_name == "llm_rubric_knowledge_recall":
             system_prompt = _rubric_system(RUBRIC_KNOWLEDGE_RECALL_PROMPT)
             user_template = ("<user_prompt>\n"
@@ -778,15 +1088,25 @@ def __init__(
                              "<rubric>\n"
                              "{rubrics}\n"
                              "</rubric>")
+            output_schema = RubricJudgeOutput
         else:
             raise ValueError(f"Unsupported metric_name for LLMJudge: {self._metric_name!r}")
 
-        if self._metric_name == "llm_final_response":
-            output_schema: Optional[type[PydanticBaseModel]] = FinalResponseOutput
-        else:
-            output_schema = RubricJudgeOutput
-
-        self._agent = _JudgeAgent(model, cfg, system_prompt, output_schema=output_schema, tools=judge_tools)
+        # Build one _JudgeAgent per judge model option, in order.
+        self._judge_agents: list[_JudgeAgent] = []
+        for opts in judge_models_list:
+            model = _create_judge_model(opts)
+            cfg, effective_tc = _judge_generation_config(opts.generation_config, opts.think)
+            planner = (BuiltInPlanner(thinking_config=effective_tc) if effective_tc is not None else None)
+            self._judge_agents.append(
+                _JudgeAgent(
+                    model,
+                    cfg,
+                    system_prompt,
+                    output_schema=output_schema,
+                    tools=judge_tools,
+                    planner=planner,
+                ))
 
         self._messages_constructor = messages_constructor or DefaultMessagesConstructor(user_template)
         self._response_scorer = response_scorer or DefaultResponseScorer()
@@ -794,25 +1114,69 @@ def __init__(
         self._invocations_aggregator = invocations_aggregator or AverageInvocationsAggregator()
 
     def get_num_samples(self) -> int:
-        """Return the number of judge samples to run per invocation (e.g. for majority vote)."""
+        """Return num_samples for the *first* judge model (legacy single-model API).
+
+        Multi-model judges may use different num_samples per model; callers that need
+        per-model sample counts should iterate criterion.get_judge_models() directly.
+        """
         return self._criterion.get_num_samples()
 
+    async def _run_one_judge(
+        self,
+        agent_index: int,
+        opts: JudgeModelOptions,
+        user_message: str,
+        threshold: float,
+    ) -> "tuple[NamedScoreResult, ScoreResult, bool]":
+        """Run num_samples calls for one judge model, then SamplesAggregator.
+
+        Returns (named_score, raw_score_result, had_exception). On exception, returns
+        a soft-failure NamedScoreResult with passed=False, score=0.0, reason=str(exc),
+        and had_exception=True.
+        """
+        agent = self._judge_agents[agent_index]
+        n = opts.get_num_samples()
+        try:
+            samples: list[ScoreResult] = []
+            for _ in range(n):
+                response_text = await agent.get_response(user_message)
+                samples.append(self._response_scorer.parse_response(response_text, self._metric_name))
+            chosen = self._samples_aggregator.aggregate_samples(samples, threshold)
+        except Exception as exc:
+            named = NamedScoreResult(
+                model_name=opts.model_name or "",
+                provider_name=opts.provider_name or "",
+                score=0.0,
+                reason=str(exc),
+                rubric_scores=[],
+                passed=False,
+            )
+            return named, ScoreResult(score=0.0, reason=str(exc)), True
+        passed = (chosen.score or 0.0) >= threshold
+        named = NamedScoreResult(
+            model_name=opts.model_name or "",
+            provider_name=opts.provider_name or "",
+            score=chosen.score or 0.0,
+            reason=chosen.reason or "",
+            rubric_scores=list(chosen.rubric_scores or []),
+            passed=passed,
+        )
+        return named, chosen, False
+
     async def evaluate(
         self,
         actual_invocations: list[Invocation],
         expected_invocations: Optional[list[Invocation]],
     ) -> EvaluationResult:
-        """Run the judge for each invocation, aggregate samples then invocations, and return EvaluationResult."""
+        """Run multi-model judge per invocation, aggregate per-model + per-invocation."""
         if expected_invocations is None:
             expected_invocations = []
         if len(actual_invocations) != len(expected_invocations):
             raise ValueError(f"actual_invocations ({len(actual_invocations)}) and "
                              f"expected_invocations ({len(expected_invocations)}) length mismatch")
-        num_samples = self.get_num_samples()
-        if num_samples <= 0:
-            raise ValueError("num_samples must be greater than 0")
 
         threshold = self._eval_metric.threshold
+        weights = [m.weight for m in self._judge_models]
         per_invocation_results: list[PerInvocationResult] = []
 
         for i in range(len(actual_invocations)):
@@ -826,22 +1190,56 @@ async def evaluate(
                 self._metric_name,
             )
 
-            samples: list[ScoreResult] = []
-            for _ in range(num_samples):
-                response_text = await self._agent.get_response(user_message)
-                samples.append(self._response_scorer.parse_response(response_text, self._metric_name))
+            # Step 1: each model runs its own samples + SamplesAggregator -> (named, raw, had_exception)
+            if self._parallel and len(self._judge_models) > 1:
+                tasks = [
+                    self._run_one_judge(idx, opts, user_message, threshold)
+                    for idx, opts in enumerate(self._judge_models)
+                ]
+                triples = await asyncio.gather(*tasks)
+            else:
+                triples = []
+                for idx, opts in enumerate(self._judge_models):
+                    triples.append(await self._run_one_judge(idx, opts, user_message, threshold))
+
+            named_results: list[NamedScoreResult] = [t[0] for t in triples]
+            score_results: list[ScoreResult] = [t[1] for t in triples]
+            exceptions: list[bool] = [t[2] for t in triples]
+
+            # Step 2: if every model raised, mark NOT_EVALUATED.
+            all_exception = all(exceptions) and len(exceptions) > 0
+
+            if all_exception:
+                per_invocation_results.append(
+                    PerInvocationResult(
+                        actual_invocation=actual,
+                        expected_invocation=expected,
+                        score=None,
+                        eval_status=EvalStatus.NOT_EVALUATED,
+                        reason="all judge models failed: " + "; ".join(f"{n.model_name}={n.reason}"
+                                                                       for n in named_results),
+                        rubric_scores=None,
+                        per_model_scores=named_results,
+                    ))
+                continue
 
-            chosen = self._samples_aggregator.aggregate_samples(samples, threshold)
-            status = EvalStatus.PASSED if (chosen.score or 0) >= threshold else EvalStatus.FAILED
-            rubric_scores = (list(chosen.rubric_scores) if chosen.rubric_scores else None)
+            # Step 3: cross-model aggregation -> single ScoreResult
+            invocation_score = self._models_aggregator.aggregate_models(
+                score_results,
+                threshold,
+                weights,
+            )
+            status = (EvalStatus.PASSED if (invocation_score.score or 0.0) >= threshold else EvalStatus.FAILED)
+            rubric_scores = (list(invocation_score.rubric_scores) if invocation_score.rubric_scores else None)
             per_invocation_results.append(
                 PerInvocationResult(
                     actual_invocation=actual,
                     expected_invocation=expected,
-                    score=chosen.score,
+                    score=invocation_score.score,
                     eval_status=status,
-                    reason=chosen.reason or None,
+                    reason=invocation_score.reason or None,
                     rubric_scores=rubric_scores,
+                    per_model_scores=named_results,
                 ))
 
         overall_score, overall_status = self._invocations_aggregator.aggregate_invocations(
diff --git a/trpc_agent_sdk/evaluation/_local_eval_service.py b/trpc_agent_sdk/evaluation/_local_eval_service.py
index 5b5afef..2ea1a8d 100644
--- a/trpc_agent_sdk/evaluation/_local_eval_service.py
+++ b/trpc_agent_sdk/evaluation/_local_eval_service.py
@@ -94,7 +94,7 @@ class LocalEvalService(BaseEvalService):
 
     def __init__(
         self,
-        root_agent: BaseAgent,
+        root_agent: Optional[BaseAgent],
         eval_sets_manager: EvalSetsManager,
         evaluator_registry: Optional[EvaluatorRegistry] = None,
         session_service: Optional[BaseSessionService] = None,
@@ -109,7 +109,9 @@ def __init__(
         """Initialize the local evaluation service.
 
         Args:
-            root_agent: The agent to evaluate
+            root_agent: The agent to evaluate. May be ``None`` only when every
+                eval case to be processed uses ``eval_mode='trace'``; standard
+                or mixed modes require a concrete agent.
             eval_sets_manager: Manager for eval sets storage
             evaluator_registry: Registry of metric evaluators
             session_service: Session service for maintaining state
@@ -242,6 +244,13 @@ async def evaluate(
         Yields:
             EvalCaseResult for each evaluated inference
         """
+        # Fail-fast: validate every (eval_case × metric) pair is semantically
+        # feasible before running any evaluator. See Bug 3.1 design doc.
+        self._validate_metric_compat(
+            inference_results=evaluate_request.inference_results,
+            evaluate_config=evaluate_request.evaluate_config,
+        )
+
         run_ctx: dict[str, Any] = {}
         start_time = time.monotonic()
         eval_case_results_list: list[EvalCaseResult] = []
@@ -483,6 +492,93 @@ async def _evaluate_metric(
                 expected_invocations=expected_invocations,
             )
 
+    def _validate_metric_compat(
+        self,
+        inference_results: list[InferenceResult],
+        evaluate_config: EvaluateConfig,
+    ) -> None:
+        """Fail-fast check every (eval_case × metric) pair for semantic feasibility.
+
+        Reference-based metrics (``requires_reference=True``) need a real
+        expected answer in ``eval_case.conversation``. Scenario-3 trace cases
+        (only ``actual_conversation`` set) have placeholder-only expected
+        invocations, which would produce silent 0.0 (rouge/final_response) or
+        phantom 1.0 (tool_trajectory with subset_matching) results.
+
+        This method aggregates ALL incompatible pairs across the request and
+        raises a single ValueError listing each one, so the user can fix them
+        in one pass. See Bug 3.1 of the strict-compat design doc.
+        """
+        incompatible: list[tuple[str, str, str]] = []
+        for ir in inference_results:
+            eval_case = self._eval_sets_manager.get_eval_case(
+                app_name=ir.app_name,
+                eval_set_id=ir.eval_set_id,
+                eval_case_id=ir.eval_case_id,
+            )
+            if eval_case is None:
+                # The per-case evaluate path will raise a clear error later;
+                # skip here to avoid duplicate / misleading messages.
+                continue
+            has_reference = self._case_has_reference(eval_case)
+            for eval_metric in evaluate_config.eval_metrics:
+                try:
+                    evaluator_cls = self._evaluator_registry.get_evaluator_class(eval_metric)
+                except ValueError:
+                    # Unknown metric — let the regular evaluate loop surface the
+                    # registry error so the message stays consistent.
+                    continue
+                if getattr(evaluator_cls, "requires_reference", True) and not has_reference:
+                    incompatible.append((
+                        ir.eval_case_id,
+                        eval_metric.metric_name,
+                        "requires reference answer (set in eval_case.conversation)",
+                    ))
+
+        if incompatible:
+            raise ValueError(self._format_compat_error(incompatible))
+
+    @staticmethod
+    def _case_has_reference(eval_case: EvalCase) -> bool:
+        """True iff ``expected_invocations`` will carry a real reference answer.
+
+        - Non-trace (``eval_mode is None``): ``conversation`` IS the expected → True.
+        - Scenario 1 (trace + both conversation and actual_conversation): True.
+        - Scenario 3 (trace + only actual_conversation): expected is a placeholder
+          built by ``_trace_expecteds_for_eval`` → False.
+        """
+        if eval_case.eval_mode != EvalModeTrace:
+            return True
+        return bool(eval_case.conversation and eval_case.actual_conversation)
+
+    @staticmethod
+    def _format_compat_error(incompatible: list[tuple[str, str, str]]) -> str:
+        """Aggregate incompatible pairs into a single actionable error message.
+
+        Groups by eval_case_id for readability; always closes with a fix guide.
+        """
+        by_case: dict[str, list[tuple[str, str]]] = {}
+        for eval_id, metric_name, reason in incompatible:
+            by_case.setdefault(eval_id, []).append((metric_name, reason))
+
+        lines = ["evaluator config incompatible with eval_set:", ""]
+        for eval_id in sorted(by_case):
+            lines.append(f"  eval_case='{eval_id}' (scenario: trace-without-reference):")
+            for metric_name, reason in by_case[eval_id]:
+                lines.append(f"    - metric '{metric_name}' {reason}")
+        lines.extend([
+            "",
+            "To fix, choose one:",
+            "  (a) Remove the incompatible metrics from your EvaluateConfig.",
+            "  (b) Use reference-free metrics only: llm_rubric_response, "
+            "llm_rubric_knowledge_recall.",
+            "  (c) Upgrade eval_cases to scenario-1 by providing BOTH `conversation` "
+            "(expected) and `actual_conversation` (actual) fields.",
+            "",
+            f"Incompatible (metric_name, eval_id) pairs: {len(incompatible)}",
+        ])
+        return "\n".join(lines)
+
     def _generate_final_eval_status(self, overall_eval_metric_results: list[EvalMetricResult]) -> EvalStatus:
         """Determine final evaluation status from all metrics.
 
@@ -539,6 +635,10 @@ async def _perform_inference_single_eval_item(
                     raise ValueError(
                         f"inference eval case (eval_case_id={eval_case.eval_id}, session_id={session_id}): "
                         "actual_conversation is only supported in trace mode")
+                if root_agent is None:
+                    raise ValueError(f"inference eval case (eval_case_id={eval_case.eval_id}, "
+                                     f"session_id={session_id}): a root_agent is required for "
+                                     f"standard (non-trace) eval_mode; got root_agent=None")
                 inferences = await self._generate_inferences_from_agent(
                     agent=root_agent,
                     eval_case=eval_case,
diff --git a/trpc_agent_sdk/evaluation/_rouge_evaluator.py b/trpc_agent_sdk/evaluation/_rouge_evaluator.py
index a6b7ff3..48c720f 100644
--- a/trpc_agent_sdk/evaluation/_rouge_evaluator.py
+++ b/trpc_agent_sdk/evaluation/_rouge_evaluator.py
@@ -51,6 +51,8 @@ class RougeEvaluator(Evaluator):
     Score range: [0, 1], where 1 means perfect match.
     """
 
+    requires_reference = True
+
     def __init__(
         self,
         threshold: Optional[float] = None,
diff --git a/trpc_agent_sdk/evaluation/_trajectory_evaluator.py b/trpc_agent_sdk/evaluation/_trajectory_evaluator.py
index e776b6f..82da309 100644
--- a/trpc_agent_sdk/evaluation/_trajectory_evaluator.py
+++ b/trpc_agent_sdk/evaluation/_trajectory_evaluator.py
@@ -55,6 +55,8 @@ class TrajectoryEvaluator(Evaluator):
     Without: strict count, order, name and arguments match.
     """
 
+    requires_reference = True
+
     def __init__(
         self,
         threshold: Optional[float] = None,