From b6b682b3f257216d3c65021ba61dca145c3417b4 Mon Sep 17 00:00:00 2001
From: ricknie <ricknie@tencent.com>
Date: Wed, 13 May 2026 19:40:32 +0800
Subject: [PATCH] =?UTF-8?q?feat:=20eval=E6=A8=A1=E5=9D=97=E6=94=AF?=
 =?UTF-8?q?=E6=8C=81=E6=8E=A5=E5=85=A5=E5=A4=96=E9=83=A8agent=E8=AF=84?=
 =?UTF-8?q?=E4=BC=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

TAPD: --story=134277610
---
 docs/mkdocs/en/evaluation.md                  | 240 ++++++---
 docs/mkdocs/zh/evaluation.md                  | 241 ++++++---
 .../test_agent_evaluator_call_agent.py        | 170 +++++++
 tests/evaluation/test_remote_eval_service.py  | 231 +++++++++
 trpc_agent_sdk/evaluation/__init__.py         |   4 +
 trpc_agent_sdk/evaluation/_agent_evaluator.py |  52 +-
 .../evaluation/_remote_eval_service.py        | 466 ++++++++++++++++++
 7 files changed, 1256 insertions(+), 148 deletions(-)
 create mode 100644 tests/evaluation/test_agent_evaluator_call_agent.py
 create mode 100644 tests/evaluation/test_remote_eval_service.py
 create mode 100644 trpc_agent_sdk/evaluation/_remote_eval_service.py

diff --git a/docs/mkdocs/en/evaluation.md b/docs/mkdocs/en/evaluation.md
index ca51a85..9028b06 100644
--- a/docs/mkdocs/en/evaluation.md
+++ b/docs/mkdocs/en/evaluation.md
@@ -65,6 +65,7 @@ Triggering evaluation through pytest allows eval test cases to be integrated int
 | Knowledge Recall Evaluation | Evaluates whether retrieved knowledge in RAG scenarios is sufficient to support the answer | Verify that knowledge base retrieval results cover the key facts in the question |
 | Multiple Runs and Statistics | Runs the same test case multiple times, computing stability metrics such as pass@k | Evaluate the Agent's pass rate across multiple attempts |
 | Trace Replay | Skips inference, directly scores using pre-recorded conversation traces | Perform offline evaluation using production logs without consuming inference resources |
+| External Agent Evaluation | Evaluate Agents not created by this framework via `call_agent` (HTTP services, CLI, other frameworks) | Run regression tests against an existing Claude Code CLI or remote API |
 | Callback Hooks | Attach custom logic at 8 lifecycle points during inference/scoring | Instrumentation, logging, sampling, reporting |
 
 #### Overall Evaluation Flow
@@ -299,7 +300,7 @@ This section explains the components of the evaluation module and their relation
 | **AgentEvaluator** | The entry point exposed to users, providing `evaluate()` and `get_executer()` | Call it in pytest tests |
 | **Eval Set (EvalSet)** | Describes "what to test"—scenarios, user inputs, expected outputs | Write `.evalset.json` files |
 | **Eval Config (EvalConfig)** | Describes "how to judge"—which metrics, thresholds, matching rules | Write `test_config.json` files |
-| **Eval Service (LocalEvalService)** | The engine that executes inference and scoring | Automatically created by the framework; usually no action needed |
+| **Eval Service (LocalEvalService / RemoteEvalService)** | The engine that executes inference and scoring (local Agent or `call_agent`) | Automatically created by the framework; usually no action needed |
 | **Evaluator** | The concrete implementation that computes scores per metric | Choose built-in evaluators, or register custom ones |
 | **Evaluator Registry (EvaluatorRegistry)** | Maintains the mapping from `metric_name` to evaluator type | Register when custom evaluators are needed |
 | **Evaluation Result (EvaluateResult)** | Holds the structured evaluation results | Obtain and analyze via `get_result()` |
@@ -308,12 +309,31 @@ This section explains the components of the evaluation module and their relation
 
 AgentEvaluator is the entry point and orchestrator of the entire evaluation flow:
 
-1. **Loading Phase**: AgentEvaluator loads the EvalSet from eval set files (`.evalset.json` / `.test.json`), loads the EvalConfig from `test_config.json` in the same directory, and loads the Agent by `agent_module`.
-2. **Building the Eval Service**: AgentEvaluator writes the EvalSet into InMemoryEvalSetsManager and creates LocalEvalService (depending on the Manager, UserSimulatorProvider, optional EvalSetResultsManager, Runner, and Callbacks). By default, it uses StaticUserSimulator, which drives inference using user_content from the conversation. Optionally, LocalEvalSetResultsManager can be injected to persist run results to a directory.
-3. **Inference Phase**: The eval service drives the Runner for inference based on test cases and conversations in the EvalSet, producing actual Invocation lists (actual tool calls, actual responses).
+1. **Loading Phase**: AgentEvaluator loads the EvalSet from eval set files (`.evalset.json` / `.test.json`), loads the EvalConfig from `test_config.json` in the same directory; for local Agent paths, loads the Agent by `agent_module` (can be omitted when using `call_agent` or when all cases use [Trace Mode](#trace-mode)).
+2. **Building the Eval Service**: AgentEvaluator writes the EvalSet into InMemoryEvalSetsManager; when `call_agent` is provided, creates RemoteEvalService; otherwise creates LocalEvalService (depending on the Manager, UserSimulatorProvider, optional EvalSetResultsManager, Runner, and Callbacks).
+3. **Inference Phase**: The eval service performs turn-by-turn inference based on test cases and conversations in the EvalSet: LocalEvalService drives the Runner to call the Agent; RemoteEvalService calls `call_agent(query)` to obtain each turn's actual response, producing actual Invocation lists.
 4. **Scoring Phase**: The eval service obtains evaluators from the EvaluatorRegistry based on the EvalMetric list in the EvalConfig, scores actual vs. expected item by item, and aggregates into EvalCaseResult.
 5. **Result Aggregation**: AgentEvaluator determines pass/fail based on results, raises `AssertionError` when any test case falls below the threshold, and optionally persists results as `.evalset_result.json`.
 
+#### AgentEvaluator Parameter List
+
+`evaluate()` and `get_executer()` accept the same parameters (`evaluate()` internally calls `get_executer()`):
+
+| Parameter | Type | Description |
+| --- | --- | --- |
+| eval_dataset_file_path_or_dir | str | Path to eval set file or directory (recursively scans `.evalset.json` / `.test.json`) |
+| agent_module | str \| None | Python module path of the Agent created by this framework; mutually exclusive with `call_agent`. Not needed when using `call_agent` or when all cases are Trace mode |
+| call_agent | CallAgent \| None | Async callable for Agents not created by this framework (`async def(str)->str`); mutually exclusive with `agent_module` / `runner` |
+| num_runs | int | Number of runs per eval set, default 1 |
+| agent_name | str \| None | Display name of the Agent |
+| print_detailed_results | bool | Whether to print per-case detail comparisons, default True |
+| eval_result_output_dir | str \| None | Directory for result persistence; omit for in-memory aggregation only |
+| runner | Runner \| None | Custom Runner instance; mutually exclusive with `call_agent` |
+| case_parallelism | int \| None | Max concurrent cases during inference |
+| case_eval_parallelism | int \| None | Max concurrent cases during scoring |
+| callbacks | Callbacks \| None | Lifecycle callbacks |
+| eval_metrics_file_path_or_dir | str \| None | Shared eval config file path (overrides same-directory `test_config.json`) |
+
 ---
 
 ### Eval Set (EvalSet) Writing Guide
@@ -458,6 +478,8 @@ Configuration keys support both snake_case (e.g., `metric_name`) and camelCase (
 | `llm_rubric_response` | LLMRubricResponseEvaluator | LLM judge scores item by item against rubrics | Need to evaluate response quality across multiple dimensions (correctness, relevance, compliance, etc.) |
 | `llm_rubric_knowledge_recall` | LLMRubricKnowledgeRecallEvaluator | LLM judge evaluates whether retrieved knowledge is sufficient to support the answer | RAG scenarios; need to verify that retrieved knowledge covers key facts |
 
+> Note: `call_agent` mode does not support `tool_trajectory_avg_score`. When evaluating external/black-box Agents, prefer `final_response_avg_score` or LLM Judge metrics.
+
 **Rubric** refers to evaluation rubrics: in the configuration, `rubrics` is an array listing multiple independently assessable clauses (e.g., "the answer must contain a conclusion," "must be relevant to the question"). The LLM judge gives a pass/fail for each rubric, then aggregates them into the metric's score.
 
 #### How to Choose Metrics
@@ -819,70 +841,7 @@ LLM response quality with rubrics (llm_rubric_response or llm_rubric_knowledge_r
 
 It is recommended to use environment variable placeholders for `api_key` and `base_url` (e.g., `${TRPC_AGENT_API_KEY}`), which are replaced by the execution environment, to avoid writing plaintext in configuration files.
 
-**Multi-model judge (cross-model aggregation)**
-
-A single LLM-judge metric may use multiple judge models simultaneously and combine their verdicts via `models_aggregator`. Use `judge_models` instead of `judge_model`; the two fields are mutually exclusive. Per-model details are available on `PerInvocationResult.per_model_scores` (a list of `NamedScoreResult`).
-
-Built-in aggregators:
-
-| Name | Pass rule | Overall score |
-| --- | --- | --- |
-| `all_pass` (default) | all models pass | min of per-model scores |
-| `any_pass` | any model passes | max of per-model scores |
-| `majority_pass` | strict majority passes (`passed*2 > total`) | `passed_count / total` |
-| `avg` | mean ≥ threshold | mean of per-model scores |
-| `weighted_avg` | weighted mean ≥ threshold | `sum(w*s) / sum(w)` |
-| `weighted_majority` | weighted-passed share ≥ 0.5 | `sum(w where passed) / sum(w)` |
-
-If a single judge model raises during execution, that model is counted as a non-passing vote; if every model raises, the invocation is reported as `NOT_EVALUATED`.
-
-```json
-{
-  "metrics": [
-    {
-      "metric_name": "llm_final_response",
-      "threshold": 1,
-      "criterion": {
-        "llm_judge": {
-          "judge_models": [
-            {
-              "model_name": "glm-4.7",
-              "api_key": "${TRPC_AGENT_API_KEY}",
-              "base_url": "${TRPC_AGENT_BASE_URL}",
-              "weight": 2.0
-            },
-            {
-              "model_name": "gpt-4o",
-              "api_key": "${TRPC_AGENT_API_KEY}",
-              "base_url": "${TRPC_AGENT_BASE_URL}",
-              "weight": 1.0
-            }
-          ],
-          "models_aggregator": "weighted_avg",
-          "parallel": true
-        }
-      }
-    }
-  ]
-}
-```
-
-`parallel` controls how multiple judge models are executed: `true` (default) calls all models concurrently, with latency bounded by the slowest model; `false` calls them sequentially in the declared order. Only takes effect when `judge_models` contains more than one model.
-
-If a judge model has thinking enabled by default, consider setting `"think": false` on its `JudgeModelOptions`: the judge output is a structured JSON, thinking traces add no value to the final verdict, and disabling thinking significantly reduces token cost and latency. Each judge model has its own independent `think` flag.
-
-Custom aggregators can be registered at runtime and take precedence over the `models_aggregator` name written in the criterion:
-
-```python
-from trpc_agent_sdk.evaluation import LLM_EVALUATOR_REGISTRY, ScoreResult
-
-def my_aggregator(per_model, threshold, weights):
-    # per_model: list[ScoreResult]; weights: list[float]
-    score = sum(s.score or 0.0 for s in per_model) / len(per_model)
-    return ScoreResult(score=score, reason="custom aggregation")
-
-LLM_EVALUATOR_REGISTRY.register_models_aggregator("llm_final_response", my_aggregator)
-```
+> A single LLM judge metric can also use multiple judge models with aggregated results. See [Advanced Features - Multi-Model Judge (Cross-Model Aggregation)](#multi-model-judge-cross-model-aggregation).
 
 #### Custom Criteria
 
@@ -1822,10 +1781,155 @@ async def test_pass_at_k():
 
 For the complete example, see [examples/evaluation/pass_at_k/](../../../examples/evaluation/pass_at_k/).
 
+#### Evaluating Agents Not Created by This Framework (call_agent)
+
+If the Agent under test is not created or managed by this framework (e.g., deployed behind an HTTP/RPC service, invoked via CLI, or wrapped by another framework), and you cannot provide `agent_module` or `runner`, use the **`call_agent`** parameter instead: pass an async function, and the evaluator will call it each turn to obtain the actual response. The rest of the scoring flow remains unchanged.
+
+**Configuration**
+
+Pass `call_agent=your_async_fn` in **AgentEvaluator.evaluate()** or **get_executer()**, without passing `agent_module` or `runner`. The signature must be `async def call_agent(query: str) -> str`.
+
+**Applicable Scenarios**
+
+Evaluating any callable that cannot be instantiated as this framework's `BaseAgent`: HTTP/RPC remote services, CLI Agents, other frameworks (LangChain / AutoGen / custom), etc.
+
+**Constraints**
+
+- `call_agent` must be async (passing a sync function raises `ValueError`)
+- `call_agent` is mutually exclusive with `agent_module` / `runner` (passing both raises `ValueError`)
+- `call_agent` mode is mutually exclusive with Trace mode (eval set containing trace cases raises `ValueError`)
+- `call_agent` mode does not support `tool_trajectory_avg_score` (raises `ValueError`); use `final_response_avg_score`, `llm_final_response`, or `llm_rubric_response` instead
+- Multi-turn cases call `call_agent` sequentially per turn; each call corresponds to one `Invocation`
+
+**Example**: Using Claude Code CLI as an external Agent
+
+```python
+import asyncio
+import os
+from asyncio.subprocess import PIPE
+
+from trpc_agent_sdk.evaluation import AgentEvaluator
+
+
+async def call_agent(query: str) -> str:
+    """Call Claude Code CLI and return its text output."""
+    cli_bin = os.getenv("CLAUDE_CODE_BIN", "claude")
+    cli_args = [cli_bin, "-p", query]
+
+    model_name = os.getenv("CLAUDE_CODE_MODEL")
+    if model_name:
+        cli_args.extend(["--model", model_name])
+
+    proc = await asyncio.create_subprocess_exec(*cli_args, stdout=PIPE, stderr=PIPE)
+    stdout, stderr = await proc.communicate()
+
+    if proc.returncode != 0:
+        raise RuntimeError(stderr.decode("utf-8", errors="ignore").strip())
+
+    output_text = stdout.decode("utf-8", errors="ignore").strip()
+    for line in output_text.splitlines():
+        if line.strip():
+            return line.strip()
+    return ""
+
+
+# Option A: pass/fail only
+await AgentEvaluator.evaluate(
+    eval_dataset_file_path_or_dir="agent/my_evalset.evalset.json",
+    call_agent=call_agent,
+)
+
+# Option B: structured results
+executer = AgentEvaluator.get_executer(
+    eval_dataset_file_path_or_dir="agent/my_evalset.evalset.json",
+    call_agent=call_agent,
+)
+await executer.evaluate()
+result = executer.get_result()  # EvaluateResult
+```
+
+> The example uses `claude` as the default command. If your executable name differs (e.g., `trpc-claudecode` or a custom wrapper), set the `CLAUDE_CODE_BIN` environment variable accordingly. For HTTP service scenarios, simply replace the `call_agent` function body with `aiohttp` / `httpx` calls while keeping the signature `async def call_agent(query: str) -> str`.
+
+#### Multi-Model Judge (Cross-Model Aggregation)
+
+A single LLM judge metric can use multiple judge models simultaneously, aggregating their verdicts via `models_aggregator` to reduce single-model variance. Use `judge_models` instead of `judge_model`; the two fields are mutually exclusive. Per-model details are available on `PerInvocationResult.per_model_scores` (a list of `NamedScoreResult`).
+
+**Configuration**
+
+In `test_config.json`, for any LLM judge metric's `criterion.llm_judge`, replace `judge_model` with `judge_models` (array), and set `models_aggregator` to choose the aggregation strategy. `parallel` controls execution: `true` (default) calls all models concurrently; `false` calls them sequentially.
+
+**Applicable Scenarios**
+
+When higher confidence in judge verdicts is needed (e.g., safety compliance, medical scenarios), or when comparing judgments across different models.
+
+**Built-in Aggregators**
+
+| Name | Pass Rule | Overall Score |
+| --- | --- | --- |
+| `all_pass` (default) | All models pass | min of per-model scores |
+| `any_pass` | Any model passes | max of per-model scores |
+| `majority_pass` | Strict majority passes (`passed*2 > total`) | `passed_count / total` |
+| `avg` | Mean ≥ threshold | mean of per-model scores |
+| `weighted_avg` | Weighted mean ≥ threshold | `sum(w*s) / sum(w)` |
+| `weighted_majority` | Weighted-passed share ≥ 0.5 | `sum(w where passed) / sum(w)` |
+
+If a single judge model raises during execution, that model is counted as a non-passing vote; if every model raises, the invocation is reported as `NOT_EVALUATED`.
+
+**Example**: Two judge models with weighted average aggregation
+
+```json
+{
+  "metrics": [
+    {
+      "metric_name": "llm_final_response",
+      "threshold": 1,
+      "criterion": {
+        "llm_judge": {
+          "judge_models": [
+            {
+              "model_name": "glm-4.7",
+              "api_key": "${TRPC_AGENT_API_KEY}",
+              "base_url": "${TRPC_AGENT_BASE_URL}",
+              "weight": 2.0
+            },
+            {
+              "model_name": "gpt-4o",
+              "api_key": "${TRPC_AGENT_API_KEY}",
+              "base_url": "${TRPC_AGENT_BASE_URL}",
+              "weight": 1.0
+            }
+          ],
+          "models_aggregator": "weighted_avg",
+          "parallel": true
+        }
+      }
+    }
+  ]
+}
+```
+
+If a judge model has thinking enabled by default, consider setting `"think": false` on its `JudgeModelOptions`: the judge output is structured JSON, and thinking traces add no value to the final verdict. Disabling thinking significantly reduces token cost and latency.
+
+**Custom Aggregators**
+
+Custom aggregators can be registered at runtime and take precedence over the `models_aggregator` name in the criterion:
+
+```python
+from trpc_agent_sdk.evaluation import LLM_EVALUATOR_REGISTRY, ScoreResult
+
+def my_aggregator(per_model, threshold, weights):
+    score = sum(s.score or 0.0 for s in per_model) / len(per_model)
+    return ScoreResult(score=score, reason="custom aggregation")
+
+LLM_EVALUATOR_REGISTRY.register_models_aggregator("llm_final_response", my_aggregator)
+```
+
 #### Trace Mode
 
 In default mode, the eval service actually calls the Agent for inference. If you already have pre-recorded conversation traces (e.g., production logs, historical sessions) and want to only "score" without repeating inference, you can use **Trace mode**: set **eval_mode: "trace"** on the case and provide **actual_conversation**; the eval service will skip inference and directly use that trace for scoring.
 
+> Note: Trace mode and `call_agent` mode are mutually exclusive; when `call_agent` is provided and the eval set contains trace cases, the framework raises `ValueError` at startup.
+
 **Configuration Methods**
 
 - Set **eval_mode**: `"trace"` on the **EvalCase**.
diff --git a/docs/mkdocs/zh/evaluation.md b/docs/mkdocs/zh/evaluation.md
index 564e0e0..68a9eed 100644
--- a/docs/mkdocs/zh/evaluation.md
+++ b/docs/mkdocs/zh/evaluation.md
@@ -65,6 +65,7 @@ tRPC-Agent 评测模块是一套**自动化 Agent 质量检验工具**。它让
 | 知识召回评估 | 评估 RAG 场景下检索到的知识是否足以支撑回答 | 验证知识库检索结果覆盖了问题中的关键事实 |
 | 多轮运行与统计 | 同一用例跑多次，计算 pass@k 等稳定性指标 | 评估 Agent 在多次尝试中的通过率 |
 | Trace 回放 | 跳过推理，直接用录制好的对话轨迹打分 | 用线上日志做离线评估，不消耗推理资源 |
+| 外部 Agent 评测 | 通过 `call_agent` 评测非本框架创建的 Agent（HTTP 服务、CLI、其他框架） | 对已有 Claude Code CLI 或远程 API 做回归评测 |
 | 回调钩子 | 在推理/打分的 8 个生命周期节点挂载自定义逻辑 | 打点、日志、采样、上报 |
 
 #### 评测整体流程
@@ -283,6 +284,7 @@ pytest test_quickstart.py -v --tb=short -s
 - **有用例未达阈值**：框架会抛出 `AssertionError`，失败摘要以 JSON 形式包含在错误信息中。
 - **结果落盘**：若调用时传入 `eval_result_output_dir`，当次评测结果会写入该目录下的 `.evalset_result.json` 文件（详见[评测结果](#评测结果)一节）。
 
+
 ---
 
 ### 核心概念
@@ -296,7 +298,7 @@ pytest test_quickstart.py -v --tb=short -s
 | **AgentEvaluator** | 对用户暴露的入口，提供 `evaluate()` 与 `get_executer()` | 在 pytest 测试中调用它 |
 | **评测集（EvalSet）** | 描述"测什么"——场景、用户输入、预期输出 | 编写 `.evalset.json` 文件 |
 | **评测配置（EvalConfig）** | 描述"怎么判"——用哪些指标、阈值、匹配规则 | 编写 `test_config.json` 文件 |
-| **评估服务（LocalEvalService）** | 执行推理与打分的引擎 | 框架自动创建，通常无需关心 |
+| **评估服务（LocalEvalService / RemoteEvalService）** | 执行推理与打分的引擎（本地 Agent 或 `call_agent`） | 框架自动创建，通常无需关心 |
 | **评估器（Evaluator）** | 按指标计算分数的具体实现 | 选择内置评估器，或注册自定义 |
 | **评估器注册表（EvaluatorRegistry）** | 维护 `metric_name` → 评估器类型的映射 | 需要自定义评估器时注册 |
 | **评测结果（EvaluateResult）** | 承载评测的结构化结果 | 通过 `get_result()` 获取并分析 |
@@ -305,12 +307,31 @@ pytest test_quickstart.py -v --tb=short -s
 
 AgentEvaluator 是整个评测流程的入口和编排者：
 
-1. **加载阶段**：AgentEvaluator 从评测集文件（`.evalset.json` / `.test.json`）加载 EvalSet，从同目录的 `test_config.json` 加载 EvalConfig，按 `agent_module` 加载 Agent（若整集为 [Trace 模式](#trace-模式)，此步可省略）。
-2. **构建评估服务**：AgentEvaluator 将 EvalSet 写入 InMemoryEvalSetsManager，创建 LocalEvalService（依赖该 Manager、UserSimulatorProvider、可选 EvalSetResultsManager、Runner、Callbacks）。默认使用 StaticUserSimulator，按 conversation 的 user_content 驱动推理。可选注入 LocalEvalSetResultsManager 将运行结果写入目录。
-3. **推理阶段**：评估服务按 EvalSet 中的用例与 conversation 驱动 Runner 推理，得到实际 Invocation 列表（实际工具调用、实际回复）。
+1. **加载阶段**：AgentEvaluator 从评测集文件（`.evalset.json` / `.test.json`）加载 EvalSet，从同目录的 `test_config.json` 加载 EvalConfig；若走本地 Agent 路径，按 `agent_module` 加载 Agent（使用 `call_agent` 或整集为 [Trace 模式](#trace-模式) 时，此步可省略）。
+2. **构建评估服务**：AgentEvaluator 将 EvalSet 写入 InMemoryEvalSetsManager；传 `call_agent` 时创建 RemoteEvalService，否则创建 LocalEvalService（依赖 Manager、UserSimulatorProvider、可选 EvalSetResultsManager、Runner、Callbacks）。
+3. **推理阶段**：评估服务按 EvalSet 的用例与 conversation 逐轮推理：LocalEvalService 通过 Runner 调 Agent；RemoteEvalService 通过 `call_agent(query)` 获取每轮实际回复，得到实际 Invocation 列表。
 4. **打分阶段**：评估服务根据 EvalConfig 中的 EvalMetric 列表，从 EvaluatorRegistry 获取各评估器，对实际与预期逐项打分并汇总为 EvalCaseResult。
 5. **结果汇总**：AgentEvaluator 根据结果判定通过/失败，有用例未达阈值时抛出 `AssertionError`，可选将结果落盘为 `.evalset_result.json`。
 
+#### AgentEvaluator 参数列表
+
+`evaluate()` 与 `get_executer()` 接受相同的参数（`evaluate()` 内部调用 `get_executer()`）：
+
+| 参数 | 类型 | 说明 |
+| --- | --- | --- |
+| eval_dataset_file_path_or_dir | str | 评测集文件或目录路径（递归扫描 `.evalset.json` / `.test.json`） |
+| agent_module | str \| None | 本框架 Agent 所在 Python 模块路径；与 `call_agent` 互斥。传 `call_agent` 时不需要；全部 case 为 Trace 模式时也不需要 |
+| call_agent | CallAgent \| None | 非本框架 Agent 的异步可调用对象（`async def(str)->str`）；与 `agent_module` / `runner` 互斥 |
+| num_runs | int | 每个评测集运行次数，默认 1 |
+| agent_name | str \| None | Agent 显示名称 |
+| print_detailed_results | bool | 是否打印每个用例的详细对比信息，默认 True |
+| eval_result_output_dir | str \| None | 结果落盘目录；不传则仅内存聚合 |
+| runner | Runner \| None | 自定义 Runner 实例；与 `call_agent` 互斥 |
+| case_parallelism | int \| None | 推理阶段最大并发用例数 |
+| case_eval_parallelism | int \| None | 打分阶段最大并发用例数 |
+| callbacks | Callbacks \| None | 生命周期回调 |
+| eval_metrics_file_path_or_dir | str \| None | 共享评测配置文件路径（覆盖同目录 `test_config.json`） |
+
 ---
 
 ### 评测集（EvalSet）编写指南
@@ -455,6 +476,8 @@ Trace 模式的配置详见[高级功能 - Trace 模式](#trace-模式)。
 | `llm_rubric_response` | LLMRubricResponseEvaluator | LLM 裁判按评估细则逐项打分 | 需要从多个维度（正确性、相关性、合规性等）评估回复质量 |
 | `llm_rubric_knowledge_recall` | LLMRubricKnowledgeRecallEvaluator | LLM 裁判评估知识检索结果是否足以支撑回答 | RAG 场景，需验证检索到的知识覆盖了关键事实 |
 
+> 注意：`call_agent` 模式不支持 `tool_trajectory_avg_score`。评测外部黑盒 Agent 时，建议优先使用 `final_response_avg_score` 或 LLM Judge 类指标。
+
 **Rubric** 指评估细则：在配置中以 `rubrics` 数组列出多条可独立判定的条款（如「回答须包含结论」「须与问题相关」），LLM 裁判对每条细则给出通过与否，再汇总为该项指标的得分。
 
 #### 如何选择指标
@@ -816,70 +839,7 @@ LLM 最终响应评判（仅需 judge_model）：
 
 建议 `api_key`、`base_url` 用环境变量占位（如 `${TRPC_AGENT_API_KEY}`），由执行环境替换，避免明文写入配置文件。
 
-**多裁判模型（跨模型聚合）**
-
-同一个 LLM 裁判指标可以同时使用多个裁判模型，并通过 `models_aggregator` 聚合各模型的判定结果。此时改用 `judge_models` 而非 `judge_model`，两字段互斥。每个裁判模型的明细会输出到 `PerInvocationResult.per_model_scores`（`NamedScoreResult` 列表）。
-
-内置聚合器：
-
-| 名称 | 通过规则 | 总分 |
-| --- | --- | --- |
-| `all_pass`（默认） | 所有模型都通过 | 各模型得分的最小值 |
-| `any_pass` | 任一模型通过 | 各模型得分的最大值 |
-| `majority_pass` | 严格多数通过（`passed*2 > total`） | `passed_count / total` |
-| `avg` | 平均分 ≥ threshold | 各模型得分的平均值 |
-| `weighted_avg` | 加权平均 ≥ threshold | `sum(w*s) / sum(w)` |
-| `weighted_majority` | 通过模型的权重占比 ≥ 0.5 | `sum(w where passed) / sum(w)` |
-
-若某个裁判模型执行抛异常，则该模型视为一张反对票；若所有模型都抛异常，该轮结果记为 `NOT_EVALUATED`。
-
-```json
-{
-  "metrics": [
-    {
-      "metric_name": "llm_final_response",
-      "threshold": 1,
-      "criterion": {
-        "llm_judge": {
-          "judge_models": [
-            {
-              "model_name": "glm-4.7",
-              "api_key": "${TRPC_AGENT_API_KEY}",
-              "base_url": "${TRPC_AGENT_BASE_URL}",
-              "weight": 2.0
-            },
-            {
-              "model_name": "gpt-4o",
-              "api_key": "${TRPC_AGENT_API_KEY}",
-              "base_url": "${TRPC_AGENT_BASE_URL}",
-              "weight": 1.0
-            }
-          ],
-          "models_aggregator": "weighted_avg",
-          "parallel": true
-        }
-      }
-    }
-  ]
-}
-```
-
-`parallel` 控制多个裁判模型之间的执行方式：`true`（默认）并发调用，耗时取决于最慢的模型；`false` 按声明顺序串行调用。仅在 `judge_models` 有多个模型时生效。
-
-若裁判模型默认开启思考链，建议在对应 `JudgeModelOptions` 上显式设 `"think": false`：judge 输出本身是结构化 JSON，思考链对最终判分无价值，关闭可显著降低 token 消耗与延时。每个裁判模型的 `think` 独立设置。
-
-也可以在运行时注册自定义聚合器，其优先级高于 criterion 中写的 `models_aggregator` 名：
-
-```python
-from trpc_agent_sdk.evaluation import LLM_EVALUATOR_REGISTRY, ScoreResult
-
-def my_aggregator(per_model, threshold, weights):
-    # per_model: list[ScoreResult]；weights: list[float]
-    score = sum(s.score or 0.0 for s in per_model) / len(per_model)
-    return ScoreResult(score=score, reason="custom aggregation")
-
-LLM_EVALUATOR_REGISTRY.register_models_aggregator("llm_final_response", my_aggregator)
-```
+> 同一个 LLM 裁判指标还可以同时使用多个裁判模型并聚合结果，详见[高级功能 - 多裁判模型（跨模型聚合）](#多裁判模型跨模型聚合)。
 
 #### 自定义准则
 
@@ -1819,10 +1779,155 @@ async def test_pass_at_k():
 
 完整示例见 [examples/evaluation/pass_at_k/](../../../examples/evaluation/pass_at_k/)。
 
+#### 评测非本框架创建的 Agent（call_agent）
+
+若被测 Agent 不是通过本框架创建和管理的（例如部署在 HTTP/RPC 服务后面、通过 CLI 调用、或使用其他框架封装），无法提供 `agent_module` 或 `runner`，可改用 **`call_agent`** 参数：传入一个异步函数，evaluator 会在每轮对话中调用它获取实际回复，其余打分流程不变。
+
+**配置方式**
+
+在 **AgentEvaluator.evaluate()** 或 **get_executer()** 中传入 `call_agent=your_async_fn`，不传 `agent_module` 和 `runner`。`call_agent` 的签名必须是 `async def call_agent(query: str) -> str`。
+
+**适用场景**
+
+评测任何无法实例化为本框架 `BaseAgent` 的可调用对象：HTTP/RPC 远程服务、CLI Agent、其他框架（LangChain / AutoGen / 自研）封装的黑盒 Agent 等。
+
+**约束**
+
+- `call_agent` 必须是异步函数（传入同步函数会报 `ValueError`）
+- `call_agent` 与 `agent_module` / `runner` 互斥（同时传入会报 `ValueError`）
+- `call_agent` 模式与 Trace 模式互斥（evalset 含 trace case 会报 `ValueError`）
+- `call_agent` 模式不支持 `tool_trajectory_avg_score`（会报 `ValueError`）；建议使用 `final_response_avg_score`、`llm_final_response` 或 `llm_rubric_response`
+- 多轮 case 会按轮次依次调用 `call_agent`；每次调用对应一个 `Invocation`
+
+**示例**：以 Claude Code CLI 为例，将其封装为 `call_agent` 并接入评测
+
+```python
+import asyncio
+import os
+from asyncio.subprocess import PIPE
+
+from trpc_agent_sdk.evaluation import AgentEvaluator
+
+
+async def call_agent(query: str) -> str:
+    """调用 Claude Code CLI，返回其文本输出。"""
+    cli_bin = os.getenv("CLAUDE_CODE_BIN", "claude")
+    cli_args = [cli_bin, "-p", query]
+
+    model_name = os.getenv("CLAUDE_CODE_MODEL")
+    if model_name:
+        cli_args.extend(["--model", model_name])
+
+    proc = await asyncio.create_subprocess_exec(*cli_args, stdout=PIPE, stderr=PIPE)
+    stdout, stderr = await proc.communicate()
+
+    if proc.returncode != 0:
+        raise RuntimeError(stderr.decode("utf-8", errors="ignore").strip())
+
+    output_text = stdout.decode("utf-8", errors="ignore").strip()
+    for line in output_text.splitlines():
+        if line.strip():
+            return line.strip()
+    return ""
+
+
+# 方式 A：只关心 pass/fail
+await AgentEvaluator.evaluate(
+    eval_dataset_file_path_or_dir="agent/my_evalset.evalset.json",
+    call_agent=call_agent,
+)
+
+# 方式 B：需要结构化结果
+executer = AgentEvaluator.get_executer(
+    eval_dataset_file_path_or_dir="agent/my_evalset.evalset.json",
+    call_agent=call_agent,
+)
+await executer.evaluate()
+result = executer.get_result()  # EvaluateResult
+```
+
+> 示例中默认命令是 `claude`。如果你环境里的可执行文件名不同（例如 `trpc-claudecode` 或自定义命令），将 `CLAUDE_CODE_BIN` 环境变量改为对应命令即可。对于 HTTP 服务场景，只需把 `call_agent` 函数体改为 `aiohttp` / `httpx` 调用，签名保持 `async def call_agent(query: str) -> str` 不变。
+
+#### 多裁判模型（跨模型聚合）
+
+同一个 LLM 裁判指标可以同时使用多个裁判模型，并通过 `models_aggregator` 聚合各模型的判定结果，降低单模型裁判的波动。此时改用 `judge_models` 而非 `judge_model`，两字段互斥。每个裁判模型的明细会输出到 `PerInvocationResult.per_model_scores`（`NamedScoreResult` 列表）。
+
+**配置方式**
+
+在 `test_config.json` 的 LLM 裁判类指标 `criterion.llm_judge` 中，将 `judge_model` 替换为 `judge_models`（数组），并设置 `models_aggregator` 选择聚合策略。`parallel` 控制多个裁判模型之间的执行方式：`true`（默认）并发调用，`false` 串行调用。
+
+**适用场景**
+
+对评判结果要求更高置信度（如安全合规、医疗场景），或希望对比不同裁判模型的判定差异。
+
+**内置聚合器**
+
+| 名称 | 通过规则 | 总分 |
+| --- | --- | --- |
+| `all_pass`（默认） | 所有模型都通过 | 各模型得分的最小值 |
+| `any_pass` | 任一模型通过 | 各模型得分的最大值 |
+| `majority_pass` | 严格多数通过（`passed*2 > total`） | `passed_count / total` |
+| `avg` | 平均分 ≥ threshold | 各模型得分的平均值 |
+| `weighted_avg` | 加权平均 ≥ threshold | `sum(w*s) / sum(w)` |
+| `weighted_majority` | 通过模型的权重占比 ≥ 0.5 | `sum(w where passed) / sum(w)` |
+
+若某个裁判模型执行抛异常，则该模型视为一张反对票；若所有模型都抛异常，该轮结果记为 `NOT_EVALUATED`。
+
+**示例**：两个裁判模型按加权平均聚合
+
+```json
+{
+  "metrics": [
+    {
+      "metric_name": "llm_final_response",
+      "threshold": 1,
+      "criterion": {
+        "llm_judge": {
+          "judge_models": [
+            {
+              "model_name": "glm-4.7",
+              "api_key": "${TRPC_AGENT_API_KEY}",
+              "base_url": "${TRPC_AGENT_BASE_URL}",
+              "weight": 2.0
+            },
+            {
+              "model_name": "gpt-4o",
+              "api_key": "${TRPC_AGENT_API_KEY}",
+              "base_url": "${TRPC_AGENT_BASE_URL}",
+              "weight": 1.0
+            }
+          ],
+          "models_aggregator": "weighted_avg",
+          "parallel": true
+        }
+      }
+    }
+  ]
+}
+```
+
+若裁判模型默认开启思考链，建议在对应 `JudgeModelOptions` 上显式设 `"think": false`：judge 输出本身是结构化 JSON，思考链对最终判分无价值，关闭可显著降低 token 消耗与延时。
+
+**自定义聚合器**
+
+也可以在运行时注册自定义聚合器，其优先级高于 criterion 中写的 `models_aggregator` 名：
+
+```python
+from trpc_agent_sdk.evaluation import LLM_EVALUATOR_REGISTRY, ScoreResult
+
+def my_aggregator(per_model, threshold, weights):
+    score = sum(s.score or 0.0 for s in per_model) / len(per_model)
+    return ScoreResult(score=score, reason="custom aggregation")
+
+LLM_EVALUATOR_REGISTRY.register_models_aggregator("llm_final_response", my_aggregator)
+```
+
 #### Trace 模式
 
 默认模式下，评估服务会真实调用 Agent 做推理。若你已有录制好的对话轨迹（如线上日志、历史会话），希望只做「打分」、不重复推理，可使用 **Trace 模式**：在用例上设置 **eval_mode: "trace"** 并提供 **actual_conversation**，评估服务会跳过推理，直接使用该轨迹参与打分。
 
+> 注意：Trace 模式与 `call_agent` 模式互斥；传入 `call_agent` 且评测集中包含 trace case 时，框架会在启动期抛出 `ValueError`。
+
 **配置方式**
 
 - 在 **EvalCase** 上设置 **eval_mode**: `"trace"`。
diff --git a/tests/evaluation/test_agent_evaluator_call_agent.py b/tests/evaluation/test_agent_evaluator_call_agent.py
new file mode 100644
index 0000000..ca7e6a6
--- /dev/null
+++ b/tests/evaluation/test_agent_evaluator_call_agent.py
@@ -0,0 +1,170 @@
+# Tencent is pleased to support the open source community by making tRPC-Agent-Python available.
+#
+# Copyright (C) 2026 Tencent. All rights reserved.
+#
+# tRPC-Agent-Python is licensed under Apache-2.0.
+"""TDD tests for AgentEvaluator call_agent routing."""
+
+from __future__ import annotations
+
+import pytest
+
+import trpc_agent_sdk.runners  # noqa: F401
+
+from trpc_agent_sdk.evaluation import AgentEvaluator
+from trpc_agent_sdk.evaluation import CallbackResult
+from trpc_agent_sdk.evaluation import Callbacks
+from trpc_agent_sdk.evaluation import EvalCase
+from trpc_agent_sdk.evaluation import EvalConfig
+from trpc_agent_sdk.evaluation import EvalSet
+from trpc_agent_sdk.evaluation import Invocation
+from trpc_agent_sdk.types import Content
+from trpc_agent_sdk.types import Part
+
+
+def _content(text: str) -> Content:
+    return Content(parts=[Part(text=text)])
+
+
+def _invocation(user: str, expected: str | None = None) -> Invocation:
+    return Invocation(
+        invocation_id="i",
+        user_content=_content(user),
+        final_response=_content(expected) if expected is not None else None,
+    )
+
+
+@pytest.mark.asyncio
+async def test_evaluate_eval_set_with_call_agent_minimal():
+    eval_set = EvalSet(
+        eval_set_id="s1",
+        eval_cases=[EvalCase(eval_id="c1", conversation=[_invocation("hello", "world")])],
+    )
+    eval_config = EvalConfig(criteria={"final_response_avg_score": 1.0})
+
+    async def call_agent(query: str) -> str:
+        return "world"
+
+    failed_summary, details, result_lines, eval_results = await AgentEvaluator.evaluate_eval_set(
+        eval_set,
+        call_agent=call_agent,
+        eval_config=eval_config,
+        print_detailed_results=False,
+    )
+
+    assert failed_summary is None
+    assert details == []
+    assert result_lines
+    assert "c1" in eval_results
+
+
+@pytest.mark.asyncio
+async def test_call_agent_with_agent_module_raises():
+    eval_set = EvalSet(
+        eval_set_id="s1",
+        eval_cases=[EvalCase(eval_id="c1", conversation=[_invocation("hello", "world")])],
+    )
+    eval_config = EvalConfig(criteria={"final_response_avg_score": 1.0})
+
+    async def call_agent(query: str) -> str:
+        return "world"
+
+    with pytest.raises(ValueError, match="mutually exclusive"):
+        await AgentEvaluator.evaluate_eval_set(
+            eval_set,
+            agent_module="fake.module",
+            call_agent=call_agent,
+            eval_config=eval_config,
+        )
+
+
+@pytest.mark.asyncio
+async def test_call_agent_with_runner_raises():
+    eval_set = EvalSet(
+        eval_set_id="s1",
+        eval_cases=[EvalCase(eval_id="c1", conversation=[_invocation("hello", "world")])],
+    )
+    eval_config = EvalConfig(criteria={"final_response_avg_score": 1.0})
+
+    async def call_agent(query: str) -> str:
+        return "world"
+
+    with pytest.raises(ValueError, match="mutually exclusive"):
+        await AgentEvaluator.evaluate_eval_set(
+            eval_set,
+            runner=object(),  # type: ignore[arg-type]
+            call_agent=call_agent,
+            eval_config=eval_config,
+        )
+
+
+@pytest.mark.asyncio
+async def test_call_agent_with_trace_case_raises():
+    eval_set = EvalSet(
+        eval_set_id="s1",
+        eval_cases=[
+            EvalCase(
+                eval_id="trace_case",
+                eval_mode="trace",
+                actual_conversation=[_invocation("hello", "world")],
+            )
+        ],
+    )
+    eval_config = EvalConfig(criteria={"final_response_avg_score": 1.0})
+
+    async def call_agent(query: str) -> str:
+        return "world"
+
+    with pytest.raises(ValueError, match="trace_case"):
+        await AgentEvaluator.evaluate_eval_set(
+            eval_set,
+            call_agent=call_agent,
+            eval_config=eval_config,
+        )
+
+
+@pytest.mark.asyncio
+async def test_call_agent_callbacks_e2e():
+    eval_set = EvalSet(
+        eval_set_id="s1",
+        eval_cases=[EvalCase(eval_id="c1", conversation=[_invocation("hello", "world")])],
+    )
+    eval_config = EvalConfig(criteria={"final_response_avg_score": 1.0})
+    points: list[str] = []
+
+    def _cb(name: str):
+        async def _fn(_ctx: dict, _args: object):
+            points.append(name)
+            return CallbackResult(context={"point": name})
+        return _fn
+
+    callbacks = Callbacks()
+    callbacks.register_before_inference_set("t", _cb("before_inference_set"))
+    callbacks.register_after_inference_set("t", _cb("after_inference_set"))
+    callbacks.register_before_inference_case("t", _cb("before_inference_case"))
+    callbacks.register_after_inference_case("t", _cb("after_inference_case"))
+    callbacks.register_before_evaluate_set("t", _cb("before_evaluate_set"))
+    callbacks.register_after_evaluate_set("t", _cb("after_evaluate_set"))
+    callbacks.register_before_evaluate_case("t", _cb("before_evaluate_case"))
+    callbacks.register_after_evaluate_case("t", _cb("after_evaluate_case"))
+
+    async def call_agent(query: str) -> str:
+        return "world"
+
+    await AgentEvaluator.evaluate_eval_set(
+        eval_set,
+        call_agent=call_agent,
+        eval_config=eval_config,
+        callbacks=callbacks,
+    )
+
+    assert {
+        "before_inference_set",
+        "after_inference_set",
+        "before_inference_case",
+        "after_inference_case",
+        "before_evaluate_set",
+        "after_evaluate_set",
+        "before_evaluate_case",
+        "after_evaluate_case",
+    }.issubset(set(points))
diff --git a/tests/evaluation/test_remote_eval_service.py b/tests/evaluation/test_remote_eval_service.py
new file mode 100644
index 0000000..77d0318
--- /dev/null
+++ b/tests/evaluation/test_remote_eval_service.py
@@ -0,0 +1,231 @@
+# Tencent is pleased to support the open source community by making tRPC-Agent-Python available.
+#
+# Copyright (C) 2026 Tencent. All rights reserved.
+#
+# tRPC-Agent-Python is licensed under Apache-2.0.
+"""TDD tests for RemoteEvalService."""
+
+from __future__ import annotations
+
+import asyncio
+
+import pytest
+
+import trpc_agent_sdk.runners  # noqa: F401
+
+from trpc_agent_sdk.evaluation import CallbackResult
+from trpc_agent_sdk.evaluation import Callbacks
+from trpc_agent_sdk.evaluation import EvalCase
+from trpc_agent_sdk.evaluation import EvalMetric
+from trpc_agent_sdk.evaluation import EvalSet
+from trpc_agent_sdk.evaluation import InMemoryEvalSetsManager
+from trpc_agent_sdk.evaluation import InferenceConfig
+from trpc_agent_sdk.evaluation import InferenceRequest
+from trpc_agent_sdk.evaluation import InferenceStatus
+from trpc_agent_sdk.evaluation import Invocation
+from trpc_agent_sdk.evaluation import EvaluateConfig
+from trpc_agent_sdk.evaluation import EvaluateRequest
+from trpc_agent_sdk.evaluation import EvalStatus
+from trpc_agent_sdk.evaluation._remote_eval_service import RemoteEvalService
+from trpc_agent_sdk.types import Content
+from trpc_agent_sdk.types import Part
+
+
+def _content(text: str) -> Content:
+    return Content(parts=[Part(text=text)])
+
+
+def _invocation(user: str, expected: str | None = None) -> Invocation:
+    return Invocation(
+        invocation_id="i",
+        user_content=_content(user),
+        final_response=_content(expected) if expected is not None else None,
+    )
+
+
+def _make_manager(eval_set: EvalSet, app_name: str = "app") -> InMemoryEvalSetsManager:
+    mgr = InMemoryEvalSetsManager()
+    mgr.create_eval_set(app_name=app_name, eval_set_id=eval_set.eval_set_id)
+    for case in eval_set.eval_cases:
+        mgr.add_eval_case(app_name=app_name, eval_set_id=eval_set.eval_set_id, eval_case=case)
+    return mgr
+
+
+@pytest.mark.asyncio
+async def test_perform_inference_async_callable_one_turn():
+    case = EvalCase(eval_id="c1", conversation=[_invocation("hello", "world")])
+    eval_set = EvalSet(eval_set_id="s1", eval_cases=[case])
+    mgr = _make_manager(eval_set)
+
+    async def call_agent(query: str) -> str:
+        assert query == "hello"
+        return "world"
+
+    service = RemoteEvalService(call_agent=call_agent, eval_sets_manager=mgr)
+    req = InferenceRequest(app_name="app", eval_set_id="s1", inference_config=InferenceConfig(parallelism=2))
+
+    results = [r async for r in service.perform_inference(req)]
+
+    assert len(results) == 1
+    assert results[0].status == InferenceStatus.SUCCESS
+    assert results[0].inferences is not None
+    assert results[0].inferences[0].final_response is not None
+    assert results[0].inferences[0].final_response.parts[0].text == "world"
+
+
+def test_reject_sync_callable_raises_value_error():
+    case = EvalCase(eval_id="c1", conversation=[_invocation("hello", "world")])
+    eval_set = EvalSet(eval_set_id="s1", eval_cases=[case])
+    mgr = _make_manager(eval_set)
+
+    def call_agent(query: str) -> str:
+        return query
+
+    with pytest.raises(ValueError, match="async function"):
+        RemoteEvalService(call_agent=call_agent, eval_sets_manager=mgr)
+
+
+@pytest.mark.asyncio
+async def test_reject_trace_cases_raises_value_error():
+    trace_case = EvalCase(
+        eval_id="trace_case",
+        eval_mode="trace",
+        actual_conversation=[_invocation("u", "a")],
+    )
+    eval_set = EvalSet(eval_set_id="s1", eval_cases=[trace_case])
+    mgr = _make_manager(eval_set)
+
+    async def call_agent(query: str) -> str:
+        return query
+
+    service = RemoteEvalService(call_agent=call_agent, eval_sets_manager=mgr)
+    req = InferenceRequest(app_name="app", eval_set_id="s1", inference_config=InferenceConfig(parallelism=1))
+
+    with pytest.raises(ValueError, match="trace_case"):
+        _ = [r async for r in service.perform_inference(req)]
+
+
+@pytest.mark.asyncio
+async def test_reject_tool_trajectory_metric_raises_value_error():
+    case = EvalCase(eval_id="c1", conversation=[_invocation("hello", "world")])
+    eval_set = EvalSet(eval_set_id="s1", eval_cases=[case])
+    mgr = _make_manager(eval_set)
+
+    async def call_agent(query: str) -> str:
+        return "world"
+
+    service = RemoteEvalService(call_agent=call_agent, eval_sets_manager=mgr)
+    req = InferenceRequest(app_name="app", eval_set_id="s1", inference_config=InferenceConfig(parallelism=1))
+    inference_results = [r async for r in service.perform_inference(req)]
+    evaluate_req = EvaluateRequest(
+        inference_results=inference_results,
+        evaluate_config=EvaluateConfig(
+            eval_metrics=[EvalMetric(metric_name="tool_trajectory_avg_score", threshold=1.0)],
+        ),
+    )
+
+    with pytest.raises(ValueError, match="tool_trajectory_avg_score"):
+        _ = [r async for r in service.evaluate(evaluate_req)]
+
+
+@pytest.mark.asyncio
+async def test_case_fail_soft_when_call_agent_raises():
+    case = EvalCase(eval_id="c1", conversation=[_invocation("hello", "world")])
+    eval_set = EvalSet(eval_set_id="s1", eval_cases=[case])
+    mgr = _make_manager(eval_set)
+
+    async def call_agent(query: str) -> str:
+        raise RuntimeError("boom")
+
+    service = RemoteEvalService(call_agent=call_agent, eval_sets_manager=mgr)
+    req = InferenceRequest(app_name="app", eval_set_id="s1", inference_config=InferenceConfig(parallelism=1))
+
+    results = [r async for r in service.perform_inference(req)]
+
+    assert len(results) == 1
+    assert results[0].status == InferenceStatus.FAILURE
+    assert results[0].error_message == "boom"
+
+
+@pytest.mark.asyncio
+async def test_case_parallel_turn_serial():
+    case1 = EvalCase(
+        eval_id="c1",
+        conversation=[_invocation("q1", "a1"), _invocation("q2", "a2")],
+    )
+    case2 = EvalCase(
+        eval_id="c2",
+        conversation=[_invocation("x1", "y1"), _invocation("x2", "y2")],
+    )
+    eval_set = EvalSet(eval_set_id="s1", eval_cases=[case1, case2])
+    mgr = _make_manager(eval_set)
+    call_order: list[str] = []
+    lock = asyncio.Lock()
+
+    async def call_agent(query: str) -> str:
+        async with lock:
+            call_order.append(query)
+        await asyncio.sleep(0.01)
+        return f"resp:{query}"
+
+    service = RemoteEvalService(call_agent=call_agent, eval_sets_manager=mgr)
+    req = InferenceRequest(app_name="app", eval_set_id="s1", inference_config=InferenceConfig(parallelism=2))
+    results = [r async for r in service.perform_inference(req)]
+
+    assert len(results) == 2
+    # Per-case should remain serial; globally interleaving is allowed.
+    c1_order = [q for q in call_order if q in {"q1", "q2"}]
+    c2_order = [q for q in call_order if q in {"x1", "x2"}]
+    assert c1_order == ["q1", "q2"]
+    assert c2_order == ["x1", "x2"]
+
+
+@pytest.mark.asyncio
+async def test_callbacks_all_nodes_called():
+    case = EvalCase(eval_id="c1", conversation=[_invocation("hello", "world")])
+    eval_set = EvalSet(eval_set_id="s1", eval_cases=[case])
+    mgr = _make_manager(eval_set)
+    points: list[str] = []
+
+    def _cb(name: str):
+        async def _fn(_ctx: dict, _args: object):
+            points.append(name)
+            return CallbackResult(context={"point": name})
+        return _fn
+
+    callbacks = Callbacks()
+    callbacks.register_before_inference_set("t", _cb("before_inference_set"))
+    callbacks.register_after_inference_set("t", _cb("after_inference_set"))
+    callbacks.register_before_inference_case("t", _cb("before_inference_case"))
+    callbacks.register_after_inference_case("t", _cb("after_inference_case"))
+    callbacks.register_before_evaluate_set("t", _cb("before_evaluate_set"))
+    callbacks.register_after_evaluate_set("t", _cb("after_evaluate_set"))
+    callbacks.register_before_evaluate_case("t", _cb("before_evaluate_case"))
+    callbacks.register_after_evaluate_case("t", _cb("after_evaluate_case"))
+
+    async def call_agent(query: str) -> str:
+        return "world"
+
+    service = RemoteEvalService(call_agent=call_agent, eval_sets_manager=mgr, callbacks=callbacks)
+    req = InferenceRequest(app_name="app", eval_set_id="s1", inference_config=InferenceConfig(parallelism=1))
+    inference_results = [r async for r in service.perform_inference(req)]
+    eval_req = EvaluateRequest(
+        inference_results=inference_results,
+        evaluate_config=EvaluateConfig(
+            eval_metrics=[EvalMetric(metric_name="final_response_avg_score", threshold=1.0)],
+        ),
+    )
+    eval_results = [r async for r in service.evaluate(eval_req)]
+
+    assert len(eval_results) == 1
+    assert eval_results[0].final_eval_status == EvalStatus.PASSED
+    assert {
+        "before_inference_set",
+        "after_inference_set",
+        "before_inference_case",
+        "after_inference_case",
+        "before_evaluate_set",
+        "after_evaluate_set",
+        "before_evaluate_case",
+        "after_evaluate_case",
+    }.issubset(set(points))
diff --git a/trpc_agent_sdk/evaluation/__init__.py b/trpc_agent_sdk/evaluation/__init__.py
index 1d4eb20..4c87990 100644
--- a/trpc_agent_sdk/evaluation/__init__.py
+++ b/trpc_agent_sdk/evaluation/__init__.py
@@ -166,6 +166,8 @@
 from ._llm_judge import WeightedMajorityModelsAggregator
 from ._llm_judge import get_builtin_models_aggregator
 from ._local_eval_service import LocalEvalService
+from ._remote_eval_service import CallAgent
+from ._remote_eval_service import RemoteEvalService
 from ._local_eval_set_results_manager import LocalEvalSetResultsManager
 from ._local_eval_sets_manager import LocalEvalSetsManager
 from ._local_eval_sets_manager import load_eval_set_from_file
@@ -297,6 +299,8 @@
     "ResponseScorerFn",
     "SamplesAggregatorFn",
     "LocalEvalService",
+    "RemoteEvalService",
+    "CallAgent",
     "RougeEvaluator",
     "StaticUserSimulator",
     "TrajectoryEvaluator",
diff --git a/trpc_agent_sdk/evaluation/_agent_evaluator.py b/trpc_agent_sdk/evaluation/_agent_evaluator.py
index 09e1ed3..f4d1da6 100644
--- a/trpc_agent_sdk/evaluation/_agent_evaluator.py
+++ b/trpc_agent_sdk/evaluation/_agent_evaluator.py
@@ -49,6 +49,8 @@
 from trpc_agent_sdk.agents import BaseAgent
 
 from ._local_eval_service import LocalEvalService
+from ._remote_eval_service import CallAgent
+from ._remote_eval_service import RemoteEvalService
 from . import _utils
 from ._eval_callbacks import Callbacks
 from ._eval_case import EvalModeTrace
@@ -89,6 +91,7 @@ def __init__(
         eval_dataset_file_path_or_dir: str,
         *,
         agent_module: Optional[str] = None,
+        call_agent: Optional[CallAgent] = None,
         num_runs: int = NUM_RUNS,
         agent_name: Optional[str] = None,
         print_detailed_results: bool = True,
@@ -100,6 +103,7 @@ def __init__(
         eval_metrics_file_path_or_dir: Optional[str] = None,
     ):
         self._agent_module = agent_module
+        self._call_agent = call_agent
         self._eval_dataset_file_path_or_dir = eval_dataset_file_path_or_dir
         self._num_runs = num_runs
         self._agent_name = agent_name
@@ -115,6 +119,7 @@ def __init__(
 
     async def _run(self) -> None:
         agent_module = self._agent_module
+        call_agent = self._call_agent
         eval_dataset_file_path_or_dir = self._eval_dataset_file_path_or_dir
         num_runs = self._num_runs
         agent_name = self._agent_name
@@ -163,6 +168,7 @@ async def _run(self) -> None:
                 await AgentEvaluator.evaluate_eval_set(
                     eval_set,
                     agent_module=agent_module,
+                    call_agent=call_agent,
                     eval_config=eval_config,
                     num_runs=num_runs_for_set,
                     agent_name=agent_name,
@@ -187,7 +193,8 @@ async def _run(self) -> None:
             _RESULT_HANDLER.print_evaluation_report(
                 all_details=all_details,
                 all_results=all_results,
-                display_agent_name=agent_name or agent_module or "trace-only",
+                display_agent_name=agent_name or agent_module
+                or ("call-agent" if call_agent is not None else "trace-only"),
                 num_runs=num_runs_for_set,
             )
         self._result = EvaluateResult(results_by_eval_set_id=results_by_eval_set_id)
@@ -282,6 +289,7 @@ async def evaluate(
         eval_dataset_file_path_or_dir: str,
         *,
         agent_module: Optional[str] = None,
+        call_agent: Optional[CallAgent] = None,
         num_runs: int = NUM_RUNS,
         agent_name: Optional[str] = None,
         print_detailed_results: bool = True,
@@ -318,6 +326,7 @@ async def evaluate(
         executer = AgentEvaluator.get_executer(
             eval_dataset_file_path_or_dir,
             agent_module=agent_module,
+            call_agent=call_agent,
             num_runs=num_runs,
             agent_name=agent_name,
             print_detailed_results=print_detailed_results,
@@ -335,6 +344,7 @@ def get_executer(
         eval_dataset_file_path_or_dir: str,
         *,
         agent_module: Optional[str] = None,
+        call_agent: Optional[CallAgent] = None,
         num_runs: int = NUM_RUNS,
         agent_name: Optional[str] = None,
         print_detailed_results: bool = True,
@@ -374,6 +384,7 @@ def get_executer(
         return _EvalExecuter(
             eval_dataset_file_path_or_dir,
             agent_module=agent_module,
+            call_agent=call_agent,
             num_runs=num_runs,
             agent_name=agent_name,
             print_detailed_results=print_detailed_results,
@@ -431,6 +442,7 @@ async def evaluate_eval_set(
         eval_set: EvalSet,
         *,
         agent_module: Optional[str] = None,
+        call_agent: Optional[CallAgent] = None,
         eval_config: Optional[EvalConfig] = None,
         num_runs: int = NUM_RUNS,
         agent_name: Optional[str] = None,
@@ -476,15 +488,20 @@ async def evaluate_eval_set(
         if eval_config is None:
             raise ValueError("`eval_config` is required.")
 
+        if call_agent is not None and agent_module is not None:
+            raise ValueError("call_agent is mutually exclusive with agent_module.")
+        if call_agent is not None and runner is not None:
+            raise ValueError("call_agent is mutually exclusive with runner.")
+
         trace_only = AgentEvaluator._is_trace_only(eval_set)
-        if agent_module is None and not trace_only:
+        if call_agent is None and agent_module is None and not trace_only:
             non_trace_ids = [case.eval_id for case in eval_set.eval_cases if case.eval_mode != EvalModeTrace]
             raise ValueError("`agent_module` is required unless every case in eval_set uses "
                              "eval_mode='trace'. Non-trace case ids: "
                              f"{non_trace_ids}")
 
         agent_for_eval: Optional[BaseAgent] = None
-        if agent_module is not None:
+        if agent_module is not None and call_agent is None:
             agent_for_eval = await AgentEvaluator._get_agent_for_eval(module_name=agent_module, agent_name=agent_name)
         eval_metrics = eval_config.get_eval_metrics()
 
@@ -502,11 +519,12 @@ async def evaluate_eval_set(
             case_parallelism=case_parallelism,
             case_eval_parallelism=case_eval_parallelism,
             callbacks=callbacks,
+            call_agent=call_agent,
         )
 
         # Step 2: Post-process the results
         failures: list[str] = []
-        display_agent_name = agent_name or agent_module or "trace-only"
+        display_agent_name = agent_name or agent_module or ("call-agent" if call_agent is not None else "trace-only")
         details_lines: list[str] = []
         result_lines: list[str] = []
 
@@ -771,6 +789,7 @@ async def _get_eval_results_by_eval_id(
         eval_metrics: list,
         num_runs: int,
         user_simulator_provider,
+        call_agent: Optional[CallAgent] = None,
         eval_set_results_manager: Optional[Any] = None,
         runner: Optional[Runner] = None,
         case_parallelism: Optional[int] = None,
@@ -811,14 +830,23 @@ async def _get_eval_results_by_eval_id(
         # app_name: evalset.app_name or configured default (case session_input.app_name overrides per case)
         request_app_name = eval_set.app_name or DEFAULT_EVAL_APP_NAME
 
-        eval_service = LocalEvalService(
-            root_agent=agent_for_eval,
-            eval_sets_manager=AgentEvaluator._get_eval_sets_manager(app_name=request_app_name, eval_set=eval_set),
-            user_simulator_provider=user_simulator_provider,
-            eval_set_results_manager=eval_set_results_manager,
-            runner=runner,
-            callbacks=callbacks,
-        )
+        eval_sets_manager = AgentEvaluator._get_eval_sets_manager(app_name=request_app_name, eval_set=eval_set)
+        if call_agent is not None:
+            eval_service = RemoteEvalService(
+                call_agent=call_agent,
+                eval_sets_manager=eval_sets_manager,
+                eval_set_results_manager=eval_set_results_manager,
+                callbacks=callbacks,
+            )
+        else:
+            eval_service = LocalEvalService(
+                root_agent=agent_for_eval,
+                eval_sets_manager=eval_sets_manager,
+                user_simulator_provider=user_simulator_provider,
+                eval_set_results_manager=eval_set_results_manager,
+                runner=runner,
+                callbacks=callbacks,
+            )
 
         inference_config = (InferenceConfig(
             parallelism=case_parallelism) if case_parallelism is not None else InferenceConfig())
diff --git a/trpc_agent_sdk/evaluation/_remote_eval_service.py b/trpc_agent_sdk/evaluation/_remote_eval_service.py
new file mode 100644
index 0000000..25a4319
--- /dev/null
+++ b/trpc_agent_sdk/evaluation/_remote_eval_service.py
@@ -0,0 +1,466 @@
+# Tencent is pleased to support the open source community by making tRPC-Agent-Python available.
+#
+# Copyright (C) 2026 Tencent. All rights reserved.
+#
+# tRPC-Agent-Python is licensed under Apache-2.0.
+"""Remote (black-box) eval service driven by async call_agent(query)->str."""
+
+from __future__ import annotations
+
+import asyncio
+import inspect
+import time
+import uuid
+from typing import Any
+from typing import AsyncGenerator
+from typing import Awaitable
+from typing import Callable
+from typing import Optional
+from typing_extensions import override
+
+from trpc_agent_sdk.log import error as log_error
+from trpc_agent_sdk.types import Content
+from trpc_agent_sdk.types import Part
+
+from ._eval_callbacks import Callbacks
+from ._eval_callbacks import CallbacksRunner
+from ._eval_callbacks import EvalSetRunResult
+from ._eval_case import EvalCase
+from ._eval_case import EvalModeTrace
+from ._eval_case import Invocation
+from ._eval_metrics import EvalMetric
+from ._eval_metrics import EvalStatus
+from ._eval_result import EvalCaseResult
+from ._eval_result import EvalMetricResult
+from ._eval_result import EvalMetricResultDetails
+from ._eval_result import EvalMetricResultPerInvocation
+from ._eval_result import EvaluationResult
+from ._eval_result import PerInvocationResult
+from ._eval_service_base import BaseEvalService
+from ._eval_service_base import EvaluateConfig
+from ._eval_service_base import EvaluateRequest
+from ._eval_service_base import InferenceRequest
+from ._eval_service_base import InferenceResult
+from ._eval_service_base import InferenceStatus
+from ._eval_set_results_manager_base import EvalSetResultsManager
+from ._eval_sets_manager_base import EvalSetsManager
+from ._evaluator_registry import EVALUATOR_REGISTRY
+from ._evaluator_registry import EvaluatorRegistry
+
+CallAgent = Callable[[str], Awaitable[str]]
+REMOTE_EVAL_INCOMPATIBLE_METRICS: frozenset[str] = frozenset({
+    "tool_trajectory_avg_score",
+})
+EVAL_SESSION_ID_PREFIX = "___remote_eval___session___"
+
+
+def _get_session_id() -> str:
+    return f"{EVAL_SESSION_ID_PREFIX}{str(uuid.uuid4())}"
+
+
+class RemoteEvalService(BaseEvalService):
+    """Eval service for remote/black-box agents via call_agent."""
+
+    def __init__(
+        self,
+        call_agent: CallAgent,
+        eval_sets_manager: EvalSetsManager,
+        evaluator_registry: Optional[EvaluatorRegistry] = None,
+        eval_set_results_manager: Optional[EvalSetResultsManager] = None,
+        session_id_supplier: Callable[[], str] = _get_session_id,
+        callbacks: Optional[Callbacks] = None,
+    ):
+        self._validate_call_agent_is_async(call_agent)
+        self._call_agent = call_agent
+        self._eval_sets_manager = eval_sets_manager
+        self._evaluator_registry = evaluator_registry or EVALUATOR_REGISTRY
+        self._eval_set_results_manager = eval_set_results_manager
+        self._session_id_supplier = session_id_supplier
+        self._callbacks_runner = CallbacksRunner(callbacks or Callbacks())
+
+    @staticmethod
+    def _validate_call_agent_is_async(call_agent: Any) -> None:
+        if not callable(call_agent):
+            raise ValueError("call_agent must be callable.")
+        if not inspect.iscoroutinefunction(call_agent):
+            raise ValueError("call_agent must be an async function: async def call_agent(query: str) -> str")
+
+    @staticmethod
+    def _user_content_to_str(content: Content) -> str:
+        parts = getattr(content, "parts", []) or []
+        chunks: list[str] = []
+        for part in parts:
+            text = getattr(part, "text", None)
+            if isinstance(text, str):
+                chunks.append(text)
+        return "".join(chunks)
+
+    @staticmethod
+    def _reject_trace_cases(eval_cases: list[EvalCase]) -> None:
+        trace_ids = [case.eval_id for case in eval_cases if case.eval_mode == EvalModeTrace]
+        if trace_ids:
+            raise ValueError(f"call_agent mode is incompatible with trace cases: {trace_ids}")
+
+    @override
+    async def perform_inference(
+        self,
+        inference_request: InferenceRequest,
+    ) -> AsyncGenerator[InferenceResult, None]:
+        eval_set = self._eval_sets_manager.get_eval_set(
+            app_name=inference_request.app_name,
+            eval_set_id=inference_request.eval_set_id,
+        )
+        if not eval_set:
+            raise ValueError(f"Eval set with id {inference_request.eval_set_id} not found for app "
+                             f"{inference_request.app_name}")
+
+        eval_cases = eval_set.eval_cases
+        if inference_request.eval_case_ids:
+            eval_cases = [c for c in eval_cases if c.eval_id in inference_request.eval_case_ids]
+        self._reject_trace_cases(eval_cases)
+
+        run_ctx: dict[str, Any] = {}
+        start_time = time.monotonic()
+        inference_results_list: list[InferenceResult] = []
+        set_error: Optional[Exception] = None
+        await self._callbacks_runner.run_before_inference_set(inference_request, run_ctx)
+        semaphore = asyncio.Semaphore(value=inference_request.inference_config.parallelism)
+
+        async def run_one(eval_case: EvalCase) -> InferenceResult:
+            case_ctx = run_ctx.copy()
+            session_id = self._session_id_supplier()
+            await self._callbacks_runner.run_before_inference_case(
+                inference_request,
+                eval_case.eval_id,
+                session_id,
+                case_ctx,
+            )
+            case_start = time.monotonic()
+            async with semaphore:
+                result = await self._perform_inference_single_eval_item(
+                    app_name=inference_request.app_name,
+                    eval_set_id=inference_request.eval_set_id,
+                    eval_case=eval_case,
+                    session_id=session_id,
+                )
+                await self._callbacks_runner.run_after_inference_case(
+                    inference_request,
+                    result,
+                    None,
+                    case_start,
+                    case_ctx,
+                )
+            return result
+
+        try:
+            tasks = [run_one(eval_case) for eval_case in eval_cases]
+            for coro in asyncio.as_completed(tasks):
+                inference_result = await coro
+                inference_results_list.append(inference_result)
+                yield inference_result
+        except Exception as e:
+            set_error = e
+            raise
+        finally:
+            await self._callbacks_runner.run_after_inference_set(
+                inference_request,
+                inference_results_list,
+                set_error,
+                start_time,
+                run_ctx,
+            )
+
+    async def _perform_inference_single_eval_item(
+        self,
+        app_name: str,
+        eval_set_id: str,
+        eval_case: EvalCase,
+        session_id: Optional[str] = None,
+    ) -> InferenceResult:
+        if session_id is None:
+            session_id = self._session_id_supplier()
+        inference_result = InferenceResult(
+            app_name=app_name,
+            eval_set_id=eval_set_id,
+            eval_case_id=eval_case.eval_id,
+            session_id=session_id,
+        )
+        try:
+            if not eval_case.conversation:
+                raise ValueError(f"inference eval case (eval_case_id={eval_case.eval_id}, session_id={session_id}): "
+                                 "conversation is required in call_agent mode")
+            inferences: list[Invocation] = []
+            for source_invocation in eval_case.conversation:
+                query = self._user_content_to_str(source_invocation.user_content)
+                response_text = await self._call_agent(query)
+                inferences.append(
+                    Invocation(
+                        invocation_id=source_invocation.invocation_id,
+                        user_content=source_invocation.user_content,
+                        final_response=Content(parts=[Part(text=response_text)]),
+                        intermediate_data=None,
+                        creation_timestamp=time.time(),
+                    ))
+            inference_result.inferences = inferences
+            inference_result.status = InferenceStatus.SUCCESS
+            return inference_result
+        except Exception as ex:  # pylint: disable=broad-except
+            log_error(
+                "Inference failed for eval case `%s` with error %s.",
+                eval_case.eval_id,
+                ex,
+                exc_info=True,
+            )
+            inference_result.status = InferenceStatus.FAILURE
+            inference_result.error_message = str(ex)
+            return inference_result
+
+    def _validate_remote_metric_compat(self, evaluate_config: EvaluateConfig) -> None:
+        incompatible = sorted({
+            metric.metric_name
+            for metric in evaluate_config.eval_metrics if metric.metric_name in REMOTE_EVAL_INCOMPATIBLE_METRICS
+        })
+        if incompatible:
+            raise ValueError("call_agent mode does not support metrics: "
+                             f"{incompatible}. Please remove them from EvalConfig.")
+
+    @override
+    async def evaluate(
+        self,
+        evaluate_request: EvaluateRequest,
+    ) -> AsyncGenerator[EvalCaseResult, None]:
+        self._validate_remote_metric_compat(evaluate_request.evaluate_config)
+        run_ctx: dict[str, Any] = {}
+        start_time = time.monotonic()
+        eval_case_results_list: list[EvalCaseResult] = []
+        set_error: Optional[Exception] = None
+        ir0 = evaluate_request.inference_results[0] if evaluate_request.inference_results else None
+        app_name = ir0.app_name if ir0 else ""
+        eval_set_id = ir0.eval_set_id if ir0 else ""
+        await self._callbacks_runner.run_before_evaluate_set(evaluate_request, run_ctx)
+        semaphore = asyncio.Semaphore(value=evaluate_request.evaluate_config.parallelism)
+
+        async def run_one_eval(inference_result: InferenceResult) -> tuple[InferenceResult, EvalCaseResult]:
+            case_ctx = run_ctx.copy()
+            await self._callbacks_runner.run_before_evaluate_case(
+                evaluate_request,
+                inference_result.eval_case_id,
+                case_ctx,
+            )
+            case_start = time.monotonic()
+            async with semaphore:
+                inference_result, eval_case_result = await self._evaluate_single_inference_result(
+                    inference_result=inference_result,
+                    evaluate_config=evaluate_request.evaluate_config,
+                )
+                await self._callbacks_runner.run_after_evaluate_case(
+                    evaluate_request,
+                    inference_result,
+                    eval_case_result,
+                    None,
+                    case_start,
+                    case_ctx,
+                )
+            return (inference_result, eval_case_result)
+
+        try:
+            tasks = [run_one_eval(ir) for ir in evaluate_request.inference_results]
+            for coro in asyncio.as_completed(tasks):
+                _, eval_case_result = await coro
+                eval_case_results_list.append(eval_case_result)
+                yield eval_case_result
+            if self._eval_set_results_manager and eval_case_results_list and app_name:
+                sorted_results = sorted(eval_case_results_list, key=lambda r: (r.run_id or 0, r.eval_id))
+                self._eval_set_results_manager.save_eval_set_result(
+                    app_name=app_name,
+                    eval_set_id=eval_set_id,
+                    eval_case_results=sorted_results,
+                )
+        except Exception as e:
+            set_error = e
+            raise
+        finally:
+            await self._callbacks_runner.run_after_evaluate_set(
+                evaluate_request,
+                EvalSetRunResult(
+                    app_name=app_name,
+                    eval_set_id=eval_set_id,
+                    eval_case_results=eval_case_results_list,
+                ),
+                set_error,
+                start_time,
+                run_ctx,
+            )
+
+    async def _evaluate_single_inference_result(
+        self,
+        inference_result: InferenceResult,
+        evaluate_config: EvaluateConfig,
+    ) -> tuple[InferenceResult, EvalCaseResult]:
+        eval_case = self._eval_sets_manager.get_eval_case(
+            app_name=inference_result.app_name,
+            eval_set_id=inference_result.eval_set_id,
+            eval_case_id=inference_result.eval_case_id,
+        )
+        if eval_case is None:
+            raise ValueError(f"Eval case with id {inference_result.eval_case_id} not found for "
+                             f"app {inference_result.app_name} and eval set {inference_result.eval_set_id}.")
+
+        expected_invocations = self._build_expected_invocations_for_eval(eval_case)
+        eval_metric_result_per_invocation: list[EvalMetricResultPerInvocation] = []
+        overall_eval_metric_results: list[EvalMetricResult] = []
+
+        if inference_result.inferences:
+            for idx, actual in enumerate(inference_result.inferences):
+                expected = None
+                if expected_invocations and idx < len(expected_invocations):
+                    expected = expected_invocations[idx]
+                eval_metric_result_per_invocation.append(
+                    EvalMetricResultPerInvocation(
+                        actual_invocation=actual,
+                        expected_invocation=expected,
+                        eval_metric_results=[],
+                    ))
+
+        case_error_message: Optional[str] = inference_result.error_message
+        if inference_result.status == InferenceStatus.FAILURE:
+            for eval_metric in evaluate_config.eval_metrics:
+                overall_eval_metric_results.append(
+                    EvalMetricResult(
+                        metric_name=eval_metric.metric_name,
+                        threshold=eval_metric.threshold,
+                        criterion=eval_metric.criterion,
+                        score=None,
+                        eval_status=EvalStatus.NOT_EVALUATED,
+                    ))
+                for invocation in eval_metric_result_per_invocation:
+                    invocation.eval_metric_results.append(
+                        EvalMetricResult(
+                            metric_name=eval_metric.metric_name,
+                            threshold=eval_metric.threshold,
+                            criterion=eval_metric.criterion,
+                            score=None,
+                            eval_status=EvalStatus.NOT_EVALUATED,
+                        ))
+            return (
+                inference_result,
+                EvalCaseResult(
+                    eval_set_id=inference_result.eval_set_id,
+                    eval_id=inference_result.eval_case_id,
+                    run_id=getattr(inference_result, "run_id", None),
+                    final_eval_status=EvalStatus.NOT_EVALUATED,
+                    error_message=case_error_message,
+                    overall_eval_metric_results=overall_eval_metric_results,
+                    eval_metric_result_per_invocation=eval_metric_result_per_invocation,
+                    session_id=inference_result.session_id or "",
+                    session_details=None,
+                    user_id=None,
+                ),
+            )
+
+        for eval_metric in evaluate_config.eval_metrics:
+            try:
+                evaluation_result = await self._evaluate_metric(
+                    eval_metric=eval_metric,
+                    actual_invocations=inference_result.inferences or [],
+                    expected_invocations=expected_invocations,
+                )
+            except Exception as e:  # pylint: disable=broad-except
+                if case_error_message is None:
+                    case_error_message = str(e)
+                log_error(
+                    "Metric evaluation failed for metric `%s` for eval case id '%s' with following error `%s`",
+                    eval_metric.metric_name,
+                    inference_result.eval_case_id,
+                    e,
+                    exc_info=True,
+                )
+                evaluation_result = EvaluationResult(overall_eval_status=EvalStatus.NOT_EVALUATED)
+
+            reasons = [pr.reason for pr in evaluation_result.per_invocation_results if pr.reason is not None]
+            rubric_scores: list[Any] = []
+            for pr in evaluation_result.per_invocation_results:
+                if pr.rubric_scores:
+                    rubric_scores.extend(pr.rubric_scores)
+            overall_reason = ";".join(reasons) if reasons else None
+            overall_rubric = rubric_scores if rubric_scores else None
+            overall_eval_metric_results.append(
+                EvalMetricResult(
+                    score=evaluation_result.overall_score,
+                    eval_status=evaluation_result.overall_eval_status,
+                    metric_name=eval_metric.metric_name,
+                    threshold=eval_metric.threshold,
+                    criterion=eval_metric.criterion,
+                    details=EvalMetricResultDetails(
+                        reason=overall_reason,
+                        score=evaluation_result.overall_score,
+                        rubric_scores=overall_rubric,
+                    ) if (overall_reason is not None or overall_rubric is not None) else None,
+                ))
+
+            for idx, invocation in enumerate(eval_metric_result_per_invocation):
+                if idx < len(evaluation_result.per_invocation_results):
+                    invocation_result = evaluation_result.per_invocation_results[idx]
+                else:
+                    invocation_result = PerInvocationResult(actual_invocation=invocation.actual_invocation)
+                invocation.eval_metric_results.append(
+                    EvalMetricResult(
+                        score=invocation_result.score,
+                        eval_status=invocation_result.eval_status,
+                        metric_name=eval_metric.metric_name,
+                        threshold=eval_metric.threshold,
+                        criterion=eval_metric.criterion,
+                        details=EvalMetricResultDetails(
+                            reason=invocation_result.reason,
+                            score=invocation_result.score,
+                            rubric_scores=invocation_result.rubric_scores,
+                        ) if
+                        (invocation_result.reason is not None or invocation_result.rubric_scores is not None) else None,
+                    ))
+
+        eval_case_result = EvalCaseResult(
+            eval_set_id=inference_result.eval_set_id,
+            eval_id=inference_result.eval_case_id,
+            run_id=getattr(inference_result, "run_id", None),
+            final_eval_status=self._generate_final_eval_status(overall_eval_metric_results),
+            error_message=case_error_message,
+            overall_eval_metric_results=overall_eval_metric_results,
+            eval_metric_result_per_invocation=eval_metric_result_per_invocation,
+            session_id=inference_result.session_id or "",
+            session_details=None,
+            user_id=None,
+        )
+        return (inference_result, eval_case_result)
+
+    async def _evaluate_metric(
+        self,
+        eval_metric: EvalMetric,
+        actual_invocations: list[Invocation],
+        expected_invocations: Optional[list[Invocation]],
+    ) -> EvaluationResult:
+        evaluator = self._evaluator_registry.get_evaluator(eval_metric)
+        if inspect.iscoroutinefunction(evaluator.evaluate_invocations):
+            return await evaluator.evaluate_invocations(
+                actual_invocations=actual_invocations,
+                expected_invocations=expected_invocations,
+            )
+        return evaluator.evaluate_invocations(
+            actual_invocations=actual_invocations,
+            expected_invocations=expected_invocations,
+        )
+
+    @staticmethod
+    def _build_expected_invocations_for_eval(eval_case: EvalCase) -> Optional[list[Invocation]]:
+        if eval_case.conversation:
+            return list(eval_case.conversation)
+        return None
+
+    @staticmethod
+    def _generate_final_eval_status(overall_eval_metric_results: list[EvalMetricResult]) -> EvalStatus:
+        final_eval_status = EvalStatus.NOT_EVALUATED
+        for result in overall_eval_metric_results:
+            if result.eval_status == EvalStatus.PASSED:
+                final_eval_status = EvalStatus.PASSED
+            elif result.eval_status == EvalStatus.FAILED:
+                return EvalStatus.FAILED
+        return final_eval_status