From b6b682b3f257216d3c65021ba61dca145c3417b4 Mon Sep 17 00:00:00 2001 From: ricknie Date: Wed, 13 May 2026 19:40:32 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20eval=E6=A8=A1=E5=9D=97=E6=94=AF?= =?UTF-8?q?=E6=8C=81=E6=8E=A5=E5=85=A5=E5=A4=96=E9=83=A8agent=E8=AF=84?= =?UTF-8?q?=E4=BC=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TAPD: --story=134277610 --- docs/mkdocs/en/evaluation.md | 240 ++++++--- docs/mkdocs/zh/evaluation.md | 241 ++++++--- .../test_agent_evaluator_call_agent.py | 170 +++++++ tests/evaluation/test_remote_eval_service.py | 231 +++++++++ trpc_agent_sdk/evaluation/__init__.py | 4 + trpc_agent_sdk/evaluation/_agent_evaluator.py | 52 +- .../evaluation/_remote_eval_service.py | 466 ++++++++++++++++++ 7 files changed, 1256 insertions(+), 148 deletions(-) create mode 100644 tests/evaluation/test_agent_evaluator_call_agent.py create mode 100644 tests/evaluation/test_remote_eval_service.py create mode 100644 trpc_agent_sdk/evaluation/_remote_eval_service.py diff --git a/docs/mkdocs/en/evaluation.md b/docs/mkdocs/en/evaluation.md index ca51a85..9028b06 100644 --- a/docs/mkdocs/en/evaluation.md +++ b/docs/mkdocs/en/evaluation.md @@ -65,6 +65,7 @@ Triggering evaluation through pytest allows eval test cases to be integrated int | Knowledge Recall Evaluation | Evaluates whether retrieved knowledge in RAG scenarios is sufficient to support the answer | Verify that knowledge base retrieval results cover the key facts in the question | | Multiple Runs and Statistics | Runs the same test case multiple times, computing stability metrics such as pass@k | Evaluate the Agent's pass rate across multiple attempts | | Trace Replay | Skips inference, directly scores using pre-recorded conversation traces | Perform offline evaluation using production logs without consuming inference resources | +| External Agent Evaluation | Evaluate Agents not created by this framework via `call_agent` (HTTP services, CLI, other frameworks) | Run regression tests against an existing Claude Code CLI or remote API | | Callback Hooks | Attach custom logic at 8 lifecycle points during inference/scoring | Instrumentation, logging, sampling, reporting | #### Overall Evaluation Flow @@ -299,7 +300,7 @@ This section explains the components of the evaluation module and their relation | **AgentEvaluator** | The entry point exposed to users, providing `evaluate()` and `get_executer()` | Call it in pytest tests | | **Eval Set (EvalSet)** | Describes "what to test"—scenarios, user inputs, expected outputs | Write `.evalset.json` files | | **Eval Config (EvalConfig)** | Describes "how to judge"—which metrics, thresholds, matching rules | Write `test_config.json` files | -| **Eval Service (LocalEvalService)** | The engine that executes inference and scoring | Automatically created by the framework; usually no action needed | +| **Eval Service (LocalEvalService / RemoteEvalService)** | The engine that executes inference and scoring (local Agent or `call_agent`) | Automatically created by the framework; usually no action needed | | **Evaluator** | The concrete implementation that computes scores per metric | Choose built-in evaluators, or register custom ones | | **Evaluator Registry (EvaluatorRegistry)** | Maintains the mapping from `metric_name` to evaluator type | Register when custom evaluators are needed | | **Evaluation Result (EvaluateResult)** | Holds the structured evaluation results | Obtain and analyze via `get_result()` | @@ -308,12 +309,31 @@ This section explains the components of the evaluation module and their relation AgentEvaluator is the entry point and orchestrator of the entire evaluation flow: -1. **Loading Phase**: AgentEvaluator loads the EvalSet from eval set files (`.evalset.json` / `.test.json`), loads the EvalConfig from `test_config.json` in the same directory, and loads the Agent by `agent_module`. -2. **Building the Eval Service**: AgentEvaluator writes the EvalSet into InMemoryEvalSetsManager and creates LocalEvalService (depending on the Manager, UserSimulatorProvider, optional EvalSetResultsManager, Runner, and Callbacks). By default, it uses StaticUserSimulator, which drives inference using user_content from the conversation. Optionally, LocalEvalSetResultsManager can be injected to persist run results to a directory. -3. **Inference Phase**: The eval service drives the Runner for inference based on test cases and conversations in the EvalSet, producing actual Invocation lists (actual tool calls, actual responses). +1. **Loading Phase**: AgentEvaluator loads the EvalSet from eval set files (`.evalset.json` / `.test.json`), loads the EvalConfig from `test_config.json` in the same directory; for local Agent paths, loads the Agent by `agent_module` (can be omitted when using `call_agent` or when all cases use [Trace Mode](#trace-mode)). +2. **Building the Eval Service**: AgentEvaluator writes the EvalSet into InMemoryEvalSetsManager; when `call_agent` is provided, creates RemoteEvalService; otherwise creates LocalEvalService (depending on the Manager, UserSimulatorProvider, optional EvalSetResultsManager, Runner, and Callbacks). +3. **Inference Phase**: The eval service performs turn-by-turn inference based on test cases and conversations in the EvalSet: LocalEvalService drives the Runner to call the Agent; RemoteEvalService calls `call_agent(query)` to obtain each turn's actual response, producing actual Invocation lists. 4. **Scoring Phase**: The eval service obtains evaluators from the EvaluatorRegistry based on the EvalMetric list in the EvalConfig, scores actual vs. expected item by item, and aggregates into EvalCaseResult. 5. **Result Aggregation**: AgentEvaluator determines pass/fail based on results, raises `AssertionError` when any test case falls below the threshold, and optionally persists results as `.evalset_result.json`. +#### AgentEvaluator Parameter List + +`evaluate()` and `get_executer()` accept the same parameters (`evaluate()` internally calls `get_executer()`): + +| Parameter | Type | Description | +| --- | --- | --- | +| eval_dataset_file_path_or_dir | str | Path to eval set file or directory (recursively scans `.evalset.json` / `.test.json`) | +| agent_module | str \| None | Python module path of the Agent created by this framework; mutually exclusive with `call_agent`. Not needed when using `call_agent` or when all cases are Trace mode | +| call_agent | CallAgent \| None | Async callable for Agents not created by this framework (`async def(str)->str`); mutually exclusive with `agent_module` / `runner` | +| num_runs | int | Number of runs per eval set, default 1 | +| agent_name | str \| None | Display name of the Agent | +| print_detailed_results | bool | Whether to print per-case detail comparisons, default True | +| eval_result_output_dir | str \| None | Directory for result persistence; omit for in-memory aggregation only | +| runner | Runner \| None | Custom Runner instance; mutually exclusive with `call_agent` | +| case_parallelism | int \| None | Max concurrent cases during inference | +| case_eval_parallelism | int \| None | Max concurrent cases during scoring | +| callbacks | Callbacks \| None | Lifecycle callbacks | +| eval_metrics_file_path_or_dir | str \| None | Shared eval config file path (overrides same-directory `test_config.json`) | + --- ### Eval Set (EvalSet) Writing Guide @@ -458,6 +478,8 @@ Configuration keys support both snake_case (e.g., `metric_name`) and camelCase ( | `llm_rubric_response` | LLMRubricResponseEvaluator | LLM judge scores item by item against rubrics | Need to evaluate response quality across multiple dimensions (correctness, relevance, compliance, etc.) | | `llm_rubric_knowledge_recall` | LLMRubricKnowledgeRecallEvaluator | LLM judge evaluates whether retrieved knowledge is sufficient to support the answer | RAG scenarios; need to verify that retrieved knowledge covers key facts | +> Note: `call_agent` mode does not support `tool_trajectory_avg_score`. When evaluating external/black-box Agents, prefer `final_response_avg_score` or LLM Judge metrics. + **Rubric** refers to evaluation rubrics: in the configuration, `rubrics` is an array listing multiple independently assessable clauses (e.g., "the answer must contain a conclusion," "must be relevant to the question"). The LLM judge gives a pass/fail for each rubric, then aggregates them into the metric's score. #### How to Choose Metrics @@ -819,70 +841,7 @@ LLM response quality with rubrics (llm_rubric_response or llm_rubric_knowledge_r It is recommended to use environment variable placeholders for `api_key` and `base_url` (e.g., `${TRPC_AGENT_API_KEY}`), which are replaced by the execution environment, to avoid writing plaintext in configuration files. -**Multi-model judge (cross-model aggregation)** - -A single LLM-judge metric may use multiple judge models simultaneously and combine their verdicts via `models_aggregator`. Use `judge_models` instead of `judge_model`; the two fields are mutually exclusive. Per-model details are available on `PerInvocationResult.per_model_scores` (a list of `NamedScoreResult`). - -Built-in aggregators: - -| Name | Pass rule | Overall score | -| --- | --- | --- | -| `all_pass` (default) | all models pass | min of per-model scores | -| `any_pass` | any model passes | max of per-model scores | -| `majority_pass` | strict majority passes (`passed*2 > total`) | `passed_count / total` | -| `avg` | mean ≥ threshold | mean of per-model scores | -| `weighted_avg` | weighted mean ≥ threshold | `sum(w*s) / sum(w)` | -| `weighted_majority` | weighted-passed share ≥ 0.5 | `sum(w where passed) / sum(w)` | - -If a single judge model raises during execution, that model is counted as a non-passing vote; if every model raises, the invocation is reported as `NOT_EVALUATED`. - -```json -{ - "metrics": [ - { - "metric_name": "llm_final_response", - "threshold": 1, - "criterion": { - "llm_judge": { - "judge_models": [ - { - "model_name": "glm-4.7", - "api_key": "${TRPC_AGENT_API_KEY}", - "base_url": "${TRPC_AGENT_BASE_URL}", - "weight": 2.0 - }, - { - "model_name": "gpt-4o", - "api_key": "${TRPC_AGENT_API_KEY}", - "base_url": "${TRPC_AGENT_BASE_URL}", - "weight": 1.0 - } - ], - "models_aggregator": "weighted_avg", - "parallel": true - } - } - } - ] -} -``` - -`parallel` controls how multiple judge models are executed: `true` (default) calls all models concurrently, with latency bounded by the slowest model; `false` calls them sequentially in the declared order. Only takes effect when `judge_models` contains more than one model. - -If a judge model has thinking enabled by default, consider setting `"think": false` on its `JudgeModelOptions`: the judge output is a structured JSON, thinking traces add no value to the final verdict, and disabling thinking significantly reduces token cost and latency. Each judge model has its own independent `think` flag. - -Custom aggregators can be registered at runtime and take precedence over the `models_aggregator` name written in the criterion: - -```python -from trpc_agent_sdk.evaluation import LLM_EVALUATOR_REGISTRY, ScoreResult - -def my_aggregator(per_model, threshold, weights): - # per_model: list[ScoreResult]; weights: list[float] - score = sum(s.score or 0.0 for s in per_model) / len(per_model) - return ScoreResult(score=score, reason="custom aggregation") - -LLM_EVALUATOR_REGISTRY.register_models_aggregator("llm_final_response", my_aggregator) -``` +> A single LLM judge metric can also use multiple judge models with aggregated results. See [Advanced Features - Multi-Model Judge (Cross-Model Aggregation)](#multi-model-judge-cross-model-aggregation). #### Custom Criteria @@ -1822,10 +1781,155 @@ async def test_pass_at_k(): For the complete example, see [examples/evaluation/pass_at_k/](../../../examples/evaluation/pass_at_k/). +#### Evaluating Agents Not Created by This Framework (call_agent) + +If the Agent under test is not created or managed by this framework (e.g., deployed behind an HTTP/RPC service, invoked via CLI, or wrapped by another framework), and you cannot provide `agent_module` or `runner`, use the **`call_agent`** parameter instead: pass an async function, and the evaluator will call it each turn to obtain the actual response. The rest of the scoring flow remains unchanged. + +**Configuration** + +Pass `call_agent=your_async_fn` in **AgentEvaluator.evaluate()** or **get_executer()**, without passing `agent_module` or `runner`. The signature must be `async def call_agent(query: str) -> str`. + +**Applicable Scenarios** + +Evaluating any callable that cannot be instantiated as this framework's `BaseAgent`: HTTP/RPC remote services, CLI Agents, other frameworks (LangChain / AutoGen / custom), etc. + +**Constraints** + +- `call_agent` must be async (passing a sync function raises `ValueError`) +- `call_agent` is mutually exclusive with `agent_module` / `runner` (passing both raises `ValueError`) +- `call_agent` mode is mutually exclusive with Trace mode (eval set containing trace cases raises `ValueError`) +- `call_agent` mode does not support `tool_trajectory_avg_score` (raises `ValueError`); use `final_response_avg_score`, `llm_final_response`, or `llm_rubric_response` instead +- Multi-turn cases call `call_agent` sequentially per turn; each call corresponds to one `Invocation` + +**Example**: Using Claude Code CLI as an external Agent + +```python +import asyncio +import os +from asyncio.subprocess import PIPE + +from trpc_agent_sdk.evaluation import AgentEvaluator + + +async def call_agent(query: str) -> str: + """Call Claude Code CLI and return its text output.""" + cli_bin = os.getenv("CLAUDE_CODE_BIN", "claude") + cli_args = [cli_bin, "-p", query] + + model_name = os.getenv("CLAUDE_CODE_MODEL") + if model_name: + cli_args.extend(["--model", model_name]) + + proc = await asyncio.create_subprocess_exec(*cli_args, stdout=PIPE, stderr=PIPE) + stdout, stderr = await proc.communicate() + + if proc.returncode != 0: + raise RuntimeError(stderr.decode("utf-8", errors="ignore").strip()) + + output_text = stdout.decode("utf-8", errors="ignore").strip() + for line in output_text.splitlines(): + if line.strip(): + return line.strip() + return "" + + +# Option A: pass/fail only +await AgentEvaluator.evaluate( + eval_dataset_file_path_or_dir="agent/my_evalset.evalset.json", + call_agent=call_agent, +) + +# Option B: structured results +executer = AgentEvaluator.get_executer( + eval_dataset_file_path_or_dir="agent/my_evalset.evalset.json", + call_agent=call_agent, +) +await executer.evaluate() +result = executer.get_result() # EvaluateResult +``` + +> The example uses `claude` as the default command. If your executable name differs (e.g., `trpc-claudecode` or a custom wrapper), set the `CLAUDE_CODE_BIN` environment variable accordingly. For HTTP service scenarios, simply replace the `call_agent` function body with `aiohttp` / `httpx` calls while keeping the signature `async def call_agent(query: str) -> str`. + +#### Multi-Model Judge (Cross-Model Aggregation) + +A single LLM judge metric can use multiple judge models simultaneously, aggregating their verdicts via `models_aggregator` to reduce single-model variance. Use `judge_models` instead of `judge_model`; the two fields are mutually exclusive. Per-model details are available on `PerInvocationResult.per_model_scores` (a list of `NamedScoreResult`). + +**Configuration** + +In `test_config.json`, for any LLM judge metric's `criterion.llm_judge`, replace `judge_model` with `judge_models` (array), and set `models_aggregator` to choose the aggregation strategy. `parallel` controls execution: `true` (default) calls all models concurrently; `false` calls them sequentially. + +**Applicable Scenarios** + +When higher confidence in judge verdicts is needed (e.g., safety compliance, medical scenarios), or when comparing judgments across different models. + +**Built-in Aggregators** + +| Name | Pass Rule | Overall Score | +| --- | --- | --- | +| `all_pass` (default) | All models pass | min of per-model scores | +| `any_pass` | Any model passes | max of per-model scores | +| `majority_pass` | Strict majority passes (`passed*2 > total`) | `passed_count / total` | +| `avg` | Mean ≥ threshold | mean of per-model scores | +| `weighted_avg` | Weighted mean ≥ threshold | `sum(w*s) / sum(w)` | +| `weighted_majority` | Weighted-passed share ≥ 0.5 | `sum(w where passed) / sum(w)` | + +If a single judge model raises during execution, that model is counted as a non-passing vote; if every model raises, the invocation is reported as `NOT_EVALUATED`. + +**Example**: Two judge models with weighted average aggregation + +```json +{ + "metrics": [ + { + "metric_name": "llm_final_response", + "threshold": 1, + "criterion": { + "llm_judge": { + "judge_models": [ + { + "model_name": "glm-4.7", + "api_key": "${TRPC_AGENT_API_KEY}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "weight": 2.0 + }, + { + "model_name": "gpt-4o", + "api_key": "${TRPC_AGENT_API_KEY}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "weight": 1.0 + } + ], + "models_aggregator": "weighted_avg", + "parallel": true + } + } + } + ] +} +``` + +If a judge model has thinking enabled by default, consider setting `"think": false` on its `JudgeModelOptions`: the judge output is structured JSON, and thinking traces add no value to the final verdict. Disabling thinking significantly reduces token cost and latency. + +**Custom Aggregators** + +Custom aggregators can be registered at runtime and take precedence over the `models_aggregator` name in the criterion: + +```python +from trpc_agent_sdk.evaluation import LLM_EVALUATOR_REGISTRY, ScoreResult + +def my_aggregator(per_model, threshold, weights): + score = sum(s.score or 0.0 for s in per_model) / len(per_model) + return ScoreResult(score=score, reason="custom aggregation") + +LLM_EVALUATOR_REGISTRY.register_models_aggregator("llm_final_response", my_aggregator) +``` + #### Trace Mode In default mode, the eval service actually calls the Agent for inference. If you already have pre-recorded conversation traces (e.g., production logs, historical sessions) and want to only "score" without repeating inference, you can use **Trace mode**: set **eval_mode: "trace"** on the case and provide **actual_conversation**; the eval service will skip inference and directly use that trace for scoring. +> Note: Trace mode and `call_agent` mode are mutually exclusive; when `call_agent` is provided and the eval set contains trace cases, the framework raises `ValueError` at startup. + **Configuration Methods** - Set **eval_mode**: `"trace"` on the **EvalCase**. diff --git a/docs/mkdocs/zh/evaluation.md b/docs/mkdocs/zh/evaluation.md index 564e0e0..68a9eed 100644 --- a/docs/mkdocs/zh/evaluation.md +++ b/docs/mkdocs/zh/evaluation.md @@ -65,6 +65,7 @@ tRPC-Agent 评测模块是一套**自动化 Agent 质量检验工具**。它让 | 知识召回评估 | 评估 RAG 场景下检索到的知识是否足以支撑回答 | 验证知识库检索结果覆盖了问题中的关键事实 | | 多轮运行与统计 | 同一用例跑多次,计算 pass@k 等稳定性指标 | 评估 Agent 在多次尝试中的通过率 | | Trace 回放 | 跳过推理,直接用录制好的对话轨迹打分 | 用线上日志做离线评估,不消耗推理资源 | +| 外部 Agent 评测 | 通过 `call_agent` 评测非本框架创建的 Agent(HTTP 服务、CLI、其他框架) | 对已有 Claude Code CLI 或远程 API 做回归评测 | | 回调钩子 | 在推理/打分的 8 个生命周期节点挂载自定义逻辑 | 打点、日志、采样、上报 | #### 评测整体流程 @@ -283,6 +284,7 @@ pytest test_quickstart.py -v --tb=short -s - **有用例未达阈值**:框架会抛出 `AssertionError`,失败摘要以 JSON 形式包含在错误信息中。 - **结果落盘**:若调用时传入 `eval_result_output_dir`,当次评测结果会写入该目录下的 `.evalset_result.json` 文件(详见[评测结果](#评测结果)一节)。 + --- ### 核心概念 @@ -296,7 +298,7 @@ pytest test_quickstart.py -v --tb=short -s | **AgentEvaluator** | 对用户暴露的入口,提供 `evaluate()` 与 `get_executer()` | 在 pytest 测试中调用它 | | **评测集(EvalSet)** | 描述"测什么"——场景、用户输入、预期输出 | 编写 `.evalset.json` 文件 | | **评测配置(EvalConfig)** | 描述"怎么判"——用哪些指标、阈值、匹配规则 | 编写 `test_config.json` 文件 | -| **评估服务(LocalEvalService)** | 执行推理与打分的引擎 | 框架自动创建,通常无需关心 | +| **评估服务(LocalEvalService / RemoteEvalService)** | 执行推理与打分的引擎(本地 Agent 或 `call_agent`) | 框架自动创建,通常无需关心 | | **评估器(Evaluator)** | 按指标计算分数的具体实现 | 选择内置评估器,或注册自定义 | | **评估器注册表(EvaluatorRegistry)** | 维护 `metric_name` → 评估器类型的映射 | 需要自定义评估器时注册 | | **评测结果(EvaluateResult)** | 承载评测的结构化结果 | 通过 `get_result()` 获取并分析 | @@ -305,12 +307,31 @@ pytest test_quickstart.py -v --tb=short -s AgentEvaluator 是整个评测流程的入口和编排者: -1. **加载阶段**:AgentEvaluator 从评测集文件(`.evalset.json` / `.test.json`)加载 EvalSet,从同目录的 `test_config.json` 加载 EvalConfig,按 `agent_module` 加载 Agent(若整集为 [Trace 模式](#trace-模式),此步可省略)。 -2. **构建评估服务**:AgentEvaluator 将 EvalSet 写入 InMemoryEvalSetsManager,创建 LocalEvalService(依赖该 Manager、UserSimulatorProvider、可选 EvalSetResultsManager、Runner、Callbacks)。默认使用 StaticUserSimulator,按 conversation 的 user_content 驱动推理。可选注入 LocalEvalSetResultsManager 将运行结果写入目录。 -3. **推理阶段**:评估服务按 EvalSet 中的用例与 conversation 驱动 Runner 推理,得到实际 Invocation 列表(实际工具调用、实际回复)。 +1. **加载阶段**:AgentEvaluator 从评测集文件(`.evalset.json` / `.test.json`)加载 EvalSet,从同目录的 `test_config.json` 加载 EvalConfig;若走本地 Agent 路径,按 `agent_module` 加载 Agent(使用 `call_agent` 或整集为 [Trace 模式](#trace-模式) 时,此步可省略)。 +2. **构建评估服务**:AgentEvaluator 将 EvalSet 写入 InMemoryEvalSetsManager;传 `call_agent` 时创建 RemoteEvalService,否则创建 LocalEvalService(依赖 Manager、UserSimulatorProvider、可选 EvalSetResultsManager、Runner、Callbacks)。 +3. **推理阶段**:评估服务按 EvalSet 的用例与 conversation 逐轮推理:LocalEvalService 通过 Runner 调 Agent;RemoteEvalService 通过 `call_agent(query)` 获取每轮实际回复,得到实际 Invocation 列表。 4. **打分阶段**:评估服务根据 EvalConfig 中的 EvalMetric 列表,从 EvaluatorRegistry 获取各评估器,对实际与预期逐项打分并汇总为 EvalCaseResult。 5. **结果汇总**:AgentEvaluator 根据结果判定通过/失败,有用例未达阈值时抛出 `AssertionError`,可选将结果落盘为 `.evalset_result.json`。 +#### AgentEvaluator 参数列表 + +`evaluate()` 与 `get_executer()` 接受相同的参数(`evaluate()` 内部调用 `get_executer()`): + +| 参数 | 类型 | 说明 | +| --- | --- | --- | +| eval_dataset_file_path_or_dir | str | 评测集文件或目录路径(递归扫描 `.evalset.json` / `.test.json`) | +| agent_module | str \| None | 本框架 Agent 所在 Python 模块路径;与 `call_agent` 互斥。传 `call_agent` 时不需要;全部 case 为 Trace 模式时也不需要 | +| call_agent | CallAgent \| None | 非本框架 Agent 的异步可调用对象(`async def(str)->str`);与 `agent_module` / `runner` 互斥 | +| num_runs | int | 每个评测集运行次数,默认 1 | +| agent_name | str \| None | Agent 显示名称 | +| print_detailed_results | bool | 是否打印每个用例的详细对比信息,默认 True | +| eval_result_output_dir | str \| None | 结果落盘目录;不传则仅内存聚合 | +| runner | Runner \| None | 自定义 Runner 实例;与 `call_agent` 互斥 | +| case_parallelism | int \| None | 推理阶段最大并发用例数 | +| case_eval_parallelism | int \| None | 打分阶段最大并发用例数 | +| callbacks | Callbacks \| None | 生命周期回调 | +| eval_metrics_file_path_or_dir | str \| None | 共享评测配置文件路径(覆盖同目录 `test_config.json`) | + --- ### 评测集(EvalSet)编写指南 @@ -455,6 +476,8 @@ Trace 模式的配置详见[高级功能 - Trace 模式](#trace-模式)。 | `llm_rubric_response` | LLMRubricResponseEvaluator | LLM 裁判按评估细则逐项打分 | 需要从多个维度(正确性、相关性、合规性等)评估回复质量 | | `llm_rubric_knowledge_recall` | LLMRubricKnowledgeRecallEvaluator | LLM 裁判评估知识检索结果是否足以支撑回答 | RAG 场景,需验证检索到的知识覆盖了关键事实 | +> 注意:`call_agent` 模式不支持 `tool_trajectory_avg_score`。评测外部黑盒 Agent 时,建议优先使用 `final_response_avg_score` 或 LLM Judge 类指标。 + **Rubric** 指评估细则:在配置中以 `rubrics` 数组列出多条可独立判定的条款(如「回答须包含结论」「须与问题相关」),LLM 裁判对每条细则给出通过与否,再汇总为该项指标的得分。 #### 如何选择指标 @@ -816,70 +839,7 @@ LLM 最终响应评判(仅需 judge_model): 建议 `api_key`、`base_url` 用环境变量占位(如 `${TRPC_AGENT_API_KEY}`),由执行环境替换,避免明文写入配置文件。 -**多裁判模型(跨模型聚合)** - -同一个 LLM 裁判指标可以同时使用多个裁判模型,并通过 `models_aggregator` 聚合各模型的判定结果。此时改用 `judge_models` 而非 `judge_model`,两字段互斥。每个裁判模型的明细会输出到 `PerInvocationResult.per_model_scores`(`NamedScoreResult` 列表)。 - -内置聚合器: - -| 名称 | 通过规则 | 总分 | -| --- | --- | --- | -| `all_pass`(默认) | 所有模型都通过 | 各模型得分的最小值 | -| `any_pass` | 任一模型通过 | 各模型得分的最大值 | -| `majority_pass` | 严格多数通过(`passed*2 > total`) | `passed_count / total` | -| `avg` | 平均分 ≥ threshold | 各模型得分的平均值 | -| `weighted_avg` | 加权平均 ≥ threshold | `sum(w*s) / sum(w)` | -| `weighted_majority` | 通过模型的权重占比 ≥ 0.5 | `sum(w where passed) / sum(w)` | - -若某个裁判模型执行抛异常,则该模型视为一张反对票;若所有模型都抛异常,该轮结果记为 `NOT_EVALUATED`。 - -```json -{ - "metrics": [ - { - "metric_name": "llm_final_response", - "threshold": 1, - "criterion": { - "llm_judge": { - "judge_models": [ - { - "model_name": "glm-4.7", - "api_key": "${TRPC_AGENT_API_KEY}", - "base_url": "${TRPC_AGENT_BASE_URL}", - "weight": 2.0 - }, - { - "model_name": "gpt-4o", - "api_key": "${TRPC_AGENT_API_KEY}", - "base_url": "${TRPC_AGENT_BASE_URL}", - "weight": 1.0 - } - ], - "models_aggregator": "weighted_avg", - "parallel": true - } - } - } - ] -} -``` - -`parallel` 控制多个裁判模型之间的执行方式:`true`(默认)并发调用,耗时取决于最慢的模型;`false` 按声明顺序串行调用。仅在 `judge_models` 有多个模型时生效。 - -若裁判模型默认开启思考链,建议在对应 `JudgeModelOptions` 上显式设 `"think": false`:judge 输出本身是结构化 JSON,思考链对最终判分无价值,关闭可显著降低 token 消耗与延时。每个裁判模型的 `think` 独立设置。 - -也可以在运行时注册自定义聚合器,其优先级高于 criterion 中写的 `models_aggregator` 名: - -```python -from trpc_agent_sdk.evaluation import LLM_EVALUATOR_REGISTRY, ScoreResult - -def my_aggregator(per_model, threshold, weights): - # per_model: list[ScoreResult];weights: list[float] - score = sum(s.score or 0.0 for s in per_model) / len(per_model) - return ScoreResult(score=score, reason="custom aggregation") - -LLM_EVALUATOR_REGISTRY.register_models_aggregator("llm_final_response", my_aggregator) -``` +> 同一个 LLM 裁判指标还可以同时使用多个裁判模型并聚合结果,详见[高级功能 - 多裁判模型(跨模型聚合)](#多裁判模型跨模型聚合)。 #### 自定义准则 @@ -1819,10 +1779,155 @@ async def test_pass_at_k(): 完整示例见 [examples/evaluation/pass_at_k/](../../../examples/evaluation/pass_at_k/)。 +#### 评测非本框架创建的 Agent(call_agent) + +若被测 Agent 不是通过本框架创建和管理的(例如部署在 HTTP/RPC 服务后面、通过 CLI 调用、或使用其他框架封装),无法提供 `agent_module` 或 `runner`,可改用 **`call_agent`** 参数:传入一个异步函数,evaluator 会在每轮对话中调用它获取实际回复,其余打分流程不变。 + +**配置方式** + +在 **AgentEvaluator.evaluate()** 或 **get_executer()** 中传入 `call_agent=your_async_fn`,不传 `agent_module` 和 `runner`。`call_agent` 的签名必须是 `async def call_agent(query: str) -> str`。 + +**适用场景** + +评测任何无法实例化为本框架 `BaseAgent` 的可调用对象:HTTP/RPC 远程服务、CLI Agent、其他框架(LangChain / AutoGen / 自研)封装的黑盒 Agent 等。 + +**约束** + +- `call_agent` 必须是异步函数(传入同步函数会报 `ValueError`) +- `call_agent` 与 `agent_module` / `runner` 互斥(同时传入会报 `ValueError`) +- `call_agent` 模式与 Trace 模式互斥(evalset 含 trace case 会报 `ValueError`) +- `call_agent` 模式不支持 `tool_trajectory_avg_score`(会报 `ValueError`);建议使用 `final_response_avg_score`、`llm_final_response` 或 `llm_rubric_response` +- 多轮 case 会按轮次依次调用 `call_agent`;每次调用对应一个 `Invocation` + +**示例**:以 Claude Code CLI 为例,将其封装为 `call_agent` 并接入评测 + +```python +import asyncio +import os +from asyncio.subprocess import PIPE + +from trpc_agent_sdk.evaluation import AgentEvaluator + + +async def call_agent(query: str) -> str: + """调用 Claude Code CLI,返回其文本输出。""" + cli_bin = os.getenv("CLAUDE_CODE_BIN", "claude") + cli_args = [cli_bin, "-p", query] + + model_name = os.getenv("CLAUDE_CODE_MODEL") + if model_name: + cli_args.extend(["--model", model_name]) + + proc = await asyncio.create_subprocess_exec(*cli_args, stdout=PIPE, stderr=PIPE) + stdout, stderr = await proc.communicate() + + if proc.returncode != 0: + raise RuntimeError(stderr.decode("utf-8", errors="ignore").strip()) + + output_text = stdout.decode("utf-8", errors="ignore").strip() + for line in output_text.splitlines(): + if line.strip(): + return line.strip() + return "" + + +# 方式 A:只关心 pass/fail +await AgentEvaluator.evaluate( + eval_dataset_file_path_or_dir="agent/my_evalset.evalset.json", + call_agent=call_agent, +) + +# 方式 B:需要结构化结果 +executer = AgentEvaluator.get_executer( + eval_dataset_file_path_or_dir="agent/my_evalset.evalset.json", + call_agent=call_agent, +) +await executer.evaluate() +result = executer.get_result() # EvaluateResult +``` + +> 示例中默认命令是 `claude`。如果你环境里的可执行文件名不同(例如 `trpc-claudecode` 或自定义命令),将 `CLAUDE_CODE_BIN` 环境变量改为对应命令即可。对于 HTTP 服务场景,只需把 `call_agent` 函数体改为 `aiohttp` / `httpx` 调用,签名保持 `async def call_agent(query: str) -> str` 不变。 + +#### 多裁判模型(跨模型聚合) + +同一个 LLM 裁判指标可以同时使用多个裁判模型,并通过 `models_aggregator` 聚合各模型的判定结果,降低单模型裁判的波动。此时改用 `judge_models` 而非 `judge_model`,两字段互斥。每个裁判模型的明细会输出到 `PerInvocationResult.per_model_scores`(`NamedScoreResult` 列表)。 + +**配置方式** + +在 `test_config.json` 的 LLM 裁判类指标 `criterion.llm_judge` 中,将 `judge_model` 替换为 `judge_models`(数组),并设置 `models_aggregator` 选择聚合策略。`parallel` 控制多个裁判模型之间的执行方式:`true`(默认)并发调用,`false` 串行调用。 + +**适用场景** + +对评判结果要求更高置信度(如安全合规、医疗场景),或希望对比不同裁判模型的判定差异。 + +**内置聚合器** + +| 名称 | 通过规则 | 总分 | +| --- | --- | --- | +| `all_pass`(默认) | 所有模型都通过 | 各模型得分的最小值 | +| `any_pass` | 任一模型通过 | 各模型得分的最大值 | +| `majority_pass` | 严格多数通过(`passed*2 > total`) | `passed_count / total` | +| `avg` | 平均分 ≥ threshold | 各模型得分的平均值 | +| `weighted_avg` | 加权平均 ≥ threshold | `sum(w*s) / sum(w)` | +| `weighted_majority` | 通过模型的权重占比 ≥ 0.5 | `sum(w where passed) / sum(w)` | + +若某个裁判模型执行抛异常,则该模型视为一张反对票;若所有模型都抛异常,该轮结果记为 `NOT_EVALUATED`。 + +**示例**:两个裁判模型按加权平均聚合 + +```json +{ + "metrics": [ + { + "metric_name": "llm_final_response", + "threshold": 1, + "criterion": { + "llm_judge": { + "judge_models": [ + { + "model_name": "glm-4.7", + "api_key": "${TRPC_AGENT_API_KEY}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "weight": 2.0 + }, + { + "model_name": "gpt-4o", + "api_key": "${TRPC_AGENT_API_KEY}", + "base_url": "${TRPC_AGENT_BASE_URL}", + "weight": 1.0 + } + ], + "models_aggregator": "weighted_avg", + "parallel": true + } + } + } + ] +} +``` + +若裁判模型默认开启思考链,建议在对应 `JudgeModelOptions` 上显式设 `"think": false`:judge 输出本身是结构化 JSON,思考链对最终判分无价值,关闭可显著降低 token 消耗与延时。 + +**自定义聚合器** + +也可以在运行时注册自定义聚合器,其优先级高于 criterion 中写的 `models_aggregator` 名: + +```python +from trpc_agent_sdk.evaluation import LLM_EVALUATOR_REGISTRY, ScoreResult + +def my_aggregator(per_model, threshold, weights): + score = sum(s.score or 0.0 for s in per_model) / len(per_model) + return ScoreResult(score=score, reason="custom aggregation") + +LLM_EVALUATOR_REGISTRY.register_models_aggregator("llm_final_response", my_aggregator) +``` + #### Trace 模式 默认模式下,评估服务会真实调用 Agent 做推理。若你已有录制好的对话轨迹(如线上日志、历史会话),希望只做「打分」、不重复推理,可使用 **Trace 模式**:在用例上设置 **eval_mode: "trace"** 并提供 **actual_conversation**,评估服务会跳过推理,直接使用该轨迹参与打分。 +> 注意:Trace 模式与 `call_agent` 模式互斥;传入 `call_agent` 且评测集中包含 trace case 时,框架会在启动期抛出 `ValueError`。 + **配置方式** - 在 **EvalCase** 上设置 **eval_mode**: `"trace"`。 diff --git a/tests/evaluation/test_agent_evaluator_call_agent.py b/tests/evaluation/test_agent_evaluator_call_agent.py new file mode 100644 index 0000000..ca7e6a6 --- /dev/null +++ b/tests/evaluation/test_agent_evaluator_call_agent.py @@ -0,0 +1,170 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""TDD tests for AgentEvaluator call_agent routing.""" + +from __future__ import annotations + +import pytest + +import trpc_agent_sdk.runners # noqa: F401 + +from trpc_agent_sdk.evaluation import AgentEvaluator +from trpc_agent_sdk.evaluation import CallbackResult +from trpc_agent_sdk.evaluation import Callbacks +from trpc_agent_sdk.evaluation import EvalCase +from trpc_agent_sdk.evaluation import EvalConfig +from trpc_agent_sdk.evaluation import EvalSet +from trpc_agent_sdk.evaluation import Invocation +from trpc_agent_sdk.types import Content +from trpc_agent_sdk.types import Part + + +def _content(text: str) -> Content: + return Content(parts=[Part(text=text)]) + + +def _invocation(user: str, expected: str | None = None) -> Invocation: + return Invocation( + invocation_id="i", + user_content=_content(user), + final_response=_content(expected) if expected is not None else None, + ) + + +@pytest.mark.asyncio +async def test_evaluate_eval_set_with_call_agent_minimal(): + eval_set = EvalSet( + eval_set_id="s1", + eval_cases=[EvalCase(eval_id="c1", conversation=[_invocation("hello", "world")])], + ) + eval_config = EvalConfig(criteria={"final_response_avg_score": 1.0}) + + async def call_agent(query: str) -> str: + return "world" + + failed_summary, details, result_lines, eval_results = await AgentEvaluator.evaluate_eval_set( + eval_set, + call_agent=call_agent, + eval_config=eval_config, + print_detailed_results=False, + ) + + assert failed_summary is None + assert details == [] + assert result_lines + assert "c1" in eval_results + + +@pytest.mark.asyncio +async def test_call_agent_with_agent_module_raises(): + eval_set = EvalSet( + eval_set_id="s1", + eval_cases=[EvalCase(eval_id="c1", conversation=[_invocation("hello", "world")])], + ) + eval_config = EvalConfig(criteria={"final_response_avg_score": 1.0}) + + async def call_agent(query: str) -> str: + return "world" + + with pytest.raises(ValueError, match="mutually exclusive"): + await AgentEvaluator.evaluate_eval_set( + eval_set, + agent_module="fake.module", + call_agent=call_agent, + eval_config=eval_config, + ) + + +@pytest.mark.asyncio +async def test_call_agent_with_runner_raises(): + eval_set = EvalSet( + eval_set_id="s1", + eval_cases=[EvalCase(eval_id="c1", conversation=[_invocation("hello", "world")])], + ) + eval_config = EvalConfig(criteria={"final_response_avg_score": 1.0}) + + async def call_agent(query: str) -> str: + return "world" + + with pytest.raises(ValueError, match="mutually exclusive"): + await AgentEvaluator.evaluate_eval_set( + eval_set, + runner=object(), # type: ignore[arg-type] + call_agent=call_agent, + eval_config=eval_config, + ) + + +@pytest.mark.asyncio +async def test_call_agent_with_trace_case_raises(): + eval_set = EvalSet( + eval_set_id="s1", + eval_cases=[ + EvalCase( + eval_id="trace_case", + eval_mode="trace", + actual_conversation=[_invocation("hello", "world")], + ) + ], + ) + eval_config = EvalConfig(criteria={"final_response_avg_score": 1.0}) + + async def call_agent(query: str) -> str: + return "world" + + with pytest.raises(ValueError, match="trace_case"): + await AgentEvaluator.evaluate_eval_set( + eval_set, + call_agent=call_agent, + eval_config=eval_config, + ) + + +@pytest.mark.asyncio +async def test_call_agent_callbacks_e2e(): + eval_set = EvalSet( + eval_set_id="s1", + eval_cases=[EvalCase(eval_id="c1", conversation=[_invocation("hello", "world")])], + ) + eval_config = EvalConfig(criteria={"final_response_avg_score": 1.0}) + points: list[str] = [] + + def _cb(name: str): + async def _fn(_ctx: dict, _args: object): + points.append(name) + return CallbackResult(context={"point": name}) + return _fn + + callbacks = Callbacks() + callbacks.register_before_inference_set("t", _cb("before_inference_set")) + callbacks.register_after_inference_set("t", _cb("after_inference_set")) + callbacks.register_before_inference_case("t", _cb("before_inference_case")) + callbacks.register_after_inference_case("t", _cb("after_inference_case")) + callbacks.register_before_evaluate_set("t", _cb("before_evaluate_set")) + callbacks.register_after_evaluate_set("t", _cb("after_evaluate_set")) + callbacks.register_before_evaluate_case("t", _cb("before_evaluate_case")) + callbacks.register_after_evaluate_case("t", _cb("after_evaluate_case")) + + async def call_agent(query: str) -> str: + return "world" + + await AgentEvaluator.evaluate_eval_set( + eval_set, + call_agent=call_agent, + eval_config=eval_config, + callbacks=callbacks, + ) + + assert { + "before_inference_set", + "after_inference_set", + "before_inference_case", + "after_inference_case", + "before_evaluate_set", + "after_evaluate_set", + "before_evaluate_case", + "after_evaluate_case", + }.issubset(set(points)) diff --git a/tests/evaluation/test_remote_eval_service.py b/tests/evaluation/test_remote_eval_service.py new file mode 100644 index 0000000..77d0318 --- /dev/null +++ b/tests/evaluation/test_remote_eval_service.py @@ -0,0 +1,231 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""TDD tests for RemoteEvalService.""" + +from __future__ import annotations + +import asyncio + +import pytest + +import trpc_agent_sdk.runners # noqa: F401 + +from trpc_agent_sdk.evaluation import CallbackResult +from trpc_agent_sdk.evaluation import Callbacks +from trpc_agent_sdk.evaluation import EvalCase +from trpc_agent_sdk.evaluation import EvalMetric +from trpc_agent_sdk.evaluation import EvalSet +from trpc_agent_sdk.evaluation import InMemoryEvalSetsManager +from trpc_agent_sdk.evaluation import InferenceConfig +from trpc_agent_sdk.evaluation import InferenceRequest +from trpc_agent_sdk.evaluation import InferenceStatus +from trpc_agent_sdk.evaluation import Invocation +from trpc_agent_sdk.evaluation import EvaluateConfig +from trpc_agent_sdk.evaluation import EvaluateRequest +from trpc_agent_sdk.evaluation import EvalStatus +from trpc_agent_sdk.evaluation._remote_eval_service import RemoteEvalService +from trpc_agent_sdk.types import Content +from trpc_agent_sdk.types import Part + + +def _content(text: str) -> Content: + return Content(parts=[Part(text=text)]) + + +def _invocation(user: str, expected: str | None = None) -> Invocation: + return Invocation( + invocation_id="i", + user_content=_content(user), + final_response=_content(expected) if expected is not None else None, + ) + + +def _make_manager(eval_set: EvalSet, app_name: str = "app") -> InMemoryEvalSetsManager: + mgr = InMemoryEvalSetsManager() + mgr.create_eval_set(app_name=app_name, eval_set_id=eval_set.eval_set_id) + for case in eval_set.eval_cases: + mgr.add_eval_case(app_name=app_name, eval_set_id=eval_set.eval_set_id, eval_case=case) + return mgr + + +@pytest.mark.asyncio +async def test_perform_inference_async_callable_one_turn(): + case = EvalCase(eval_id="c1", conversation=[_invocation("hello", "world")]) + eval_set = EvalSet(eval_set_id="s1", eval_cases=[case]) + mgr = _make_manager(eval_set) + + async def call_agent(query: str) -> str: + assert query == "hello" + return "world" + + service = RemoteEvalService(call_agent=call_agent, eval_sets_manager=mgr) + req = InferenceRequest(app_name="app", eval_set_id="s1", inference_config=InferenceConfig(parallelism=2)) + + results = [r async for r in service.perform_inference(req)] + + assert len(results) == 1 + assert results[0].status == InferenceStatus.SUCCESS + assert results[0].inferences is not None + assert results[0].inferences[0].final_response is not None + assert results[0].inferences[0].final_response.parts[0].text == "world" + + +def test_reject_sync_callable_raises_value_error(): + case = EvalCase(eval_id="c1", conversation=[_invocation("hello", "world")]) + eval_set = EvalSet(eval_set_id="s1", eval_cases=[case]) + mgr = _make_manager(eval_set) + + def call_agent(query: str) -> str: + return query + + with pytest.raises(ValueError, match="async function"): + RemoteEvalService(call_agent=call_agent, eval_sets_manager=mgr) + + +@pytest.mark.asyncio +async def test_reject_trace_cases_raises_value_error(): + trace_case = EvalCase( + eval_id="trace_case", + eval_mode="trace", + actual_conversation=[_invocation("u", "a")], + ) + eval_set = EvalSet(eval_set_id="s1", eval_cases=[trace_case]) + mgr = _make_manager(eval_set) + + async def call_agent(query: str) -> str: + return query + + service = RemoteEvalService(call_agent=call_agent, eval_sets_manager=mgr) + req = InferenceRequest(app_name="app", eval_set_id="s1", inference_config=InferenceConfig(parallelism=1)) + + with pytest.raises(ValueError, match="trace_case"): + _ = [r async for r in service.perform_inference(req)] + + +@pytest.mark.asyncio +async def test_reject_tool_trajectory_metric_raises_value_error(): + case = EvalCase(eval_id="c1", conversation=[_invocation("hello", "world")]) + eval_set = EvalSet(eval_set_id="s1", eval_cases=[case]) + mgr = _make_manager(eval_set) + + async def call_agent(query: str) -> str: + return "world" + + service = RemoteEvalService(call_agent=call_agent, eval_sets_manager=mgr) + req = InferenceRequest(app_name="app", eval_set_id="s1", inference_config=InferenceConfig(parallelism=1)) + inference_results = [r async for r in service.perform_inference(req)] + evaluate_req = EvaluateRequest( + inference_results=inference_results, + evaluate_config=EvaluateConfig( + eval_metrics=[EvalMetric(metric_name="tool_trajectory_avg_score", threshold=1.0)], + ), + ) + + with pytest.raises(ValueError, match="tool_trajectory_avg_score"): + _ = [r async for r in service.evaluate(evaluate_req)] + + +@pytest.mark.asyncio +async def test_case_fail_soft_when_call_agent_raises(): + case = EvalCase(eval_id="c1", conversation=[_invocation("hello", "world")]) + eval_set = EvalSet(eval_set_id="s1", eval_cases=[case]) + mgr = _make_manager(eval_set) + + async def call_agent(query: str) -> str: + raise RuntimeError("boom") + + service = RemoteEvalService(call_agent=call_agent, eval_sets_manager=mgr) + req = InferenceRequest(app_name="app", eval_set_id="s1", inference_config=InferenceConfig(parallelism=1)) + + results = [r async for r in service.perform_inference(req)] + + assert len(results) == 1 + assert results[0].status == InferenceStatus.FAILURE + assert results[0].error_message == "boom" + + +@pytest.mark.asyncio +async def test_case_parallel_turn_serial(): + case1 = EvalCase( + eval_id="c1", + conversation=[_invocation("q1", "a1"), _invocation("q2", "a2")], + ) + case2 = EvalCase( + eval_id="c2", + conversation=[_invocation("x1", "y1"), _invocation("x2", "y2")], + ) + eval_set = EvalSet(eval_set_id="s1", eval_cases=[case1, case2]) + mgr = _make_manager(eval_set) + call_order: list[str] = [] + lock = asyncio.Lock() + + async def call_agent(query: str) -> str: + async with lock: + call_order.append(query) + await asyncio.sleep(0.01) + return f"resp:{query}" + + service = RemoteEvalService(call_agent=call_agent, eval_sets_manager=mgr) + req = InferenceRequest(app_name="app", eval_set_id="s1", inference_config=InferenceConfig(parallelism=2)) + results = [r async for r in service.perform_inference(req)] + + assert len(results) == 2 + # Per-case should remain serial; globally interleaving is allowed. + c1_order = [q for q in call_order if q in {"q1", "q2"}] + c2_order = [q for q in call_order if q in {"x1", "x2"}] + assert c1_order == ["q1", "q2"] + assert c2_order == ["x1", "x2"] + + +@pytest.mark.asyncio +async def test_callbacks_all_nodes_called(): + case = EvalCase(eval_id="c1", conversation=[_invocation("hello", "world")]) + eval_set = EvalSet(eval_set_id="s1", eval_cases=[case]) + mgr = _make_manager(eval_set) + points: list[str] = [] + + def _cb(name: str): + async def _fn(_ctx: dict, _args: object): + points.append(name) + return CallbackResult(context={"point": name}) + return _fn + + callbacks = Callbacks() + callbacks.register_before_inference_set("t", _cb("before_inference_set")) + callbacks.register_after_inference_set("t", _cb("after_inference_set")) + callbacks.register_before_inference_case("t", _cb("before_inference_case")) + callbacks.register_after_inference_case("t", _cb("after_inference_case")) + callbacks.register_before_evaluate_set("t", _cb("before_evaluate_set")) + callbacks.register_after_evaluate_set("t", _cb("after_evaluate_set")) + callbacks.register_before_evaluate_case("t", _cb("before_evaluate_case")) + callbacks.register_after_evaluate_case("t", _cb("after_evaluate_case")) + + async def call_agent(query: str) -> str: + return "world" + + service = RemoteEvalService(call_agent=call_agent, eval_sets_manager=mgr, callbacks=callbacks) + req = InferenceRequest(app_name="app", eval_set_id="s1", inference_config=InferenceConfig(parallelism=1)) + inference_results = [r async for r in service.perform_inference(req)] + eval_req = EvaluateRequest( + inference_results=inference_results, + evaluate_config=EvaluateConfig( + eval_metrics=[EvalMetric(metric_name="final_response_avg_score", threshold=1.0)], + ), + ) + eval_results = [r async for r in service.evaluate(eval_req)] + + assert len(eval_results) == 1 + assert eval_results[0].final_eval_status == EvalStatus.PASSED + assert { + "before_inference_set", + "after_inference_set", + "before_inference_case", + "after_inference_case", + "before_evaluate_set", + "after_evaluate_set", + "before_evaluate_case", + "after_evaluate_case", + }.issubset(set(points)) diff --git a/trpc_agent_sdk/evaluation/__init__.py b/trpc_agent_sdk/evaluation/__init__.py index 1d4eb20..4c87990 100644 --- a/trpc_agent_sdk/evaluation/__init__.py +++ b/trpc_agent_sdk/evaluation/__init__.py @@ -166,6 +166,8 @@ from ._llm_judge import WeightedMajorityModelsAggregator from ._llm_judge import get_builtin_models_aggregator from ._local_eval_service import LocalEvalService +from ._remote_eval_service import CallAgent +from ._remote_eval_service import RemoteEvalService from ._local_eval_set_results_manager import LocalEvalSetResultsManager from ._local_eval_sets_manager import LocalEvalSetsManager from ._local_eval_sets_manager import load_eval_set_from_file @@ -297,6 +299,8 @@ "ResponseScorerFn", "SamplesAggregatorFn", "LocalEvalService", + "RemoteEvalService", + "CallAgent", "RougeEvaluator", "StaticUserSimulator", "TrajectoryEvaluator", diff --git a/trpc_agent_sdk/evaluation/_agent_evaluator.py b/trpc_agent_sdk/evaluation/_agent_evaluator.py index 09e1ed3..f4d1da6 100644 --- a/trpc_agent_sdk/evaluation/_agent_evaluator.py +++ b/trpc_agent_sdk/evaluation/_agent_evaluator.py @@ -49,6 +49,8 @@ from trpc_agent_sdk.agents import BaseAgent from ._local_eval_service import LocalEvalService +from ._remote_eval_service import CallAgent +from ._remote_eval_service import RemoteEvalService from . import _utils from ._eval_callbacks import Callbacks from ._eval_case import EvalModeTrace @@ -89,6 +91,7 @@ def __init__( eval_dataset_file_path_or_dir: str, *, agent_module: Optional[str] = None, + call_agent: Optional[CallAgent] = None, num_runs: int = NUM_RUNS, agent_name: Optional[str] = None, print_detailed_results: bool = True, @@ -100,6 +103,7 @@ def __init__( eval_metrics_file_path_or_dir: Optional[str] = None, ): self._agent_module = agent_module + self._call_agent = call_agent self._eval_dataset_file_path_or_dir = eval_dataset_file_path_or_dir self._num_runs = num_runs self._agent_name = agent_name @@ -115,6 +119,7 @@ def __init__( async def _run(self) -> None: agent_module = self._agent_module + call_agent = self._call_agent eval_dataset_file_path_or_dir = self._eval_dataset_file_path_or_dir num_runs = self._num_runs agent_name = self._agent_name @@ -163,6 +168,7 @@ async def _run(self) -> None: await AgentEvaluator.evaluate_eval_set( eval_set, agent_module=agent_module, + call_agent=call_agent, eval_config=eval_config, num_runs=num_runs_for_set, agent_name=agent_name, @@ -187,7 +193,8 @@ async def _run(self) -> None: _RESULT_HANDLER.print_evaluation_report( all_details=all_details, all_results=all_results, - display_agent_name=agent_name or agent_module or "trace-only", + display_agent_name=agent_name or agent_module + or ("call-agent" if call_agent is not None else "trace-only"), num_runs=num_runs_for_set, ) self._result = EvaluateResult(results_by_eval_set_id=results_by_eval_set_id) @@ -282,6 +289,7 @@ async def evaluate( eval_dataset_file_path_or_dir: str, *, agent_module: Optional[str] = None, + call_agent: Optional[CallAgent] = None, num_runs: int = NUM_RUNS, agent_name: Optional[str] = None, print_detailed_results: bool = True, @@ -318,6 +326,7 @@ async def evaluate( executer = AgentEvaluator.get_executer( eval_dataset_file_path_or_dir, agent_module=agent_module, + call_agent=call_agent, num_runs=num_runs, agent_name=agent_name, print_detailed_results=print_detailed_results, @@ -335,6 +344,7 @@ def get_executer( eval_dataset_file_path_or_dir: str, *, agent_module: Optional[str] = None, + call_agent: Optional[CallAgent] = None, num_runs: int = NUM_RUNS, agent_name: Optional[str] = None, print_detailed_results: bool = True, @@ -374,6 +384,7 @@ def get_executer( return _EvalExecuter( eval_dataset_file_path_or_dir, agent_module=agent_module, + call_agent=call_agent, num_runs=num_runs, agent_name=agent_name, print_detailed_results=print_detailed_results, @@ -431,6 +442,7 @@ async def evaluate_eval_set( eval_set: EvalSet, *, agent_module: Optional[str] = None, + call_agent: Optional[CallAgent] = None, eval_config: Optional[EvalConfig] = None, num_runs: int = NUM_RUNS, agent_name: Optional[str] = None, @@ -476,15 +488,20 @@ async def evaluate_eval_set( if eval_config is None: raise ValueError("`eval_config` is required.") + if call_agent is not None and agent_module is not None: + raise ValueError("call_agent is mutually exclusive with agent_module.") + if call_agent is not None and runner is not None: + raise ValueError("call_agent is mutually exclusive with runner.") + trace_only = AgentEvaluator._is_trace_only(eval_set) - if agent_module is None and not trace_only: + if call_agent is None and agent_module is None and not trace_only: non_trace_ids = [case.eval_id for case in eval_set.eval_cases if case.eval_mode != EvalModeTrace] raise ValueError("`agent_module` is required unless every case in eval_set uses " "eval_mode='trace'. Non-trace case ids: " f"{non_trace_ids}") agent_for_eval: Optional[BaseAgent] = None - if agent_module is not None: + if agent_module is not None and call_agent is None: agent_for_eval = await AgentEvaluator._get_agent_for_eval(module_name=agent_module, agent_name=agent_name) eval_metrics = eval_config.get_eval_metrics() @@ -502,11 +519,12 @@ async def evaluate_eval_set( case_parallelism=case_parallelism, case_eval_parallelism=case_eval_parallelism, callbacks=callbacks, + call_agent=call_agent, ) # Step 2: Post-process the results failures: list[str] = [] - display_agent_name = agent_name or agent_module or "trace-only" + display_agent_name = agent_name or agent_module or ("call-agent" if call_agent is not None else "trace-only") details_lines: list[str] = [] result_lines: list[str] = [] @@ -771,6 +789,7 @@ async def _get_eval_results_by_eval_id( eval_metrics: list, num_runs: int, user_simulator_provider, + call_agent: Optional[CallAgent] = None, eval_set_results_manager: Optional[Any] = None, runner: Optional[Runner] = None, case_parallelism: Optional[int] = None, @@ -811,14 +830,23 @@ async def _get_eval_results_by_eval_id( # app_name: evalset.app_name or configured default (case session_input.app_name overrides per case) request_app_name = eval_set.app_name or DEFAULT_EVAL_APP_NAME - eval_service = LocalEvalService( - root_agent=agent_for_eval, - eval_sets_manager=AgentEvaluator._get_eval_sets_manager(app_name=request_app_name, eval_set=eval_set), - user_simulator_provider=user_simulator_provider, - eval_set_results_manager=eval_set_results_manager, - runner=runner, - callbacks=callbacks, - ) + eval_sets_manager = AgentEvaluator._get_eval_sets_manager(app_name=request_app_name, eval_set=eval_set) + if call_agent is not None: + eval_service = RemoteEvalService( + call_agent=call_agent, + eval_sets_manager=eval_sets_manager, + eval_set_results_manager=eval_set_results_manager, + callbacks=callbacks, + ) + else: + eval_service = LocalEvalService( + root_agent=agent_for_eval, + eval_sets_manager=eval_sets_manager, + user_simulator_provider=user_simulator_provider, + eval_set_results_manager=eval_set_results_manager, + runner=runner, + callbacks=callbacks, + ) inference_config = (InferenceConfig( parallelism=case_parallelism) if case_parallelism is not None else InferenceConfig()) diff --git a/trpc_agent_sdk/evaluation/_remote_eval_service.py b/trpc_agent_sdk/evaluation/_remote_eval_service.py new file mode 100644 index 0000000..25a4319 --- /dev/null +++ b/trpc_agent_sdk/evaluation/_remote_eval_service.py @@ -0,0 +1,466 @@ +# Tencent is pleased to support the open source community by making tRPC-Agent-Python available. +# +# Copyright (C) 2026 Tencent. All rights reserved. +# +# tRPC-Agent-Python is licensed under Apache-2.0. +"""Remote (black-box) eval service driven by async call_agent(query)->str.""" + +from __future__ import annotations + +import asyncio +import inspect +import time +import uuid +from typing import Any +from typing import AsyncGenerator +from typing import Awaitable +from typing import Callable +from typing import Optional +from typing_extensions import override + +from trpc_agent_sdk.log import error as log_error +from trpc_agent_sdk.types import Content +from trpc_agent_sdk.types import Part + +from ._eval_callbacks import Callbacks +from ._eval_callbacks import CallbacksRunner +from ._eval_callbacks import EvalSetRunResult +from ._eval_case import EvalCase +from ._eval_case import EvalModeTrace +from ._eval_case import Invocation +from ._eval_metrics import EvalMetric +from ._eval_metrics import EvalStatus +from ._eval_result import EvalCaseResult +from ._eval_result import EvalMetricResult +from ._eval_result import EvalMetricResultDetails +from ._eval_result import EvalMetricResultPerInvocation +from ._eval_result import EvaluationResult +from ._eval_result import PerInvocationResult +from ._eval_service_base import BaseEvalService +from ._eval_service_base import EvaluateConfig +from ._eval_service_base import EvaluateRequest +from ._eval_service_base import InferenceRequest +from ._eval_service_base import InferenceResult +from ._eval_service_base import InferenceStatus +from ._eval_set_results_manager_base import EvalSetResultsManager +from ._eval_sets_manager_base import EvalSetsManager +from ._evaluator_registry import EVALUATOR_REGISTRY +from ._evaluator_registry import EvaluatorRegistry + +CallAgent = Callable[[str], Awaitable[str]] +REMOTE_EVAL_INCOMPATIBLE_METRICS: frozenset[str] = frozenset({ + "tool_trajectory_avg_score", +}) +EVAL_SESSION_ID_PREFIX = "___remote_eval___session___" + + +def _get_session_id() -> str: + return f"{EVAL_SESSION_ID_PREFIX}{str(uuid.uuid4())}" + + +class RemoteEvalService(BaseEvalService): + """Eval service for remote/black-box agents via call_agent.""" + + def __init__( + self, + call_agent: CallAgent, + eval_sets_manager: EvalSetsManager, + evaluator_registry: Optional[EvaluatorRegistry] = None, + eval_set_results_manager: Optional[EvalSetResultsManager] = None, + session_id_supplier: Callable[[], str] = _get_session_id, + callbacks: Optional[Callbacks] = None, + ): + self._validate_call_agent_is_async(call_agent) + self._call_agent = call_agent + self._eval_sets_manager = eval_sets_manager + self._evaluator_registry = evaluator_registry or EVALUATOR_REGISTRY + self._eval_set_results_manager = eval_set_results_manager + self._session_id_supplier = session_id_supplier + self._callbacks_runner = CallbacksRunner(callbacks or Callbacks()) + + @staticmethod + def _validate_call_agent_is_async(call_agent: Any) -> None: + if not callable(call_agent): + raise ValueError("call_agent must be callable.") + if not inspect.iscoroutinefunction(call_agent): + raise ValueError("call_agent must be an async function: async def call_agent(query: str) -> str") + + @staticmethod + def _user_content_to_str(content: Content) -> str: + parts = getattr(content, "parts", []) or [] + chunks: list[str] = [] + for part in parts: + text = getattr(part, "text", None) + if isinstance(text, str): + chunks.append(text) + return "".join(chunks) + + @staticmethod + def _reject_trace_cases(eval_cases: list[EvalCase]) -> None: + trace_ids = [case.eval_id for case in eval_cases if case.eval_mode == EvalModeTrace] + if trace_ids: + raise ValueError(f"call_agent mode is incompatible with trace cases: {trace_ids}") + + @override + async def perform_inference( + self, + inference_request: InferenceRequest, + ) -> AsyncGenerator[InferenceResult, None]: + eval_set = self._eval_sets_manager.get_eval_set( + app_name=inference_request.app_name, + eval_set_id=inference_request.eval_set_id, + ) + if not eval_set: + raise ValueError(f"Eval set with id {inference_request.eval_set_id} not found for app " + f"{inference_request.app_name}") + + eval_cases = eval_set.eval_cases + if inference_request.eval_case_ids: + eval_cases = [c for c in eval_cases if c.eval_id in inference_request.eval_case_ids] + self._reject_trace_cases(eval_cases) + + run_ctx: dict[str, Any] = {} + start_time = time.monotonic() + inference_results_list: list[InferenceResult] = [] + set_error: Optional[Exception] = None + await self._callbacks_runner.run_before_inference_set(inference_request, run_ctx) + semaphore = asyncio.Semaphore(value=inference_request.inference_config.parallelism) + + async def run_one(eval_case: EvalCase) -> InferenceResult: + case_ctx = run_ctx.copy() + session_id = self._session_id_supplier() + await self._callbacks_runner.run_before_inference_case( + inference_request, + eval_case.eval_id, + session_id, + case_ctx, + ) + case_start = time.monotonic() + async with semaphore: + result = await self._perform_inference_single_eval_item( + app_name=inference_request.app_name, + eval_set_id=inference_request.eval_set_id, + eval_case=eval_case, + session_id=session_id, + ) + await self._callbacks_runner.run_after_inference_case( + inference_request, + result, + None, + case_start, + case_ctx, + ) + return result + + try: + tasks = [run_one(eval_case) for eval_case in eval_cases] + for coro in asyncio.as_completed(tasks): + inference_result = await coro + inference_results_list.append(inference_result) + yield inference_result + except Exception as e: + set_error = e + raise + finally: + await self._callbacks_runner.run_after_inference_set( + inference_request, + inference_results_list, + set_error, + start_time, + run_ctx, + ) + + async def _perform_inference_single_eval_item( + self, + app_name: str, + eval_set_id: str, + eval_case: EvalCase, + session_id: Optional[str] = None, + ) -> InferenceResult: + if session_id is None: + session_id = self._session_id_supplier() + inference_result = InferenceResult( + app_name=app_name, + eval_set_id=eval_set_id, + eval_case_id=eval_case.eval_id, + session_id=session_id, + ) + try: + if not eval_case.conversation: + raise ValueError(f"inference eval case (eval_case_id={eval_case.eval_id}, session_id={session_id}): " + "conversation is required in call_agent mode") + inferences: list[Invocation] = [] + for source_invocation in eval_case.conversation: + query = self._user_content_to_str(source_invocation.user_content) + response_text = await self._call_agent(query) + inferences.append( + Invocation( + invocation_id=source_invocation.invocation_id, + user_content=source_invocation.user_content, + final_response=Content(parts=[Part(text=response_text)]), + intermediate_data=None, + creation_timestamp=time.time(), + )) + inference_result.inferences = inferences + inference_result.status = InferenceStatus.SUCCESS + return inference_result + except Exception as ex: # pylint: disable=broad-except + log_error( + "Inference failed for eval case `%s` with error %s.", + eval_case.eval_id, + ex, + exc_info=True, + ) + inference_result.status = InferenceStatus.FAILURE + inference_result.error_message = str(ex) + return inference_result + + def _validate_remote_metric_compat(self, evaluate_config: EvaluateConfig) -> None: + incompatible = sorted({ + metric.metric_name + for metric in evaluate_config.eval_metrics if metric.metric_name in REMOTE_EVAL_INCOMPATIBLE_METRICS + }) + if incompatible: + raise ValueError("call_agent mode does not support metrics: " + f"{incompatible}. Please remove them from EvalConfig.") + + @override + async def evaluate( + self, + evaluate_request: EvaluateRequest, + ) -> AsyncGenerator[EvalCaseResult, None]: + self._validate_remote_metric_compat(evaluate_request.evaluate_config) + run_ctx: dict[str, Any] = {} + start_time = time.monotonic() + eval_case_results_list: list[EvalCaseResult] = [] + set_error: Optional[Exception] = None + ir0 = evaluate_request.inference_results[0] if evaluate_request.inference_results else None + app_name = ir0.app_name if ir0 else "" + eval_set_id = ir0.eval_set_id if ir0 else "" + await self._callbacks_runner.run_before_evaluate_set(evaluate_request, run_ctx) + semaphore = asyncio.Semaphore(value=evaluate_request.evaluate_config.parallelism) + + async def run_one_eval(inference_result: InferenceResult) -> tuple[InferenceResult, EvalCaseResult]: + case_ctx = run_ctx.copy() + await self._callbacks_runner.run_before_evaluate_case( + evaluate_request, + inference_result.eval_case_id, + case_ctx, + ) + case_start = time.monotonic() + async with semaphore: + inference_result, eval_case_result = await self._evaluate_single_inference_result( + inference_result=inference_result, + evaluate_config=evaluate_request.evaluate_config, + ) + await self._callbacks_runner.run_after_evaluate_case( + evaluate_request, + inference_result, + eval_case_result, + None, + case_start, + case_ctx, + ) + return (inference_result, eval_case_result) + + try: + tasks = [run_one_eval(ir) for ir in evaluate_request.inference_results] + for coro in asyncio.as_completed(tasks): + _, eval_case_result = await coro + eval_case_results_list.append(eval_case_result) + yield eval_case_result + if self._eval_set_results_manager and eval_case_results_list and app_name: + sorted_results = sorted(eval_case_results_list, key=lambda r: (r.run_id or 0, r.eval_id)) + self._eval_set_results_manager.save_eval_set_result( + app_name=app_name, + eval_set_id=eval_set_id, + eval_case_results=sorted_results, + ) + except Exception as e: + set_error = e + raise + finally: + await self._callbacks_runner.run_after_evaluate_set( + evaluate_request, + EvalSetRunResult( + app_name=app_name, + eval_set_id=eval_set_id, + eval_case_results=eval_case_results_list, + ), + set_error, + start_time, + run_ctx, + ) + + async def _evaluate_single_inference_result( + self, + inference_result: InferenceResult, + evaluate_config: EvaluateConfig, + ) -> tuple[InferenceResult, EvalCaseResult]: + eval_case = self._eval_sets_manager.get_eval_case( + app_name=inference_result.app_name, + eval_set_id=inference_result.eval_set_id, + eval_case_id=inference_result.eval_case_id, + ) + if eval_case is None: + raise ValueError(f"Eval case with id {inference_result.eval_case_id} not found for " + f"app {inference_result.app_name} and eval set {inference_result.eval_set_id}.") + + expected_invocations = self._build_expected_invocations_for_eval(eval_case) + eval_metric_result_per_invocation: list[EvalMetricResultPerInvocation] = [] + overall_eval_metric_results: list[EvalMetricResult] = [] + + if inference_result.inferences: + for idx, actual in enumerate(inference_result.inferences): + expected = None + if expected_invocations and idx < len(expected_invocations): + expected = expected_invocations[idx] + eval_metric_result_per_invocation.append( + EvalMetricResultPerInvocation( + actual_invocation=actual, + expected_invocation=expected, + eval_metric_results=[], + )) + + case_error_message: Optional[str] = inference_result.error_message + if inference_result.status == InferenceStatus.FAILURE: + for eval_metric in evaluate_config.eval_metrics: + overall_eval_metric_results.append( + EvalMetricResult( + metric_name=eval_metric.metric_name, + threshold=eval_metric.threshold, + criterion=eval_metric.criterion, + score=None, + eval_status=EvalStatus.NOT_EVALUATED, + )) + for invocation in eval_metric_result_per_invocation: + invocation.eval_metric_results.append( + EvalMetricResult( + metric_name=eval_metric.metric_name, + threshold=eval_metric.threshold, + criterion=eval_metric.criterion, + score=None, + eval_status=EvalStatus.NOT_EVALUATED, + )) + return ( + inference_result, + EvalCaseResult( + eval_set_id=inference_result.eval_set_id, + eval_id=inference_result.eval_case_id, + run_id=getattr(inference_result, "run_id", None), + final_eval_status=EvalStatus.NOT_EVALUATED, + error_message=case_error_message, + overall_eval_metric_results=overall_eval_metric_results, + eval_metric_result_per_invocation=eval_metric_result_per_invocation, + session_id=inference_result.session_id or "", + session_details=None, + user_id=None, + ), + ) + + for eval_metric in evaluate_config.eval_metrics: + try: + evaluation_result = await self._evaluate_metric( + eval_metric=eval_metric, + actual_invocations=inference_result.inferences or [], + expected_invocations=expected_invocations, + ) + except Exception as e: # pylint: disable=broad-except + if case_error_message is None: + case_error_message = str(e) + log_error( + "Metric evaluation failed for metric `%s` for eval case id '%s' with following error `%s`", + eval_metric.metric_name, + inference_result.eval_case_id, + e, + exc_info=True, + ) + evaluation_result = EvaluationResult(overall_eval_status=EvalStatus.NOT_EVALUATED) + + reasons = [pr.reason for pr in evaluation_result.per_invocation_results if pr.reason is not None] + rubric_scores: list[Any] = [] + for pr in evaluation_result.per_invocation_results: + if pr.rubric_scores: + rubric_scores.extend(pr.rubric_scores) + overall_reason = ";".join(reasons) if reasons else None + overall_rubric = rubric_scores if rubric_scores else None + overall_eval_metric_results.append( + EvalMetricResult( + score=evaluation_result.overall_score, + eval_status=evaluation_result.overall_eval_status, + metric_name=eval_metric.metric_name, + threshold=eval_metric.threshold, + criterion=eval_metric.criterion, + details=EvalMetricResultDetails( + reason=overall_reason, + score=evaluation_result.overall_score, + rubric_scores=overall_rubric, + ) if (overall_reason is not None or overall_rubric is not None) else None, + )) + + for idx, invocation in enumerate(eval_metric_result_per_invocation): + if idx < len(evaluation_result.per_invocation_results): + invocation_result = evaluation_result.per_invocation_results[idx] + else: + invocation_result = PerInvocationResult(actual_invocation=invocation.actual_invocation) + invocation.eval_metric_results.append( + EvalMetricResult( + score=invocation_result.score, + eval_status=invocation_result.eval_status, + metric_name=eval_metric.metric_name, + threshold=eval_metric.threshold, + criterion=eval_metric.criterion, + details=EvalMetricResultDetails( + reason=invocation_result.reason, + score=invocation_result.score, + rubric_scores=invocation_result.rubric_scores, + ) if + (invocation_result.reason is not None or invocation_result.rubric_scores is not None) else None, + )) + + eval_case_result = EvalCaseResult( + eval_set_id=inference_result.eval_set_id, + eval_id=inference_result.eval_case_id, + run_id=getattr(inference_result, "run_id", None), + final_eval_status=self._generate_final_eval_status(overall_eval_metric_results), + error_message=case_error_message, + overall_eval_metric_results=overall_eval_metric_results, + eval_metric_result_per_invocation=eval_metric_result_per_invocation, + session_id=inference_result.session_id or "", + session_details=None, + user_id=None, + ) + return (inference_result, eval_case_result) + + async def _evaluate_metric( + self, + eval_metric: EvalMetric, + actual_invocations: list[Invocation], + expected_invocations: Optional[list[Invocation]], + ) -> EvaluationResult: + evaluator = self._evaluator_registry.get_evaluator(eval_metric) + if inspect.iscoroutinefunction(evaluator.evaluate_invocations): + return await evaluator.evaluate_invocations( + actual_invocations=actual_invocations, + expected_invocations=expected_invocations, + ) + return evaluator.evaluate_invocations( + actual_invocations=actual_invocations, + expected_invocations=expected_invocations, + ) + + @staticmethod + def _build_expected_invocations_for_eval(eval_case: EvalCase) -> Optional[list[Invocation]]: + if eval_case.conversation: + return list(eval_case.conversation) + return None + + @staticmethod + def _generate_final_eval_status(overall_eval_metric_results: list[EvalMetricResult]) -> EvalStatus: + final_eval_status = EvalStatus.NOT_EVALUATED + for result in overall_eval_metric_results: + if result.eval_status == EvalStatus.PASSED: + final_eval_status = EvalStatus.PASSED + elif result.eval_status == EvalStatus.FAILED: + return EvalStatus.FAILED + return final_eval_status