From f455862cd050f03c81c3e1c7df5414fb2b6d0e21 Mon Sep 17 00:00:00 2001 From: Akangsha Goel Date: Wed, 24 Jun 2026 11:12:32 -0700 Subject: [PATCH] feat(mcp-readability): add MCP style-guide compliance eval Add an `mcp_readability` orchestrator to EvalBench that checks MCP endpoints against the Data Cloud MCP style guide and reports compliance metrics (P0/P1/P2 counts, tool count, token budget usage, score). - Tool fetching uses the official `mcp` SDK over Streamable HTTP (`streamable_http_client` + `ClientSession.list_tools`) with URL sanitization and ADC auth; a `file` source supports offline runs. - Tools are rendered to man-page markup (resolves $ref/$defs, flattens nesting, renders enum/default) which feeds the LLM scorer. - Scorer uses Vertex Gemini JSON mode, anchors severity to style-guide priority annotations, supports per-endpoint waivers, evaluates from an LLM-agent-consumption perspective, and runs a dedicated consistency second pass against prior feedback when available. - Results write to a compliance CSV and, when `reporting.bigquery` is configured, append to the existing eval BigQuery table. - Endpoint list mirrors monitored_endpoints.textproto (24 endpoints). Co-Authored-By: Claude Opus 4.8 --- datasets/mcp_readability/endpoints.yaml | 124 ++++ datasets/mcp_readability/exceptions.yaml | 24 + .../model_configs/gemini_2.5_pro_model.yaml | 4 + datasets/mcp_readability/run_config.yaml | 43 ++ datasets/mcp_readability/sample_tools.yaml | 37 + datasets/mcp_readability/style_guide.md | 699 ++++++++++++++++++ datasets/mcp_readability/tools_generator.yaml | 7 + evalbench/evalbench.py | 20 +- evalbench/evaluator/__init__.py | 5 + .../evaluator/mcp_readability/__init__.py | 3 + evalbench/evaluator/mcp_readability/enums.py | 89 +++ .../evaluator/mcp_readability/exceptions.py | 71 ++ .../evaluator/mcp_readability/orchestrator.py | 334 +++++++++ .../mcp_readability/token_estimator.py | 33 + evalbench/generators/models/__init__.py | 2 + .../generators/models/mcp_tool_formatter.py | 341 +++++++++ evalbench/generators/models/mcp_tools.py | 229 ++++++ evalbench/reporting/mcp_readability_csv.py | 98 +++ evalbench/scorers/mcp_style_compliance.py | 360 +++++++++ evalbench/test/mcp_readability_test.py | 413 +++++++++++ pyproject.toml | 4 + 21 files changed, 2938 insertions(+), 2 deletions(-) create mode 100644 datasets/mcp_readability/endpoints.yaml create mode 100644 datasets/mcp_readability/exceptions.yaml create mode 100644 datasets/mcp_readability/model_configs/gemini_2.5_pro_model.yaml create mode 100644 datasets/mcp_readability/run_config.yaml create mode 100644 datasets/mcp_readability/sample_tools.yaml create mode 100644 datasets/mcp_readability/style_guide.md create mode 100644 datasets/mcp_readability/tools_generator.yaml create mode 100644 evalbench/evaluator/mcp_readability/__init__.py create mode 100644 evalbench/evaluator/mcp_readability/enums.py create mode 100644 evalbench/evaluator/mcp_readability/exceptions.py create mode 100644 evalbench/evaluator/mcp_readability/orchestrator.py create mode 100644 evalbench/evaluator/mcp_readability/token_estimator.py create mode 100644 evalbench/generators/models/mcp_tool_formatter.py create mode 100644 evalbench/generators/models/mcp_tools.py create mode 100644 evalbench/reporting/mcp_readability_csv.py create mode 100644 evalbench/scorers/mcp_style_compliance.py create mode 100644 evalbench/test/mcp_readability_test.py diff --git a/datasets/mcp_readability/endpoints.yaml b/datasets/mcp_readability/endpoints.yaml new file mode 100644 index 00000000..8acd8ee2 --- /dev/null +++ b/datasets/mcp_readability/endpoints.yaml @@ -0,0 +1,124 @@ +# MCP endpoints to check against the Data Cloud MCP style guide. +# +# Mirrors cloud/databases/mcp/readability/monitored_endpoints.textproto +# (EndpointList). The proto `type` maps to `environment`; all live endpoints are +# fetched via the `mcp` source over Streamable HTTP using ADC auth. +# +# `defaults` provides values shared by all endpoints; each endpoint may override +# any of them. `tools_source.type` makes the tool listing pluggable: +# - mcp: fetch tools/list from a live MCP server (official mcp SDK). Needs +# network access and (for these googleapis.com endpoints) ADC auth. +# - file: read a local YAML/JSON tools spec (offline / deterministic). + +defaults: + token_budget: 200000 + endpoint_type: REMOTE # EndpointType: REMOTE | LOCAL + tools_source: + type: mcp # mcp | file + auth: google_credentials # ADC bearer + x-goog-user-project + +endpoints: + - product_name: "AlloyDB for PostgreSQL" + endpoint_url: "https://alloydb.us-central1.rep.googleapis.com/mcp" + environment: PROD + + - product_name: "BigQuery" + endpoint_url: "https://bigquery.googleapis.com/mcp" + environment: PROD + + - product_name: "BigQuery Data Transfer Service" + endpoint_url: "https://bigquerydatatransfer.googleapis.com/mcp" + environment: PROD + + - product_name: "BigQuery Migration Service" + endpoint_url: "https://bigquerymigration.googleapis.com/mcp" + environment: PROD + + - product_name: "Bigtable" + endpoint_url: "https://bigtableadmin.googleapis.com/mcp" + environment: PROD + + - product_name: "Cloud SQL" + endpoint_url: "https://sqladmin.googleapis.com/mcp" + environment: PROD + + - product_name: "Firestore" + endpoint_url: "https://firestore.googleapis.com/mcp" + environment: PROD + + - product_name: "Spanner" + endpoint_url: "https://spanner.googleapis.com/mcp" + environment: PROD + + - product_name: "Data Migration Service" + endpoint_url: "https://datamigration.googleapis.com/mcp" + environment: PROD + + - product_name: "Datastream" + endpoint_url: "https://datastream.googleapis.com/mcp" + environment: PROD + + - product_name: "Oracle Database" + endpoint_url: "https://oracledatabase.googleapis.com/mcp" + environment: PROD + + - product_name: "Memory store for Redis/Redis Cluster" + endpoint_url: "https://redis.googleapis.com/mcp" + environment: PROD + + - product_name: "Memory store for Valkey" + endpoint_url: "https://memorystore.googleapis.com/mcp" + environment: PROD + + - product_name: "Pub/Sub" + endpoint_url: "https://pubsub.googleapis.com/mcp" + environment: PROD + + - product_name: "Managed Kafka" + endpoint_url: "https://managedkafka.us-central1.rep.googleapis.com/mcp" + environment: PROD + + - product_name: "Cloud Composer" + endpoint_url: "https://us-central1-staging-composer.sandbox.googleapis.com/mcp" + environment: STAGING + - product_name: "Cloud Composer" + endpoint_url: "https://composer.us-central1.rep.googleapis.com/mcp" + environment: PROD + + - product_name: "Knowledge Catalog" + endpoint_url: "https://dataplex.googleapis.com/mcp" + environment: PROD + + - product_name: "Knowledge Catalog Data Products" + endpoint_url: "https://staging-dataplex.sandbox.googleapis.com/mcp" + environment: STAGING + + - product_name: "Database Insights (EGO for AlloyDB Tools)" + endpoint_url: "https://autopush-databaseinsights.sandbox.googleapis.com/mcp" + environment: AUTOPUSH + + - product_name: "DB: AI Assisted Troubleshooting" + endpoint_url: "https://databaseinsights.googleapis.com/mcp" + environment: PROD + + - product_name: "Cloud Storage MCP Server" + endpoint_url: "https://storage.googleapis.com/storage/mcp" + environment: PROD + + - product_name: "Database Center API for MCP" + endpoint_url: "https://databasecenter.googleapis.com/mcp" + environment: PROD + + - product_name: "Dataproc" + endpoint_url: "https://dataproc.googleapis.com/mcp" + environment: PROD + + # --- Offline / deterministic sample (no network or auth) ------------------ + # Uncomment to smoke-test the pipeline without hitting live endpoints. + # - product_name: "Sample Data Cloud MCP (local file)" + # endpoint_url: "file://datasets/mcp_readability/sample_tools.yaml" + # endpoint_type: LOCAL + # environment: DEV + # tools_source: + # type: file + # path: datasets/mcp_readability/sample_tools.yaml diff --git a/datasets/mcp_readability/exceptions.yaml b/datasets/mcp_readability/exceptions.yaml new file mode 100644 index 00000000..1b5aaebb --- /dev/null +++ b/datasets/mcp_readability/exceptions.yaml @@ -0,0 +1,24 @@ +# Style-rule exceptions (waivers). +# +# Each entry waives one style `rule_id` for the endpoints it matches. A waived +# rule is NOT counted toward p0/p1/p2 issues and is reported separately under +# "waived" in the LLM feedback. +# +# Matchers (all that are present must match; an absent field or "*" = match-all): +# - endpoint_url +# - product_name +# - environment + +# `rule_id` should match the style guide's section heading text (the scorer uses +# headings as rule IDs), e.g. "Tool Names", "Concise Descriptions", +# "Use Enums", "Limit Options". +exceptions: + # Example: waive the tool-naming rule for one specific endpoint. + - product_name: "Sample Data Cloud MCP (local file)" + rule_id: "Tool Names" + reason: "Sample fixture intentionally uses a non-compliant tool name (RunQuery)." + + # Example: waive the enum rule for every autopush endpoint. + - environment: AUTOPUSH + rule_id: "Use Enums" + reason: "Autopush builds defer enum constraints until promotion to prod." diff --git a/datasets/mcp_readability/model_configs/gemini_2.5_pro_model.yaml b/datasets/mcp_readability/model_configs/gemini_2.5_pro_model.yaml new file mode 100644 index 00000000..26eb0b7f --- /dev/null +++ b/datasets/mcp_readability/model_configs/gemini_2.5_pro_model.yaml @@ -0,0 +1,4 @@ +generator: gcp_vertex_gemini +vertex_model: gemini-2.5-pro +base_prompt: "" +execs_per_minute: 5 diff --git a/datasets/mcp_readability/run_config.yaml b/datasets/mcp_readability/run_config.yaml new file mode 100644 index 00000000..2f7c3af1 --- /dev/null +++ b/datasets/mcp_readability/run_config.yaml @@ -0,0 +1,43 @@ +############################################################ +### MCP Style-Guide Compliance (Readability) Run Config +############################################################ +# Drives the `mcp_readability` orchestrator: fetch each endpoint's tools (man-page +# markup), evaluate it against the style guide with an LLM, and write a compliance +# CSV (and optionally append the rows to the eval BigQuery table). + +orchestrator: mcp_readability + +# Inputs +endpoints_config: datasets/mcp_readability/endpoints.yaml +style_guide: datasets/mcp_readability/style_guide.md +exceptions_config: datasets/mcp_readability/exceptions.yaml # optional +tools_generator_config: datasets/mcp_readability/tools_generator.yaml + +# Default token budget for token_budget_used_percent (endpoints may override). +token_budget: 200000 + +# Optional: path to a prior compliance CSV; prior feedback is fed to the LLM to +# keep findings consistent run-to-run. +previous_results_csv: null + +# Optional: only check endpoints whose environment is in this list, e.g. [PROD]. +environments: [] + +# LLM used by the compliance scorer. +scorers: + mcp_style_compliance: + model_config: datasets/mcp_readability/model_configs/gemini_2.5_pro_model.yaml + +# Endpoint check concurrency. +runners: + endpoint_runners: 4 + +# Output: results//mcp_readability_compliance.csv +# Add a `bigquery` block to also append the compliance rows to the standard eval +# table (.evalbench.results); the schema auto-evolves to add the +# compliance columns. No new dataset/table is created. +reporting: + csv: + output_directory: 'results' + # bigquery: + # gcp_project_id: senseai-team diff --git a/datasets/mcp_readability/sample_tools.yaml b/datasets/mcp_readability/sample_tools.yaml new file mode 100644 index 00000000..6a2ea490 --- /dev/null +++ b/datasets/mcp_readability/sample_tools.yaml @@ -0,0 +1,37 @@ +# Sample MCP tools spec for offline, deterministic testing of the readability +# check (tools_source.type: file). Intentionally mixes good and bad style so the +# LLM has something to flag. +tools: + - name: list_datasets + description: "List all BigQuery datasets in the given project. Use this to discover available datasets before querying." + inputSchema: + type: object + properties: + project_id: + type: string + description: "The Google Cloud project ID to list datasets from." + required: ["project_id"] + + - name: RunQuery + description: "runs sql" + inputSchema: + type: object + properties: + q: + type: string + + - name: delete_table + description: "Deletes a table." + inputSchema: + type: object + properties: + project_id: + type: string + description: "Project ID." + dataset_id: + type: string + description: "Dataset ID." + table_id: + type: string + description: "Table ID." + required: ["project_id", "dataset_id", "table_id"] diff --git a/datasets/mcp_readability/style_guide.md b/datasets/mcp_readability/style_guide.md new file mode 100644 index 00000000..7df1bf36 --- /dev/null +++ b/datasets/mcp_readability/style_guide.md @@ -0,0 +1,699 @@ +# Data Cloud MCP Tool Style Guide + +[TOC] + +* **Go link:** + [go/mcp-datacloud-styleguide](http://go/mcp-datacloud-styleguide) + +## Style Principles + +This document provides style guidelines and best practices for Data Cloud teams +building MCP (Model Context Protocol) tools. + +> [!NOTE] The tool descriptions and examples in this guide are generated using +> the `experimental/users/kvg/mcp_readability` utility. You can use this tool to +> view the API of your own MCP tools in a readable format by running it against +> your MCP server endpoint: +> +> ```bash +> blaze run //experimental/users/kvg/mcp_readability:mcp_list_tools -- \ +> --url="https://your-mcp-server.googleapis.com/mcp" \ +> --auth \ +> --project="your-gcp-project" +> ``` +> +> You can use the following tool to get LLM generated recommendations (AI +> evaluation) based on this style guide for your MCP tools by running it against +> your MCP server endpoint. This tool requires a Gemini API key to run the AI +> evaluation: +> +> ```bash +> blaze run cloud/databases/mcp/readability:run_mcp_readability_check -- \ +> --gemini_api_key check_url \ +> --url "https://your-mcp-server.googleapis.com/mcp" \ +> [--tools ] [--auth] [--project ] +> ``` + +There are a few overarching principles that summarize how to think about +building good MCP tools. The following are attributes of good tools, in order of +importance: + +1. **Clarity**: The tool's purpose, parameters, and error messages are clear to + the reasoning engine. +2. **Simplicity**: Tools and their parameters are simple enough to reduce + cognitive load on the model, minimize hallucination risks, and avoid + excessive token usage, while remaining comprehensive enough to complete a + task. +3. **Security**: Tools are secure by default, do not expose secrets, and use + granular protections. +4. **Maintainability**: Tools are built in a consistent manner, relying on + standardized platforms, making them easier to maintain across teams. + +## Platform & Constraints + +### Platform to build tools + +All teams within data cloud **MUST** be using either OneMCP or MCP Toolbox for +tool building. Bespoke solutions are **NOT** supported. + +### Tool Count Limits + +The current rule of thumb is to keep it to 40 tools per server as an upper +limit, though even this amount may cause performance issues. Performance +degrades as more tools are added, so teams **SHOULD** heavily weigh adding new +tools against the negative impact on tool accuracy until other mechanisms are in +place to deal with this. + +In the future, we aspire for this number to be reduced to 5-8 tools per server +as toolsets/skills capabilities improve in MCP Toolbox and OneMCP, though this +is not a requirement today. Until overall platform tool handling improves, teams +**SHOULD** minimize tool count as much as possible. + +## Tool Design & API + +MCP tools **MUST** be high quality, reliable, and easy for agents to use. + +### Tool Names + +Teams **MUST** use `snake_case` (`_`) for tool names. You +**SHOULD NOT** use product-specific prefixes, as the agent can automatically +disambiguate by prefixing the tool name with the MCP server name. + +```text {.bad} +================================================================================ +TOOL: my_service_create_instance +================================================================================ +================================================================================ +TOOL: my_service_execute_sql +================================================================================ +``` + +```text {.good} +================================================================================ +TOOL: create_instance +================================================================================ +================================================================================ +TOOL: execute_sql +================================================================================ +``` + +All new tools **MUST** be registered in the +[Data Cloud OneMCP: Tools and Commitment Dashboard](https://docs.google.com/spreadsheets/d/1rXWhXONd5xqCpU-oJJ8eTSWtL4QVm9yuavjur-p_NXY){.external}. + +### Keep tools CUJ-focused + +Tools **SHOULD** be focused on a specific CUJ or goal, like creating a resource +or fetching monitoring data. Aim for tools that are granular enough to be easily +understood and used by agents, but comprehensive enough to complete a task in +one go if possible. For example, if a CUJ requires multiple API calls, consider +creating one tool that orchestrates these calls to reduce chances of multi-tool +failure. + +However, avoid bundling unrelated actions into a single tool. + +```text {.bad} +================================================================================ +TOOL: manage_instance +================================================================================ +DESCRIPTION: + Creates, deletes, or retrieves a database instance depending on the action + parameter. + +PARAMETERS: + action (string) [REQUIRED] + Required. The action to perform (create, delete, get). + + name (string) [REQUIRED] + Required. Name of the database instance. This does not include the + project ID. +``` + +```text {.good} +================================================================================ +TOOL: get_instance +================================================================================ +DESCRIPTION: + Get the details of a Cloud SQL instance. + +PARAMETERS: + instance (string) [REQUIRED] + Required. Database instance ID. This does not include the project_id. + + project_id (string) [REQUIRED] + Required. Project ID of the project that contains the instance. +``` + +### Idempotency + +Whenever possible, tools **SHOULD** be idempotent. + +**Why?** Calling the same tool with the same parameters multiple times should +have the same result as calling it once. This is especially important for create +and delete operations. For example, if an agent calls `create_resource` multiple +times with the same parameters, only one resource should be created. If the +resource already exists, the tool should return a success status code rather +than an error code (e.g. `ALREADY_EXISTS`). Returning success on successive +attempts allows agents to safely retry operations without causing tool execution +to fail. It is acceptable for the tool response to indicate that the resource +already existed rather than being created by the current call. + +### Parameters + +Parameters **MUST** have clear and meaningful names. Avoid ambiguity that could +lead to accidental insecure configurations or accidental destructive operations. + +#### Naming Convention + +Teams should use `snake_case` for all parameter names. This aligns with Python +naming conventions (the most-used language for MCP servers) and provides a +predictable, standardized interface for the reasoning engine. + +```text {.bad} +PARAMETERS: + projectId (string) [REQUIRED] + Required. Project ID of the project. +``` +```text {.good} +PARAMETERS: + project_id (string) [REQUIRED] + Required. Project ID of the project. +``` + +#### Understandability + +A good rule of thumb is: if a human cannot easily understand a parameter without +reading the documentation, the name is likely too ambiguous for a model as well. + +Directly mapping to existing enums or field names from underlying OnePlatform +APIs is often an antipattern. These internal names are typically designed for +system-to-system communication and backwards compatibility, not for semantic +understanding by an LLM. + +```text {.bad} +PARAMETERS: + write_disposition (string) [OPTIONAL] + Optional. Specifies the action that occurs if the destination table + already exists. Can be ['WRITE_TRUNCATE', 'WRITE_APPEND', 'WRITE_EMPTY']. +``` + +```text {.good} +PARAMETERS: + if_table_exists (string) [OPTIONAL] + Optional. Action to take if the destination table already exists. + Can be ['overwrite', 'append', 'fail']. +``` + +**Explain Domain Terminology** Do not assume the agent understands your +specific product's architecture or internal naming conventions. If a tool relies +on domain-specific concepts (e.g., "Exadata," "Static-IP Connectivity," +"Statement Suggestions"), you MUST provide a brief, plain-English explanation of +what the concept is and *why* the agent would use it. + +To prevent tool descriptions from becoming excessively long, keep these +explanations concise. + +Create evaluations +([evalbench](http://g3doc/cloud/databases/mcp/g3doc/evalbench_overview.md)) +to understand how well an agent handles domain terminology, and to test what +level of explanations you need to include in the tool description. + +#### Query Languages and Custom Filters + +If a tool takes a custom query string as an input parameter (e.g., AIP-160, +PromQL, EBNF syntax): + +1. **Provide Concrete Examples**: You MUST include 2-3 fully-formed examples of + valid query strings in the parameter description. + +2. **Explain Literals**: If the filter requires specific internal string + literals, list the most common valid literals directly in the description. + +3. **Iterative Discovery**: If the domain is too large to document in the + description (e.g., thousands of PromQL metrics), you SHOULD provide a + secondary "discovery" tool(e.g., `list_available_metrics`) so the agent can + look up the correct literals before composing its filter. + +Evaluating agent performance on query generation is crucial for understanding +how much detail and how many examples are necessary to include in tool +descriptions. You SHOULD use evaluations to test how well agents handle query +languages and custom filters with varying levels of detail in the description. + +* *Note*: A fuller explanation of complex query languages might be better + suited as a separate skill. + +**Examples:** + +```text {.good} +PARAMETERS: + filter (string) [OPTIONAL] + Optional. An AIP-160 filter string. Examples: + - 'labels.env = "prod" AND labels.team = "data"' + - 'create_time > "2024-01-01T00:00:00Z"' +``` + +```text {.good} +PARAMETERS: + query (string) [REQUIRED] + Required. A PromQL query. Examples: + - 'up{job="kubernetes-pods"}' + - 'sum(rate(http_requests_total[5m]))' +``` + +#### Avoid complex parameters + +Avoid complex parameter types like large, nested protocol buffers or overly +complex dictionaries. These increase the likelihood of parsing errors, consume +unnecessary tokens, and cause model confusion. Stick to simple primitives +(strings, integers, booleans) wherever possible. Simple maps (e.g., for labels +or simple key-value configurations) are acceptable if they are well-described +and do not introduce large, nested proto structures. + +```text {.good} +// GOOD: Simple maps for flat key-value data +PARAMETERS: + labels (object) [OPTIONAL] + Optional. Labels to apply to the instance. A simple map of string keys + to string values (e.g., {"env": "prod", "team": "data"}). +``` + +**Opinionated Defaults for Resource Creation** When wrapping `Create` endpoints +for complex resources, DO NOT expose the entire OnePlatform proto schema as +tool parameters. Instead, provide sensible and secure defaults for the +resource configuration on the server side, and only expose parameters for +the 5 to 8 most critical settings the agent needs to configure +(e.g., `machine_type`, `region`, `memory_gb`). If providing sensible defaults +is not sufficient to reduce complexity, you **SHOULD** split functionality into +separate, purpose-built MCP tools rather than creating one complex tool. + +```text {.bad} +// BAD: Exposing a massive, complex proto object +PARAMETERS: + cluster_config (object) [REQUIRED] + Required. The complete cluster configuration proto, including nested + fields for master_config, worker_config, software_config, etc. +``` + +```text {.good} +// GOOD: Opinionated defaults and simple parameters +PARAMETERS: + cluster_name (string) [REQUIRED] + Required. The name of the cluster. + + node_count (integer) [OPTIONAL] + Optional. The number of worker nodes. Defaults to 3. + + machine_type (string) [OPTIONAL] + Optional. The machine type to use for nodes. Defaults to 'n1-standard-4'. +``` + +**Restructuring Complex Protos** Instead of passing a +complex `Settings` proto for instance creation, flatten the critical +fields: + +```text {.bad} +PARAMETERS: + settings (object) [OPTIONAL] + Optional. Instance settings proto containing tier, pricingPlan, + ipConfiguration (which itself contains authorizedNetworks, ipv4Enabled, etc). +``` + +```text {.good} +PARAMETERS: + tier (string) [REQUIRED] + Required. The service tier (e.g., 'db-custom-1-3840'). + + enable_public_ip (boolean) [OPTIONAL] + Optional. Whether to enable public IP. Defaults to false. + + authorized_networks (array) [OPTIONAL] + Optional. List of authorized CIDR blocks. +``` + +#### Hide Useless and Deprecated Fields + +You MUST NOT expose parameters to the agent that it cannot meaningfully use. +Leaking backend or OnePlatform requirements into the AI surface increases +cognitive load and token waste. + +- **Idempotency Tokens & UUIDs**: Do not ask the agent to generate UUIDs or + `request_id`s for idempotency. Handle idempotency and request deduplication + inside the MCP server or adapter layer. These should NOT be included in the + input payload schema. +- **Deprecated Fields**: Strip all deprecated fields from your schemas. + +#### Use Human-Readable Time and Durations + +When designing parameters for dates, times, or durations, you **MUST NOT** use +low-level Unix timestamps (e.g., milliseconds or nanoseconds). Instead, use +higher-order time units (seconds, minutes, hours, days) or standard +human-readable date-time formats. + +LLMs reason much better with formatted dates, times, and higher-order durations +than they do with raw Unix timestamps. Forcing the reasoning engine to calculate +nanoseconds increases the likelihood of hallucination and token waste. + +```text {.bad} +PARAMETERS: + start_time_ns (integer) [OPTIONAL] + Optional. The start time of the query in nanoseconds since the Unix epoch. + + timeout_ms (integer) [OPTIONAL] + Optional. The timeout duration for the job in milliseconds. +``` +```text {.good} +PARAMETERS: + start_time (string) [OPTIONAL] + Optional. The start time of the query in RFC 3339 format + (e.g., "2024-04-09T22:52:14Z"). + + timeout_in_minutes (integer) [OPTIONAL] + Optional. The timeout duration for the job in minutes. +``` + +#### Limit Options + +Aim for a limit of 5 to 8 parameters per tool, while keeping in mind that fewer +parameters is usually better. High parameter counts exponentially decreases tool +call accuracy and can increase token consumption. + +#### Use Enums + +Prefer iterating a list of possible values over free-text strings. This +constrains the model's decision path and reduces token perplexity. + +```text {.good} +status: ['open', 'closed'] +``` + +#### Safe Pagination + +If your tool returns lists of values (for example resources): - +**Cap Page Sizes**: Default `page_size` parameters MUST be capped at an LLM-safe +limit (e.g., 10 to 20 maximum). Aim to keep the return output size +below 5,000 to 10,000 tokens to avoid using up too much of an agent's context +window. + +- **Document Page Tokens**: If exposing a `page_token` parameter, explain + exactly how the agent should use it (e.g., *"Pass the `next_page_token` + received from the previous response to retrieve the next page."*). + +#### Concise Descriptions + +While it can be tempting to treat tools like documentation, avoid adding +redundant information to tool descriptions. + +- **Keep Parameter Docs on the Parameters**: Do not repeat information in the + tool description that belongs in parameter descriptions. For example, do not + embed parameter definitions, formatting rules, or extraneous section titles + (such as "Purpose and Capabilities") within the main tool description. Each + parameter has its own dedicated description field in the schema for this + information. Stating "The request must provide parameter X to get result Y" + in the main description is redundant and wastes tokens. + +```text {.good} +================================================================================ +TOOL: execute_sql +================================================================================ +DESCRIPTION: + Execute any valid SQL statement, including data definition language (DDL), + data control language (DCL), data query language (DQL), or + data manipulation language (DML) statements, on a Cloud SQL instance. + +PARAMETERS: + database (string) [OPTIONAL] + Optional. Name of the database on which the statement will be executed. + For Postgres it's required, for MySQL it's optional. For Postgres, if your + query is not scoped to an existings database, like list databases / create + new database / grant roles, you can pass in default value as postgres. + + instance (string) [REQUIRED] + Required. Database instance ID. This does not include the project_id. + + project_id (string) [REQUIRED] + Required. Project ID of the project that contains the instance. + + sql (string) [REQUIRED] + Required. SQL statements to run on the database. It can be a single + statement or a sequence of statements separated by semicolons. +``` + +#### Avoid Conversational Scripting + +- **Use Imperative Phrasing**: You may use imperative phrasing to guide agent + actions, including asking for user confirmation on critical or destructive + operations (e.g., *"Please explicitly check with the user to ensure they + want to delete all data before executing"*). However, do not overly + prescribe conversational flow (e.g., *"Send each step as a separate + message"*). +- **Do not reference capabilities an agent may not have**: For example, do not + tell the agent to *"sleep for 10 seconds"* or *"navigate to this URL in a + browser"*. While some agents can have these capabilities, asking an agent + who does not may confuse or derail them. Use objective constraints instead + (e.g., *"Wait at least 10 seconds before polling again"*). Tool + descriptions should explain what the tool does, and they should avoid + telling the agent to do things it cannot or to hide things that require + user interaction. + +#### Be Consistent + +Use consistent parameter names for the same concept across different tools. +Decide on one parameter name for project identification (e.g., `project_id`) and +use it across all tools. + +```text {.bad} +PARAMETERS: + project (string) [REQUIRED] + Required. ID of the project. + +# ... +PARAMETERS: + project (string) [REQUIRED] + Required. Name of the project. + +``` + +```text {.good} +PARAMETERS: + instance_id (string) [REQUIRED] + Required. Database instance ID. + + project_id (string) [REQUIRED] + Required. Project ID of the project that contains the instance. +``` + +#### Explicit Consequences + +Explicitly state consequences in parameter names. For destructive or high-cost +operations, parameter names should make consequences clear. + +```text {.bad} + force (boolean) [OPTIONAL] + Optional. Force the deletion. +``` + +```text {.good} +PARAMETERS: + confirm_deletion (boolean) [OPTIONAL] + Optional. Acknowledge that this operation will permanently delete all + clusters and data. This action cannot be undone. +``` + +#### Avoid Templated Strings + +**DO NOT** require the agent to pass parameters in as templated strings (e.g., +`projects/{project_id}/locations/{region}/instances/{resource_name}`). + +This creates an additional formatting step for the agent to construct the string +correctly, which increases the likelihood of hallucination or syntax errors and +potentially lowers the tool's success rate. + +#### Parameter Design Examples + +Tool | Bad Design (Ambiguous/Insecure) | Good Design (Explicit/Secure) +:---------------- | :------------------------------ | :---------------------------- +`deploy_dag` | `force: true` | `acknowledge_overwrite_of_existing_dag_file: true` +`cancel_job` | `immediate: true` | `termination_policy: "FORCE_CANCEL_WITHOUT_DRAIN"` +`instance_delete` | `confirm: true` | `confirm_permanent_deletion_of_all_clusters_and_data: true` +`Metadata Lookup` | `show_all: true` | `include_sensitive_business_metadata: true` +`batch_create` | `use_public_ip: true` | `network_access: "PRIVATE_ONLY"` or `"PUBLIC_INTERNET_ACCESS"` +`database_delete` | `delete: true` | `acknowledge_permanent_database_deletion_and_data_loss: true` +`execute_sql` | `allow_large_results: true` | `acknowledge_potential_high_compute_cost_for_large_results: true` +`duration` | `duration: "4h"` | `requested_access_duration_in_hours: 4` +`justification` | `reason: "fixing bug"` | `detailed_justification_for_elevated_privileges: "b/1234 -- Fixing bug"` + +### Long running operations + +MCP today does not have a standard for how to do long running operations. We +expect that our tools will have to evolve as the MCP standard changes. + +In the meantime, the standard pattern for handling LROs is: + +1. **Initiation**: An MCP tool call that initiates an LRO (e.g., + `create_instance`) **MUST** return immediately with an operation ID. +2. **Polling**: The agent will use a dedicated tool, like `get_operation`, to + poll the status of the LRO using the returned operation ID. Teams **MUST** + provide such a polling tool. +3. **Completion**: The agent will continue polling the tool until the operation + reaches a terminal state (e.g., `SUCCEEDED`, `FAILED`). + +To ensure agents can effectively follow this pattern, clear descriptions and +hints **MUST** be provided within the tool descriptions. This guides the agent +on the expected workflow. + +Example tool descriptions: + +```text {.bad} +================================================================================ +TOOL: create_instance +================================================================================ +DESCRIPTION: + Creates a database instance. +``` + +```text {.good} +================================================================================ +TOOL: create_instance +================================================================================ +DESCRIPTION: + Initiates the creation of a Cloud SQL instance. + + * The tool returns a long-running operation. Use the `get_operation` tool to + poll its status until the operation completes. + * The instance creation operation can take several minutes. Wait at least + 30 seconds before rechecking the status. + +PARAMETERS: + name (string) [REQUIRED] + Required. Name of the Cloud SQL instance. This does not include the + project ID. + + project (string) [REQUIRED] + Required. Project ID of the project to which the newly created Cloud SQL + instances should belong. + +================================================================================ +TOOL: get_operation +================================================================================ +DESCRIPTION: + Retrieves the status of a long-running operation. +``` + +## Error Handling + +### Actionable Error Messaging + +Tools **MUST** return clear and actionable error messages that tell agents what +went wrong, why it went wrong, and how to fix it. When an operation fails, **DO +NOT** return generic error messages like "internal error". + +For more details on writing good error messages, see +[Write actionable error messages](https://developers.google.com/workspace/chat/write-error-messages). + +Here are some examples of good and bad error messages for tools: + +| Tool | Bad error message | Good error message | +| :---------------- | :------------------ | :-------------------------------- | +| `create_instance` | `Invalid request` | `Failed to create instance: The | +: : : instance name 'my-instance' is : +: : : already in use. Please choose a : +: : : different name.` : +| `execute_sql` | `Permission denied` | `Failed to execute SQL: The | +: : : service account does not have : +: : : permission to access the database : +: : : 'my-db'. Please grant 'Cloud SQL : +: : : Client' role to the service : +: : : account.` : +| `delete_resource` | `Internal error` | `Failed to delete resource: The | +: : : resource is currently in use by : +: : : another operation. Please try : +: : : again in a few minutes.` : + +## Security Best Practices + +This section covers security best practices for creating MCP tools. Authoring a +new MCP tool is an opportunity to make tools that are secure by default. + +### Secure by Default + +Our APIs **MUST** enforce secure configurations. + +* **Critical Requirement:** All "create" or "update" operations **MUST** apply + the most secure configuration by default. For example, any resource that + *can* be encrypted *must* be encrypted by default with Google-managed or + customer-managed encryption keys. Any resource that *can* be exposed to the + public internet *must* default to private. If defaulting to private creates + significant user friction (e.g., forcing VPC setup) or represents an + irreversible configuration, network configuration may be defined as a + required parameter, forcing the agent to request clarification if the user's + intent is unclear. + +### Tool Descriptions as Embedded Security Guidance + +The description field in a tool's manifest is the primary documentation for an +agent and **MUST** guide the reasoning engine toward correct actions. **DO NOT** +use field descriptions like "enable_policy" with a boolean value. + +| Field | Bad Description | Better Description | +| :-------------------- | :---------------- | :------------------------------- | +| `enable_public_ip` | Enables public IP | Enabling this flag makes the | +: : : resource accessible from the : +: : : public internet, which is a : +: : : security risk if not properly : +: : : managed. It is recommended to : +: : : keep resources private whenever : +: : : possible. : +| `deletion_protection` | Enables deletion | Prevents the resource from being | +: : protection : accidentally deleted. : +: : : Recommended for production : +: : : resources to prevent human : +: : : error. : + +### Handling Passwords and Secrets + +Tools **MUST NOT** surface passwords or credentials in clear-text requests or +responses. Authentication **MUST** be handled by the underlying transport (e.g., +service account credentials). Exposing plaintext password fields in API +interfaces for AI consumption is a security risk. + +* **Policy:** + [Read More](https://docs.google.com/document/d/1D6FGIJTfDlNTwPE00W0jStP6zB4Gbdyy0QSOnewKbPo) + {.external} +* Teams may allow agents to access references to secrets stored in Secret + Manager + +### Granular Protection for Sensitive Operations + +Many GCP APIs follow a pattern of bundling multiple update operations into a +single API method (e.g., `resource.update`), controlled by a single IAM +permission (e.g., `my-service.resources.update`). While efficient, this can +expose users to extra risk when authorizing AI agents. For example, a user might +want to grant an agent permission to update resource labels, but not to enable +public internet access for the resource, which is a high-risk action. If both +actions are covered by `my-service.resources.update`, the user cannot easily +grant permission for one but not the other. This prevents users from +establishing safe boundaries for their agents. + +To mitigate this, teams **SHOULD** consider introducing granular protections for +high-risk actions such as deleting data or exposing a resource to the public +internet. For example: + +* **Field-level IAM permissions:** For API methods that update multiple + fields, consider requiring more specific IAM permissions for changes to + high-risk fields. For example, enabling public internet access via a + `resource.update` method could require + `my-service.resources.enablePublicAccess` in addition to + `my-service.resources.update`. +* **Multi-party Approval:** For critical operations like deleting a production + resource, consider requiring a second independent confirmation from another + user before execution. + +### Horizontals and Compliance + +All MCP tools **MUST** comply with standard Google Cloud horizontal requirements +including the standard ones: + +* **VPC-SC:** Integration with VPC Service Controls. +* **IAM:** Use Cloud IAM for authorization and support IAM Deny policies. +* **Audit Logging:** Working `Gin` logging for all tools. +* **Access Transparency (AXT/AXA):** Integration for administrative access + logs. diff --git a/datasets/mcp_readability/tools_generator.yaml b/datasets/mcp_readability/tools_generator.yaml new file mode 100644 index 00000000..4520556e --- /dev/null +++ b/datasets/mcp_readability/tools_generator.yaml @@ -0,0 +1,7 @@ +# Generator that fetches/normalizes each endpoint's tools.yaml. +# Per-endpoint `tools_source` (in endpoints.yaml) overrides drive actual fetch +# behavior; these are just defaults for the generator itself. +generator: mcp_tools +timeout: 30 +headers: {} +# auth: google_credentials # default auth for endpoints that don't set their own diff --git a/evalbench/evalbench.py b/evalbench/evalbench.py index 604fde52..bfc786f8 100644 --- a/evalbench/evalbench.py +++ b/evalbench/evalbench.py @@ -88,8 +88,13 @@ def eval(experiment_config: str): config, db_configs, model_config, setup_config = load_session_configs(session) logging.info("Loaded Configurations in %s", experiment_config) - # Load the dataset - dataset = load_dataset_from_json(session["dataset_config"], config) + # Load the dataset. Some orchestrators (e.g. mcp_readability) drive their + # work from the run config itself rather than a prompt dataset, so + # dataset_config is optional; flatten_dataset({}) yields []. + if session.get("dataset_config"): + dataset = load_dataset_from_json(session["dataset_config"], config) + else: + dataset = {} # Load the evaluator evaluator = get_orchestrator( config, db_configs, setup_config, report_progress=True @@ -137,6 +142,17 @@ def eval(experiment_config: str): ) summary_scores_df["job_id"] = job_id summary_scores_df["run_time"] = run_time + elif config.get("orchestrator") == "mcp_readability": + # This orchestrator writes its own compliance CSV in process() and + # intentionally returns no NL2SQL results/scores temp files. + logging.info( + f"Finished MCP readability run for config '{display_config}'." + ) + reporters = [] + config_df = None + results_df = None + scores_df = None + summary_scores_df = None else: logging.warning( f"There were no matching evals in run for config '{display_config}'. Returning empty set." diff --git a/evalbench/evaluator/__init__.py b/evalbench/evaluator/__init__.py index 4795183b..0374aaa6 100644 --- a/evalbench/evaluator/__init__.py +++ b/evalbench/evaluator/__init__.py @@ -8,6 +8,7 @@ from evaluator.dataengineeringagentorchestrator import ( DataEngineeringAgentOrchestrator, ) +from evaluator.mcp_readability import McpReadabilityOrchestrator import logging @@ -27,6 +28,10 @@ def get_orchestrator(config, db_configs, setup_config, report_progress=False): return DataEngineeringAgentOrchestrator( config, db_configs, setup_config, report_progress ) + elif orchestrator_type == "mcp_readability": + return McpReadabilityOrchestrator( + config, db_configs, setup_config, report_progress + ) else: return Orchestrator(config, db_configs, setup_config, report_progress) diff --git a/evalbench/evaluator/mcp_readability/__init__.py b/evalbench/evaluator/mcp_readability/__init__.py new file mode 100644 index 00000000..bff0cfa9 --- /dev/null +++ b/evalbench/evaluator/mcp_readability/__init__.py @@ -0,0 +1,3 @@ +from evaluator.mcp_readability.orchestrator import McpReadabilityOrchestrator + +__all__ = ["McpReadabilityOrchestrator"] diff --git a/evalbench/evaluator/mcp_readability/enums.py b/evalbench/evaluator/mcp_readability/enums.py new file mode 100644 index 00000000..a91873d7 --- /dev/null +++ b/evalbench/evaluator/mcp_readability/enums.py @@ -0,0 +1,89 @@ +"""Enums for the MCP style-guide readability/compliance check. + +These mirror proto enums used by the Data Cloud MCP readability tooling +(e.g. ``cloud.databases.mcp.readability.EndpointType``). They are defined as +plain Python enums here so the feature is self-contained; values can be aligned +to the generated proto later without touching call sites. All enums are written +to the results CSV using their ``.name``. +""" + +from enum import Enum + + +class EndpointType(Enum): + """How the MCP server is reached. + + Mirrors ``cloud.databases.mcp.readability.EndpointType``. + """ + + ENDPOINT_TYPE_UNSPECIFIED = 0 + REMOTE = 1 + LOCAL = 2 + + +class Environment(Enum): + """Release channel / deployment environment of an endpoint. + + Distinct from :class:`EndpointType`; used purely to slice / filter results + (e.g. only check ``PROD`` endpoints). + """ + + ENVIRONMENT_UNSPECIFIED = 0 + PROD = 1 + AUTOPUSH = 2 + STAGING = 3 + DEV = 4 + + +class CheckStatus(Enum): + """Whether the compliance check *ran* successfully (not its findings). + + Compliance results are captured separately via the p0/p1/p2 issue counts and + compliance_score; this status only reflects whether the eval completed. + + - ``SUCCESS``: the eval ran end-to-end. + - ``FETCH_ERROR``: failed to retrieve tools data from the endpoint. + - ``ANALYSIS_ERROR``: error during LLM analysis or result parsing. + - ``INTERNAL_ERROR``: other script/system error. + """ + + CHECK_STATUS_UNSPECIFIED = 0 + SUCCESS = 1 + FETCH_ERROR = 2 + ANALYSIS_ERROR = 3 + INTERNAL_ERROR = 4 + + +def _coerce(enum_cls, value, default): + """Coerce a config value (str / int / enum / None) into ``enum_cls``. + + Unknown values fall back to ``default`` rather than raising, so a typo in a + config file degrades gracefully instead of aborting the whole run. + """ + if value is None: + return default + if isinstance(value, enum_cls): + return value + if isinstance(value, int): + try: + return enum_cls(value) + except ValueError: + return default + name = str(value).strip().upper() + # Allow both short ("REMOTE") and fully-qualified ("ENDPOINT_TYPE_REMOTE") + # spellings, plus the raw member name. + members = enum_cls.__members__ + if name in members: + return members[name] + for member_name, member in members.items(): + if member_name.endswith("_" + name): + return member + return default + + +def coerce_endpoint_type(value) -> EndpointType: + return _coerce(EndpointType, value, EndpointType.ENDPOINT_TYPE_UNSPECIFIED) + + +def coerce_environment(value) -> Environment: + return _coerce(Environment, value, Environment.ENVIRONMENT_UNSPECIFIED) diff --git a/evalbench/evaluator/mcp_readability/exceptions.py b/evalbench/evaluator/mcp_readability/exceptions.py new file mode 100644 index 00000000..1aab8b55 --- /dev/null +++ b/evalbench/evaluator/mcp_readability/exceptions.py @@ -0,0 +1,71 @@ +"""Loading and matching of per-endpoint style-rule exceptions (waivers). + +An exceptions file lets an operator waive a specific style requirement for a +specific endpoint when it legitimately cannot comply. Waived rules are passed to +the scorer so they are excluded from the P0/P1/P2 counts and surfaced separately +in the feedback. + +File schema (see ``datasets/mcp_readability/exceptions.yaml``):: + + exceptions: + - endpoint_url: "https://.../mcp" # any of url / product_name / environment + rule_id: "P1-TOOL-DESC-LENGTH" + reason: "..." + - environment: AUTOPUSH # "*" or omitted field = match-all + rule_id: "P2-EXAMPLE-USAGE" + reason: "..." +""" + +import logging +from util.config import load_yaml_config + + +def load_exceptions(path: str) -> list[dict]: + """Load the exceptions list from a YAML file. Missing/empty -> [].""" + if not path: + return [] + parsed = load_yaml_config(path) + if not parsed: + return [] + exceptions = parsed.get("exceptions") or [] + if not isinstance(exceptions, list): + logging.warning( + "mcp_readability: 'exceptions' in %s is not a list; ignoring.", path + ) + return [] + return exceptions + + +def _matches(field_value, exception_value) -> bool: + """A matcher field matches when it is absent, '*', or equal (case-insensitive).""" + if exception_value is None or exception_value == "*": + return True + if field_value is None: + return False + return str(field_value).strip().lower() == str(exception_value).strip().lower() + + +def applicable_exceptions(endpoint: dict, all_exceptions: list[dict]) -> list[dict]: + """Return exceptions whose matchers all apply to ``endpoint``. + + Matchers considered: ``endpoint_url``, ``product_name``, ``environment``. + Each exception keeps its ``rule_id`` and ``reason`` for the scorer prompt. + """ + matched = [] + for exc in all_exceptions: + if not isinstance(exc, dict): + continue + if not exc.get("rule_id"): + continue + if ( + _matches(endpoint.get("endpoint_url"), exc.get("endpoint_url")) + and _matches(endpoint.get("product_name"), exc.get("product_name")) + and _matches(endpoint.get("environment"), exc.get("environment")) + ): + matched.append( + { + "rule_id": exc.get("rule_id"), + "reason": exc.get("reason", ""), + } + ) + return matched diff --git a/evalbench/evaluator/mcp_readability/orchestrator.py b/evalbench/evaluator/mcp_readability/orchestrator.py new file mode 100644 index 00000000..bdb066db --- /dev/null +++ b/evalbench/evaluator/mcp_readability/orchestrator.py @@ -0,0 +1,334 @@ +"""Orchestrator for the MCP style-guide compliance (readability) check. + +Flow per endpoint: + 1. The ``mcp_tools`` *generator* fetches/normalizes the endpoint's tools.yaml. + 2. Token usage is estimated against the endpoint's token budget. + 3. Applicable exceptions + any previous feedback are gathered. + 4. The ``mcp_style_compliance`` *scorer* (LLM) evaluates tools.yaml vs the + style guide and returns issue counts / score / feedback. + 5. A result row is assembled with the full metric schema. + +Results are written to a dedicated compliance CSV. This orchestrator does not +use the NL2SQL dataset/scores pipeline, so :meth:`process` returns ``None`` for +the results/scores temp files, which makes the standard EvalBench reporting path +skip cleanly. +""" + +import concurrent.futures +import datetime +import json +import logging +import threading + +import pandas as pd + +from evaluator.orchestrator import Orchestrator +from generators.models import get_generator +from scorers.mcp_style_compliance import McpStyleComplianceScorer +from reporting.mcp_readability_csv import COLUMNS as mcp_csv_columns +from reporting.mcp_readability_csv import write_compliance_csv +from util.config import load_yaml_config + +from evaluator.mcp_readability.enums import ( + CheckStatus, + coerce_endpoint_type, + coerce_environment, +) +from evaluator.mcp_readability import exceptions as exceptions_mod +from evaluator.mcp_readability.token_estimator import ( + token_budget_used_percent, +) + + +class McpReadabilityOrchestrator(Orchestrator): + """Checks a list of MCP endpoints against the Data Cloud MCP style guide.""" + + def __init__(self, config, db_configs, setup_config, report_progress=False): + super().__init__(config, db_configs, setup_config, report_progress) + + runner_config = config.get("runners", {}) or {} + self.endpoint_runners = runner_config.get("endpoint_runners", 4) + + # Load endpoints + shared defaults. + endpoints_config = config.get("endpoints_config") + if not endpoints_config: + raise ValueError("mcp_readability requires 'endpoints_config'") + parsed = load_yaml_config(endpoints_config) or {} + self.defaults = parsed.get("defaults") or {} + self.endpoints = parsed.get("endpoints") or [] + if not self.endpoints: + logging.warning( + "mcp_readability: no endpoints found in %s", endpoints_config + ) + + # Style guide text (fed verbatim into the scorer prompt). + self.style_guide = self._read_text(config.get("style_guide")) + + # Exceptions (waivers). + self.all_exceptions = exceptions_mod.load_exceptions( + config.get("exceptions_config") + ) + + # Default token budget + environment filter. + self.default_token_budget = config.get("token_budget", 0) or 0 + env_filter = config.get("environments") or [] + self.environment_filter = {str(e).strip().upper() for e in env_filter} + + # Previous feedback for run-to-run consistency. + self.previous_feedback = self._load_previous_feedback( + config.get("previous_results_csv") + ) + + # Shared model registry for get_generator (lock + cache), as used by + # the standard orchestrators. + self.global_models = { + "lock": threading.Lock(), + "registered_models": {}, + } + + # Tools generator (the "system under test" fetcher). + tools_generator_config = config.get("tools_generator_config") + if not tools_generator_config: + raise ValueError("mcp_readability requires 'tools_generator_config'") + self.tools_generator = get_generator( + self.global_models, tools_generator_config + ) + + # Compliance scorer (LLM). + scorer_config = (config.get("scorers") or {}).get( + "mcp_style_compliance" + ) or {} + self.scorer = McpStyleComplianceScorer(scorer_config, self.global_models) + + self.rows = [] + + # ------------------------------------------------------------------ + # Public lifecycle + # ------------------------------------------------------------------ + def evaluate(self, dataset=None): + """Run the compliance check for every (filtered) endpoint.""" + endpoints = self._filtered_endpoints() + if not endpoints: + logging.warning("mcp_readability: no endpoints to check after filtering.") + self.rows = [] + return + + workers = max(1, int(self.endpoint_runners)) + with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as pool: + futures = { + pool.submit(self._check_endpoint, ep): ep for ep in endpoints + } + rows = [] + for future in concurrent.futures.as_completed(futures): + rows.append(future.result()) + + # Keep deterministic ordering (by product_name then url). + rows.sort(key=lambda r: (r.get("product_name", ""), r.get("endpoint_url", ""))) + self.rows = rows + if self.report_progress: + logging.info("mcp_readability: checked %d endpoints.", len(rows)) + + def process(self): + reporting = self.config.get("reporting") or {} + output_directory = (reporting.get("csv") or {}).get( + "output_directory", "results" + ) + try: + write_compliance_csv(self.rows, output_directory, self.job_id) + except Exception: + logging.exception("mcp_readability: failed to write compliance CSV") + + # Optionally append the same rows to the existing eval BigQuery table. + if reporting.get("bigquery"): + self._write_bigquery(reporting["bigquery"]) + + # Return Nones so the standard NL2SQL reporting path is skipped. + return (self.job_id, self.run_time, None, None, None) + + def _write_bigquery(self, bq_config): + """Append compliance rows to the standard eval BigQuery table. + + Reuses the shared :class:`BigQueryReporter` so rows land in the existing + ``.evalbench.results`` table (schema auto-evolves to add the + compliance columns); no dedicated dataset/table is created. + """ + if not self.rows: + return + try: + from reporting.bqstore import BigQueryReporter + from reporting.report import STORETYPE + + df = pd.DataFrame(self.rows, columns=mcp_csv_columns) + df["job_id"] = self.job_id + reporter = BigQueryReporter(bq_config, self.job_id, self.run_time) + reporter.store(df, STORETYPE.EVALS) + except Exception: + logging.exception("mcp_readability: failed to write BigQuery results") + + # ------------------------------------------------------------------ + # Per-endpoint work + # ------------------------------------------------------------------ + def _check_endpoint(self, endpoint: dict) -> dict: + product_name = endpoint.get("product_name", "") + endpoint_url = endpoint.get("endpoint_url", "") + endpoint_type = coerce_endpoint_type(self._ep_value(endpoint, "endpoint_type")) + environment = coerce_environment(self._ep_value(endpoint, "environment")) + token_budget = self._ep_value( + endpoint, "token_budget", self.default_token_budget + ) + + row = self._base_row( + product_name, endpoint_url, endpoint_type, environment + ) + try: + # 1. Fetch tools + render man-page markup. Failure here is a + # FETCH_ERROR. + try: + tools, man_page = self.tools_generator.fetch_tools( + endpoint, self.defaults + ) + except Exception as e: + logging.exception( + "mcp_readability: fetch failed for %s (%s)", + product_name, + endpoint_url, + ) + row["check_status"] = CheckStatus.FETCH_ERROR.name + row["error_message"] = f"{type(e).__name__}: {e}" + return row + + # 2. Token estimate / tool count (recorded even if analysis fails). + # Metrics come from the man-page formatter; the percentage is + # computed against the configured per-endpoint budget. + est_tokens = man_page.estimated_tokens + row["total_tools"] = man_page.total_tools + row["estimated_tokens"] = est_tokens + row["token_budget_used_percent"] = token_budget_used_percent( + est_tokens, token_budget + ) + + # 3. Exceptions + previous feedback for this endpoint. + applicable = exceptions_mod.applicable_exceptions( + endpoint, self.all_exceptions + ) + prev = self.previous_feedback.get(endpoint_url) + + # 4. Compliance evaluation. Failure here is an ANALYSIS_ERROR. + try: + feedback = self.scorer.evaluate( + tools_markup=man_page.man_page, + style_guide=self.style_guide, + product_name=product_name, + exceptions=applicable, + previous_feedback=prev, + ) + except Exception as e: + logging.exception( + "mcp_readability: analysis failed for %s (%s)", + product_name, + endpoint_url, + ) + row["check_status"] = CheckStatus.ANALYSIS_ERROR.name + row["error_message"] = f"{type(e).__name__}: {e}" + return row + + # 5. Success: the eval ran. Compliance findings are recorded as + # metrics independent of this status. + row.update( + { + "check_status": CheckStatus.SUCCESS.name, + "p0_issues": int(feedback.get("p0_issues", 0)), + "p1_issues": int(feedback.get("p1_issues", 0)), + "p2_issues": int(feedback.get("p2_issues", 0)), + "compliance_score": int(feedback.get("compliance_score", 0)), + "llm_feedback_json": json.dumps(feedback), + "llm_feedback_html": self.scorer.to_html(feedback), + "error_message": "", + } + ) + except Exception as e: + logging.exception( + "mcp_readability: internal error for %s (%s)", + product_name, + endpoint_url, + ) + row["check_status"] = CheckStatus.INTERNAL_ERROR.name + row["error_message"] = f"{type(e).__name__}: {e}" + return row + + # ------------------------------------------------------------------ + # Helpers + # ------------------------------------------------------------------ + def _ep_value(self, endpoint: dict, key: str, fallback=None): + """Endpoint value, falling back to the shared ``defaults`` then a literal.""" + if key in endpoint: + return endpoint.get(key) + if key in self.defaults: + return self.defaults.get(key) + return fallback + + def _filtered_endpoints(self) -> list[dict]: + if not self.environment_filter: + return list(self.endpoints) + kept = [] + for ep in self.endpoints: + env = coerce_environment(self._ep_value(ep, "environment")).name + if env in self.environment_filter: + kept.append(ep) + return kept + + @staticmethod + def _base_row(product_name, endpoint_url, endpoint_type, environment) -> dict: + return { + "product_name": product_name, + "endpoint_url": endpoint_url, + "endpoint_type": endpoint_type.name, + "environment": environment.name, + "check_timestamp": datetime.datetime.now().isoformat(), + "check_status": "", + "p0_issues": 0, + "p1_issues": 0, + "p2_issues": 0, + "total_tools": 0, + "estimated_tokens": 0, + "token_budget_used_percent": 0.0, + "compliance_score": 0, + "llm_feedback_json": "", + "llm_feedback_html": "", + "error_message": "", + } + + @staticmethod + def _read_text(path) -> str: + if not path: + return "" + try: + with open(path, "r", encoding="utf-8") as f: + return f.read() + except OSError as e: + logging.warning("mcp_readability: could not read style guide %s: %s", path, e) + return "" + + @staticmethod + def _load_previous_feedback(csv_path) -> dict: + """Map endpoint_url -> prior llm_feedback_json from a previous run CSV.""" + if not csv_path: + return {} + try: + df = pd.read_csv(csv_path) + except Exception as e: + logging.warning( + "mcp_readability: could not read previous_results_csv %s: %s", + csv_path, + e, + ) + return {} + if "endpoint_url" not in df.columns or "llm_feedback_json" not in df.columns: + return {} + mapping = {} + for _, r in df.iterrows(): + url = r.get("endpoint_url") + fb = r.get("llm_feedback_json") + if isinstance(url, str) and isinstance(fb, str) and fb: + mapping[url] = fb + return mapping diff --git a/evalbench/evaluator/mcp_readability/token_estimator.py b/evalbench/evaluator/mcp_readability/token_estimator.py new file mode 100644 index 00000000..60acaf7b --- /dev/null +++ b/evalbench/evaluator/mcp_readability/token_estimator.py @@ -0,0 +1,33 @@ +"""Token estimation for an MCP tools spec. + +Uses a lightweight heuristic so the feature carries no extra dependency. The +estimate intentionally errs slightly high (tool schemas are JSON-ish and +tokenize denser than prose). If exact counts are needed, swap ``estimate_tokens`` +for ``genai_client.models.count_tokens(...)`` — the call sites only depend on the +function signature. +""" + +import math + + +def estimate_tokens(text: str) -> int: + """Estimate the number of LLM tokens in ``text``. + + Heuristic: roughly 4 characters per token, with a small floor so a non-empty + spec never estimates to zero. + """ + if not text: + return 0 + # ~4 chars/token is the common rule of thumb for English + JSON. + return max(1, math.ceil(len(text) / 4)) + + +def token_budget_used_percent(estimated_tokens: int, token_budget: int) -> float: + """Percent of the token budget consumed, rounded to 2 decimals. + + Returns ``0.0`` when the budget is missing or non-positive to avoid + divide-by-zero blowing up an otherwise-successful check. + """ + if not token_budget or token_budget <= 0: + return 0.0 + return round(100.0 * estimated_tokens / token_budget, 2) diff --git a/evalbench/generators/models/__init__.py b/evalbench/generators/models/__init__.py index 0e7d187a..fa6a1459 100644 --- a/evalbench/generators/models/__init__.py +++ b/evalbench/generators/models/__init__.py @@ -12,6 +12,7 @@ from .codex_cli import CodexCliGenerator from .gcp_data_engineering_agent import DataEngineeringAgentGenerator from .agy_cli import AgyCliGenerator +from .mcp_tools import McpToolsGenerator from util.config import load_yaml_config @@ -36,6 +37,7 @@ def get_generator(global_models, model_config_path: str, db: DB = None): "codex_cli": lambda: CodexCliGenerator(config), "data_engineering_agent": lambda: DataEngineeringAgentGenerator(config), "agy_cli": lambda: AgyCliGenerator(config), + "mcp_tools": lambda: McpToolsGenerator(config), } generator = config["generator"] if generator not in generators: diff --git a/evalbench/generators/models/mcp_tool_formatter.py b/evalbench/generators/models/mcp_tool_formatter.py new file mode 100644 index 00000000..885d2a0e --- /dev/null +++ b/evalbench/generators/models/mcp_tool_formatter.py @@ -0,0 +1,341 @@ +"""Formatting utilities for MCP tools (man-page style markup). + +Renders a sequence of MCP tools into a human-readable "man page" that is fed to +the style-guide compliance scorer. This mirrors the production-tested +``tool_formatter`` used by the Data Cloud MCP readability tooling: it resolves +JSON-Schema ``$ref``/``$defs``, flattens nested parameters into dotted paths, +marks REQUIRED/OPTIONAL, and computes token/budget metrics in the same pass. + +Beyond the original, this version also renders structured ``enum`` and +``default`` values so style rules that key off them (e.g. "Use Enums") are +judgeable from the markup. +""" + +from collections.abc import Mapping, Sequence, Set +import dataclasses +import json +import textwrap +from typing import Any + +from mcp import types as mcp_types + + +# Default token budget for a single tool manifest. Overridable per call so the +# orchestrator can pass the configured per-endpoint budget. +_DEFAULT_TOKEN_BUDGET = 200_000 + + +@dataclasses.dataclass(frozen=True) +class ManPageResult: + """Result of formatting tools to a man-page style string. + + Attributes: + man_page: The formatted man-page style string representing the tools. + total_tools: The total number of tools. + estimated_tokens: The total estimated number of tokens consumed by the + tools. + budget_pct: The percentage of the token budget used by the tools. + """ + + man_page: str + total_tools: int + estimated_tokens: int + budget_pct: float + + +def _estimate_tokens(tool: mcp_types.Tool) -> int: + """Estimates the number of tokens a tool consumes.""" + # A rough estimate is length of JSON representation / 4. + tool_dict = { + "name": tool.name, + "description": tool.description, + "inputSchema": tool.inputSchema, + } + json_str = json.dumps(tool_dict, default=str) + return max(1, len(json_str) // 4) + + +def _wrap_text(text: str, indent: str) -> list[str]: + """Wraps text preserving empty lines and applying indentation.""" + wrapped = [] + for line in (text or "").splitlines(): + line = line.strip() + if not line: + if wrapped: # Don't add leading empty lines. + wrapped.append("") + else: + wrapped.extend( + textwrap.wrap( + line, width=80, initial_indent=indent, subsequent_indent=indent + ) + ) + while wrapped and not wrapped[-1]: + wrapped.pop() + return wrapped + + +def _value_indent(level: int) -> str: + """Indent used for sub-lines (description/enum/default) at a given level.""" + if level == 0: + return " " + if level == 1: + return " " + return " " + " " * level + " " + + +def _format_enum_default(schema: Mapping[str, Any], level: int) -> list[str]: + """Render ``enum`` / ``default`` values for a property, if present.""" + lines: list[str] = [] + indent = _value_indent(level) + if "enum" in schema and isinstance(schema["enum"], (list, tuple)): + values = ", ".join(json.dumps(v, default=str) for v in schema["enum"]) + lines.extend(_wrap_text(f"enum: [{values}]", indent)) + if "default" in schema: + lines.extend( + _wrap_text(f"default: {json.dumps(schema['default'], default=str)}", indent) + ) + return lines + + +def _resolve_ref( + schema: Mapping[str, Any], defs: Mapping[str, Any] +) -> tuple[Mapping[str, Any], str | None]: + """Resolves JSON schema references. + + Args: + schema: The current JSON schema dictionary. + defs: A dictionary of schema definitions where '$ref' can be resolved. + + Returns: + A tuple (resolved_schema, ref_name), where `resolved_schema` is the + resolved schema dictionary (or the original schema if no '$ref' is present + or resolvable), and `ref_name` is the name of the resolved reference (e.g., + the part after '#/$defs/'), or None if no reference was resolved. + """ + if "$ref" in schema: + ref_path = schema["$ref"] + if ref_path.startswith("#/$defs/"): + ref_name = ref_path.split("/")[-1] + if ref_name in defs: + return defs[ref_name], ref_name + return schema, None + + +def _format_type(schema: Mapping[str, Any]) -> str: + """Formats a JSON schema type representation.""" + schema_type = schema.get("type", "any") + if isinstance(schema_type, list): + return " | ".join(str(t) for t in schema_type) + return str(schema_type) + + +def _format_header( + prop_name: str, + prop_type: str, + req_str: str, + desc: str, + full_path: str, + level: int, +) -> list[str]: + """Formats a property's header line and description based on nesting level.""" + lines = [] + if level == 0: + lines.append(f" {prop_name} ({prop_type}) [{req_str}]") + if desc: + lines.extend(_wrap_text(desc, " ")) + elif level == 1: + lines.append(f" -> {full_path} ({prop_type}) [{req_str}]") + if desc: + lines.extend(_wrap_text(desc, " ")) + else: + indent = " " + " " * level + lines.append(f"{indent}* {prop_name} ({prop_type})") + if desc: + lines.extend(_wrap_text(desc, indent + " ")) + return lines + + +def _format_recursive(level: int) -> list[str]: + """Formats a recursive reference notice for array items at the given level.""" + desc = "[Recursive Reference]" + if level == 0: + return _wrap_text(desc, " ") + if level == 1: + return _wrap_text(desc, " ") + indent = " " + " " * (level + 1) + return _wrap_text(desc, indent + " ") + + +def _format_obj( + schema: Mapping[str, Any], + defs: Mapping[str, Any], + full_path: str, + level: int, + visited: Set[str], +) -> list[str]: + """Formats an object property by recursively traversing its children.""" + lines = _format_props( + schema, + defs=defs, + path_prefix=full_path, + level=level + 1, + visited=visited, + ) + if lines and lines[-1]: + lines.append("") + return lines + + +def _format_array( + schema: Mapping[str, Any], + defs: Mapping[str, Any], + full_path: str, + level: int, + visited: Set[str], +) -> list[str]: + """Formats an array property by traversing its items schema.""" + # Resolve the items schema to check if it points to a reference definition. + schema, ref = _resolve_ref(schema["items"], defs) + is_recursive = False + if ref: + # Check if this reference name has already been encountered higher up in + # the current traversal path to prevent infinite recursion. + if ref in visited: + is_recursive = True + else: + visited = visited | {ref} + + if not is_recursive and schema.get("type") == "object": + return _format_props( + schema, + defs=defs, + path_prefix=full_path + "[]", + level=level + 1, + visited=visited, + ) + if is_recursive: + return _format_recursive(level) + return [] + + +def _format_props( + schema: Mapping[str, Any], + *, + defs: Mapping[str, Any] | None = None, + path_prefix: str = "", + level: int = 0, + visited: Set[str] | None = None, +) -> list[str]: + """Recursively formats JSON schema properties into man-page parameter lines.""" + lines = [] + if defs is None: + defs = schema.get("$defs", {}) or schema.get("definitions", {}) + visited = visited if visited is not None else set() + + schema, _ = _resolve_ref(schema, defs) + properties = schema.get("properties", {}) + required_fields = set(schema.get("required", [])) + + for prop_name, prop_schema in properties.items(): + prop_schema, ref_name = _resolve_ref(prop_schema, defs) + is_recursive = ref_name in visited if ref_name else False + prop_visited = ( + visited | {ref_name} if ref_name and not is_recursive else visited + ) + + is_required = prop_name in required_fields + req_str = "REQUIRED" if is_required else "OPTIONAL" + prop_type = _format_type(prop_schema) + + desc = prop_schema.get("description", "").strip() + if is_recursive: + desc = f"[Recursive Reference] {desc}".strip() + + full_path = f"{path_prefix}.{prop_name}" if path_prefix else prop_name + + lines.extend( + _format_header(prop_name, prop_type, req_str, desc, full_path, level) + ) + lines.extend(_format_enum_default(prop_schema, level)) + + if not is_recursive: + if prop_type == "object" and "properties" in prop_schema: + lines.extend( + _format_obj(prop_schema, defs, full_path, level, prop_visited) + ) + elif prop_type == "array" and "items" in prop_schema: + lines.extend( + _format_array(prop_schema, defs, full_path, level, prop_visited) + ) + + if level == 0: + if lines and lines[-1]: + lines.append("") + + return lines + + +def format_tools_to_man_page( + tools: Sequence[mcp_types.Tool], token_budget: int = _DEFAULT_TOKEN_BUDGET +) -> ManPageResult: + """Formats a sequence of MCP tools into a man-page style string. + + Args: + tools: Sequence of mcp.types.Tool objects. + token_budget: Token budget used to compute the budget percentage. + + Returns: + A ManPageResult with the formatted markup, tool count, estimated tokens, + and percentage of the token budget used. + """ + budget = token_budget if token_budget and token_budget > 0 else _DEFAULT_TOKEN_BUDGET + + if not tools: + return ManPageResult( + man_page="No tools available.", + total_tools=0, + estimated_tokens=0, + budget_pct=0.0, + ) + + lines = [] + + tool_tokens = {tool.name: _estimate_tokens(tool) for tool in tools} + estimated_tokens = sum(tool_tokens.values()) + total_tools = len(tools) + budget_pct = (estimated_tokens / budget) * 100 + + lines.append("=" * 80) + lines.append("SUMMARY") + lines.append("=" * 80) + lines.append(f"Total Tools: {total_tools}") + lines.append(f"Total Estimated Tokens: {estimated_tokens}") + lines.append(f"Budget Used: {budget_pct:.2f}% of {budget} tokens") + lines.append("\nTools:") + for name, tokens in tool_tokens.items(): + pct_of_window = (tokens / budget) * 100 + lines.append(f" - {name} ({tokens} tokens, {pct_of_window:.2f}% of window)") + lines.append("") + + for tool in tools: + lines.append("=" * 80) + lines.append(f"TOOL: {tool.name}") + lines.append("=" * 80) + lines.append("DESCRIPTION:") + + desc = tool.description or "No description provided." + lines.extend(_wrap_text(desc, " ")) + + lines.append("\nPARAMETERS:") + if tool.inputSchema and tool.inputSchema.get("properties"): + schema_lines = _format_props(tool.inputSchema) + lines.extend(schema_lines) + else: + lines.append(" None\n") + + return ManPageResult( + man_page="\n".join(lines).strip(), + total_tools=total_tools, + estimated_tokens=estimated_tokens, + budget_pct=budget_pct, + ) diff --git a/evalbench/generators/models/mcp_tools.py b/evalbench/generators/models/mcp_tools.py new file mode 100644 index 00000000..ae419fb1 --- /dev/null +++ b/evalbench/generators/models/mcp_tools.py @@ -0,0 +1,229 @@ +"""Generator that fetches an MCP endpoint's tools and renders man-page markup. + +This plays the EvalBench *generator* role for the MCP readability check: instead +of asking a model to produce an artifact, it fetches the tool listing from the +"system under test" (an MCP endpoint) and renders the production-tested man-page +markup that the compliance *scorer* then evaluates against the style guide. + +The source is pluggable per endpoint via ``tools_source.type``: + - ``mcp``: live MCP ``tools/list`` over Streamable HTTP using the official + ``mcp`` Python SDK. Requires network access (and usually auth). + - ``file``: read a local YAML/JSON tools spec (dependency-free; great for tests + and for offline / deterministic runs). + +Fetching, auth, and URL normalization mirror the tested Data Cloud MCP +readability tooling (``mcp_utils``): ADC bearer + ``x-goog-user-project`` headers +and a ``/mcp`` suffix convention. +""" + +import functools +import logging +import os + +import anyio +import httpx + +from mcp import types as mcp_types +from mcp.client import session as mcp_session +from mcp.client.streamable_http import streamable_http_client + +from .generator import QueryGenerator +from .mcp_tool_formatter import ManPageResult, format_tools_to_man_page +from util.config import load_yaml_config + + +class McpToolsError(Exception): + """Raised when a tools spec cannot be fetched or parsed.""" + + +class McpToolsGenerator(QueryGenerator): + """Fetches an MCP endpoint's tools and renders them as man-page markup.""" + + def __init__(self, querygenerator_config): + super().__init__(querygenerator_config) + self.name = "mcp_tools" + cfg = querygenerator_config or {} + self.timeout = cfg.get("timeout", 30) + self.default_headers = cfg.get("headers") or {} + # Optional default auth mode applied when an endpoint doesn't specify one. + self.default_auth = cfg.get("auth") + + # The abstract base requires generate_internal; the orchestrator calls + # fetch_tools directly, but we keep this so the class is a valid generator. + def generate_internal(self, prompt): + if isinstance(prompt, dict): + _, result = self.fetch_tools(prompt, {}) + return result.man_page + return "" + + # ------------------------------------------------------------------ + # Public API + # ------------------------------------------------------------------ + def fetch_tools(self, endpoint: dict, defaults: dict): + """Fetch tools for ``endpoint`` and render man-page markup. + + Returns ``(tools, man_page_result)`` where ``tools`` is a list of + ``mcp.types.Tool`` and ``man_page_result`` is a + :class:`ManPageResult` (markup + token/tool metrics). + """ + source = self._resolve_source(endpoint, defaults) + source_type = (source.get("type") or "mcp").lower() + + if source_type == "file": + tools = self._from_file(source) + elif source_type == "mcp": + tools = self._from_mcp(source, endpoint) + else: + raise McpToolsError(f"Unknown tools_source.type '{source_type}'") + + result = format_tools_to_man_page(tools) + return tools, result + + # ------------------------------------------------------------------ + # Source resolution / auth + # ------------------------------------------------------------------ + def _resolve_source(self, endpoint: dict, defaults: dict) -> dict: + """Merge defaults.tools_source with the endpoint's tools_source.""" + merged = dict((defaults or {}).get("tools_source") or {}) + merged.update(endpoint.get("tools_source") or {}) + return merged + + def _auth_headers(self, source: dict) -> dict: + """Build request headers: defaults + source + ADC bearer + project.""" + headers = dict(self.default_headers) + headers.update(source.get("headers") or {}) + project = source.get("project") + auth = source.get("auth", self.default_auth) + if auth == "google_credentials": + token, adc_project = self._google_credentials() + if token: + headers["Authorization"] = f"Bearer {token}" + if adc_project and "x-goog-user-project" not in headers: + headers["x-goog-user-project"] = adc_project + if project: + headers["x-goog-user-project"] = project + return headers + + @staticmethod + def _google_credentials(): + """Best-effort ADC ``(token, project)``; returns ``(None, None)`` if N/A.""" + try: + import google.auth + from google.auth.transport.requests import Request + + creds, project = google.auth.default( + scopes=["https://www.googleapis.com/auth/cloud-platform"] + ) + creds.refresh(Request()) + return creds.token, project + except Exception as e: # pragma: no cover - depends on environment + logging.warning("mcp_tools: could not obtain Google credentials: %s", e) + return None, None + + @staticmethod + def sanitize_url(url: str) -> str: + """Ensure the URL has a scheme and ends with ``/mcp``. + + ``/mcp`` is the conventional path where ESF exposes the MCP protocol. + """ + stripped_url = (url or "").strip() + if stripped_url.startswith(("http://", "https://")): + url_with_scheme = stripped_url + elif "localhost" in stripped_url: + url_with_scheme = f"http://{stripped_url}" + else: + url_with_scheme = f"https://{stripped_url}" + + rstripped_url = url_with_scheme.rstrip("/") + if not rstripped_url.endswith("/mcp"): + return f"{rstripped_url}/mcp" + return rstripped_url + + # ------------------------------------------------------------------ + # mcp source (official SDK over Streamable HTTP) + # ------------------------------------------------------------------ + def _from_mcp(self, source: dict, endpoint: dict) -> list[mcp_types.Tool]: + raw_url = source.get("url") or endpoint.get("endpoint_url") + if not raw_url: + raise McpToolsError("tools_source.type 'mcp' requires an endpoint URL") + url = self.sanitize_url(raw_url) + headers = self._auth_headers(source) + try: + return anyio.run( + functools.partial(self._async_fetch_tools, url, headers) + ) + except McpToolsError: + raise + except Exception as e: + raise McpToolsError(f"Failed to fetch tools from {url}: {e}") from e + + async def _async_fetch_tools(self, url: str, headers: dict) -> list[mcp_types.Tool]: + """Connect to the MCP server over Streamable HTTP and list its tools.""" + timeout = httpx.Timeout(self.timeout, read=300.0) + async with ( + httpx.AsyncClient(headers=headers, timeout=timeout) as http_client, + streamable_http_client(url, http_client=http_client) as streams, + ): + reader, writer, _ = streams + async with mcp_session.ClientSession(reader, writer) as session: + await session.initialize() + tools_response = await session.list_tools() + return list(tools_response.tools) + + # ------------------------------------------------------------------ + # file source + # ------------------------------------------------------------------ + def _from_file(self, source: dict) -> list[mcp_types.Tool]: + path = source.get("path") + if not path: + raise McpToolsError("tools_source.type 'file' requires a 'path'") + if not os.path.exists(path): + raise McpToolsError(f"Tools file not found: {path}") + try: + parsed = load_yaml_config(path) # yaml.safe_load handles JSON too + except Exception as e: + raise McpToolsError(f"Could not parse tools file {path}: {e}") from e + if not parsed: + raise McpToolsError(f"Empty or unreadable tools file: {path}") + return self._to_tools(parsed) + + @staticmethod + def _to_tools(raw) -> list[mcp_types.Tool]: + """Build ``mcp.types.Tool`` objects from a parsed tools spec.""" + if isinstance(raw, dict): + tools = raw.get("tools") + if tools is None and "result" in raw: + tools = (raw.get("result") or {}).get("tools") + elif isinstance(raw, list): + tools = raw + else: + tools = None + + if not tools: + raise McpToolsError("No tools found in spec") + + built: list[mcp_types.Tool] = [] + for t in tools: + if not isinstance(t, dict): + continue + schema = ( + t.get("inputSchema") + or t.get("input_schema") + or t.get("parameters") + or {} + ) + if not isinstance(schema, dict): + schema = {} + try: + built.append( + mcp_types.Tool( + name=t.get("name", ""), + description=t.get("description") or t.get("title") or "", + inputSchema=schema, + ) + ) + except Exception as e: + raise McpToolsError(f"Invalid tool entry {t!r}: {e}") from e + if not built: + raise McpToolsError("Tools list contained no valid tool entries") + return built diff --git a/evalbench/reporting/mcp_readability_csv.py b/evalbench/reporting/mcp_readability_csv.py new file mode 100644 index 00000000..952ae8d3 --- /dev/null +++ b/evalbench/reporting/mcp_readability_csv.py @@ -0,0 +1,98 @@ +"""CSV writer for MCP style-guide compliance results. + +Writes one row per checked endpoint with the fixed metric schema requested for +the Data Cloud MCP readability check, plus an ``environment`` column (after +``endpoint_type``) so results can be sliced by release channel. + +Column order and types are stable: + product_name (str), endpoint_url (str), endpoint_type (str/enum name), + environment (str/enum name), check_timestamp (ISO-8601 str), + check_status (str/enum name), p0_issues (int), p1_issues (int), + p2_issues (int), total_tools (int), estimated_tokens (int), + token_budget_used_percent (float), compliance_score (int), + llm_feedback_json (str), llm_feedback_html (str), error_message (str) +""" + +import logging +import os +import sys + +import pandas as pd + + +COLUMNS = [ + "product_name", + "endpoint_url", + "endpoint_type", + "environment", + "check_timestamp", + "check_status", + "p0_issues", + "p1_issues", + "p2_issues", + "total_tools", + "estimated_tokens", + "token_budget_used_percent", + "compliance_score", + "llm_feedback_json", + "llm_feedback_html", + "error_message", +] + +_INT_COLUMNS = [ + "p0_issues", + "p1_issues", + "p2_issues", + "total_tools", + "estimated_tokens", + "compliance_score", +] +_FLOAT_COLUMNS = ["token_budget_used_percent"] +_STRING_COLUMNS = [ + "product_name", + "endpoint_url", + "endpoint_type", + "environment", + "check_timestamp", + "check_status", + "llm_feedback_json", + "llm_feedback_html", + "error_message", +] + + +def _output_dir(output_directory: str) -> str: + # Match reporting/csv.py: when run as the gRPC service, write to the shared + # volume rather than the configured local directory. + if sys.argv and sys.argv[0].endswith("eval_server.py"): + return "/tmp_session_files/results" + return output_directory or "results" + + +def write_compliance_csv(rows: list[dict], output_directory: str, job_id: str) -> str: + """Write compliance ``rows`` to ``//mcp_readability_compliance.csv``. + + Returns the path written. + """ + out_dir = _output_dir(output_directory) + directory = os.path.join(out_dir, job_id) + os.makedirs(directory, exist_ok=True) + file_path = os.path.join(directory, "mcp_readability_compliance.csv") + + df = pd.DataFrame(rows, columns=COLUMNS) + + # Enforce types so the CSV schema is stable regardless of row contents. + for col in _INT_COLUMNS: + df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0).astype("int64") + for col in _FLOAT_COLUMNS: + df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0.0).astype("float64") + for col in _STRING_COLUMNS: + df[col] = df[col].fillna("").astype("string") + + df.to_csv(file_path, index=False) + logging.info( + "Wrote MCP readability compliance CSV (%d rows) to %s", + len(df), + file_path, + ) + return file_path diff --git a/evalbench/scorers/mcp_style_compliance.py b/evalbench/scorers/mcp_style_compliance.py new file mode 100644 index 00000000..e5c93cfc --- /dev/null +++ b/evalbench/scorers/mcp_style_compliance.py @@ -0,0 +1,360 @@ +"""LLM-backed scorer that evaluates an MCP tool manifest against a style guide. + +Follows the same shape as :class:`scorers.llmrater.LLMRater`: constructed with +``(config, global_models)`` and holds an LLM obtained via +``generators.models.get_generator``. The orchestrator calls :meth:`evaluate` +per endpoint with the tool man-page markup. + +The model reviews the tools from an LLM-agent-consumption perspective and returns +a strict JSON object describing P0/P1/P2 findings, an overall compliance score, +and any rules waived via the exceptions file. We parse and normalize that JSON +defensively so a slightly malformed response degrades to an ERROR row rather than +crashing the run. + +When a previous run's feedback is available, a dedicated second LLM pass +reconciles the fresh review with it (preserving historical severity, dropping +fixed issues, adding new ones) so results stay consistent run-to-run. +""" + +import html +import json +import logging +import re + +from generators.models import get_generator + + +# Shared JSON output contract appended to both prompts (escaped for str.format). +_OUTPUT_SCHEMA = """### OUTPUT +Return ONLY a JSON object (no markdown, no prose) with exactly this shape: +{{ + "compliance_score": , + "p0_issues": , + "p1_issues": , + "p2_issues": , + "findings": [ + {{"severity": "P0|P1|P2", "rule_id": "", "tool": "", + "message": "", "suggestion": ""}} + ], + "waived": [ + {{"rule_id": "", "reason": "", "would_have_violated": }} + ], + "summary": "" +}} +The counts p0_issues/p1_issues/p2_issues MUST equal the number of findings of +each severity.""" + + +PROMPT_TEMPLATE = ( + """You are an expert on the Data Cloud MCP / OneMCP standard and a pragmatic +API Developer Experience reviewer. Evaluate the MCP server's tool definitions +(shown below as a man page) against the STYLE GUIDE and report every violation. + +Evaluate from the perspective of an LLM agent that must call these tools: +- Will the model understand the terminology and the tool / parameter names? +- Are the parameters too complex, too numerous, or under-described? +- Is the agent forced to act like a computer -- e.g. formatting complex strings, + generating UUIDs, or calculating timestamps -- instead of expressing intent? +Adopt a consultative, pragmatic tone (e.g. "Consider...", "Evaluate whether..."). +Do not be overly pedantic about minor wording when larger architectural blockers +exist. + +Severity levels: +- P0: critical violation (must fix; blocks compliance). +- P1: major violation (should fix). +- P2: minor / stylistic violation (nice to fix). + +How to assign severity and rule_id: +- The STYLE GUIDE annotates each rule with its priority in an HTML comment next + to the section heading, e.g. `### Tool Names ` or + `#### Safe Pagination `. Use that annotated priority as + the severity of any violation of that rule (p0 -> P0, p1 -> P1, p2 -> P2). +- A heading may specify different priorities for different aspects, e.g. + ``. Honor that + split when classifying the specific violation. +- Use the section heading text as the `rule_id` (e.g. "Tool Names", + "Use Human-Readable Time and Durations", "Concise Descriptions"). +- Only report violations of rules that actually apply to the given tools. Rules + about platform/registration/dashboards that cannot be judged from the tool + schema alone should not be flagged as violations. + +Avoid duplication (global vs per-tool): +- If an issue is global or repeats across many tools (e.g. a convention violated + everywhere), report it ONCE with "tool" set to "" (empty string). Do NOT emit + the same global issue once per tool. +- Report tool-specific issues with "tool" set to that tool's name. + +### STYLE GUIDE +{style_guide} + +### PRODUCT +{product_name} + +### TOOLS (man page) +{tools_markup} + +### EXCEPTIONS (waived rules — DO NOT count these as issues) +The following rules have been explicitly waived for this endpoint. Do not include +them in p0_issues/p1_issues/p2_issues or in "findings". Instead list them under +"waived" with their reason. If a waived rule would otherwise have been violated, +note that in the waived entry. +{exceptions} + +""" + + _OUTPUT_SCHEMA +) + + +# Second pass: reconcile the just-produced review with the previous run's review +# so findings stay consistent over time (preserve historical severity, drop +# fixed issues, add genuinely new ones). Both reviews share the schema above. +CONSISTENCY_PROMPT_TEMPLATE = ( + """You are maintaining run-to-run consistency for the Data Cloud MCP +style-guide review of a single endpoint. You are given the STYLE GUIDE, the +current tool manifest (man page), the review just produced for it ("LATEST +REVIEW"), and the review from the previous run ("PREVIOUS REVIEW"). Both reviews +use the JSON schema described under OUTPUT. + +Produce a single reconciled review: +1. For each finding in PREVIOUS REVIEW, check it against the current manifest. If + the violation STILL EXISTS, keep it and PRESERVE its original severity and + wording. Drop findings that are now fixed. +2. Add genuinely new violations from LATEST REVIEW that were missed previously. +3. If a finding appears in both, prefer the PREVIOUS REVIEW's wording and + severity, unless LATEST REVIEW is significantly more precise about the same + issue. +4. Do not downgrade or upgrade a still-valid finding's severity unless the STYLE + GUIDE's annotated priority for that rule has actually changed. +5. Carry forward the "waived" entries that still apply. + +### STYLE GUIDE +{style_guide} + +### TOOLS (man page) +{tools_markup} + +### LATEST REVIEW (JSON) +{latest_review} + +### PREVIOUS REVIEW (JSON) +{previous_review} + +""" + + _OUTPUT_SCHEMA +) + + +class McpStyleComplianceScorer: + """Scores a tools spec against the MCP style guide using an LLM.""" + + def __init__(self, config: dict, global_models): + self.name = "mcp_style_compliance" + config = config or {} + self.model_config = config.get("model_config") or "" + if not self.model_config: + raise ValueError( + "model_config is required for the mcp_style_compliance scorer" + ) + self.model = get_generator(global_models, self.model_config) + + def evaluate( + self, + tools_markup: str, + style_guide: str, + product_name: str, + exceptions: list[dict] | None = None, + previous_feedback: str | None = None, + ) -> dict: + """Run the LLM compliance check and return a normalized feedback dict. + + When ``previous_feedback`` (a prior run's ``llm_feedback_json``) is + supplied, a dedicated second pass reconciles the fresh review with it so + findings stay consistent run-to-run. + """ + prompt = PROMPT_TEMPLATE.format( + style_guide=style_guide or "(no style guide provided)", + product_name=product_name or "(unknown)", + tools_markup=tools_markup or "(no tools)", + exceptions=json.dumps(exceptions or [], indent=2), + ) + raw = self._generate(prompt) + feedback = self._parse(raw) + + prev = (previous_feedback or "").strip() + if prev: + feedback = self._consistency_check( + tools_markup=tools_markup, + style_guide=style_guide, + latest=feedback, + previous=prev, + ) + return feedback + + def _consistency_check( + self, tools_markup: str, style_guide: str, latest: dict, previous: str + ) -> dict: + """Reconcile ``latest`` with the ``previous`` run's review (JSON string). + + Returns the merged feedback, or ``latest`` unchanged if the pass fails. + """ + try: + prompt = CONSISTENCY_PROMPT_TEMPLATE.format( + style_guide=style_guide or "(no style guide provided)", + tools_markup=tools_markup or "(no tools)", + latest_review=json.dumps(latest), + previous_review=previous, + ) + return self._parse(self._generate(prompt)) + except Exception as e: + logging.warning( + "mcp_style_compliance: consistency pass failed (%s); " + "using latest review.", + e, + ) + return latest + + def _generate(self, prompt: str) -> str: + """Generate the model response as raw JSON text. + + Prefer Gemini's native JSON mode via the underlying genai client, which + guarantees syntactically valid JSON and -- crucially -- bypasses + ``GeminiGenerator.generate``'s SQL sanitizer (it strips backslashes and + collapses whitespace, corrupting JSON escapes). Falls back to the + generic ``generate`` for non-Gemini models. + """ + client = getattr(self.model, "client", None) + caller = getattr(self.model, "_call_generate_content", None) + if client is not None and callable(caller): + try: + from google.genai import types + + config = types.GenerateContentConfig( + response_mime_type="application/json", + temperature=0, + ) + resp = caller(contents=prompt, config=config) + text = getattr(resp, "text", None) + if text: + return text + except Exception as e: + logging.warning( + "mcp_style_compliance: JSON-mode generation failed (%s); " + "falling back to plain generate().", + e, + ) + return self.model.generate(prompt) + + # ------------------------------------------------------------------ + # parsing / rendering + # ------------------------------------------------------------------ + @staticmethod + def _extract_json(text: str) -> dict: + """Pull a JSON object out of a model response (handles code fences).""" + if not text: + raise ValueError("empty model response") + text = text.strip() + # Strip ```json ... ``` or ``` ... ``` fences. + fence = re.match(r"^```(?:json)?\s*(.*?)\s*```$", text, re.DOTALL) + if fence: + text = fence.group(1).strip() + try: + return json.loads(text) + except json.JSONDecodeError: + # Fallback: grab the outermost {...} span. + start = text.find("{") + end = text.rfind("}") + if start != -1 and end != -1 and end > start: + return json.loads(text[start : end + 1]) + raise + + def _parse(self, raw: str) -> dict: + """Normalize the model output into a stable feedback dict.""" + data = self._extract_json(raw) + findings = data.get("findings") or [] + if not isinstance(findings, list): + findings = [] + + def _count(sev): + return sum( + 1 + for f in findings + if isinstance(f, dict) + and str(f.get("severity", "")).upper() == sev + ) + + # Prefer counts derived from findings (authoritative); fall back to the + # model-reported integers if findings are absent. + p0 = _count("P0") or _safe_int(data.get("p0_issues")) + p1 = _count("P1") or _safe_int(data.get("p1_issues")) + p2 = _count("P2") or _safe_int(data.get("p2_issues")) + + return { + "compliance_score": _safe_int(data.get("compliance_score")), + "p0_issues": p0, + "p1_issues": p1, + "p2_issues": p2, + "findings": findings, + "waived": data.get("waived") or [], + "summary": data.get("summary", ""), + } + + @staticmethod + def to_html(feedback: dict) -> str: + """Render feedback as a severity-grouped HTML fragment.""" + if not feedback: + return "" + score = feedback.get("compliance_score", 0) + summary = html.escape(str(feedback.get("summary", ""))) + parts = [ + "
", + f"

Compliance score: {score}

", + f"

{summary}

", + "

P0: {p0}   P1: {p1}   P2: {p2}

".format( + p0=feedback.get("p0_issues", 0), + p1=feedback.get("p1_issues", 0), + p2=feedback.get("p2_issues", 0), + ), + ] + findings = feedback.get("findings") or [] + if findings: + parts.append( + "" + "" + "" + ) + for f in findings: + if not isinstance(f, dict): + continue + parts.append( + "" + "".format( + sev=html.escape(str(f.get("severity", ""))), + rule=html.escape(str(f.get("rule_id", ""))), + tool=html.escape(str(f.get("tool", ""))), + msg=html.escape(str(f.get("message", ""))), + sug=html.escape(str(f.get("suggestion", ""))), + ) + ) + parts.append("
SeverityRuleToolIssueSuggestion
{sev}{rule}{tool}{msg}{sug}
") + waived = feedback.get("waived") or [] + if waived: + parts.append("

Waived rules

    ") + for w in waived: + if not isinstance(w, dict): + continue + parts.append( + "
  • {rule}: {reason}
  • ".format( + rule=html.escape(str(w.get("rule_id", ""))), + reason=html.escape(str(w.get("reason", ""))), + ) + ) + parts.append("
") + parts.append("
") + return "".join(parts) + + +def _safe_int(value) -> int: + try: + return int(value) + except (TypeError, ValueError): + return 0 diff --git a/evalbench/test/mcp_readability_test.py b/evalbench/test/mcp_readability_test.py new file mode 100644 index 00000000..a4f07e34 --- /dev/null +++ b/evalbench/test/mcp_readability_test.py @@ -0,0 +1,413 @@ +import json +import os +import sys +import tempfile +from unittest.mock import patch + +import pandas as pd +import pytest + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +# Repo root (two levels up from evalbench/evalbench/test/) so dataset paths +# resolve regardless of the pytest working directory. +_REPO_ROOT = os.path.dirname( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +) + + +def _ds(rel): + return os.path.join(_REPO_ROOT, rel) + +from evaluator.mcp_readability.enums import ( + CheckStatus, + EndpointType, + Environment, + coerce_endpoint_type, + coerce_environment, +) +from evaluator.mcp_readability import exceptions as exc_mod +from evaluator.mcp_readability.token_estimator import ( + estimate_tokens, + token_budget_used_percent, +) +from mcp import types as mcp_types +from generators.models.mcp_tools import McpToolsGenerator, McpToolsError +from generators.models.mcp_tool_formatter import format_tools_to_man_page +from scorers.mcp_style_compliance import McpStyleComplianceScorer +from reporting.mcp_readability_csv import COLUMNS, write_compliance_csv + + +# -------------------------------------------------------------------------- +# Enums +# -------------------------------------------------------------------------- +def test_enum_coercion(): + assert coerce_endpoint_type("REMOTE") is EndpointType.REMOTE + assert coerce_endpoint_type("local") is EndpointType.LOCAL + assert coerce_endpoint_type("bogus") is EndpointType.ENDPOINT_TYPE_UNSPECIFIED + assert coerce_endpoint_type(None) is EndpointType.ENDPOINT_TYPE_UNSPECIFIED + assert coerce_environment("autopush") is Environment.AUTOPUSH + assert coerce_environment("PROD") is Environment.PROD + + +def test_check_status_members(): + # check_status reflects whether the eval ran, not compliance findings. + assert {s.name for s in CheckStatus} == { + "CHECK_STATUS_UNSPECIFIED", + "SUCCESS", + "FETCH_ERROR", + "ANALYSIS_ERROR", + "INTERNAL_ERROR", + } + + +# -------------------------------------------------------------------------- +# Token estimation +# -------------------------------------------------------------------------- +def test_token_math(): + assert estimate_tokens("") == 0 + assert estimate_tokens("abcd") == 1 + assert token_budget_used_percent(0, 0) == 0.0 + assert token_budget_used_percent(250, 25000) == 1.0 + assert token_budget_used_percent(210, 25000) == 0.84 + + +# -------------------------------------------------------------------------- +# Exceptions matching +# -------------------------------------------------------------------------- +def test_exceptions_matching(): + all_exc = [ + {"endpoint_url": "https://a/mcp", "rule_id": "R1", "reason": "x"}, + {"environment": "AUTOPUSH", "rule_id": "R2", "reason": "y"}, + {"rule_id": "R3", "reason": "global"}, # match-all + {"reason": "no rule id, ignored"}, + ] + ep = {"endpoint_url": "https://a/mcp", "environment": "PROD"} + matched = {e["rule_id"] for e in exc_mod.applicable_exceptions(ep, all_exc)} + assert matched == {"R1", "R3"} + + ep2 = {"endpoint_url": "https://b/mcp", "environment": "AUTOPUSH"} + matched2 = {e["rule_id"] for e in exc_mod.applicable_exceptions(ep2, all_exc)} + assert matched2 == {"R2", "R3"} + + +# -------------------------------------------------------------------------- +# URL sanitization +# -------------------------------------------------------------------------- +def test_sanitize_url(): + s = McpToolsGenerator.sanitize_url + assert s("example.com") == "https://example.com/mcp" + assert s("https://x.com/mcp") == "https://x.com/mcp" + assert s("https://x.com/mcp/") == "https://x.com/mcp" + assert s("localhost:8000") == "http://localhost:8000/mcp" + assert s("https://x.com") == "https://x.com/mcp" + + +# -------------------------------------------------------------------------- +# Formatter (man-page markup) +# -------------------------------------------------------------------------- +def test_formatter_renders_enum_default(): + tool = mcp_types.Tool( + name="t", + description="d", + inputSchema={ + "type": "object", + "properties": { + "mode": { + "type": "string", + "enum": ["a", "b"], + "default": "a", + "description": "the mode", + } + }, + "required": ["mode"], + }, + ) + mpr = format_tools_to_man_page([tool]) + assert mpr.total_tools == 1 + assert "TOOL: t" in mpr.man_page + assert "enum:" in mpr.man_page and '"a"' in mpr.man_page + assert "default:" in mpr.man_page + + +# -------------------------------------------------------------------------- +# Generator (file source) +# -------------------------------------------------------------------------- +def test_generator_file_source(): + gen = McpToolsGenerator({}) + endpoint = { + "tools_source": { + "type": "file", + "path": _ds("datasets/mcp_readability/sample_tools.yaml"), + } + } + tools, mpr = gen.fetch_tools(endpoint, {}) + assert mpr.total_tools == 3 + assert all(isinstance(t, mcp_types.Tool) for t in tools) + assert {t.name for t in tools} == {"list_datasets", "RunQuery", "delete_table"} + assert "TOOL: RunQuery" in mpr.man_page + + +def test_generator_missing_file_raises(): + gen = McpToolsGenerator({}) + with pytest.raises(McpToolsError): + gen.fetch_tools({"tools_source": {"type": "file", "path": "nope.yaml"}}, {}) + + +def test_generator_mcp_source_mocked(): + """The live `mcp` path runs the SDK fetch; mock it so no network is needed.""" + gen = McpToolsGenerator({}) + + async def _fake_fetch(self, url, headers): + assert url == "https://example.com/mcp" + return [ + mcp_types.Tool( + name="t1", + description="d", + inputSchema={"type": "object", "properties": {}}, + ) + ] + + with patch.object(McpToolsGenerator, "_async_fetch_tools", _fake_fetch): + endpoint = {"endpoint_url": "example.com", "tools_source": {"type": "mcp"}} + tools, mpr = gen.fetch_tools(endpoint, {}) + assert [t.name for t in tools] == ["t1"] + assert mpr.total_tools == 1 + + +# -------------------------------------------------------------------------- +# Scorer parsing / html (no LLM call) +# -------------------------------------------------------------------------- +def test_scorer_parse_and_html(): + raw = """```json + {"compliance_score": 75, "findings": [ + {"severity": "P0", "rule_id": "P0-X", "tool": "t", "message": "m", "suggestion": "s"}, + {"severity": "P2", "rule_id": "P2-Y", "tool": "t", "message": "m", "suggestion": "s"} + ], "waived": [], "summary": "ok"} + ```""" + # Build a scorer without invoking the LLM constructor path. + scorer = McpStyleComplianceScorer.__new__(McpStyleComplianceScorer) + fb = scorer._parse(raw) + assert fb["p0_issues"] == 1 + assert fb["p2_issues"] == 1 + assert fb["compliance_score"] == 75 + html = McpStyleComplianceScorer.to_html(fb) + assert "", + "error_message": "", + } + ] + with tempfile.TemporaryDirectory() as d: + path = write_compliance_csv(rows, d, "job1") + df = pd.read_csv(path) + assert list(df.columns) == COLUMNS + assert df.iloc[0]["p1_issues"] == 1 + assert abs(df.iloc[0]["token_budget_used_percent"] - 0.84) < 1e-9 + + +# -------------------------------------------------------------------------- +# End-to-end orchestrator with stubbed LLM +# -------------------------------------------------------------------------- +class _FakeLLM: + def generate(self, prompt): + return json.dumps( + { + "compliance_score": 80, + "findings": [ + {"severity": "P1", "rule_id": "P1-A", "tool": "RunQuery", + "message": "m", "suggestion": "s"}, + ], + "waived": [{"rule_id": "P2-TOOL-DESC-LENGTH", "reason": "r"}], + "summary": "fine", + } + ) + + +def test_orchestrator_end_to_end(): + import yaml + + with patch( + "scorers.mcp_style_compliance.get_generator", return_value=_FakeLLM() + ): + from evaluator import get_orchestrator + + # Use a self-contained offline endpoints file (file source) so the test + # stays deterministic and independent of the production endpoint list. + with tempfile.TemporaryDirectory() as d: + ep_path = os.path.join(d, "endpoints.yaml") + with open(ep_path, "w") as f: + yaml.safe_dump( + { + "defaults": { + "endpoint_type": "REMOTE", + "tools_source": {"type": "file"}, + }, + "endpoints": [ + { + "product_name": "Sample", + "endpoint_url": "file://sample", + "environment": "DEV", + "tools_source": { + "type": "file", + "path": _ds( + "datasets/mcp_readability/sample_tools.yaml" + ), + }, + } + ], + }, + f, + ) + config = { + "orchestrator": "mcp_readability", + "endpoints_config": ep_path, + "style_guide": _ds("datasets/mcp_readability/style_guide.md"), + "exceptions_config": _ds("datasets/mcp_readability/exceptions.yaml"), + "tools_generator_config": _ds( + "datasets/mcp_readability/tools_generator.yaml" + ), + "token_budget": 25000, + "scorers": {"mcp_style_compliance": {"model_config": "unused"}}, + "runners": {"endpoint_runners": 2}, + "reporting": {"csv": {"output_directory": d}}, + } + orch = get_orchestrator(config, [], {}) + orch.evaluate([]) + job_id, _, a, b, c = orch.process() + assert (a, b, c) == (None, None, None) + df = pd.read_csv( + os.path.join(d, job_id, "mcp_readability_compliance.csv") + ) + assert list(df.columns) == COLUMNS + row = df.iloc[0] + assert row["check_status"] == "SUCCESS" + assert row["endpoint_type"] == "REMOTE" # inherited from defaults + assert int(row["total_tools"]) == 3 + assert int(row["p1_issues"]) == 1 + assert int(row["compliance_score"]) == 80 + + +def test_orchestrator_fetch_error(): + import yaml + + with patch( + "scorers.mcp_style_compliance.get_generator", return_value=_FakeLLM() + ): + from evaluator import get_orchestrator + + with tempfile.TemporaryDirectory() as d: + ep_path = os.path.join(d, "endpoints.yaml") + with open(ep_path, "w") as f: + yaml.safe_dump( + { + "defaults": {"tools_source": {"type": "file"}}, + "endpoints": [ + { + "product_name": "Bad", + "endpoint_url": "x", + "endpoint_type": "REMOTE", + "environment": "PROD", + "tools_source": { + "type": "file", + "path": os.path.join(d, "missing.yaml"), + }, + } + ], + }, + f, + ) + config = { + "orchestrator": "mcp_readability", + "endpoints_config": ep_path, + "style_guide": _ds("datasets/mcp_readability/style_guide.md"), + "tools_generator_config": _ds( + "datasets/mcp_readability/tools_generator.yaml" + ), + "token_budget": 25000, + "scorers": {"mcp_style_compliance": {"model_config": "unused"}}, + "reporting": {"csv": {"output_directory": d}}, + } + orch = get_orchestrator(config, [], {}) + orch.evaluate([]) + job_id, _, _, _, _ = orch.process() + df = pd.read_csv( + os.path.join(d, job_id, "mcp_readability_compliance.csv") + ) + row = df.iloc[0] + assert row["check_status"] == "FETCH_ERROR" + assert "not found" in str(row["error_message"]).lower() diff --git a/pyproject.toml b/pyproject.toml index 6777fb57..bb0ba1fa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -64,6 +64,10 @@ dependencies = [ # (urllib3.contrib.pyopenssl.inject_into_urllib3). Lift when google-auth # stops using the deprecated inject pattern. "pyOpenSSL<26.2", + # MCP readability check: official MCP SDK + async HTTP transport. + "mcp", + "httpx", + "anyio", ] [tool.uv.sources]