From f455862cd050f03c81c3e1c7df5414fb2b6d0e21 Mon Sep 17 00:00:00 2001
From: Akangsha Goel <akangshagoel@google.com>
Date: Wed, 24 Jun 2026 11:12:32 -0700
Subject: [PATCH] feat(mcp-readability): add MCP style-guide compliance eval

Add an `mcp_readability` orchestrator to EvalBench that checks MCP
endpoints against the Data Cloud MCP style guide and reports compliance
metrics (P0/P1/P2 counts, tool count, token budget usage, score).

- Tool fetching uses the official `mcp` SDK over Streamable HTTP
  (`streamable_http_client` + `ClientSession.list_tools`) with URL
  sanitization and ADC auth; a `file` source supports offline runs.
- Tools are rendered to man-page markup (resolves $ref/$defs, flattens
  nesting, renders enum/default) which feeds the LLM scorer.
- Scorer uses Vertex Gemini JSON mode, anchors severity to style-guide
  priority annotations, supports per-endpoint waivers, evaluates from an
  LLM-agent-consumption perspective, and runs a dedicated consistency
  second pass against prior feedback when available.
- Results write to a compliance CSV and, when `reporting.bigquery` is
  configured, append to the existing eval BigQuery table.
- Endpoint list mirrors monitored_endpoints.textproto (24 endpoints).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 datasets/mcp_readability/endpoints.yaml       | 124 ++++
 datasets/mcp_readability/exceptions.yaml      |  24 +
 .../model_configs/gemini_2.5_pro_model.yaml   |   4 +
 datasets/mcp_readability/run_config.yaml      |  43 ++
 datasets/mcp_readability/sample_tools.yaml    |  37 +
 datasets/mcp_readability/style_guide.md       | 699 ++++++++++++++++++
 datasets/mcp_readability/tools_generator.yaml |   7 +
 evalbench/evalbench.py                        |  20 +-
 evalbench/evaluator/__init__.py               |   5 +
 .../evaluator/mcp_readability/__init__.py     |   3 +
 evalbench/evaluator/mcp_readability/enums.py  |  89 +++
 .../evaluator/mcp_readability/exceptions.py   |  71 ++
 .../evaluator/mcp_readability/orchestrator.py | 334 +++++++++
 .../mcp_readability/token_estimator.py        |  33 +
 evalbench/generators/models/__init__.py       |   2 +
 .../generators/models/mcp_tool_formatter.py   | 341 +++++++++
 evalbench/generators/models/mcp_tools.py      | 229 ++++++
 evalbench/reporting/mcp_readability_csv.py    |  98 +++
 evalbench/scorers/mcp_style_compliance.py     | 360 +++++++++
 evalbench/test/mcp_readability_test.py        | 413 +++++++++++
 pyproject.toml                                |   4 +
 21 files changed, 2938 insertions(+), 2 deletions(-)
 create mode 100644 datasets/mcp_readability/endpoints.yaml
 create mode 100644 datasets/mcp_readability/exceptions.yaml
 create mode 100644 datasets/mcp_readability/model_configs/gemini_2.5_pro_model.yaml
 create mode 100644 datasets/mcp_readability/run_config.yaml
 create mode 100644 datasets/mcp_readability/sample_tools.yaml
 create mode 100644 datasets/mcp_readability/style_guide.md
 create mode 100644 datasets/mcp_readability/tools_generator.yaml
 create mode 100644 evalbench/evaluator/mcp_readability/__init__.py
 create mode 100644 evalbench/evaluator/mcp_readability/enums.py
 create mode 100644 evalbench/evaluator/mcp_readability/exceptions.py
 create mode 100644 evalbench/evaluator/mcp_readability/orchestrator.py
 create mode 100644 evalbench/evaluator/mcp_readability/token_estimator.py
 create mode 100644 evalbench/generators/models/mcp_tool_formatter.py
 create mode 100644 evalbench/generators/models/mcp_tools.py
 create mode 100644 evalbench/reporting/mcp_readability_csv.py
 create mode 100644 evalbench/scorers/mcp_style_compliance.py
 create mode 100644 evalbench/test/mcp_readability_test.py

diff --git a/datasets/mcp_readability/endpoints.yaml b/datasets/mcp_readability/endpoints.yaml
new file mode 100644
index 00000000..8acd8ee2
--- /dev/null
+++ b/datasets/mcp_readability/endpoints.yaml
@@ -0,0 +1,124 @@
+# MCP endpoints to check against the Data Cloud MCP style guide.
+#
+# Mirrors cloud/databases/mcp/readability/monitored_endpoints.textproto
+# (EndpointList). The proto `type` maps to `environment`; all live endpoints are
+# fetched via the `mcp` source over Streamable HTTP using ADC auth.
+#
+# `defaults` provides values shared by all endpoints; each endpoint may override
+# any of them. `tools_source.type` makes the tool listing pluggable:
+#   - mcp:  fetch tools/list from a live MCP server (official mcp SDK). Needs
+#           network access and (for these googleapis.com endpoints) ADC auth.
+#   - file: read a local YAML/JSON tools spec (offline / deterministic).
+
+defaults:
+  token_budget: 200000
+  endpoint_type: REMOTE       # EndpointType: REMOTE | LOCAL
+  tools_source:
+    type: mcp                 # mcp | file
+    auth: google_credentials  # ADC bearer + x-goog-user-project
+
+endpoints:
+  - product_name: "AlloyDB for PostgreSQL"
+    endpoint_url: "https://alloydb.us-central1.rep.googleapis.com/mcp"
+    environment: PROD
+
+  - product_name: "BigQuery"
+    endpoint_url: "https://bigquery.googleapis.com/mcp"
+    environment: PROD
+
+  - product_name: "BigQuery Data Transfer Service"
+    endpoint_url: "https://bigquerydatatransfer.googleapis.com/mcp"
+    environment: PROD
+
+  - product_name: "BigQuery Migration Service"
+    endpoint_url: "https://bigquerymigration.googleapis.com/mcp"
+    environment: PROD
+
+  - product_name: "Bigtable"
+    endpoint_url: "https://bigtableadmin.googleapis.com/mcp"
+    environment: PROD
+
+  - product_name: "Cloud SQL"
+    endpoint_url: "https://sqladmin.googleapis.com/mcp"
+    environment: PROD
+
+  - product_name: "Firestore"
+    endpoint_url: "https://firestore.googleapis.com/mcp"
+    environment: PROD
+
+  - product_name: "Spanner"
+    endpoint_url: "https://spanner.googleapis.com/mcp"
+    environment: PROD
+
+  - product_name: "Data Migration Service"
+    endpoint_url: "https://datamigration.googleapis.com/mcp"
+    environment: PROD
+
+  - product_name: "Datastream"
+    endpoint_url: "https://datastream.googleapis.com/mcp"
+    environment: PROD
+
+  - product_name: "Oracle Database"
+    endpoint_url: "https://oracledatabase.googleapis.com/mcp"
+    environment: PROD
+
+  - product_name: "Memory store for Redis/Redis Cluster"
+    endpoint_url: "https://redis.googleapis.com/mcp"
+    environment: PROD
+
+  - product_name: "Memory store for Valkey"
+    endpoint_url: "https://memorystore.googleapis.com/mcp"
+    environment: PROD
+
+  - product_name: "Pub/Sub"
+    endpoint_url: "https://pubsub.googleapis.com/mcp"
+    environment: PROD
+
+  - product_name: "Managed Kafka"
+    endpoint_url: "https://managedkafka.us-central1.rep.googleapis.com/mcp"
+    environment: PROD
+
+  - product_name: "Cloud Composer"
+    endpoint_url: "https://us-central1-staging-composer.sandbox.googleapis.com/mcp"
+    environment: STAGING
+  - product_name: "Cloud Composer"
+    endpoint_url: "https://composer.us-central1.rep.googleapis.com/mcp"
+    environment: PROD
+
+  - product_name: "Knowledge Catalog"
+    endpoint_url: "https://dataplex.googleapis.com/mcp"
+    environment: PROD
+
+  - product_name: "Knowledge Catalog Data Products"
+    endpoint_url: "https://staging-dataplex.sandbox.googleapis.com/mcp"
+    environment: STAGING
+
+  - product_name: "Database Insights (EGO for AlloyDB Tools)"
+    endpoint_url: "https://autopush-databaseinsights.sandbox.googleapis.com/mcp"
+    environment: AUTOPUSH
+
+  - product_name: "DB: AI Assisted Troubleshooting"
+    endpoint_url: "https://databaseinsights.googleapis.com/mcp"
+    environment: PROD
+
+  - product_name: "Cloud Storage MCP Server"
+    endpoint_url: "https://storage.googleapis.com/storage/mcp"
+    environment: PROD
+
+  - product_name: "Database Center API for MCP"
+    endpoint_url: "https://databasecenter.googleapis.com/mcp"
+    environment: PROD
+
+  - product_name: "Dataproc"
+    endpoint_url: "https://dataproc.googleapis.com/mcp"
+    environment: PROD
+
+  # --- Offline / deterministic sample (no network or auth) ------------------
+  # Uncomment to smoke-test the pipeline without hitting live endpoints.
+  # - product_name: "Sample Data Cloud MCP (local file)"
+  #   endpoint_url: "file://datasets/mcp_readability/sample_tools.yaml"
+  #   endpoint_type: LOCAL
+  #   environment: DEV
+  #   tools_source:
+  #     type: file
+  #     path: datasets/mcp_readability/sample_tools.yaml
diff --git a/datasets/mcp_readability/exceptions.yaml b/datasets/mcp_readability/exceptions.yaml
new file mode 100644
index 00000000..1b5aaebb
--- /dev/null
+++ b/datasets/mcp_readability/exceptions.yaml
@@ -0,0 +1,24 @@
+# Style-rule exceptions (waivers).
+#
+# Each entry waives one style `rule_id` for the endpoints it matches. A waived
+# rule is NOT counted toward p0/p1/p2 issues and is reported separately under
+# "waived" in the LLM feedback.
+#
+# Matchers (all that are present must match; an absent field or "*" = match-all):
+#   - endpoint_url
+#   - product_name
+#   - environment
+
+# `rule_id` should match the style guide's section heading text (the scorer uses
+# headings as rule IDs), e.g. "Tool Names", "Concise Descriptions",
+# "Use Enums", "Limit Options".
+exceptions:
+  # Example: waive the tool-naming rule for one specific endpoint.
+  - product_name: "Sample Data Cloud MCP (local file)"
+    rule_id: "Tool Names"
+    reason: "Sample fixture intentionally uses a non-compliant tool name (RunQuery)."
+
+  # Example: waive the enum rule for every autopush endpoint.
+  - environment: AUTOPUSH
+    rule_id: "Use Enums"
+    reason: "Autopush builds defer enum constraints until promotion to prod."
diff --git a/datasets/mcp_readability/model_configs/gemini_2.5_pro_model.yaml b/datasets/mcp_readability/model_configs/gemini_2.5_pro_model.yaml
new file mode 100644
index 00000000..26eb0b7f
--- /dev/null
+++ b/datasets/mcp_readability/model_configs/gemini_2.5_pro_model.yaml
@@ -0,0 +1,4 @@
+generator: gcp_vertex_gemini
+vertex_model: gemini-2.5-pro
+base_prompt: ""
+execs_per_minute: 5
diff --git a/datasets/mcp_readability/run_config.yaml b/datasets/mcp_readability/run_config.yaml
new file mode 100644
index 00000000..2f7c3af1
--- /dev/null
+++ b/datasets/mcp_readability/run_config.yaml
@@ -0,0 +1,43 @@
+############################################################
+### MCP Style-Guide Compliance (Readability) Run Config
+############################################################
+# Drives the `mcp_readability` orchestrator: fetch each endpoint's tools (man-page
+# markup), evaluate it against the style guide with an LLM, and write a compliance
+# CSV (and optionally append the rows to the eval BigQuery table).
+
+orchestrator: mcp_readability
+
+# Inputs
+endpoints_config: datasets/mcp_readability/endpoints.yaml
+style_guide: datasets/mcp_readability/style_guide.md
+exceptions_config: datasets/mcp_readability/exceptions.yaml          # optional
+tools_generator_config: datasets/mcp_readability/tools_generator.yaml
+
+# Default token budget for token_budget_used_percent (endpoints may override).
+token_budget: 200000
+
+# Optional: path to a prior compliance CSV; prior feedback is fed to the LLM to
+# keep findings consistent run-to-run.
+previous_results_csv: null
+
+# Optional: only check endpoints whose environment is in this list, e.g. [PROD].
+environments: []
+
+# LLM used by the compliance scorer.
+scorers:
+  mcp_style_compliance:
+    model_config: datasets/mcp_readability/model_configs/gemini_2.5_pro_model.yaml
+
+# Endpoint check concurrency.
+runners:
+  endpoint_runners: 4
+
+# Output: results/<job_id>/mcp_readability_compliance.csv
+# Add a `bigquery` block to also append the compliance rows to the standard eval
+# table (<gcp_project_id>.evalbench.results); the schema auto-evolves to add the
+# compliance columns. No new dataset/table is created.
+reporting:
+  csv:
+    output_directory: 'results'
+  # bigquery:
+  #   gcp_project_id: senseai-team
diff --git a/datasets/mcp_readability/sample_tools.yaml b/datasets/mcp_readability/sample_tools.yaml
new file mode 100644
index 00000000..6a2ea490
--- /dev/null
+++ b/datasets/mcp_readability/sample_tools.yaml
@@ -0,0 +1,37 @@
+# Sample MCP tools spec for offline, deterministic testing of the readability
+# check (tools_source.type: file). Intentionally mixes good and bad style so the
+# LLM has something to flag.
+tools:
+  - name: list_datasets
+    description: "List all BigQuery datasets in the given project. Use this to discover available datasets before querying."
+    inputSchema:
+      type: object
+      properties:
+        project_id:
+          type: string
+          description: "The Google Cloud project ID to list datasets from."
+      required: ["project_id"]
+
+  - name: RunQuery
+    description: "runs sql"
+    inputSchema:
+      type: object
+      properties:
+        q:
+          type: string
+
+  - name: delete_table
+    description: "Deletes a table."
+    inputSchema:
+      type: object
+      properties:
+        project_id:
+          type: string
+          description: "Project ID."
+        dataset_id:
+          type: string
+          description: "Dataset ID."
+        table_id:
+          type: string
+          description: "Table ID."
+      required: ["project_id", "dataset_id", "table_id"]
diff --git a/datasets/mcp_readability/style_guide.md b/datasets/mcp_readability/style_guide.md
new file mode 100644
index 00000000..7df1bf36
--- /dev/null
+++ b/datasets/mcp_readability/style_guide.md
@@ -0,0 +1,699 @@
+# Data Cloud MCP Tool Style Guide
+
+[TOC]
+
+*   **Go link:**
+    [go/mcp-datacloud-styleguide](http://go/mcp-datacloud-styleguide)
+
+## Style Principles
+
+This document provides style guidelines and best practices for Data Cloud teams
+building MCP (Model Context Protocol) tools.
+
+> [!NOTE] The tool descriptions and examples in this guide are generated using
+> the `experimental/users/kvg/mcp_readability` utility. You can use this tool to
+> view the API of your own MCP tools in a readable format by running it against
+> your MCP server endpoint:
+>
+> ```bash
+> blaze run //experimental/users/kvg/mcp_readability:mcp_list_tools -- \
+>     --url="https://your-mcp-server.googleapis.com/mcp" \
+>     --auth \
+>     --project="your-gcp-project"
+> ```
+>
+> You can use the following tool to get LLM generated recommendations (AI
+> evaluation) based on this style guide for your MCP tools by running it against
+> your MCP server endpoint. This tool requires a Gemini API key to run the AI
+> evaluation:
+>
+> ```bash
+> blaze run cloud/databases/mcp/readability:run_mcp_readability_check -- \
+>     --gemini_api_key <gemini_api_key> check_url \
+>     --url "https://your-mcp-server.googleapis.com/mcp" \
+>     [--tools <tool_names>] [--auth] [--project <project_id>]
+> ```
+
+There are a few overarching principles that summarize how to think about
+building good MCP tools. The following are attributes of good tools, in order of
+importance:
+
+1.  **Clarity**: The tool's purpose, parameters, and error messages are clear to
+    the reasoning engine.
+2.  **Simplicity**: Tools and their parameters are simple enough to reduce
+    cognitive load on the model, minimize hallucination risks, and avoid
+    excessive token usage, while remaining comprehensive enough to complete a
+    task.
+3.  **Security**: Tools are secure by default, do not expose secrets, and use
+    granular protections.
+4.  **Maintainability**: Tools are built in a consistent manner, relying on
+    standardized platforms, making them easier to maintain across teams.
+
+## Platform & Constraints
+
+### Platform to build tools <!-- priority: p0 -->
+
+All teams within data cloud **MUST** be using either OneMCP or MCP Toolbox for
+tool building. Bespoke solutions are **NOT** supported.
+
+### Tool Count Limits <!-- priority: p0 -->
+
+The current rule of thumb is to keep it to 40 tools per server as an upper
+limit, though even this amount may cause performance issues. Performance
+degrades as more tools are added, so teams **SHOULD** heavily weigh adding new
+tools against the negative impact on tool accuracy until other mechanisms are in
+place to deal with this.
+
+In the future, we aspire for this number to be reduced to 5-8 tools per server
+as toolsets/skills capabilities improve in MCP Toolbox and OneMCP, though this
+is not a requirement today. Until overall platform tool handling improves, teams
+**SHOULD** minimize tool count as much as possible.
+
+## Tool Design & API
+
+MCP tools **MUST** be high quality, reliable, and easy for agents to use.
+
+### Tool Names <!-- priority: p1 for <action>_<resource>, p2 for snake_case -->
+
+Teams **MUST** use `snake_case` (`<action>_<resource>`) for tool names. You
+**SHOULD NOT** use product-specific prefixes, as the agent can automatically
+disambiguate by prefixing the tool name with the MCP server name.
+
+```text {.bad}
+================================================================================
+TOOL: my_service_create_instance
+================================================================================
+================================================================================
+TOOL: my_service_execute_sql
+================================================================================
+```
+
+```text {.good}
+================================================================================
+TOOL: create_instance
+================================================================================
+================================================================================
+TOOL: execute_sql
+================================================================================
+```
+
+All new tools **MUST** be registered in the
+[Data Cloud OneMCP: Tools and Commitment Dashboard](https://docs.google.com/spreadsheets/d/1rXWhXONd5xqCpU-oJJ8eTSWtL4QVm9yuavjur-p_NXY){.external}.
+
+### Keep tools CUJ-focused <!-- priority: p0 -->
+
+Tools **SHOULD** be focused on a specific CUJ or goal, like creating a resource
+or fetching monitoring data. Aim for tools that are granular enough to be easily
+understood and used by agents, but comprehensive enough to complete a task in
+one go if possible. For example, if a CUJ requires multiple API calls, consider
+creating one tool that orchestrates these calls to reduce chances of multi-tool
+failure.
+
+However, avoid bundling unrelated actions into a single tool.
+
+```text {.bad}
+================================================================================
+TOOL: manage_instance
+================================================================================
+DESCRIPTION:
+  Creates, deletes, or retrieves a database instance depending on the action
+  parameter.
+
+PARAMETERS:
+  action (string) [REQUIRED]
+      Required. The action to perform (create, delete, get).
+
+  name (string) [REQUIRED]
+      Required. Name of the database instance. This does not include the
+      project ID.
+```
+
+```text {.good}
+================================================================================
+TOOL: get_instance
+================================================================================
+DESCRIPTION:
+  Get the details of a Cloud SQL instance.
+
+PARAMETERS:
+  instance (string) [REQUIRED]
+      Required. Database instance ID. This does not include the project_id.
+
+  project_id (string) [REQUIRED]
+      Required. Project ID of the project that contains the instance.
+```
+
+### Idempotency <!-- priority: p1 -->
+
+Whenever possible, tools **SHOULD** be idempotent.
+
+**Why?** Calling the same tool with the same parameters multiple times should
+have the same result as calling it once. This is especially important for create
+and delete operations. For example, if an agent calls `create_resource` multiple
+times with the same parameters, only one resource should be created. If the
+resource already exists, the tool should return a success status code rather
+than an error code (e.g. `ALREADY_EXISTS`). Returning success on successive
+attempts allows agents to safely retry operations without causing tool execution
+to fail. It is acceptable for the tool response to indicate that the resource
+already existed rather than being created by the current call.
+
+### Parameters
+
+Parameters **MUST** have clear and meaningful names. Avoid ambiguity that could
+lead to accidental insecure configurations or accidental destructive operations.
+
+#### Naming Convention <!-- priority: p2 -->
+
+Teams should use `snake_case` for all parameter names. This aligns with Python
+naming conventions (the most-used language for MCP servers) and provides a
+predictable, standardized interface for the reasoning engine.
+
+```text {.bad}
+PARAMETERS:
+  projectId (string) [REQUIRED]
+      Required. Project ID of the project.
+```
+```text {.good}
+PARAMETERS:
+  project_id (string) [REQUIRED]
+      Required. Project ID of the project.
+```
+
+#### Understandability <!-- priority: p0 -->
+
+A good rule of thumb is: if a human cannot easily understand a parameter without
+reading the documentation, the name is likely too ambiguous for a model as well.
+
+Directly mapping to existing enums or field names from underlying OnePlatform
+APIs is often an antipattern. These internal names are typically designed for
+system-to-system communication and backwards compatibility, not for semantic
+understanding by an LLM.
+
+```text {.bad}
+PARAMETERS:
+  write_disposition (string) [OPTIONAL]
+      Optional. Specifies the action that occurs if the destination table
+      already exists. Can be ['WRITE_TRUNCATE', 'WRITE_APPEND', 'WRITE_EMPTY'].
+```
+
+```text {.good}
+PARAMETERS:
+  if_table_exists (string) [OPTIONAL]
+      Optional. Action to take if the destination table already exists.
+      Can be ['overwrite', 'append', 'fail'].
+```
+
+**Explain Domain Terminology** Do not assume the agent understands your
+specific product's architecture or internal naming conventions. If a tool relies
+on domain-specific concepts (e.g., "Exadata," "Static-IP Connectivity,"
+"Statement Suggestions"), you MUST provide a brief, plain-English explanation of
+what the concept is and *why* the agent would use it.
+
+To prevent tool descriptions from becoming excessively long, keep these
+explanations concise.
+
+Create evaluations
+([evalbench](http://g3doc/cloud/databases/mcp/g3doc/evalbench_overview.md))
+to understand how well an agent handles domain terminology, and to test what
+level of explanations you need to include in the tool description.
+
+#### Query Languages and Custom Filters <!-- priority: p0 -->
+
+If a tool takes a custom query string as an input parameter (e.g., AIP-160,
+PromQL, EBNF syntax):
+
+1.  **Provide Concrete Examples**: You MUST include 2-3 fully-formed examples of
+    valid query strings in the parameter description.
+
+2.  **Explain Literals**: If the filter requires specific internal string
+    literals, list the most common valid literals directly in the description.
+
+3.  **Iterative Discovery**: If the domain is too large to document in the
+    description (e.g., thousands of PromQL metrics), you SHOULD provide a
+    secondary "discovery" tool(e.g., `list_available_metrics`) so the agent can
+    look up the correct literals before composing its filter.
+
+Evaluating agent performance on query generation is crucial for understanding
+how much detail and how many examples are necessary to include in tool
+descriptions. You SHOULD use evaluations to test how well agents handle query
+languages and custom filters with varying levels of detail in the description.
+
+*   *Note*: A fuller explanation of complex query languages might be better
+    suited as a separate skill.
+
+**Examples:**
+
+```text {.good}
+PARAMETERS:
+  filter (string) [OPTIONAL]
+      Optional. An AIP-160 filter string. Examples:
+      - 'labels.env = "prod" AND labels.team = "data"'
+      - 'create_time > "2024-01-01T00:00:00Z"'
+```
+
+```text {.good}
+PARAMETERS:
+  query (string) [REQUIRED]
+      Required. A PromQL query. Examples:
+      - 'up{job="kubernetes-pods"}'
+      - 'sum(rate(http_requests_total[5m]))'
+```
+
+#### Avoid complex parameters <!-- priority: p0 -->
+
+Avoid complex parameter types like large, nested protocol buffers or overly
+complex dictionaries. These increase the likelihood of parsing errors, consume
+unnecessary tokens, and cause model confusion. Stick to simple primitives
+(strings, integers, booleans) wherever possible. Simple maps (e.g., for labels
+or simple key-value configurations) are acceptable if they are well-described
+and do not introduce large, nested proto structures.
+
+```text {.good}
+// GOOD: Simple maps for flat key-value data
+PARAMETERS:
+  labels (object) [OPTIONAL]
+      Optional. Labels to apply to the instance. A simple map of string keys
+      to string values (e.g., {"env": "prod", "team": "data"}).
+```
+
+**Opinionated Defaults for Resource Creation** When wrapping `Create` endpoints
+for complex resources, DO NOT expose the entire OnePlatform proto schema as
+tool parameters. Instead, provide sensible and secure defaults for the
+resource configuration on the server side, and only expose parameters for
+the 5 to 8 most critical settings the agent needs to configure
+(e.g., `machine_type`, `region`, `memory_gb`). If providing sensible defaults
+is not sufficient to reduce complexity, you **SHOULD** split functionality into
+separate, purpose-built MCP tools rather than creating one complex tool.
+
+```text {.bad}
+// BAD: Exposing a massive, complex proto object
+PARAMETERS:
+  cluster_config (object) [REQUIRED]
+      Required. The complete cluster configuration proto, including nested
+      fields for master_config, worker_config, software_config, etc.
+```
+
+```text {.good}
+// GOOD: Opinionated defaults and simple parameters
+PARAMETERS:
+  cluster_name (string) [REQUIRED]
+      Required. The name of the cluster.
+
+  node_count (integer) [OPTIONAL]
+      Optional. The number of worker nodes. Defaults to 3.
+
+  machine_type (string) [OPTIONAL]
+      Optional. The machine type to use for nodes. Defaults to 'n1-standard-4'.
+```
+
+**Restructuring Complex Protos** Instead of passing a
+complex `Settings` proto for instance creation, flatten the critical
+fields:
+
+```text {.bad}
+PARAMETERS:
+  settings (object) [OPTIONAL]
+      Optional. Instance settings proto containing tier, pricingPlan,
+      ipConfiguration (which itself contains authorizedNetworks, ipv4Enabled, etc).
+```
+
+```text {.good}
+PARAMETERS:
+  tier (string) [REQUIRED]
+      Required. The service tier (e.g., 'db-custom-1-3840').
+
+  enable_public_ip (boolean) [OPTIONAL]
+      Optional. Whether to enable public IP. Defaults to false.
+
+  authorized_networks (array) [OPTIONAL]
+      Optional. List of authorized CIDR blocks.
+```
+
+#### Hide Useless and Deprecated Fields <!-- priority: p1 -->
+
+You MUST NOT expose parameters to the agent that it cannot meaningfully use.
+Leaking backend or OnePlatform requirements into the AI surface increases
+cognitive load and token waste.
+
+-   **Idempotency Tokens & UUIDs**: Do not ask the agent to generate UUIDs or
+    `request_id`s for idempotency. Handle idempotency and request deduplication
+    inside the MCP server or adapter layer. These should NOT be included in the
+    input payload schema.
+-   **Deprecated Fields**: Strip all deprecated fields from your schemas.
+
+#### Use Human-Readable Time and Durations <!-- priority: p0 -->
+
+When designing parameters for dates, times, or durations, you **MUST NOT** use
+low-level Unix timestamps (e.g., milliseconds or nanoseconds). Instead, use
+higher-order time units (seconds, minutes, hours, days) or standard
+human-readable date-time formats.
+
+LLMs reason much better with formatted dates, times, and higher-order durations
+than they do with raw Unix timestamps. Forcing the reasoning engine to calculate
+nanoseconds increases the likelihood of hallucination and token waste.
+
+```text {.bad}
+PARAMETERS:
+  start_time_ns (integer) [OPTIONAL]
+      Optional. The start time of the query in nanoseconds since the Unix epoch.
+
+  timeout_ms (integer) [OPTIONAL]
+      Optional. The timeout duration for the job in milliseconds.
+```
+```text {.good}
+PARAMETERS:
+  start_time (string) [OPTIONAL]
+      Optional. The start time of the query in RFC 3339 format
+      (e.g., "2024-04-09T22:52:14Z").
+
+  timeout_in_minutes (integer) [OPTIONAL]
+      Optional. The timeout duration for the job in minutes.
+```
+
+#### Limit Options <!-- priority: p1 -->
+
+Aim for a limit of 5 to 8 parameters per tool, while keeping in mind that fewer
+parameters is usually better. High parameter counts exponentially decreases tool
+call accuracy and can increase token consumption.
+
+#### Use Enums <!-- priority: p1 -->
+
+Prefer iterating a list of possible values over free-text strings. This
+constrains the model's decision path and reduces token perplexity.
+
+```text {.good}
+status: ['open', 'closed']
+```
+
+#### Safe Pagination <!-- priority: p0 -->
+
+If your tool returns lists of values (for example resources): -
+**Cap Page Sizes**: Default `page_size` parameters MUST be capped at an LLM-safe
+limit (e.g., 10 to 20 maximum). Aim to keep the return output size
+below 5,000 to 10,000 tokens to avoid using up too much of an agent's context
+window.
+
+-   **Document Page Tokens**: If exposing a `page_token` parameter, explain
+    exactly how the agent should use it (e.g., *"Pass the `next_page_token`
+    received from the previous response to retrieve the next page."*).
+
+#### Concise Descriptions <!-- priority: p0 -->
+
+While it can be tempting to treat tools like documentation, avoid adding
+redundant information to tool descriptions.
+
+-   **Keep Parameter Docs on the Parameters**: Do not repeat information in the
+    tool description that belongs in parameter descriptions. For example, do not
+    embed parameter definitions, formatting rules, or extraneous section titles
+    (such as "Purpose and Capabilities") within the main tool description. Each
+    parameter has its own dedicated description field in the schema for this
+    information. Stating "The request must provide parameter X to get result Y"
+    in the main description is redundant and wastes tokens.
+
+```text {.good}
+================================================================================
+TOOL: execute_sql
+================================================================================
+DESCRIPTION:
+  Execute any valid SQL statement, including data definition language (DDL),
+  data control language (DCL), data query language (DQL), or
+  data manipulation language (DML) statements, on a Cloud SQL instance.
+
+PARAMETERS:
+  database (string) [OPTIONAL]
+      Optional. Name of the database on which the statement will be executed.
+      For Postgres it's required, for MySQL it's optional. For Postgres, if your
+      query is not scoped to an existings database, like list databases / create
+      new database / grant roles, you can pass in default value as postgres.
+
+  instance (string) [REQUIRED]
+      Required. Database instance ID. This does not include the project_id.
+
+  project_id (string) [REQUIRED]
+      Required. Project ID of the project that contains the instance.
+
+  sql (string) [REQUIRED]
+      Required. SQL statements to run on the database. It can be a single
+      statement or a sequence of statements separated by semicolons.
+```
+
+#### Avoid Conversational Scripting <!-- priority: p0 -->
+
+-   **Use Imperative Phrasing**: You may use imperative phrasing to guide agent
+    actions, including asking for user confirmation on critical or destructive
+    operations (e.g., *"Please explicitly check with the user to ensure they
+    want to delete all data before executing"*). However, do not overly
+    prescribe conversational flow (e.g., *"Send each step as a separate
+    message"*).
+-   **Do not reference capabilities an agent may not have**: For example, do not
+    tell the agent to *"sleep for 10 seconds"* or *"navigate to this URL in a
+    browser"*. While some agents can have these capabilities, asking an agent
+    who does not may confuse or derail them. Use objective constraints instead
+    (e.g., *"Wait at least 10 seconds before polling again"*). Tool
+    descriptions should explain what the tool does, and they should avoid
+    telling the agent to do things it cannot or to hide things that require
+    user interaction.
+
+#### Be Consistent <!-- priority: p0 -->
+
+Use consistent parameter names for the same concept across different tools.
+Decide on one parameter name for project identification (e.g., `project_id`) and
+use it across all tools.
+
+```text {.bad}
+PARAMETERS:
+  project (string) [REQUIRED]
+      Required. ID of the project.
+
+# ...
+PARAMETERS:
+  project (string) [REQUIRED]
+      Required. Name of the project.
+
+```
+
+```text {.good}
+PARAMETERS:
+  instance_id (string) [REQUIRED]
+      Required. Database instance ID.
+
+  project_id (string) [REQUIRED]
+      Required. Project ID of the project that contains the instance.
+```
+
+#### Explicit Consequences <!-- priority: p1 -->
+
+Explicitly state consequences in parameter names. For destructive or high-cost
+operations, parameter names should make consequences clear.
+
+```text {.bad}
+  force (boolean) [OPTIONAL]
+      Optional. Force the deletion.
+```
+
+```text {.good}
+PARAMETERS:
+  confirm_deletion (boolean) [OPTIONAL]
+      Optional. Acknowledge that this operation will permanently delete all
+      clusters and data. This action cannot be undone.
+```
+
+#### Avoid Templated Strings <!-- priority: p0 -->
+
+**DO NOT** require the agent to pass parameters in as templated strings (e.g.,
+`projects/{project_id}/locations/{region}/instances/{resource_name}`).
+
+This creates an additional formatting step for the agent to construct the string
+correctly, which increases the likelihood of hallucination or syntax errors and
+potentially lowers the tool's success rate.
+
+#### Parameter Design Examples
+
+Tool              | Bad Design (Ambiguous/Insecure) | Good Design (Explicit/Secure)
+:---------------- | :------------------------------ | :----------------------------
+`deploy_dag`      | `force: true`                   | `acknowledge_overwrite_of_existing_dag_file: true`
+`cancel_job`      | `immediate: true`               | `termination_policy: "FORCE_CANCEL_WITHOUT_DRAIN"`
+`instance_delete` | `confirm: true`                 | `confirm_permanent_deletion_of_all_clusters_and_data: true`
+`Metadata Lookup` | `show_all: true`                | `include_sensitive_business_metadata: true`
+`batch_create`    | `use_public_ip: true`           | `network_access: "PRIVATE_ONLY"` or `"PUBLIC_INTERNET_ACCESS"`
+`database_delete` | `delete: true`                  | `acknowledge_permanent_database_deletion_and_data_loss: true`
+`execute_sql`     | `allow_large_results: true`     | `acknowledge_potential_high_compute_cost_for_large_results: true`
+`duration`        | `duration: "4h"`                | `requested_access_duration_in_hours: 4`
+`justification`   | `reason: "fixing bug"`          | `detailed_justification_for_elevated_privileges: "b/1234 -- Fixing bug"`
+
+### Long running operations <!-- priority: p1 -->
+
+MCP today does not have a standard for how to do long running operations. We
+expect that our tools will have to evolve as the MCP standard changes.
+
+In the meantime, the standard pattern for handling LROs is:
+
+1.  **Initiation**: An MCP tool call that initiates an LRO (e.g.,
+    `create_instance`) **MUST** return immediately with an operation ID.
+2.  **Polling**: The agent will use a dedicated tool, like `get_operation`, to
+    poll the status of the LRO using the returned operation ID. Teams **MUST**
+    provide such a polling tool.
+3.  **Completion**: The agent will continue polling the tool until the operation
+    reaches a terminal state (e.g., `SUCCEEDED`, `FAILED`).
+
+To ensure agents can effectively follow this pattern, clear descriptions and
+hints **MUST** be provided within the tool descriptions. This guides the agent
+on the expected workflow.
+
+Example tool descriptions:
+
+```text {.bad}
+================================================================================
+TOOL: create_instance
+================================================================================
+DESCRIPTION:
+  Creates a database instance.
+```
+
+```text {.good}
+================================================================================
+TOOL: create_instance
+================================================================================
+DESCRIPTION:
+  Initiates the creation of a Cloud SQL instance.
+
+  * The tool returns a long-running operation. Use the `get_operation` tool to
+  poll its status until the operation completes.
+  * The instance creation operation can take several minutes. Wait at least
+  30 seconds before rechecking the status.
+
+PARAMETERS:
+  name (string) [REQUIRED]
+      Required. Name of the Cloud SQL instance. This does not include the
+      project ID.
+
+  project (string) [REQUIRED]
+      Required. Project ID of the project to which the newly created Cloud SQL
+      instances should belong.
+
+================================================================================
+TOOL: get_operation
+================================================================================
+DESCRIPTION:
+  Retrieves the status of a long-running operation.
+```
+
+## Error Handling
+
+### Actionable Error Messaging <!-- priority: p0 -->
+
+Tools **MUST** return clear and actionable error messages that tell agents what
+went wrong, why it went wrong, and how to fix it. When an operation fails, **DO
+NOT** return generic error messages like "internal error".
+
+For more details on writing good error messages, see
+[Write actionable error messages](https://developers.google.com/workspace/chat/write-error-messages).
+
+Here are some examples of good and bad error messages for tools:
+
+| Tool              | Bad error message   | Good error message                |
+| :---------------- | :------------------ | :-------------------------------- |
+| `create_instance` | `Invalid request`   | `Failed to create instance: The   |
+:                   :                     : instance name 'my-instance' is    :
+:                   :                     : already in use. Please choose a   :
+:                   :                     : different name.`                  :
+| `execute_sql`     | `Permission denied` | `Failed to execute SQL: The       |
+:                   :                     : service account does not have     :
+:                   :                     : permission to access the database :
+:                   :                     : 'my-db'. Please grant 'Cloud SQL  :
+:                   :                     : Client' role to the service       :
+:                   :                     : account.`                         :
+| `delete_resource` | `Internal error`    | `Failed to delete resource: The   |
+:                   :                     : resource is currently in use by   :
+:                   :                     : another operation. Please try     :
+:                   :                     : again in a few minutes.`          :
+
+## Security Best Practices
+
+This section covers security best practices for creating MCP tools. Authoring a
+new MCP tool is an opportunity to make tools that are secure by default.
+
+### Secure by Default <!-- priority: p0 -->
+
+Our APIs **MUST** enforce secure configurations.
+
+*   **Critical Requirement:** All "create" or "update" operations **MUST** apply
+    the most secure configuration by default. For example, any resource that
+    *can* be encrypted *must* be encrypted by default with Google-managed or
+    customer-managed encryption keys. Any resource that *can* be exposed to the
+    public internet *must* default to private. If defaulting to private creates
+    significant user friction (e.g., forcing VPC setup) or represents an
+    irreversible configuration, network configuration may be defined as a
+    required parameter, forcing the agent to request clarification if the user's
+    intent is unclear.
+
+### Tool Descriptions as Embedded Security Guidance <!-- priority: p0 -->
+
+The description field in a tool's manifest is the primary documentation for an
+agent and **MUST** guide the reasoning engine toward correct actions. **DO NOT**
+use field descriptions like "enable_policy" with a boolean value.
+
+| Field                 | Bad Description   | Better Description               |
+| :-------------------- | :---------------- | :------------------------------- |
+| `enable_public_ip`    | Enables public IP | Enabling this flag makes the     |
+:                       :                   : resource accessible from the     :
+:                       :                   : public internet, which is a      :
+:                       :                   : security risk if not properly    :
+:                       :                   : managed. It is recommended to    :
+:                       :                   : keep resources private whenever  :
+:                       :                   : possible.                        :
+| `deletion_protection` | Enables deletion  | Prevents the resource from being |
+:                       : protection        : accidentally deleted.            :
+:                       :                   : Recommended for production       :
+:                       :                   : resources to prevent human       :
+:                       :                   : error.                           :
+
+### Handling Passwords and Secrets <!-- priority: p0 -->
+
+Tools **MUST NOT** surface passwords or credentials in clear-text requests or
+responses. Authentication **MUST** be handled by the underlying transport (e.g.,
+service account credentials). Exposing plaintext password fields in API
+interfaces for AI consumption is a security risk.
+
+*   **Policy:**
+    [Read More](https://docs.google.com/document/d/1D6FGIJTfDlNTwPE00W0jStP6zB4Gbdyy0QSOnewKbPo)
+    {.external}
+*   Teams may allow agents to access references to secrets stored in Secret
+    Manager
+
+### Granular Protection for Sensitive Operations <!-- priority: p0 -->
+
+Many GCP APIs follow a pattern of bundling multiple update operations into a
+single API method (e.g., `resource.update`), controlled by a single IAM
+permission (e.g., `my-service.resources.update`). While efficient, this can
+expose users to extra risk when authorizing AI agents. For example, a user might
+want to grant an agent permission to update resource labels, but not to enable
+public internet access for the resource, which is a high-risk action. If both
+actions are covered by `my-service.resources.update`, the user cannot easily
+grant permission for one but not the other. This prevents users from
+establishing safe boundaries for their agents.
+
+To mitigate this, teams **SHOULD** consider introducing granular protections for
+high-risk actions such as deleting data or exposing a resource to the public
+internet. For example:
+
+*   **Field-level IAM permissions:** For API methods that update multiple
+    fields, consider requiring more specific IAM permissions for changes to
+    high-risk fields. For example, enabling public internet access via a
+    `resource.update` method could require
+    `my-service.resources.enablePublicAccess` in addition to
+    `my-service.resources.update`.
+*   **Multi-party Approval:** For critical operations like deleting a production
+    resource, consider requiring a second independent confirmation from another
+    user before execution.
+
+### Horizontals and Compliance <!-- priority: p0 -->
+
+All MCP tools **MUST** comply with standard Google Cloud horizontal requirements
+including the standard ones:
+
+*   **VPC-SC:** Integration with VPC Service Controls.
+*   **IAM:** Use Cloud IAM for authorization and support IAM Deny policies.
+*   **Audit Logging:** Working `Gin` logging for all tools.
+*   **Access Transparency (AXT/AXA):** Integration for administrative access
+    logs.
diff --git a/datasets/mcp_readability/tools_generator.yaml b/datasets/mcp_readability/tools_generator.yaml
new file mode 100644
index 00000000..4520556e
--- /dev/null
+++ b/datasets/mcp_readability/tools_generator.yaml
@@ -0,0 +1,7 @@
+# Generator that fetches/normalizes each endpoint's tools.yaml.
+# Per-endpoint `tools_source` (in endpoints.yaml) overrides drive actual fetch
+# behavior; these are just defaults for the generator itself.
+generator: mcp_tools
+timeout: 30
+headers: {}
+# auth: google_credentials   # default auth for endpoints that don't set their own
diff --git a/evalbench/evalbench.py b/evalbench/evalbench.py
index 604fde52..bfc786f8 100644
--- a/evalbench/evalbench.py
+++ b/evalbench/evalbench.py
@@ -88,8 +88,13 @@ def eval(experiment_config: str):
         config, db_configs, model_config, setup_config = load_session_configs(session)
         logging.info("Loaded Configurations in %s", experiment_config)
 
-        # Load the dataset
-        dataset = load_dataset_from_json(session["dataset_config"], config)
+        # Load the dataset. Some orchestrators (e.g. mcp_readability) drive their
+        # work from the run config itself rather than a prompt dataset, so
+        # dataset_config is optional; flatten_dataset({}) yields [].
+        if session.get("dataset_config"):
+            dataset = load_dataset_from_json(session["dataset_config"], config)
+        else:
+            dataset = {}
         # Load the evaluator
         evaluator = get_orchestrator(
             config, db_configs, setup_config, report_progress=True
@@ -137,6 +142,17 @@ def eval(experiment_config: str):
             )
             summary_scores_df["job_id"] = job_id
             summary_scores_df["run_time"] = run_time
+        elif config.get("orchestrator") == "mcp_readability":
+            # This orchestrator writes its own compliance CSV in process() and
+            # intentionally returns no NL2SQL results/scores temp files.
+            logging.info(
+                f"Finished MCP readability run for config '{display_config}'."
+            )
+            reporters = []
+            config_df = None
+            results_df = None
+            scores_df = None
+            summary_scores_df = None
         else:
             logging.warning(
                 f"There were no matching evals in run for config '{display_config}'. Returning empty set."
diff --git a/evalbench/evaluator/__init__.py b/evalbench/evaluator/__init__.py
index 4795183b..0374aaa6 100644
--- a/evalbench/evaluator/__init__.py
+++ b/evalbench/evaluator/__init__.py
@@ -8,6 +8,7 @@
 from evaluator.dataengineeringagentorchestrator import (
     DataEngineeringAgentOrchestrator,
 )
+from evaluator.mcp_readability import McpReadabilityOrchestrator
 import logging
 
 
@@ -27,6 +28,10 @@ def get_orchestrator(config, db_configs, setup_config, report_progress=False):
         return DataEngineeringAgentOrchestrator(
             config, db_configs, setup_config, report_progress
         )
+    elif orchestrator_type == "mcp_readability":
+        return McpReadabilityOrchestrator(
+            config, db_configs, setup_config, report_progress
+        )
     else:
         return Orchestrator(config, db_configs, setup_config, report_progress)
 
diff --git a/evalbench/evaluator/mcp_readability/__init__.py b/evalbench/evaluator/mcp_readability/__init__.py
new file mode 100644
index 00000000..bff0cfa9
--- /dev/null
+++ b/evalbench/evaluator/mcp_readability/__init__.py
@@ -0,0 +1,3 @@
+from evaluator.mcp_readability.orchestrator import McpReadabilityOrchestrator
+
+__all__ = ["McpReadabilityOrchestrator"]
diff --git a/evalbench/evaluator/mcp_readability/enums.py b/evalbench/evaluator/mcp_readability/enums.py
new file mode 100644
index 00000000..a91873d7
--- /dev/null
+++ b/evalbench/evaluator/mcp_readability/enums.py
@@ -0,0 +1,89 @@
+"""Enums for the MCP style-guide readability/compliance check.
+
+These mirror proto enums used by the Data Cloud MCP readability tooling
+(e.g. ``cloud.databases.mcp.readability.EndpointType``). They are defined as
+plain Python enums here so the feature is self-contained; values can be aligned
+to the generated proto later without touching call sites. All enums are written
+to the results CSV using their ``.name``.
+"""
+
+from enum import Enum
+
+
+class EndpointType(Enum):
+    """How the MCP server is reached.
+
+    Mirrors ``cloud.databases.mcp.readability.EndpointType``.
+    """
+
+    ENDPOINT_TYPE_UNSPECIFIED = 0
+    REMOTE = 1
+    LOCAL = 2
+
+
+class Environment(Enum):
+    """Release channel / deployment environment of an endpoint.
+
+    Distinct from :class:`EndpointType`; used purely to slice / filter results
+    (e.g. only check ``PROD`` endpoints).
+    """
+
+    ENVIRONMENT_UNSPECIFIED = 0
+    PROD = 1
+    AUTOPUSH = 2
+    STAGING = 3
+    DEV = 4
+
+
+class CheckStatus(Enum):
+    """Whether the compliance check *ran* successfully (not its findings).
+
+    Compliance results are captured separately via the p0/p1/p2 issue counts and
+    compliance_score; this status only reflects whether the eval completed.
+
+    - ``SUCCESS``: the eval ran end-to-end.
+    - ``FETCH_ERROR``: failed to retrieve tools data from the endpoint.
+    - ``ANALYSIS_ERROR``: error during LLM analysis or result parsing.
+    - ``INTERNAL_ERROR``: other script/system error.
+    """
+
+    CHECK_STATUS_UNSPECIFIED = 0
+    SUCCESS = 1
+    FETCH_ERROR = 2
+    ANALYSIS_ERROR = 3
+    INTERNAL_ERROR = 4
+
+
+def _coerce(enum_cls, value, default):
+    """Coerce a config value (str / int / enum / None) into ``enum_cls``.
+
+    Unknown values fall back to ``default`` rather than raising, so a typo in a
+    config file degrades gracefully instead of aborting the whole run.
+    """
+    if value is None:
+        return default
+    if isinstance(value, enum_cls):
+        return value
+    if isinstance(value, int):
+        try:
+            return enum_cls(value)
+        except ValueError:
+            return default
+    name = str(value).strip().upper()
+    # Allow both short ("REMOTE") and fully-qualified ("ENDPOINT_TYPE_REMOTE")
+    # spellings, plus the raw member name.
+    members = enum_cls.__members__
+    if name in members:
+        return members[name]
+    for member_name, member in members.items():
+        if member_name.endswith("_" + name):
+            return member
+    return default
+
+
+def coerce_endpoint_type(value) -> EndpointType:
+    return _coerce(EndpointType, value, EndpointType.ENDPOINT_TYPE_UNSPECIFIED)
+
+
+def coerce_environment(value) -> Environment:
+    return _coerce(Environment, value, Environment.ENVIRONMENT_UNSPECIFIED)
diff --git a/evalbench/evaluator/mcp_readability/exceptions.py b/evalbench/evaluator/mcp_readability/exceptions.py
new file mode 100644
index 00000000..1aab8b55
--- /dev/null
+++ b/evalbench/evaluator/mcp_readability/exceptions.py
@@ -0,0 +1,71 @@
+"""Loading and matching of per-endpoint style-rule exceptions (waivers).
+
+An exceptions file lets an operator waive a specific style requirement for a
+specific endpoint when it legitimately cannot comply. Waived rules are passed to
+the scorer so they are excluded from the P0/P1/P2 counts and surfaced separately
+in the feedback.
+
+File schema (see ``datasets/mcp_readability/exceptions.yaml``)::
+
+    exceptions:
+      - endpoint_url: "https://.../mcp"   # any of url / product_name / environment
+        rule_id: "P1-TOOL-DESC-LENGTH"
+        reason: "..."
+      - environment: AUTOPUSH             # "*" or omitted field = match-all
+        rule_id: "P2-EXAMPLE-USAGE"
+        reason: "..."
+"""
+
+import logging
+from util.config import load_yaml_config
+
+
+def load_exceptions(path: str) -> list[dict]:
+    """Load the exceptions list from a YAML file. Missing/empty -> []."""
+    if not path:
+        return []
+    parsed = load_yaml_config(path)
+    if not parsed:
+        return []
+    exceptions = parsed.get("exceptions") or []
+    if not isinstance(exceptions, list):
+        logging.warning(
+            "mcp_readability: 'exceptions' in %s is not a list; ignoring.", path
+        )
+        return []
+    return exceptions
+
+
+def _matches(field_value, exception_value) -> bool:
+    """A matcher field matches when it is absent, '*', or equal (case-insensitive)."""
+    if exception_value is None or exception_value == "*":
+        return True
+    if field_value is None:
+        return False
+    return str(field_value).strip().lower() == str(exception_value).strip().lower()
+
+
+def applicable_exceptions(endpoint: dict, all_exceptions: list[dict]) -> list[dict]:
+    """Return exceptions whose matchers all apply to ``endpoint``.
+
+    Matchers considered: ``endpoint_url``, ``product_name``, ``environment``.
+    Each exception keeps its ``rule_id`` and ``reason`` for the scorer prompt.
+    """
+    matched = []
+    for exc in all_exceptions:
+        if not isinstance(exc, dict):
+            continue
+        if not exc.get("rule_id"):
+            continue
+        if (
+            _matches(endpoint.get("endpoint_url"), exc.get("endpoint_url"))
+            and _matches(endpoint.get("product_name"), exc.get("product_name"))
+            and _matches(endpoint.get("environment"), exc.get("environment"))
+        ):
+            matched.append(
+                {
+                    "rule_id": exc.get("rule_id"),
+                    "reason": exc.get("reason", ""),
+                }
+            )
+    return matched
diff --git a/evalbench/evaluator/mcp_readability/orchestrator.py b/evalbench/evaluator/mcp_readability/orchestrator.py
new file mode 100644
index 00000000..bdb066db
--- /dev/null
+++ b/evalbench/evaluator/mcp_readability/orchestrator.py
@@ -0,0 +1,334 @@
+"""Orchestrator for the MCP style-guide compliance (readability) check.
+
+Flow per endpoint:
+  1. The ``mcp_tools`` *generator* fetches/normalizes the endpoint's tools.yaml.
+  2. Token usage is estimated against the endpoint's token budget.
+  3. Applicable exceptions + any previous feedback are gathered.
+  4. The ``mcp_style_compliance`` *scorer* (LLM) evaluates tools.yaml vs the
+     style guide and returns issue counts / score / feedback.
+  5. A result row is assembled with the full metric schema.
+
+Results are written to a dedicated compliance CSV. This orchestrator does not
+use the NL2SQL dataset/scores pipeline, so :meth:`process` returns ``None`` for
+the results/scores temp files, which makes the standard EvalBench reporting path
+skip cleanly.
+"""
+
+import concurrent.futures
+import datetime
+import json
+import logging
+import threading
+
+import pandas as pd
+
+from evaluator.orchestrator import Orchestrator
+from generators.models import get_generator
+from scorers.mcp_style_compliance import McpStyleComplianceScorer
+from reporting.mcp_readability_csv import COLUMNS as mcp_csv_columns
+from reporting.mcp_readability_csv import write_compliance_csv
+from util.config import load_yaml_config
+
+from evaluator.mcp_readability.enums import (
+    CheckStatus,
+    coerce_endpoint_type,
+    coerce_environment,
+)
+from evaluator.mcp_readability import exceptions as exceptions_mod
+from evaluator.mcp_readability.token_estimator import (
+    token_budget_used_percent,
+)
+
+
+class McpReadabilityOrchestrator(Orchestrator):
+    """Checks a list of MCP endpoints against the Data Cloud MCP style guide."""
+
+    def __init__(self, config, db_configs, setup_config, report_progress=False):
+        super().__init__(config, db_configs, setup_config, report_progress)
+
+        runner_config = config.get("runners", {}) or {}
+        self.endpoint_runners = runner_config.get("endpoint_runners", 4)
+
+        # Load endpoints + shared defaults.
+        endpoints_config = config.get("endpoints_config")
+        if not endpoints_config:
+            raise ValueError("mcp_readability requires 'endpoints_config'")
+        parsed = load_yaml_config(endpoints_config) or {}
+        self.defaults = parsed.get("defaults") or {}
+        self.endpoints = parsed.get("endpoints") or []
+        if not self.endpoints:
+            logging.warning(
+                "mcp_readability: no endpoints found in %s", endpoints_config
+            )
+
+        # Style guide text (fed verbatim into the scorer prompt).
+        self.style_guide = self._read_text(config.get("style_guide"))
+
+        # Exceptions (waivers).
+        self.all_exceptions = exceptions_mod.load_exceptions(
+            config.get("exceptions_config")
+        )
+
+        # Default token budget + environment filter.
+        self.default_token_budget = config.get("token_budget", 0) or 0
+        env_filter = config.get("environments") or []
+        self.environment_filter = {str(e).strip().upper() for e in env_filter}
+
+        # Previous feedback for run-to-run consistency.
+        self.previous_feedback = self._load_previous_feedback(
+            config.get("previous_results_csv")
+        )
+
+        # Shared model registry for get_generator (lock + cache), as used by
+        # the standard orchestrators.
+        self.global_models = {
+            "lock": threading.Lock(),
+            "registered_models": {},
+        }
+
+        # Tools generator (the "system under test" fetcher).
+        tools_generator_config = config.get("tools_generator_config")
+        if not tools_generator_config:
+            raise ValueError("mcp_readability requires 'tools_generator_config'")
+        self.tools_generator = get_generator(
+            self.global_models, tools_generator_config
+        )
+
+        # Compliance scorer (LLM).
+        scorer_config = (config.get("scorers") or {}).get(
+            "mcp_style_compliance"
+        ) or {}
+        self.scorer = McpStyleComplianceScorer(scorer_config, self.global_models)
+
+        self.rows = []
+
+    # ------------------------------------------------------------------
+    # Public lifecycle
+    # ------------------------------------------------------------------
+    def evaluate(self, dataset=None):
+        """Run the compliance check for every (filtered) endpoint."""
+        endpoints = self._filtered_endpoints()
+        if not endpoints:
+            logging.warning("mcp_readability: no endpoints to check after filtering.")
+            self.rows = []
+            return
+
+        workers = max(1, int(self.endpoint_runners))
+        with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as pool:
+            futures = {
+                pool.submit(self._check_endpoint, ep): ep for ep in endpoints
+            }
+            rows = []
+            for future in concurrent.futures.as_completed(futures):
+                rows.append(future.result())
+
+        # Keep deterministic ordering (by product_name then url).
+        rows.sort(key=lambda r: (r.get("product_name", ""), r.get("endpoint_url", "")))
+        self.rows = rows
+        if self.report_progress:
+            logging.info("mcp_readability: checked %d endpoints.", len(rows))
+
+    def process(self):
+        reporting = self.config.get("reporting") or {}
+        output_directory = (reporting.get("csv") or {}).get(
+            "output_directory", "results"
+        )
+        try:
+            write_compliance_csv(self.rows, output_directory, self.job_id)
+        except Exception:
+            logging.exception("mcp_readability: failed to write compliance CSV")
+
+        # Optionally append the same rows to the existing eval BigQuery table.
+        if reporting.get("bigquery"):
+            self._write_bigquery(reporting["bigquery"])
+
+        # Return Nones so the standard NL2SQL reporting path is skipped.
+        return (self.job_id, self.run_time, None, None, None)
+
+    def _write_bigquery(self, bq_config):
+        """Append compliance rows to the standard eval BigQuery table.
+
+        Reuses the shared :class:`BigQueryReporter` so rows land in the existing
+        ``<project>.evalbench.results`` table (schema auto-evolves to add the
+        compliance columns); no dedicated dataset/table is created.
+        """
+        if not self.rows:
+            return
+        try:
+            from reporting.bqstore import BigQueryReporter
+            from reporting.report import STORETYPE
+
+            df = pd.DataFrame(self.rows, columns=mcp_csv_columns)
+            df["job_id"] = self.job_id
+            reporter = BigQueryReporter(bq_config, self.job_id, self.run_time)
+            reporter.store(df, STORETYPE.EVALS)
+        except Exception:
+            logging.exception("mcp_readability: failed to write BigQuery results")
+
+    # ------------------------------------------------------------------
+    # Per-endpoint work
+    # ------------------------------------------------------------------
+    def _check_endpoint(self, endpoint: dict) -> dict:
+        product_name = endpoint.get("product_name", "")
+        endpoint_url = endpoint.get("endpoint_url", "")
+        endpoint_type = coerce_endpoint_type(self._ep_value(endpoint, "endpoint_type"))
+        environment = coerce_environment(self._ep_value(endpoint, "environment"))
+        token_budget = self._ep_value(
+            endpoint, "token_budget", self.default_token_budget
+        )
+
+        row = self._base_row(
+            product_name, endpoint_url, endpoint_type, environment
+        )
+        try:
+            # 1. Fetch tools + render man-page markup. Failure here is a
+            #    FETCH_ERROR.
+            try:
+                tools, man_page = self.tools_generator.fetch_tools(
+                    endpoint, self.defaults
+                )
+            except Exception as e:
+                logging.exception(
+                    "mcp_readability: fetch failed for %s (%s)",
+                    product_name,
+                    endpoint_url,
+                )
+                row["check_status"] = CheckStatus.FETCH_ERROR.name
+                row["error_message"] = f"{type(e).__name__}: {e}"
+                return row
+
+            # 2. Token estimate / tool count (recorded even if analysis fails).
+            #    Metrics come from the man-page formatter; the percentage is
+            #    computed against the configured per-endpoint budget.
+            est_tokens = man_page.estimated_tokens
+            row["total_tools"] = man_page.total_tools
+            row["estimated_tokens"] = est_tokens
+            row["token_budget_used_percent"] = token_budget_used_percent(
+                est_tokens, token_budget
+            )
+
+            # 3. Exceptions + previous feedback for this endpoint.
+            applicable = exceptions_mod.applicable_exceptions(
+                endpoint, self.all_exceptions
+            )
+            prev = self.previous_feedback.get(endpoint_url)
+
+            # 4. Compliance evaluation. Failure here is an ANALYSIS_ERROR.
+            try:
+                feedback = self.scorer.evaluate(
+                    tools_markup=man_page.man_page,
+                    style_guide=self.style_guide,
+                    product_name=product_name,
+                    exceptions=applicable,
+                    previous_feedback=prev,
+                )
+            except Exception as e:
+                logging.exception(
+                    "mcp_readability: analysis failed for %s (%s)",
+                    product_name,
+                    endpoint_url,
+                )
+                row["check_status"] = CheckStatus.ANALYSIS_ERROR.name
+                row["error_message"] = f"{type(e).__name__}: {e}"
+                return row
+
+            # 5. Success: the eval ran. Compliance findings are recorded as
+            #    metrics independent of this status.
+            row.update(
+                {
+                    "check_status": CheckStatus.SUCCESS.name,
+                    "p0_issues": int(feedback.get("p0_issues", 0)),
+                    "p1_issues": int(feedback.get("p1_issues", 0)),
+                    "p2_issues": int(feedback.get("p2_issues", 0)),
+                    "compliance_score": int(feedback.get("compliance_score", 0)),
+                    "llm_feedback_json": json.dumps(feedback),
+                    "llm_feedback_html": self.scorer.to_html(feedback),
+                    "error_message": "",
+                }
+            )
+        except Exception as e:
+            logging.exception(
+                "mcp_readability: internal error for %s (%s)",
+                product_name,
+                endpoint_url,
+            )
+            row["check_status"] = CheckStatus.INTERNAL_ERROR.name
+            row["error_message"] = f"{type(e).__name__}: {e}"
+        return row
+
+    # ------------------------------------------------------------------
+    # Helpers
+    # ------------------------------------------------------------------
+    def _ep_value(self, endpoint: dict, key: str, fallback=None):
+        """Endpoint value, falling back to the shared ``defaults`` then a literal."""
+        if key in endpoint:
+            return endpoint.get(key)
+        if key in self.defaults:
+            return self.defaults.get(key)
+        return fallback
+
+    def _filtered_endpoints(self) -> list[dict]:
+        if not self.environment_filter:
+            return list(self.endpoints)
+        kept = []
+        for ep in self.endpoints:
+            env = coerce_environment(self._ep_value(ep, "environment")).name
+            if env in self.environment_filter:
+                kept.append(ep)
+        return kept
+
+    @staticmethod
+    def _base_row(product_name, endpoint_url, endpoint_type, environment) -> dict:
+        return {
+            "product_name": product_name,
+            "endpoint_url": endpoint_url,
+            "endpoint_type": endpoint_type.name,
+            "environment": environment.name,
+            "check_timestamp": datetime.datetime.now().isoformat(),
+            "check_status": "",
+            "p0_issues": 0,
+            "p1_issues": 0,
+            "p2_issues": 0,
+            "total_tools": 0,
+            "estimated_tokens": 0,
+            "token_budget_used_percent": 0.0,
+            "compliance_score": 0,
+            "llm_feedback_json": "",
+            "llm_feedback_html": "",
+            "error_message": "",
+        }
+
+    @staticmethod
+    def _read_text(path) -> str:
+        if not path:
+            return ""
+        try:
+            with open(path, "r", encoding="utf-8") as f:
+                return f.read()
+        except OSError as e:
+            logging.warning("mcp_readability: could not read style guide %s: %s", path, e)
+            return ""
+
+    @staticmethod
+    def _load_previous_feedback(csv_path) -> dict:
+        """Map endpoint_url -> prior llm_feedback_json from a previous run CSV."""
+        if not csv_path:
+            return {}
+        try:
+            df = pd.read_csv(csv_path)
+        except Exception as e:
+            logging.warning(
+                "mcp_readability: could not read previous_results_csv %s: %s",
+                csv_path,
+                e,
+            )
+            return {}
+        if "endpoint_url" not in df.columns or "llm_feedback_json" not in df.columns:
+            return {}
+        mapping = {}
+        for _, r in df.iterrows():
+            url = r.get("endpoint_url")
+            fb = r.get("llm_feedback_json")
+            if isinstance(url, str) and isinstance(fb, str) and fb:
+                mapping[url] = fb
+        return mapping
diff --git a/evalbench/evaluator/mcp_readability/token_estimator.py b/evalbench/evaluator/mcp_readability/token_estimator.py
new file mode 100644
index 00000000..60acaf7b
--- /dev/null
+++ b/evalbench/evaluator/mcp_readability/token_estimator.py
@@ -0,0 +1,33 @@
+"""Token estimation for an MCP tools spec.
+
+Uses a lightweight heuristic so the feature carries no extra dependency. The
+estimate intentionally errs slightly high (tool schemas are JSON-ish and
+tokenize denser than prose). If exact counts are needed, swap ``estimate_tokens``
+for ``genai_client.models.count_tokens(...)`` — the call sites only depend on the
+function signature.
+"""
+
+import math
+
+
+def estimate_tokens(text: str) -> int:
+    """Estimate the number of LLM tokens in ``text``.
+
+    Heuristic: roughly 4 characters per token, with a small floor so a non-empty
+    spec never estimates to zero.
+    """
+    if not text:
+        return 0
+    # ~4 chars/token is the common rule of thumb for English + JSON.
+    return max(1, math.ceil(len(text) / 4))
+
+
+def token_budget_used_percent(estimated_tokens: int, token_budget: int) -> float:
+    """Percent of the token budget consumed, rounded to 2 decimals.
+
+    Returns ``0.0`` when the budget is missing or non-positive to avoid
+    divide-by-zero blowing up an otherwise-successful check.
+    """
+    if not token_budget or token_budget <= 0:
+        return 0.0
+    return round(100.0 * estimated_tokens / token_budget, 2)
diff --git a/evalbench/generators/models/__init__.py b/evalbench/generators/models/__init__.py
index 0e7d187a..fa6a1459 100644
--- a/evalbench/generators/models/__init__.py
+++ b/evalbench/generators/models/__init__.py
@@ -12,6 +12,7 @@
 from .codex_cli import CodexCliGenerator
 from .gcp_data_engineering_agent import DataEngineeringAgentGenerator
 from .agy_cli import AgyCliGenerator
+from .mcp_tools import McpToolsGenerator
 from util.config import load_yaml_config
 
 
@@ -36,6 +37,7 @@ def get_generator(global_models, model_config_path: str, db: DB = None):
             "codex_cli": lambda: CodexCliGenerator(config),
             "data_engineering_agent": lambda: DataEngineeringAgentGenerator(config),
             "agy_cli": lambda: AgyCliGenerator(config),
+            "mcp_tools": lambda: McpToolsGenerator(config),
         }
         generator = config["generator"]
         if generator not in generators:
diff --git a/evalbench/generators/models/mcp_tool_formatter.py b/evalbench/generators/models/mcp_tool_formatter.py
new file mode 100644
index 00000000..885d2a0e
--- /dev/null
+++ b/evalbench/generators/models/mcp_tool_formatter.py
@@ -0,0 +1,341 @@
+"""Formatting utilities for MCP tools (man-page style markup).
+
+Renders a sequence of MCP tools into a human-readable "man page" that is fed to
+the style-guide compliance scorer. This mirrors the production-tested
+``tool_formatter`` used by the Data Cloud MCP readability tooling: it resolves
+JSON-Schema ``$ref``/``$defs``, flattens nested parameters into dotted paths,
+marks REQUIRED/OPTIONAL, and computes token/budget metrics in the same pass.
+
+Beyond the original, this version also renders structured ``enum`` and
+``default`` values so style rules that key off them (e.g. "Use Enums") are
+judgeable from the markup.
+"""
+
+from collections.abc import Mapping, Sequence, Set
+import dataclasses
+import json
+import textwrap
+from typing import Any
+
+from mcp import types as mcp_types
+
+
+# Default token budget for a single tool manifest. Overridable per call so the
+# orchestrator can pass the configured per-endpoint budget.
+_DEFAULT_TOKEN_BUDGET = 200_000
+
+
+@dataclasses.dataclass(frozen=True)
+class ManPageResult:
+    """Result of formatting tools to a man-page style string.
+
+    Attributes:
+      man_page: The formatted man-page style string representing the tools.
+      total_tools: The total number of tools.
+      estimated_tokens: The total estimated number of tokens consumed by the
+        tools.
+      budget_pct: The percentage of the token budget used by the tools.
+    """
+
+    man_page: str
+    total_tools: int
+    estimated_tokens: int
+    budget_pct: float
+
+
+def _estimate_tokens(tool: mcp_types.Tool) -> int:
+    """Estimates the number of tokens a tool consumes."""
+    # A rough estimate is length of JSON representation / 4.
+    tool_dict = {
+        "name": tool.name,
+        "description": tool.description,
+        "inputSchema": tool.inputSchema,
+    }
+    json_str = json.dumps(tool_dict, default=str)
+    return max(1, len(json_str) // 4)
+
+
+def _wrap_text(text: str, indent: str) -> list[str]:
+    """Wraps text preserving empty lines and applying indentation."""
+    wrapped = []
+    for line in (text or "").splitlines():
+        line = line.strip()
+        if not line:
+            if wrapped:  # Don't add leading empty lines.
+                wrapped.append("")
+        else:
+            wrapped.extend(
+                textwrap.wrap(
+                    line, width=80, initial_indent=indent, subsequent_indent=indent
+                )
+            )
+    while wrapped and not wrapped[-1]:
+        wrapped.pop()
+    return wrapped
+
+
+def _value_indent(level: int) -> str:
+    """Indent used for sub-lines (description/enum/default) at a given level."""
+    if level == 0:
+        return "      "
+    if level == 1:
+        return "         "
+    return "    " + "   " * level + "  "
+
+
+def _format_enum_default(schema: Mapping[str, Any], level: int) -> list[str]:
+    """Render ``enum`` / ``default`` values for a property, if present."""
+    lines: list[str] = []
+    indent = _value_indent(level)
+    if "enum" in schema and isinstance(schema["enum"], (list, tuple)):
+        values = ", ".join(json.dumps(v, default=str) for v in schema["enum"])
+        lines.extend(_wrap_text(f"enum: [{values}]", indent))
+    if "default" in schema:
+        lines.extend(
+            _wrap_text(f"default: {json.dumps(schema['default'], default=str)}", indent)
+        )
+    return lines
+
+
+def _resolve_ref(
+    schema: Mapping[str, Any], defs: Mapping[str, Any]
+) -> tuple[Mapping[str, Any], str | None]:
+    """Resolves JSON schema references.
+
+    Args:
+      schema: The current JSON schema dictionary.
+      defs: A dictionary of schema definitions where '$ref' can be resolved.
+
+    Returns:
+      A tuple (resolved_schema, ref_name), where `resolved_schema` is the
+      resolved schema dictionary (or the original schema if no '$ref' is present
+      or resolvable), and `ref_name` is the name of the resolved reference (e.g.,
+      the part after '#/$defs/'), or None if no reference was resolved.
+    """
+    if "$ref" in schema:
+        ref_path = schema["$ref"]
+        if ref_path.startswith("#/$defs/"):
+            ref_name = ref_path.split("/")[-1]
+            if ref_name in defs:
+                return defs[ref_name], ref_name
+    return schema, None
+
+
+def _format_type(schema: Mapping[str, Any]) -> str:
+    """Formats a JSON schema type representation."""
+    schema_type = schema.get("type", "any")
+    if isinstance(schema_type, list):
+        return " | ".join(str(t) for t in schema_type)
+    return str(schema_type)
+
+
+def _format_header(
+    prop_name: str,
+    prop_type: str,
+    req_str: str,
+    desc: str,
+    full_path: str,
+    level: int,
+) -> list[str]:
+    """Formats a property's header line and description based on nesting level."""
+    lines = []
+    if level == 0:
+        lines.append(f"  {prop_name} ({prop_type}) [{req_str}]")
+        if desc:
+            lines.extend(_wrap_text(desc, "      "))
+    elif level == 1:
+        lines.append(f"      -> {full_path} ({prop_type}) [{req_str}]")
+        if desc:
+            lines.extend(_wrap_text(desc, "         "))
+    else:
+        indent = "    " + "   " * level
+        lines.append(f"{indent}* {prop_name} ({prop_type})")
+        if desc:
+            lines.extend(_wrap_text(desc, indent + "  "))
+    return lines
+
+
+def _format_recursive(level: int) -> list[str]:
+    """Formats a recursive reference notice for array items at the given level."""
+    desc = "[Recursive Reference]"
+    if level == 0:
+        return _wrap_text(desc, "      ")
+    if level == 1:
+        return _wrap_text(desc, "         ")
+    indent = "    " + "   " * (level + 1)
+    return _wrap_text(desc, indent + "  ")
+
+
+def _format_obj(
+    schema: Mapping[str, Any],
+    defs: Mapping[str, Any],
+    full_path: str,
+    level: int,
+    visited: Set[str],
+) -> list[str]:
+    """Formats an object property by recursively traversing its children."""
+    lines = _format_props(
+        schema,
+        defs=defs,
+        path_prefix=full_path,
+        level=level + 1,
+        visited=visited,
+    )
+    if lines and lines[-1]:
+        lines.append("")
+    return lines
+
+
+def _format_array(
+    schema: Mapping[str, Any],
+    defs: Mapping[str, Any],
+    full_path: str,
+    level: int,
+    visited: Set[str],
+) -> list[str]:
+    """Formats an array property by traversing its items schema."""
+    # Resolve the items schema to check if it points to a reference definition.
+    schema, ref = _resolve_ref(schema["items"], defs)
+    is_recursive = False
+    if ref:
+        # Check if this reference name has already been encountered higher up in
+        # the current traversal path to prevent infinite recursion.
+        if ref in visited:
+            is_recursive = True
+        else:
+            visited = visited | {ref}
+
+    if not is_recursive and schema.get("type") == "object":
+        return _format_props(
+            schema,
+            defs=defs,
+            path_prefix=full_path + "[]",
+            level=level + 1,
+            visited=visited,
+        )
+    if is_recursive:
+        return _format_recursive(level)
+    return []
+
+
+def _format_props(
+    schema: Mapping[str, Any],
+    *,
+    defs: Mapping[str, Any] | None = None,
+    path_prefix: str = "",
+    level: int = 0,
+    visited: Set[str] | None = None,
+) -> list[str]:
+    """Recursively formats JSON schema properties into man-page parameter lines."""
+    lines = []
+    if defs is None:
+        defs = schema.get("$defs", {}) or schema.get("definitions", {})
+    visited = visited if visited is not None else set()
+
+    schema, _ = _resolve_ref(schema, defs)
+    properties = schema.get("properties", {})
+    required_fields = set(schema.get("required", []))
+
+    for prop_name, prop_schema in properties.items():
+        prop_schema, ref_name = _resolve_ref(prop_schema, defs)
+        is_recursive = ref_name in visited if ref_name else False
+        prop_visited = (
+            visited | {ref_name} if ref_name and not is_recursive else visited
+        )
+
+        is_required = prop_name in required_fields
+        req_str = "REQUIRED" if is_required else "OPTIONAL"
+        prop_type = _format_type(prop_schema)
+
+        desc = prop_schema.get("description", "").strip()
+        if is_recursive:
+            desc = f"[Recursive Reference] {desc}".strip()
+
+        full_path = f"{path_prefix}.{prop_name}" if path_prefix else prop_name
+
+        lines.extend(
+            _format_header(prop_name, prop_type, req_str, desc, full_path, level)
+        )
+        lines.extend(_format_enum_default(prop_schema, level))
+
+        if not is_recursive:
+            if prop_type == "object" and "properties" in prop_schema:
+                lines.extend(
+                    _format_obj(prop_schema, defs, full_path, level, prop_visited)
+                )
+            elif prop_type == "array" and "items" in prop_schema:
+                lines.extend(
+                    _format_array(prop_schema, defs, full_path, level, prop_visited)
+                )
+
+        if level == 0:
+            if lines and lines[-1]:
+                lines.append("")
+
+    return lines
+
+
+def format_tools_to_man_page(
+    tools: Sequence[mcp_types.Tool], token_budget: int = _DEFAULT_TOKEN_BUDGET
+) -> ManPageResult:
+    """Formats a sequence of MCP tools into a man-page style string.
+
+    Args:
+      tools: Sequence of mcp.types.Tool objects.
+      token_budget: Token budget used to compute the budget percentage.
+
+    Returns:
+      A ManPageResult with the formatted markup, tool count, estimated tokens,
+      and percentage of the token budget used.
+    """
+    budget = token_budget if token_budget and token_budget > 0 else _DEFAULT_TOKEN_BUDGET
+
+    if not tools:
+        return ManPageResult(
+            man_page="No tools available.",
+            total_tools=0,
+            estimated_tokens=0,
+            budget_pct=0.0,
+        )
+
+    lines = []
+
+    tool_tokens = {tool.name: _estimate_tokens(tool) for tool in tools}
+    estimated_tokens = sum(tool_tokens.values())
+    total_tools = len(tools)
+    budget_pct = (estimated_tokens / budget) * 100
+
+    lines.append("=" * 80)
+    lines.append("SUMMARY")
+    lines.append("=" * 80)
+    lines.append(f"Total Tools: {total_tools}")
+    lines.append(f"Total Estimated Tokens: {estimated_tokens}")
+    lines.append(f"Budget Used: {budget_pct:.2f}% of {budget} tokens")
+    lines.append("\nTools:")
+    for name, tokens in tool_tokens.items():
+        pct_of_window = (tokens / budget) * 100
+        lines.append(f"  - {name} ({tokens} tokens, {pct_of_window:.2f}% of window)")
+    lines.append("")
+
+    for tool in tools:
+        lines.append("=" * 80)
+        lines.append(f"TOOL: {tool.name}")
+        lines.append("=" * 80)
+        lines.append("DESCRIPTION:")
+
+        desc = tool.description or "No description provided."
+        lines.extend(_wrap_text(desc, "  "))
+
+        lines.append("\nPARAMETERS:")
+        if tool.inputSchema and tool.inputSchema.get("properties"):
+            schema_lines = _format_props(tool.inputSchema)
+            lines.extend(schema_lines)
+        else:
+            lines.append("  None\n")
+
+    return ManPageResult(
+        man_page="\n".join(lines).strip(),
+        total_tools=total_tools,
+        estimated_tokens=estimated_tokens,
+        budget_pct=budget_pct,
+    )
diff --git a/evalbench/generators/models/mcp_tools.py b/evalbench/generators/models/mcp_tools.py
new file mode 100644
index 00000000..ae419fb1
--- /dev/null
+++ b/evalbench/generators/models/mcp_tools.py
@@ -0,0 +1,229 @@
+"""Generator that fetches an MCP endpoint's tools and renders man-page markup.
+
+This plays the EvalBench *generator* role for the MCP readability check: instead
+of asking a model to produce an artifact, it fetches the tool listing from the
+"system under test" (an MCP endpoint) and renders the production-tested man-page
+markup that the compliance *scorer* then evaluates against the style guide.
+
+The source is pluggable per endpoint via ``tools_source.type``:
+  - ``mcp``:  live MCP ``tools/list`` over Streamable HTTP using the official
+    ``mcp`` Python SDK. Requires network access (and usually auth).
+  - ``file``: read a local YAML/JSON tools spec (dependency-free; great for tests
+    and for offline / deterministic runs).
+
+Fetching, auth, and URL normalization mirror the tested Data Cloud MCP
+readability tooling (``mcp_utils``): ADC bearer + ``x-goog-user-project`` headers
+and a ``/mcp`` suffix convention.
+"""
+
+import functools
+import logging
+import os
+
+import anyio
+import httpx
+
+from mcp import types as mcp_types
+from mcp.client import session as mcp_session
+from mcp.client.streamable_http import streamable_http_client
+
+from .generator import QueryGenerator
+from .mcp_tool_formatter import ManPageResult, format_tools_to_man_page
+from util.config import load_yaml_config
+
+
+class McpToolsError(Exception):
+    """Raised when a tools spec cannot be fetched or parsed."""
+
+
+class McpToolsGenerator(QueryGenerator):
+    """Fetches an MCP endpoint's tools and renders them as man-page markup."""
+
+    def __init__(self, querygenerator_config):
+        super().__init__(querygenerator_config)
+        self.name = "mcp_tools"
+        cfg = querygenerator_config or {}
+        self.timeout = cfg.get("timeout", 30)
+        self.default_headers = cfg.get("headers") or {}
+        # Optional default auth mode applied when an endpoint doesn't specify one.
+        self.default_auth = cfg.get("auth")
+
+    # The abstract base requires generate_internal; the orchestrator calls
+    # fetch_tools directly, but we keep this so the class is a valid generator.
+    def generate_internal(self, prompt):
+        if isinstance(prompt, dict):
+            _, result = self.fetch_tools(prompt, {})
+            return result.man_page
+        return ""
+
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+    def fetch_tools(self, endpoint: dict, defaults: dict):
+        """Fetch tools for ``endpoint`` and render man-page markup.
+
+        Returns ``(tools, man_page_result)`` where ``tools`` is a list of
+        ``mcp.types.Tool`` and ``man_page_result`` is a
+        :class:`ManPageResult` (markup + token/tool metrics).
+        """
+        source = self._resolve_source(endpoint, defaults)
+        source_type = (source.get("type") or "mcp").lower()
+
+        if source_type == "file":
+            tools = self._from_file(source)
+        elif source_type == "mcp":
+            tools = self._from_mcp(source, endpoint)
+        else:
+            raise McpToolsError(f"Unknown tools_source.type '{source_type}'")
+
+        result = format_tools_to_man_page(tools)
+        return tools, result
+
+    # ------------------------------------------------------------------
+    # Source resolution / auth
+    # ------------------------------------------------------------------
+    def _resolve_source(self, endpoint: dict, defaults: dict) -> dict:
+        """Merge defaults.tools_source with the endpoint's tools_source."""
+        merged = dict((defaults or {}).get("tools_source") or {})
+        merged.update(endpoint.get("tools_source") or {})
+        return merged
+
+    def _auth_headers(self, source: dict) -> dict:
+        """Build request headers: defaults + source + ADC bearer + project."""
+        headers = dict(self.default_headers)
+        headers.update(source.get("headers") or {})
+        project = source.get("project")
+        auth = source.get("auth", self.default_auth)
+        if auth == "google_credentials":
+            token, adc_project = self._google_credentials()
+            if token:
+                headers["Authorization"] = f"Bearer {token}"
+            if adc_project and "x-goog-user-project" not in headers:
+                headers["x-goog-user-project"] = adc_project
+        if project:
+            headers["x-goog-user-project"] = project
+        return headers
+
+    @staticmethod
+    def _google_credentials():
+        """Best-effort ADC ``(token, project)``; returns ``(None, None)`` if N/A."""
+        try:
+            import google.auth
+            from google.auth.transport.requests import Request
+
+            creds, project = google.auth.default(
+                scopes=["https://www.googleapis.com/auth/cloud-platform"]
+            )
+            creds.refresh(Request())
+            return creds.token, project
+        except Exception as e:  # pragma: no cover - depends on environment
+            logging.warning("mcp_tools: could not obtain Google credentials: %s", e)
+            return None, None
+
+    @staticmethod
+    def sanitize_url(url: str) -> str:
+        """Ensure the URL has a scheme and ends with ``/mcp``.
+
+        ``/mcp`` is the conventional path where ESF exposes the MCP protocol.
+        """
+        stripped_url = (url or "").strip()
+        if stripped_url.startswith(("http://", "https://")):
+            url_with_scheme = stripped_url
+        elif "localhost" in stripped_url:
+            url_with_scheme = f"http://{stripped_url}"
+        else:
+            url_with_scheme = f"https://{stripped_url}"
+
+        rstripped_url = url_with_scheme.rstrip("/")
+        if not rstripped_url.endswith("/mcp"):
+            return f"{rstripped_url}/mcp"
+        return rstripped_url
+
+    # ------------------------------------------------------------------
+    # mcp source (official SDK over Streamable HTTP)
+    # ------------------------------------------------------------------
+    def _from_mcp(self, source: dict, endpoint: dict) -> list[mcp_types.Tool]:
+        raw_url = source.get("url") or endpoint.get("endpoint_url")
+        if not raw_url:
+            raise McpToolsError("tools_source.type 'mcp' requires an endpoint URL")
+        url = self.sanitize_url(raw_url)
+        headers = self._auth_headers(source)
+        try:
+            return anyio.run(
+                functools.partial(self._async_fetch_tools, url, headers)
+            )
+        except McpToolsError:
+            raise
+        except Exception as e:
+            raise McpToolsError(f"Failed to fetch tools from {url}: {e}") from e
+
+    async def _async_fetch_tools(self, url: str, headers: dict) -> list[mcp_types.Tool]:
+        """Connect to the MCP server over Streamable HTTP and list its tools."""
+        timeout = httpx.Timeout(self.timeout, read=300.0)
+        async with (
+            httpx.AsyncClient(headers=headers, timeout=timeout) as http_client,
+            streamable_http_client(url, http_client=http_client) as streams,
+        ):
+            reader, writer, _ = streams
+            async with mcp_session.ClientSession(reader, writer) as session:
+                await session.initialize()
+                tools_response = await session.list_tools()
+        return list(tools_response.tools)
+
+    # ------------------------------------------------------------------
+    # file source
+    # ------------------------------------------------------------------
+    def _from_file(self, source: dict) -> list[mcp_types.Tool]:
+        path = source.get("path")
+        if not path:
+            raise McpToolsError("tools_source.type 'file' requires a 'path'")
+        if not os.path.exists(path):
+            raise McpToolsError(f"Tools file not found: {path}")
+        try:
+            parsed = load_yaml_config(path)  # yaml.safe_load handles JSON too
+        except Exception as e:
+            raise McpToolsError(f"Could not parse tools file {path}: {e}") from e
+        if not parsed:
+            raise McpToolsError(f"Empty or unreadable tools file: {path}")
+        return self._to_tools(parsed)
+
+    @staticmethod
+    def _to_tools(raw) -> list[mcp_types.Tool]:
+        """Build ``mcp.types.Tool`` objects from a parsed tools spec."""
+        if isinstance(raw, dict):
+            tools = raw.get("tools")
+            if tools is None and "result" in raw:
+                tools = (raw.get("result") or {}).get("tools")
+        elif isinstance(raw, list):
+            tools = raw
+        else:
+            tools = None
+
+        if not tools:
+            raise McpToolsError("No tools found in spec")
+
+        built: list[mcp_types.Tool] = []
+        for t in tools:
+            if not isinstance(t, dict):
+                continue
+            schema = (
+                t.get("inputSchema")
+                or t.get("input_schema")
+                or t.get("parameters")
+                or {}
+            )
+            if not isinstance(schema, dict):
+                schema = {}
+            try:
+                built.append(
+                    mcp_types.Tool(
+                        name=t.get("name", ""),
+                        description=t.get("description") or t.get("title") or "",
+                        inputSchema=schema,
+                    )
+                )
+            except Exception as e:
+                raise McpToolsError(f"Invalid tool entry {t!r}: {e}") from e
+        if not built:
+            raise McpToolsError("Tools list contained no valid tool entries")
+        return built
diff --git a/evalbench/reporting/mcp_readability_csv.py b/evalbench/reporting/mcp_readability_csv.py
new file mode 100644
index 00000000..952ae8d3
--- /dev/null
+++ b/evalbench/reporting/mcp_readability_csv.py
@@ -0,0 +1,98 @@
+"""CSV writer for MCP style-guide compliance results.
+
+Writes one row per checked endpoint with the fixed metric schema requested for
+the Data Cloud MCP readability check, plus an ``environment`` column (after
+``endpoint_type``) so results can be sliced by release channel.
+
+Column order and types are stable:
+  product_name (str), endpoint_url (str), endpoint_type (str/enum name),
+  environment (str/enum name), check_timestamp (ISO-8601 str),
+  check_status (str/enum name), p0_issues (int), p1_issues (int),
+  p2_issues (int), total_tools (int), estimated_tokens (int),
+  token_budget_used_percent (float), compliance_score (int),
+  llm_feedback_json (str), llm_feedback_html (str), error_message (str)
+"""
+
+import logging
+import os
+import sys
+
+import pandas as pd
+
+
+COLUMNS = [
+    "product_name",
+    "endpoint_url",
+    "endpoint_type",
+    "environment",
+    "check_timestamp",
+    "check_status",
+    "p0_issues",
+    "p1_issues",
+    "p2_issues",
+    "total_tools",
+    "estimated_tokens",
+    "token_budget_used_percent",
+    "compliance_score",
+    "llm_feedback_json",
+    "llm_feedback_html",
+    "error_message",
+]
+
+_INT_COLUMNS = [
+    "p0_issues",
+    "p1_issues",
+    "p2_issues",
+    "total_tools",
+    "estimated_tokens",
+    "compliance_score",
+]
+_FLOAT_COLUMNS = ["token_budget_used_percent"]
+_STRING_COLUMNS = [
+    "product_name",
+    "endpoint_url",
+    "endpoint_type",
+    "environment",
+    "check_timestamp",
+    "check_status",
+    "llm_feedback_json",
+    "llm_feedback_html",
+    "error_message",
+]
+
+
+def _output_dir(output_directory: str) -> str:
+    # Match reporting/csv.py: when run as the gRPC service, write to the shared
+    # volume rather than the configured local directory.
+    if sys.argv and sys.argv[0].endswith("eval_server.py"):
+        return "/tmp_session_files/results"
+    return output_directory or "results"
+
+
+def write_compliance_csv(rows: list[dict], output_directory: str, job_id: str) -> str:
+    """Write compliance ``rows`` to ``<output_dir>/<job_id>/mcp_readability_compliance.csv``.
+
+    Returns the path written.
+    """
+    out_dir = _output_dir(output_directory)
+    directory = os.path.join(out_dir, job_id)
+    os.makedirs(directory, exist_ok=True)
+    file_path = os.path.join(directory, "mcp_readability_compliance.csv")
+
+    df = pd.DataFrame(rows, columns=COLUMNS)
+
+    # Enforce types so the CSV schema is stable regardless of row contents.
+    for col in _INT_COLUMNS:
+        df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0).astype("int64")
+    for col in _FLOAT_COLUMNS:
+        df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0.0).astype("float64")
+    for col in _STRING_COLUMNS:
+        df[col] = df[col].fillna("").astype("string")
+
+    df.to_csv(file_path, index=False)
+    logging.info(
+        "Wrote MCP readability compliance CSV (%d rows) to %s",
+        len(df),
+        file_path,
+    )
+    return file_path
diff --git a/evalbench/scorers/mcp_style_compliance.py b/evalbench/scorers/mcp_style_compliance.py
new file mode 100644
index 00000000..e5c93cfc
--- /dev/null
+++ b/evalbench/scorers/mcp_style_compliance.py
@@ -0,0 +1,360 @@
+"""LLM-backed scorer that evaluates an MCP tool manifest against a style guide.
+
+Follows the same shape as :class:`scorers.llmrater.LLMRater`: constructed with
+``(config, global_models)`` and holds an LLM obtained via
+``generators.models.get_generator``. The orchestrator calls :meth:`evaluate`
+per endpoint with the tool man-page markup.
+
+The model reviews the tools from an LLM-agent-consumption perspective and returns
+a strict JSON object describing P0/P1/P2 findings, an overall compliance score,
+and any rules waived via the exceptions file. We parse and normalize that JSON
+defensively so a slightly malformed response degrades to an ERROR row rather than
+crashing the run.
+
+When a previous run's feedback is available, a dedicated second LLM pass
+reconciles the fresh review with it (preserving historical severity, dropping
+fixed issues, adding new ones) so results stay consistent run-to-run.
+"""
+
+import html
+import json
+import logging
+import re
+
+from generators.models import get_generator
+
+
+# Shared JSON output contract appended to both prompts (escaped for str.format).
+_OUTPUT_SCHEMA = """### OUTPUT
+Return ONLY a JSON object (no markdown, no prose) with exactly this shape:
+{{
+  "compliance_score": <integer 0-100, higher is better>,
+  "p0_issues": <integer>,
+  "p1_issues": <integer>,
+  "p2_issues": <integer>,
+  "findings": [
+    {{"severity": "P0|P1|P2", "rule_id": "<string>", "tool": "<tool name or ''>",
+      "message": "<what is wrong>", "suggestion": "<how to fix>"}}
+  ],
+  "waived": [
+    {{"rule_id": "<string>", "reason": "<reason>", "would_have_violated": <true|false>}}
+  ],
+  "summary": "<one-paragraph overall assessment>"
+}}
+The counts p0_issues/p1_issues/p2_issues MUST equal the number of findings of
+each severity."""
+
+
+PROMPT_TEMPLATE = (
+    """You are an expert on the Data Cloud MCP / OneMCP standard and a pragmatic
+API Developer Experience reviewer. Evaluate the MCP server's tool definitions
+(shown below as a man page) against the STYLE GUIDE and report every violation.
+
+Evaluate from the perspective of an LLM agent that must call these tools:
+- Will the model understand the terminology and the tool / parameter names?
+- Are the parameters too complex, too numerous, or under-described?
+- Is the agent forced to act like a computer -- e.g. formatting complex strings,
+  generating UUIDs, or calculating timestamps -- instead of expressing intent?
+Adopt a consultative, pragmatic tone (e.g. "Consider...", "Evaluate whether...").
+Do not be overly pedantic about minor wording when larger architectural blockers
+exist.
+
+Severity levels:
+- P0: critical violation (must fix; blocks compliance).
+- P1: major violation (should fix).
+- P2: minor / stylistic violation (nice to fix).
+
+How to assign severity and rule_id:
+- The STYLE GUIDE annotates each rule with its priority in an HTML comment next
+  to the section heading, e.g. `### Tool Names <!-- priority: p1 ... -->` or
+  `#### Safe Pagination <!-- priority: p0 -->`. Use that annotated priority as
+  the severity of any violation of that rule (p0 -> P0, p1 -> P1, p2 -> P2).
+- A heading may specify different priorities for different aspects, e.g.
+  `<!-- priority: p1 for <action>_<resource>, p2 for snake_case -->`. Honor that
+  split when classifying the specific violation.
+- Use the section heading text as the `rule_id` (e.g. "Tool Names",
+  "Use Human-Readable Time and Durations", "Concise Descriptions").
+- Only report violations of rules that actually apply to the given tools. Rules
+  about platform/registration/dashboards that cannot be judged from the tool
+  schema alone should not be flagged as violations.
+
+Avoid duplication (global vs per-tool):
+- If an issue is global or repeats across many tools (e.g. a convention violated
+  everywhere), report it ONCE with "tool" set to "" (empty string). Do NOT emit
+  the same global issue once per tool.
+- Report tool-specific issues with "tool" set to that tool's name.
+
+### STYLE GUIDE
+{style_guide}
+
+### PRODUCT
+{product_name}
+
+### TOOLS (man page)
+{tools_markup}
+
+### EXCEPTIONS (waived rules — DO NOT count these as issues)
+The following rules have been explicitly waived for this endpoint. Do not include
+them in p0_issues/p1_issues/p2_issues or in "findings". Instead list them under
+"waived" with their reason. If a waived rule would otherwise have been violated,
+note that in the waived entry.
+{exceptions}
+
+"""
+    + _OUTPUT_SCHEMA
+)
+
+
+# Second pass: reconcile the just-produced review with the previous run's review
+# so findings stay consistent over time (preserve historical severity, drop
+# fixed issues, add genuinely new ones). Both reviews share the schema above.
+CONSISTENCY_PROMPT_TEMPLATE = (
+    """You are maintaining run-to-run consistency for the Data Cloud MCP
+style-guide review of a single endpoint. You are given the STYLE GUIDE, the
+current tool manifest (man page), the review just produced for it ("LATEST
+REVIEW"), and the review from the previous run ("PREVIOUS REVIEW"). Both reviews
+use the JSON schema described under OUTPUT.
+
+Produce a single reconciled review:
+1. For each finding in PREVIOUS REVIEW, check it against the current manifest. If
+   the violation STILL EXISTS, keep it and PRESERVE its original severity and
+   wording. Drop findings that are now fixed.
+2. Add genuinely new violations from LATEST REVIEW that were missed previously.
+3. If a finding appears in both, prefer the PREVIOUS REVIEW's wording and
+   severity, unless LATEST REVIEW is significantly more precise about the same
+   issue.
+4. Do not downgrade or upgrade a still-valid finding's severity unless the STYLE
+   GUIDE's annotated priority for that rule has actually changed.
+5. Carry forward the "waived" entries that still apply.
+
+### STYLE GUIDE
+{style_guide}
+
+### TOOLS (man page)
+{tools_markup}
+
+### LATEST REVIEW (JSON)
+{latest_review}
+
+### PREVIOUS REVIEW (JSON)
+{previous_review}
+
+"""
+    + _OUTPUT_SCHEMA
+)
+
+
+class McpStyleComplianceScorer:
+    """Scores a tools spec against the MCP style guide using an LLM."""
+
+    def __init__(self, config: dict, global_models):
+        self.name = "mcp_style_compliance"
+        config = config or {}
+        self.model_config = config.get("model_config") or ""
+        if not self.model_config:
+            raise ValueError(
+                "model_config is required for the mcp_style_compliance scorer"
+            )
+        self.model = get_generator(global_models, self.model_config)
+
+    def evaluate(
+        self,
+        tools_markup: str,
+        style_guide: str,
+        product_name: str,
+        exceptions: list[dict] | None = None,
+        previous_feedback: str | None = None,
+    ) -> dict:
+        """Run the LLM compliance check and return a normalized feedback dict.
+
+        When ``previous_feedback`` (a prior run's ``llm_feedback_json``) is
+        supplied, a dedicated second pass reconciles the fresh review with it so
+        findings stay consistent run-to-run.
+        """
+        prompt = PROMPT_TEMPLATE.format(
+            style_guide=style_guide or "(no style guide provided)",
+            product_name=product_name or "(unknown)",
+            tools_markup=tools_markup or "(no tools)",
+            exceptions=json.dumps(exceptions or [], indent=2),
+        )
+        raw = self._generate(prompt)
+        feedback = self._parse(raw)
+
+        prev = (previous_feedback or "").strip()
+        if prev:
+            feedback = self._consistency_check(
+                tools_markup=tools_markup,
+                style_guide=style_guide,
+                latest=feedback,
+                previous=prev,
+            )
+        return feedback
+
+    def _consistency_check(
+        self, tools_markup: str, style_guide: str, latest: dict, previous: str
+    ) -> dict:
+        """Reconcile ``latest`` with the ``previous`` run's review (JSON string).
+
+        Returns the merged feedback, or ``latest`` unchanged if the pass fails.
+        """
+        try:
+            prompt = CONSISTENCY_PROMPT_TEMPLATE.format(
+                style_guide=style_guide or "(no style guide provided)",
+                tools_markup=tools_markup or "(no tools)",
+                latest_review=json.dumps(latest),
+                previous_review=previous,
+            )
+            return self._parse(self._generate(prompt))
+        except Exception as e:
+            logging.warning(
+                "mcp_style_compliance: consistency pass failed (%s); "
+                "using latest review.",
+                e,
+            )
+            return latest
+
+    def _generate(self, prompt: str) -> str:
+        """Generate the model response as raw JSON text.
+
+        Prefer Gemini's native JSON mode via the underlying genai client, which
+        guarantees syntactically valid JSON and -- crucially -- bypasses
+        ``GeminiGenerator.generate``'s SQL sanitizer (it strips backslashes and
+        collapses whitespace, corrupting JSON escapes). Falls back to the
+        generic ``generate`` for non-Gemini models.
+        """
+        client = getattr(self.model, "client", None)
+        caller = getattr(self.model, "_call_generate_content", None)
+        if client is not None and callable(caller):
+            try:
+                from google.genai import types
+
+                config = types.GenerateContentConfig(
+                    response_mime_type="application/json",
+                    temperature=0,
+                )
+                resp = caller(contents=prompt, config=config)
+                text = getattr(resp, "text", None)
+                if text:
+                    return text
+            except Exception as e:
+                logging.warning(
+                    "mcp_style_compliance: JSON-mode generation failed (%s); "
+                    "falling back to plain generate().",
+                    e,
+                )
+        return self.model.generate(prompt)
+
+    # ------------------------------------------------------------------
+    # parsing / rendering
+    # ------------------------------------------------------------------
+    @staticmethod
+    def _extract_json(text: str) -> dict:
+        """Pull a JSON object out of a model response (handles code fences)."""
+        if not text:
+            raise ValueError("empty model response")
+        text = text.strip()
+        # Strip ```json ... ``` or ``` ... ``` fences.
+        fence = re.match(r"^```(?:json)?\s*(.*?)\s*```$", text, re.DOTALL)
+        if fence:
+            text = fence.group(1).strip()
+        try:
+            return json.loads(text)
+        except json.JSONDecodeError:
+            # Fallback: grab the outermost {...} span.
+            start = text.find("{")
+            end = text.rfind("}")
+            if start != -1 and end != -1 and end > start:
+                return json.loads(text[start : end + 1])
+            raise
+
+    def _parse(self, raw: str) -> dict:
+        """Normalize the model output into a stable feedback dict."""
+        data = self._extract_json(raw)
+        findings = data.get("findings") or []
+        if not isinstance(findings, list):
+            findings = []
+
+        def _count(sev):
+            return sum(
+                1
+                for f in findings
+                if isinstance(f, dict)
+                and str(f.get("severity", "")).upper() == sev
+            )
+
+        # Prefer counts derived from findings (authoritative); fall back to the
+        # model-reported integers if findings are absent.
+        p0 = _count("P0") or _safe_int(data.get("p0_issues"))
+        p1 = _count("P1") or _safe_int(data.get("p1_issues"))
+        p2 = _count("P2") or _safe_int(data.get("p2_issues"))
+
+        return {
+            "compliance_score": _safe_int(data.get("compliance_score")),
+            "p0_issues": p0,
+            "p1_issues": p1,
+            "p2_issues": p2,
+            "findings": findings,
+            "waived": data.get("waived") or [],
+            "summary": data.get("summary", ""),
+        }
+
+    @staticmethod
+    def to_html(feedback: dict) -> str:
+        """Render feedback as a severity-grouped HTML fragment."""
+        if not feedback:
+            return ""
+        score = feedback.get("compliance_score", 0)
+        summary = html.escape(str(feedback.get("summary", "")))
+        parts = [
+            "<div class='mcp-compliance'>",
+            f"<h3>Compliance score: {score}</h3>",
+            f"<p>{summary}</p>",
+            "<p>P0: {p0} &nbsp; P1: {p1} &nbsp; P2: {p2}</p>".format(
+                p0=feedback.get("p0_issues", 0),
+                p1=feedback.get("p1_issues", 0),
+                p2=feedback.get("p2_issues", 0),
+            ),
+        ]
+        findings = feedback.get("findings") or []
+        if findings:
+            parts.append(
+                "<table border='1' cellpadding='4' cellspacing='0'>"
+                "<tr><th>Severity</th><th>Rule</th><th>Tool</th>"
+                "<th>Issue</th><th>Suggestion</th></tr>"
+            )
+            for f in findings:
+                if not isinstance(f, dict):
+                    continue
+                parts.append(
+                    "<tr><td>{sev}</td><td>{rule}</td><td>{tool}</td>"
+                    "<td>{msg}</td><td>{sug}</td></tr>".format(
+                        sev=html.escape(str(f.get("severity", ""))),
+                        rule=html.escape(str(f.get("rule_id", ""))),
+                        tool=html.escape(str(f.get("tool", ""))),
+                        msg=html.escape(str(f.get("message", ""))),
+                        sug=html.escape(str(f.get("suggestion", ""))),
+                    )
+                )
+            parts.append("</table>")
+        waived = feedback.get("waived") or []
+        if waived:
+            parts.append("<h4>Waived rules</h4><ul>")
+            for w in waived:
+                if not isinstance(w, dict):
+                    continue
+                parts.append(
+                    "<li>{rule}: {reason}</li>".format(
+                        rule=html.escape(str(w.get("rule_id", ""))),
+                        reason=html.escape(str(w.get("reason", ""))),
+                    )
+                )
+            parts.append("</ul>")
+        parts.append("</div>")
+        return "".join(parts)
+
+
+def _safe_int(value) -> int:
+    try:
+        return int(value)
+    except (TypeError, ValueError):
+        return 0
diff --git a/evalbench/test/mcp_readability_test.py b/evalbench/test/mcp_readability_test.py
new file mode 100644
index 00000000..a4f07e34
--- /dev/null
+++ b/evalbench/test/mcp_readability_test.py
@@ -0,0 +1,413 @@
+import json
+import os
+import sys
+import tempfile
+from unittest.mock import patch
+
+import pandas as pd
+import pytest
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+# Repo root (two levels up from evalbench/evalbench/test/) so dataset paths
+# resolve regardless of the pytest working directory.
+_REPO_ROOT = os.path.dirname(
+    os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+)
+
+
+def _ds(rel):
+    return os.path.join(_REPO_ROOT, rel)
+
+from evaluator.mcp_readability.enums import (
+    CheckStatus,
+    EndpointType,
+    Environment,
+    coerce_endpoint_type,
+    coerce_environment,
+)
+from evaluator.mcp_readability import exceptions as exc_mod
+from evaluator.mcp_readability.token_estimator import (
+    estimate_tokens,
+    token_budget_used_percent,
+)
+from mcp import types as mcp_types
+from generators.models.mcp_tools import McpToolsGenerator, McpToolsError
+from generators.models.mcp_tool_formatter import format_tools_to_man_page
+from scorers.mcp_style_compliance import McpStyleComplianceScorer
+from reporting.mcp_readability_csv import COLUMNS, write_compliance_csv
+
+
+# --------------------------------------------------------------------------
+# Enums
+# --------------------------------------------------------------------------
+def test_enum_coercion():
+    assert coerce_endpoint_type("REMOTE") is EndpointType.REMOTE
+    assert coerce_endpoint_type("local") is EndpointType.LOCAL
+    assert coerce_endpoint_type("bogus") is EndpointType.ENDPOINT_TYPE_UNSPECIFIED
+    assert coerce_endpoint_type(None) is EndpointType.ENDPOINT_TYPE_UNSPECIFIED
+    assert coerce_environment("autopush") is Environment.AUTOPUSH
+    assert coerce_environment("PROD") is Environment.PROD
+
+
+def test_check_status_members():
+    # check_status reflects whether the eval ran, not compliance findings.
+    assert {s.name for s in CheckStatus} == {
+        "CHECK_STATUS_UNSPECIFIED",
+        "SUCCESS",
+        "FETCH_ERROR",
+        "ANALYSIS_ERROR",
+        "INTERNAL_ERROR",
+    }
+
+
+# --------------------------------------------------------------------------
+# Token estimation
+# --------------------------------------------------------------------------
+def test_token_math():
+    assert estimate_tokens("") == 0
+    assert estimate_tokens("abcd") == 1
+    assert token_budget_used_percent(0, 0) == 0.0
+    assert token_budget_used_percent(250, 25000) == 1.0
+    assert token_budget_used_percent(210, 25000) == 0.84
+
+
+# --------------------------------------------------------------------------
+# Exceptions matching
+# --------------------------------------------------------------------------
+def test_exceptions_matching():
+    all_exc = [
+        {"endpoint_url": "https://a/mcp", "rule_id": "R1", "reason": "x"},
+        {"environment": "AUTOPUSH", "rule_id": "R2", "reason": "y"},
+        {"rule_id": "R3", "reason": "global"},  # match-all
+        {"reason": "no rule id, ignored"},
+    ]
+    ep = {"endpoint_url": "https://a/mcp", "environment": "PROD"}
+    matched = {e["rule_id"] for e in exc_mod.applicable_exceptions(ep, all_exc)}
+    assert matched == {"R1", "R3"}
+
+    ep2 = {"endpoint_url": "https://b/mcp", "environment": "AUTOPUSH"}
+    matched2 = {e["rule_id"] for e in exc_mod.applicable_exceptions(ep2, all_exc)}
+    assert matched2 == {"R2", "R3"}
+
+
+# --------------------------------------------------------------------------
+# URL sanitization
+# --------------------------------------------------------------------------
+def test_sanitize_url():
+    s = McpToolsGenerator.sanitize_url
+    assert s("example.com") == "https://example.com/mcp"
+    assert s("https://x.com/mcp") == "https://x.com/mcp"
+    assert s("https://x.com/mcp/") == "https://x.com/mcp"
+    assert s("localhost:8000") == "http://localhost:8000/mcp"
+    assert s("https://x.com") == "https://x.com/mcp"
+
+
+# --------------------------------------------------------------------------
+# Formatter (man-page markup)
+# --------------------------------------------------------------------------
+def test_formatter_renders_enum_default():
+    tool = mcp_types.Tool(
+        name="t",
+        description="d",
+        inputSchema={
+            "type": "object",
+            "properties": {
+                "mode": {
+                    "type": "string",
+                    "enum": ["a", "b"],
+                    "default": "a",
+                    "description": "the mode",
+                }
+            },
+            "required": ["mode"],
+        },
+    )
+    mpr = format_tools_to_man_page([tool])
+    assert mpr.total_tools == 1
+    assert "TOOL: t" in mpr.man_page
+    assert "enum:" in mpr.man_page and '"a"' in mpr.man_page
+    assert "default:" in mpr.man_page
+
+
+# --------------------------------------------------------------------------
+# Generator (file source)
+# --------------------------------------------------------------------------
+def test_generator_file_source():
+    gen = McpToolsGenerator({})
+    endpoint = {
+        "tools_source": {
+            "type": "file",
+            "path": _ds("datasets/mcp_readability/sample_tools.yaml"),
+        }
+    }
+    tools, mpr = gen.fetch_tools(endpoint, {})
+    assert mpr.total_tools == 3
+    assert all(isinstance(t, mcp_types.Tool) for t in tools)
+    assert {t.name for t in tools} == {"list_datasets", "RunQuery", "delete_table"}
+    assert "TOOL: RunQuery" in mpr.man_page
+
+
+def test_generator_missing_file_raises():
+    gen = McpToolsGenerator({})
+    with pytest.raises(McpToolsError):
+        gen.fetch_tools({"tools_source": {"type": "file", "path": "nope.yaml"}}, {})
+
+
+def test_generator_mcp_source_mocked():
+    """The live `mcp` path runs the SDK fetch; mock it so no network is needed."""
+    gen = McpToolsGenerator({})
+
+    async def _fake_fetch(self, url, headers):
+        assert url == "https://example.com/mcp"
+        return [
+            mcp_types.Tool(
+                name="t1",
+                description="d",
+                inputSchema={"type": "object", "properties": {}},
+            )
+        ]
+
+    with patch.object(McpToolsGenerator, "_async_fetch_tools", _fake_fetch):
+        endpoint = {"endpoint_url": "example.com", "tools_source": {"type": "mcp"}}
+        tools, mpr = gen.fetch_tools(endpoint, {})
+        assert [t.name for t in tools] == ["t1"]
+        assert mpr.total_tools == 1
+
+
+# --------------------------------------------------------------------------
+# Scorer parsing / html (no LLM call)
+# --------------------------------------------------------------------------
+def test_scorer_parse_and_html():
+    raw = """```json
+    {"compliance_score": 75, "findings": [
+       {"severity": "P0", "rule_id": "P0-X", "tool": "t", "message": "m", "suggestion": "s"},
+       {"severity": "P2", "rule_id": "P2-Y", "tool": "t", "message": "m", "suggestion": "s"}
+     ], "waived": [], "summary": "ok"}
+    ```"""
+    # Build a scorer without invoking the LLM constructor path.
+    scorer = McpStyleComplianceScorer.__new__(McpStyleComplianceScorer)
+    fb = scorer._parse(raw)
+    assert fb["p0_issues"] == 1
+    assert fb["p2_issues"] == 1
+    assert fb["compliance_score"] == 75
+    html = McpStyleComplianceScorer.to_html(fb)
+    assert "<table" in html and "P0-X" in html
+
+
+def test_scorer_consistency_second_pass():
+    """With previous_feedback, evaluate() runs a 2nd reconciliation pass."""
+
+    class _SeqLLM:
+        def __init__(self, responses):
+            self.responses = responses
+            self.calls = 0
+
+        def generate(self, prompt):
+            r = self.responses[min(self.calls, len(self.responses) - 1)]
+            self.calls += 1
+            return r
+
+    latest = json.dumps(
+        {"compliance_score": 50, "findings": [
+            {"severity": "P0", "rule_id": "New", "tool": "", "message": "m",
+             "suggestion": "s"}], "summary": "latest"}
+    )
+    merged = json.dumps(
+        {"compliance_score": 60, "findings": [
+            {"severity": "P1", "rule_id": "Old", "tool": "t", "message": "m",
+             "suggestion": "s"}], "summary": "merged"}
+    )
+    scorer = McpStyleComplianceScorer.__new__(McpStyleComplianceScorer)
+    scorer.model = _SeqLLM([latest, merged])
+
+    fb = scorer.evaluate(
+        tools_markup="x",
+        style_guide="g",
+        product_name="p",
+        previous_feedback='{"findings": [{"severity": "P1", "rule_id": "Old"}]}',
+    )
+    assert scorer.model.calls == 2  # review + consistency
+    assert fb["summary"] == "merged"
+    assert fb["p1_issues"] == 1 and fb["p0_issues"] == 0
+
+
+def test_scorer_no_previous_feedback_single_pass():
+    class _OneLLM:
+        def __init__(self):
+            self.calls = 0
+
+        def generate(self, prompt):
+            self.calls += 1
+            return json.dumps({"compliance_score": 90, "findings": [], "summary": "ok"})
+
+    scorer = McpStyleComplianceScorer.__new__(McpStyleComplianceScorer)
+    scorer.model = _OneLLM()
+    fb = scorer.evaluate(tools_markup="x", style_guide="g", product_name="p")
+    assert scorer.model.calls == 1
+    assert fb["compliance_score"] == 90
+
+
+# --------------------------------------------------------------------------
+# CSV reporter schema
+# --------------------------------------------------------------------------
+def test_csv_schema_and_types():
+    rows = [
+        {
+            "product_name": "p",
+            "endpoint_url": "u",
+            "endpoint_type": "REMOTE",
+            "environment": "PROD",
+            "check_timestamp": "2026-01-01T00:00:00",
+            "check_status": "SUCCESS",
+            "p0_issues": 0,
+            "p1_issues": 1,
+            "p2_issues": 2,
+            "total_tools": 3,
+            "estimated_tokens": 210,
+            "token_budget_used_percent": 0.84,
+            "compliance_score": 90,
+            "llm_feedback_json": "{}",
+            "llm_feedback_html": "<div></div>",
+            "error_message": "",
+        }
+    ]
+    with tempfile.TemporaryDirectory() as d:
+        path = write_compliance_csv(rows, d, "job1")
+        df = pd.read_csv(path)
+        assert list(df.columns) == COLUMNS
+        assert df.iloc[0]["p1_issues"] == 1
+        assert abs(df.iloc[0]["token_budget_used_percent"] - 0.84) < 1e-9
+
+
+# --------------------------------------------------------------------------
+# End-to-end orchestrator with stubbed LLM
+# --------------------------------------------------------------------------
+class _FakeLLM:
+    def generate(self, prompt):
+        return json.dumps(
+            {
+                "compliance_score": 80,
+                "findings": [
+                    {"severity": "P1", "rule_id": "P1-A", "tool": "RunQuery",
+                     "message": "m", "suggestion": "s"},
+                ],
+                "waived": [{"rule_id": "P2-TOOL-DESC-LENGTH", "reason": "r"}],
+                "summary": "fine",
+            }
+        )
+
+
+def test_orchestrator_end_to_end():
+    import yaml
+
+    with patch(
+        "scorers.mcp_style_compliance.get_generator", return_value=_FakeLLM()
+    ):
+        from evaluator import get_orchestrator
+
+        # Use a self-contained offline endpoints file (file source) so the test
+        # stays deterministic and independent of the production endpoint list.
+        with tempfile.TemporaryDirectory() as d:
+            ep_path = os.path.join(d, "endpoints.yaml")
+            with open(ep_path, "w") as f:
+                yaml.safe_dump(
+                    {
+                        "defaults": {
+                            "endpoint_type": "REMOTE",
+                            "tools_source": {"type": "file"},
+                        },
+                        "endpoints": [
+                            {
+                                "product_name": "Sample",
+                                "endpoint_url": "file://sample",
+                                "environment": "DEV",
+                                "tools_source": {
+                                    "type": "file",
+                                    "path": _ds(
+                                        "datasets/mcp_readability/sample_tools.yaml"
+                                    ),
+                                },
+                            }
+                        ],
+                    },
+                    f,
+                )
+            config = {
+                "orchestrator": "mcp_readability",
+                "endpoints_config": ep_path,
+                "style_guide": _ds("datasets/mcp_readability/style_guide.md"),
+                "exceptions_config": _ds("datasets/mcp_readability/exceptions.yaml"),
+                "tools_generator_config": _ds(
+                    "datasets/mcp_readability/tools_generator.yaml"
+                ),
+                "token_budget": 25000,
+                "scorers": {"mcp_style_compliance": {"model_config": "unused"}},
+                "runners": {"endpoint_runners": 2},
+                "reporting": {"csv": {"output_directory": d}},
+            }
+            orch = get_orchestrator(config, [], {})
+            orch.evaluate([])
+            job_id, _, a, b, c = orch.process()
+            assert (a, b, c) == (None, None, None)
+            df = pd.read_csv(
+                os.path.join(d, job_id, "mcp_readability_compliance.csv")
+            )
+            assert list(df.columns) == COLUMNS
+            row = df.iloc[0]
+            assert row["check_status"] == "SUCCESS"
+            assert row["endpoint_type"] == "REMOTE"  # inherited from defaults
+            assert int(row["total_tools"]) == 3
+            assert int(row["p1_issues"]) == 1
+            assert int(row["compliance_score"]) == 80
+
+
+def test_orchestrator_fetch_error():
+    import yaml
+
+    with patch(
+        "scorers.mcp_style_compliance.get_generator", return_value=_FakeLLM()
+    ):
+        from evaluator import get_orchestrator
+
+        with tempfile.TemporaryDirectory() as d:
+            ep_path = os.path.join(d, "endpoints.yaml")
+            with open(ep_path, "w") as f:
+                yaml.safe_dump(
+                    {
+                        "defaults": {"tools_source": {"type": "file"}},
+                        "endpoints": [
+                            {
+                                "product_name": "Bad",
+                                "endpoint_url": "x",
+                                "endpoint_type": "REMOTE",
+                                "environment": "PROD",
+                                "tools_source": {
+                                    "type": "file",
+                                    "path": os.path.join(d, "missing.yaml"),
+                                },
+                            }
+                        ],
+                    },
+                    f,
+                )
+            config = {
+                "orchestrator": "mcp_readability",
+                "endpoints_config": ep_path,
+                "style_guide": _ds("datasets/mcp_readability/style_guide.md"),
+                "tools_generator_config": _ds(
+                    "datasets/mcp_readability/tools_generator.yaml"
+                ),
+                "token_budget": 25000,
+                "scorers": {"mcp_style_compliance": {"model_config": "unused"}},
+                "reporting": {"csv": {"output_directory": d}},
+            }
+            orch = get_orchestrator(config, [], {})
+            orch.evaluate([])
+            job_id, _, _, _, _ = orch.process()
+            df = pd.read_csv(
+                os.path.join(d, job_id, "mcp_readability_compliance.csv")
+            )
+            row = df.iloc[0]
+            assert row["check_status"] == "FETCH_ERROR"
+            assert "not found" in str(row["error_message"]).lower()
diff --git a/pyproject.toml b/pyproject.toml
index 6777fb57..bb0ba1fa 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -64,6 +64,10 @@ dependencies = [
     # (urllib3.contrib.pyopenssl.inject_into_urllib3). Lift when google-auth
     # stops using the deprecated inject pattern.
     "pyOpenSSL<26.2",
+    # MCP readability check: official MCP SDK + async HTTP transport.
+    "mcp",
+    "httpx",
+    "anyio",
 ]
 
 [tool.uv.sources]