diff --git a/apps/web/astro.config.mjs b/apps/web/astro.config.mjs
index 6f475ca7e..944e03427 100644
--- a/apps/web/astro.config.mjs
+++ b/apps/web/astro.config.mjs
@@ -38,7 +38,7 @@ export default defineConfig({
       sidebar: [
         { label: 'Getting Started', autogenerate: { directory: 'docs/getting-started' } },
         { label: 'Evaluation', autogenerate: { directory: 'docs/evaluation' } },
-        { label: 'Evaluators', autogenerate: { directory: 'docs/evaluators' } },
+        { label: 'Graders', autogenerate: { directory: 'docs/graders' } },
         { label: 'Targets', autogenerate: { directory: 'docs/targets' } },
         { label: 'Tools', autogenerate: { directory: 'docs/tools' } },
         { label: 'Guides', autogenerate: { directory: 'docs/guides' } },
diff --git a/apps/web/src/content/docs/docs/evaluation/batch-cli.mdx b/apps/web/src/content/docs/docs/evaluation/batch-cli.mdx
index 73b12b881..ed01ab3e5 100644
--- a/apps/web/src/content/docs/docs/evaluation/batch-cli.mdx
+++ b/apps/web/src/content/docs/docs/evaluation/batch-cli.mdx
@@ -14,14 +14,14 @@ Use batch CLI evaluation when:
 - An external tool processes multiple inputs in a single invocation (e.g., AML screening, bulk classification)
 - The runner reads the eval YAML directly to extract all tests
 - Output is JSONL with records keyed by test `id`
-- Each test has its own evaluator to validate its corresponding output record
+- Each test has its own grader to validate its corresponding output record
 
 ## Execution Flow
 
 1. **AgentV** invokes the batch runner once, passing `--eval <yaml-path>` and `--output <jsonl-path>`
 2. **Batch runner** reads the eval YAML, extracts all tests, processes them, and writes JSONL output keyed by `id`
 3. **AgentV** parses the JSONL and routes each record to its matching test by `id`
-4. **Per-test evaluators** validate the output for each test independently
+4. **Per-test graders** validate the output for each test independently
 
 ## Eval File Structure
 
@@ -109,7 +109,7 @@ JSONL where each line is a JSON object with an `id` matching a test:
 {"id": "case-002", "text": "{\"decision\": \"REVIEW\", ...}"}
 ```
 
-The `id` field must match the test `id` for AgentV to route output to the correct evaluator.
+The `id` field must match the test `id` for AgentV to route output to the correct grader.
 
 ### Output with Tool Trajectory
 
@@ -138,11 +138,11 @@ To enable `tool_trajectory` evaluation, include `output` with `tool_calls`:
 }
 ```
 
-AgentV extracts tool calls directly from `output[].tool_calls[]` for `tool_trajectory` evaluators.
+AgentV extracts tool calls directly from `output[].tool_calls[]` for `tool_trajectory` graders.
 
-## Evaluator Implementation
+## Grader Implementation
 
-Each test has its own evaluator that validates the batch runner output. The evaluator receives the standard `code_grader` input via stdin.
+Each test has its own grader that validates the batch runner output. The grader receives the standard `code_grader` input via stdin.
 
 **Input (stdin):**
 ```json
@@ -164,7 +164,7 @@ Each test has its own evaluator that validates the batch runner output. The eval
 }
 ```
 
-### Example Evaluator
+### Example Grader
 
 ```typescript
 import fs from 'node:fs';
@@ -233,7 +233,7 @@ expected_output:
       reasons: []
 ```
 
-The evaluator extracts these fields and compares them against the parsed candidate output.
+The grader extracts these fields and compares them against the parsed candidate output.
 
 ## Target Configuration
 
@@ -259,7 +259,7 @@ Key settings:
 
 ## Best Practices
 
-1. **Use unique test IDs** -- the batch runner and AgentV use `id` to route outputs to the correct evaluator
+1. **Use unique test IDs** -- the batch runner and AgentV use `id` to route outputs to the correct grader
 2. **Structured input** -- put structured data in `user.content` for the runner to extract
 3. **Structured expected_output** -- define expected output as objects for easy comparison
 4. **Deterministic runners** -- batch runners should produce consistent output for reliable testing
diff --git a/apps/web/src/content/docs/docs/evaluation/eval-cases.mdx b/apps/web/src/content/docs/docs/evaluation/eval-cases.mdx
index 25e7eec21..a8601a214 100644
--- a/apps/web/src/content/docs/docs/evaluation/eval-cases.mdx
+++ b/apps/web/src/content/docs/docs/evaluation/eval-cases.mdx
@@ -5,7 +5,7 @@ sidebar:
   order: 2
 ---
 
-Tests are individual test entries within an evaluation file. Each test defines input messages, expected outcomes, and optional evaluator overrides.
+Tests are individual test entries within an evaluation file. Each test defines input messages, expected outcomes, and optional grader overrides.
 
 ## Basic Structure
 
@@ -29,9 +29,9 @@ tests:
 | `expected_output` | No | Expected response for comparison (string, object, or message array). Alias: `expected_output` |
 | `execution` | No | Per-case execution overrides (for example `target`, `skip_defaults`) |
 | `workspace` | No | Per-case workspace config (overrides suite-level) |
-| `metadata` | No | Arbitrary key-value pairs passed to evaluators and workspace scripts |
+| `metadata` | No | Arbitrary key-value pairs passed to graders and workspace scripts |
 | `rubrics` | No | Structured evaluation criteria |
-| `assertions` | No | Per-test evaluators |
+| `assertions` | No | Per-test graders |
 
 ## Input
 
@@ -55,7 +55,7 @@ When suite-level `input` is defined in the eval file, those messages are prepend
 
 ## Expected Output
 
-Optional reference response for comparison by evaluators. A string expands to a single assistant message:
+Optional reference response for comparison by graders. A string expands to a single assistant message:
 
 ```yaml
 expected_output: "42"
@@ -71,7 +71,7 @@ expected_output:
 
 ## Per-Case Execution Overrides
 
-Override the default target or evaluators for specific tests:
+Override the default target or graders for specific tests:
 
 ```yaml
 tests:
@@ -87,7 +87,7 @@ tests:
         prompt: ./graders/depth.md
 ```
 
-Per-case `assertions` evaluators are **merged** with root-level `assertions` evaluators — test-specific evaluators run first, then root-level defaults are appended. To opt out of root-level defaults for a specific test, set `execution.skip_defaults: true`:
+Per-case `assertions` graders are **merged** with root-level `assertions` graders — test-specific graders run first, then root-level defaults are appended. To opt out of root-level defaults for a specific test, set `execution.skip_defaults: true`:
 
 ```yaml
 assertions:
@@ -162,11 +162,11 @@ Operational checkout state belongs under `workspace.repos[].checkout.base_commit
 
 ## Per-Test Assertions
 
-The `assertions` field defines evaluators directly on a test. It supports both deterministic assertion types and LLM-based rubric evaluation.
+The `assertions` field defines graders directly on a test. It supports both deterministic assertion types and LLM-based rubric evaluation.
 
 ### Deterministic Assertions
 
-These evaluators run without an LLM call and produce binary (0 or 1) scores:
+These graders run without an LLM call and produce binary (0 or 1) scores:
 
 | Type | Value | Description |
 |------|-------|-------------|
@@ -251,7 +251,7 @@ tests:
         value: ["true/false", "boolean", "expected value"]
 ```
 
-Assertion evaluators auto-generate a `name` when one is not provided (e.g., `contains-DENIED`, `is_json`).
+Assertion graders auto-generate a `name` when one is not provided (e.g., `contains-DENIED`, `is_json`).
 
 ### Rubric Assertions
 
@@ -283,7 +283,7 @@ tests:
 
 ### Required Gates
 
-Any evaluator in `assertions` can be marked as `required`. When a required evaluator fails, the overall test verdict is `fail` regardless of the aggregate score.
+Any grader in `assertions` can be marked as `required`. When a required grader fails, the overall test verdict is `fail` regardless of the aggregate score.
 
 | Value | Behavior |
 |-------|----------|
@@ -303,23 +303,23 @@ assertions:
         weight: 1.0
 ```
 
-Required gates are evaluated after all evaluators run. If any required evaluator falls below its threshold, the verdict is forced to `fail`.
+Required gates are evaluated after all graders run. If any required grader falls below its threshold, the verdict is forced to `fail`.
 
 ### Assertions Merge Behavior
 
 `assertions` can be defined at both suite and test levels:
 
-- Per-test `assertions` evaluators run first.
-- Suite-level `assertions` evaluators are appended automatically.
+- Per-test `assertions` graders run first.
+- Suite-level `assertions` graders are appended automatically.
 - Set `execution.skip_defaults: true` on a test to skip suite-level defaults.
 
 ## How `criteria` and `assertions` Interact
 
-The `criteria` field is a **data field** that describes what the response should accomplish. It is not an evaluator itself — how it gets used depends on whether `assertions` is present.
+The `criteria` field is a **data field** that describes what the response should accomplish. It is not an grader itself — how it gets used depends on whether `assertions` is present.
 
 ### No `assertions` — implicit LLM grader
 
-When a test has no `assertions` field, a default `llm-grader` evaluator runs automatically and uses `criteria` as the evaluation prompt:
+When a test has no `assertions` field, a default `llm-grader` grader runs automatically and uses `criteria` as the evaluation prompt:
 
 ```yaml
 tests:
@@ -342,14 +342,14 @@ tests:
     input: Generate the spreadsheet report
 ```
 
-### `assertions` present — explicit evaluators only
+### `assertions` present — explicit graders only
 
-When `assertions` is defined, only the declared evaluators run. No implicit grader is added. Graders that are declared (such as `llm-grader`, `code-grader`, or `rubrics`) receive `criteria` as input automatically.
+When `assertions` is defined, only the declared graders run. No implicit grader is added. Graders that are declared (such as `llm-grader`, `code-grader`, or `rubrics`) receive `criteria` as input automatically.
 
-If `assertions` contains only deterministic evaluators (like `contains` or `regex`), the `criteria` field is not evaluated and a warning is emitted:
+If `assertions` contains only deterministic graders (like `contains` or `regex`), the `criteria` field is not evaluated and a warning is emitted:
 
 ```
-Warning: Test 'my-test': criteria is defined but no evaluator in assertions
+Warning: Test 'my-test': criteria is defined but no grader in assertions
 will evaluate it. Add 'type: llm-grader' to assertions, or remove criteria
 if it is documentation-only.
 ```
@@ -367,7 +367,7 @@ tests:
         value: "fix"
 ```
 
-When you need a custom file conversion for only one grader, add `preprocessors` directly to that evaluator:
+When you need a custom file conversion for only one grader, add `preprocessors` directly to that grader:
 
 ```yaml
 preprocessors:
@@ -389,7 +389,7 @@ tests:
 
 ## Metadata
 
-Pass additional context to evaluators via the `metadata` field:
+Pass additional context to graders via the `metadata` field:
 
 ```yaml
 tests:
diff --git a/apps/web/src/content/docs/docs/evaluation/eval-files.mdx b/apps/web/src/content/docs/docs/evaluation/eval-files.mdx
index cf98867cb..9b68cfaf1 100644
--- a/apps/web/src/content/docs/docs/evaluation/eval-files.mdx
+++ b/apps/web/src/content/docs/docs/evaluation/eval-files.mdx
@@ -5,7 +5,7 @@ sidebar:
   order: 1
 ---
 
-Evaluation files define the test cases, targets, and evaluators for an evaluation run. AgentV supports two formats: YAML and JSONL.
+Evaluation files define the test cases, targets, and graders for an evaluation run. AgentV supports two formats: YAML and JSONL.
 
 ## Suites
 
@@ -41,7 +41,7 @@ tests:
 | `execution` | Default execution config (`target`, `fail_on_error`, `threshold`, etc.) |
 | `workspace` | Suite-level workspace config — inline object or string path to an [external workspace file](/docs/guides/workspace-pool/#external-workspace-config) |
 | `tests` | Array of individual tests, or a string path to an external file |
-| `assertions` | Suite-level evaluators appended to each test unless `execution.skip_defaults: true` is set on the test |
+| `assertions` | Suite-level graders appended to each test unless `execution.skip_defaults: true` is set on the test |
 | `input` | Suite-level input messages prepended to each test's input unless `execution.skip_defaults: true` is set on the test |
 
 ### Metadata Fields
@@ -76,7 +76,7 @@ tests:
 
 ### Suite-level Assertions
 
-The `assertions` field is the canonical way to define suite-level evaluators. Suite-level assertions are appended to every test's evaluators unless a test sets `execution.skip_defaults: true`.
+The `assertions` field is the canonical way to define suite-level graders. Suite-level assertions are appended to every test's graders unless a test sets `execution.skip_defaults: true`.
 
 ```yaml
 description: API response validation
@@ -92,7 +92,7 @@ tests:
     input: Check API health
 ```
 
-`assertions` supports all evaluator types, including deterministic assertion types (`contains`, `regex`, `is_json`, `equals`) and `rubrics`. See [Tests](/docs/evaluation/eval-cases/#per-test-assertions) for per-test assertions usage.
+`assertions` supports all grader types, including deterministic assertion types (`contains`, `regex`, `is_json`, `equals`) and `rubrics`. See [Tests](/docs/evaluation/eval-cases/#per-test-assertions) for per-test assertions usage.
 
 ### Assertion Includes
 
diff --git a/apps/web/src/content/docs/docs/evaluation/examples.mdx b/apps/web/src/content/docs/docs/evaluation/examples.mdx
index c28aedc10..53ce262c6 100644
--- a/apps/web/src/content/docs/docs/evaluation/examples.mdx
+++ b/apps/web/src/content/docs/docs/evaluation/examples.mdx
@@ -69,7 +69,7 @@ tests:
           ```
 ````
 
-## Multi-Evaluator
+## Multi-Grader
 
 Combine a code grader and an LLM grader on the same test:
 
@@ -86,7 +86,7 @@ tests:
       - name: json_format_validator
         type: code-grader
         command: [uv, run, validate_json.py]
-        cwd: ./evaluators
+        cwd: ./graders
       - name: content_evaluator
         type: llm-grader
         prompt: ./graders/semantic_correctness.md
@@ -363,11 +363,11 @@ tests:
 - The batch runner reads the eval YAML via `--eval` flag and outputs JSONL keyed by `id`
 - Put structured data in `user.content` as objects for the runner to extract
 - Use `expected_output` with object fields for structured expected output
-- Each test has its own evaluator to validate its portion of the output
+- Each test has its own grader to validate its portion of the output
 
 ## Suite-level Input
 
-Share a common prompt or system instruction across all tests. Suite-level `input` messages are prepended to each test's input — like suite-level `assertions` for evaluators:
+Share a common prompt or system instruction across all tests. Suite-level `input` messages are prepended to each test's input — like suite-level `assertions` for graders:
 
 ```yaml
 description: Travel assistant evaluation
@@ -418,11 +418,11 @@ See the [suite-level-input example](https://github.com/EntityProcess/agentv/tree
 - Show the pattern, not rigid templates
 - Allow for natural language variation
 - Focus on semantic correctness over exact matching
-- Evaluators handle the actual validation logic
+- Graders handle the actual validation logic
 
 ## Showcases
 
 For complete end-to-end workflows that combine multiple features, see the showcases in [`examples/showcase/`](https://github.com/EntityProcess/agentv/tree/main/examples/showcase):
 
-- **[Multi-Model Benchmark](https://github.com/EntityProcess/agentv/tree/main/examples/showcase/multi-model-benchmark)** — targets matrix × weighted metrics × trials × compare workflow. Runs the same tests against multiple models, scores with weighted evaluators, measures variability, and compares results side-by-side.
+- **[Multi-Model Benchmark](https://github.com/EntityProcess/agentv/tree/main/examples/showcase/multi-model-benchmark)** — targets matrix × weighted metrics × trials × compare workflow. Runs the same tests against multiple models, scores with weighted graders, measures variability, and compares results side-by-side.
 - **[Export Screening](https://github.com/EntityProcess/agentv/tree/main/examples/showcase/export-screening)** — classification eval with confusion matrix metrics and CI gating.
diff --git a/apps/web/src/content/docs/docs/evaluation/rubrics.mdx b/apps/web/src/content/docs/docs/evaluation/rubrics.mdx
index 9408efd79..4f9f5587e 100644
--- a/apps/web/src/content/docs/docs/evaluation/rubrics.mdx
+++ b/apps/web/src/content/docs/docs/evaluation/rubrics.mdx
@@ -22,7 +22,7 @@ tests:
       - States time complexity
 ```
 
-All strings are collected into a single rubrics evaluator automatically.
+All strings are collected into a single rubrics grader automatically.
 
 ### Full form for advanced options
 
@@ -120,9 +120,9 @@ score = sum(criterion_score / 10 * weight) / sum(total_weights)
 
 ## Authoring Rubrics
 
-Write rubric criteria directly in `assertions`. If you want help choosing between plain assertions, deterministic evaluators, and rubric or LLM-based grading, use the `agentv-eval-writer` skill. Keep the evaluator choice driven by the criteria rather than one fixed recipe.
+Write rubric criteria directly in `assertions`. If you want help choosing between plain assertions, deterministic graders, and rubric or LLM-based grading, use the `agentv-eval-writer` skill. Keep the grader choice driven by the criteria rather than one fixed recipe.
 
-## Combining with Other Evaluators
+## Combining with Other Graders
 
 Rubrics work alongside code and LLM graders:
 
diff --git a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx
index b0e46c911..b4c333625 100644
--- a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx
+++ b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx
@@ -75,7 +75,7 @@ agentv eval --dry-run evals/my-eval.yaml
 ```
 
 :::note
-Dry-run returns mock responses that don't match evaluator output schemas. Use it only for testing harness flow, not evaluator logic.
+Dry-run returns mock responses that don't match grader output schemas. Use it only for testing harness flow, not grader logic.
 :::
 
 ### Custom Output Directory
@@ -163,7 +163,7 @@ Each eval test case produces a trace with:
 - **LLM call spans** (`chat <model>`) — model name, token usage (input/output/cached)
 - **Tool call spans** (`execute_tool <name>`) — tool name, arguments, results (with `--otel-capture-content`)
 - **Turn spans** (`agentv.turn.N`) — groups messages by conversation turn (with `--otel-group-turns`)
-- **Evaluator events** — per-grader scores attached to the root span
+- **Grader events** — per-grader scores attached to the root span
 
 :::tip[Claude provider + trace-claude-code plugin]
 When using the Claude provider, AgentV injects `CC_PARENT_SPAN_ID` and `CC_ROOT_SPAN_ID` into the Claude subprocess. If the [trace-claude-code](https://github.com/braintrustdata/braintrust-claude-plugin) plugin is installed, it attaches Claude Code CLI-level tool spans (Read, Write, Bash, etc.) as children of the AgentV eval trace, giving you full visibility into both the eval framework and the agent's internal actions.
@@ -331,14 +331,14 @@ This is the same interface that agent-orchestrated evals use — the EVAL.yaml t
 
 ## Offline Grading
 
-Grade existing agent sessions without re-running them. Import a transcript, then run deterministic evaluators:
+Grade existing agent sessions without re-running them. Import a transcript, then run deterministic graders:
 
 ```bash
 # List sessions and import one
 agentv import claude --list
 agentv import claude --session-id <uuid>
 
-# Run evaluators against the imported transcript
+# Run graders against the imported transcript
 agentv eval evals/my-eval.yaml --transcript .agentv/transcripts/claude-<id>.jsonl
 ```
 
diff --git a/apps/web/src/content/docs/docs/evaluation/sdk.mdx b/apps/web/src/content/docs/docs/evaluation/sdk.mdx
index 3bddcbb7c..6f92029ae 100644
--- a/apps/web/src/content/docs/docs/evaluation/sdk.mdx
+++ b/apps/web/src/content/docs/docs/evaluation/sdk.mdx
@@ -90,7 +90,7 @@ export default defineCodeGrader(({ trace, outputText }) => ({
 
 `defineCodeGrader` graders are referenced in YAML with `type: code-grader` and `command: [bun, run, grader.ts]`. `defineAssertion` uses convention-based discovery instead — just place in `.agentv/assertions/` and reference by name.
 
-For detailed patterns, input/output contracts, and language-agnostic examples, see [Code Graders](/docs/evaluators/code-graders/).
+For detailed patterns, input/output contracts, and language-agnostic examples, see [Code Graders](/docs/graders/code-graders/).
 
 ## Programmatic API
 
diff --git a/apps/web/src/content/docs/docs/getting-started/quickstart.mdx b/apps/web/src/content/docs/docs/getting-started/quickstart.mdx
index ec1fcd2ec..6a32e5a6b 100644
--- a/apps/web/src/content/docs/docs/getting-started/quickstart.mdx
+++ b/apps/web/src/content/docs/docs/getting-started/quickstart.mdx
@@ -72,5 +72,5 @@ Results appear in `.agentv/results/runs/<timestamp>/index.jsonl` with scores, re
 
 - Learn about [eval file formats](/docs/evaluation/eval-files/)
 - Configure [targets](/docs/targets/configuration/) for different providers
-- Create [custom evaluators](/docs/evaluators/custom-evaluators/)
+- Create [custom graders](/docs/graders/custom-graders/)
 - If setup drifts, rerun: `agentv init`
diff --git a/apps/web/src/content/docs/docs/evaluators/code-graders.mdx b/apps/web/src/content/docs/docs/graders/code-graders.mdx
similarity index 100%
rename from apps/web/src/content/docs/docs/evaluators/code-graders.mdx
rename to apps/web/src/content/docs/docs/graders/code-graders.mdx
diff --git a/apps/web/src/content/docs/docs/evaluators/composite.mdx b/apps/web/src/content/docs/docs/graders/composite.mdx
similarity index 74%
rename from apps/web/src/content/docs/docs/evaluators/composite.mdx
rename to apps/web/src/content/docs/docs/graders/composite.mdx
index 38e249f58..86c1c4413 100644
--- a/apps/web/src/content/docs/docs/evaluators/composite.mdx
+++ b/apps/web/src/content/docs/docs/graders/composite.mdx
@@ -1,15 +1,15 @@
 ---
-title: Composite Evaluators
-description: Combine multiple evaluators with aggregation strategies for multi-criteria evaluation.
+title: Composite Graders
+description: Combine multiple graders with aggregation strategies for multi-criteria evaluation.
 sidebar:
   order: 4
 ---
 
-Composite evaluators combine multiple evaluators and aggregate their results into a single score. This enables sophisticated evaluation patterns like safety gates, weighted scoring, and conflict resolution.
+Composite graders combine multiple graders and aggregate their results into a single score. This enables sophisticated evaluation patterns like safety gates, weighted scoring, and conflict resolution.
 
 ## Basic Structure
 
-A composite evaluator wraps two or more sub-evaluators and an aggregator that determines the final score:
+A composite grader wraps two or more sub-graders and an aggregator that determines the final score:
 
 ```yaml
 assertions:
@@ -29,10 +29,10 @@ assertions:
         evaluator_2: 0.4
 ```
 
-Each sub-evaluator runs independently, then the aggregator combines their results.
-Use `assertions` for composite members. `evaluators` is still accepted for backward compatibility.
+Each sub-grader runs independently, then the aggregator combines their results.
+Use `assertions` for composite members. `graders` is still accepted for backward compatibility.
 
-If you only need weighted-average aggregation, a plain test-level `assertions` list already computes a weighted mean across evaluators. Use `composite` when you need a custom aggregation strategy (`threshold`, `code_grader`, `llm_grader`) or nested evaluator groups.
+If you only need weighted-average aggregation, a plain test-level `assertions` list already computes a weighted mean across graders. Use `composite` when you need a custom aggregation strategy (`threshold`, `code_grader`, `llm_grader`) or nested grader groups.
 
 ## Aggregator Types
 
@@ -48,7 +48,7 @@ aggregator:
     quality: 0.7     # 70% weight
 ```
 
-If weights are omitted, all evaluators receive equal weight (1.0).
+If weights are omitted, all graders receive equal weight (1.0).
 This is equivalent to averaging all member scores.
 
 The score is calculated as:
@@ -59,16 +59,16 @@ final_score = sum(score_i * weight_i) / sum(weight_i)
 
 ### Code Grader Aggregator
 
-Run a custom command to decide the final score based on all evaluator results:
+Run a custom command to decide the final score based on all grader results:
 
 ```yaml
 aggregator:
   type: code-grader
   path: node ./scripts/safety-gate.js
-  cwd: ./evaluators  # optional working directory
+  cwd: ./graders  # optional working directory
 ```
 
-The command receives the evaluator results on stdin and must print a result to stdout.
+The command receives the grader results on stdin and must print a result to stdout.
 
 **Input (stdin):**
 ```json
@@ -92,7 +92,7 @@ The command receives the evaluator results on stdin and must print a result to s
 
 ### LLM Grader Aggregator
 
-Use an LLM to resolve conflicts or make nuanced decisions across evaluator results:
+Use an LLM to resolve conflicts or make nuanced decisions across grader results:
 
 ```yaml
 aggregator:
@@ -100,7 +100,7 @@ aggregator:
   prompt: ./prompts/conflict-resolution.md
 ```
 
-Inside the prompt file, use the `{{EVALUATOR_RESULTS_JSON}}` variable to inject the JSON results from all child evaluators.
+Inside the prompt file, use the `{{EVALUATOR_RESULTS_JSON}}` variable to inject the JSON results from all child graders.
 
 ## Patterns
 
@@ -130,7 +130,7 @@ tests:
           path: ./scripts/safety-gate.js
 ```
 
-The `safety-gate.js` command can return a score of 0.0 whenever the safety evaluator fails, regardless of the quality score.
+The `safety-gate.js` command can return a score of 0.0 whenever the safety grader fails, regardless of the quality score.
 
 ### Multi-Criteria Weighted
 
@@ -191,7 +191,7 @@ Composites can contain other composites for hierarchical evaluation:
 
 ## Result Structure
 
-Composite evaluators return nested `scores`, giving full visibility into each sub-evaluator:
+Composite graders return nested `scores`, giving full visibility into each sub-grader:
 
 ```json
 {
@@ -227,12 +227,12 @@ Composite evaluators return nested `scores`, giving full visibility into each su
 }
 ```
 
-Assertions from sub-evaluators are prefixed with the evaluator name (e.g., `[safety]`) in the top-level `assertions` array.
+Assertions from sub-graders are prefixed with the grader name (e.g., `[safety]`) in the top-level `assertions` array.
 
 ## Best Practices
 
-1. **Name evaluators clearly** -- names appear in results and debugging output, so use descriptive labels like `safety` or `correctness` rather than `eval_1`.
+1. **Name graders clearly** -- names appear in results and debugging output, so use descriptive labels like `safety` or `correctness` rather than `eval_1`.
 2. **Use safety gates for critical checks** -- do not let high quality scores override safety failures. A code grader aggregator can enforce hard gates.
 3. **Balance weights thoughtfully** -- consider which aspects matter most for your use case and assign weights accordingly.
 4. **Keep nesting shallow** -- deep nesting makes debugging harder. Two levels of composites is usually sufficient.
-5. **Test aggregators independently** -- verify custom aggregation logic with unit tests before wiring it into a composite evaluator.
+5. **Test aggregators independently** -- verify custom aggregation logic with unit tests before wiring it into a composite grader.
diff --git a/apps/web/src/content/docs/docs/evaluators/custom-assertions.mdx b/apps/web/src/content/docs/docs/graders/custom-assertions.mdx
similarity index 96%
rename from apps/web/src/content/docs/docs/evaluators/custom-assertions.mdx
rename to apps/web/src/content/docs/docs/graders/custom-assertions.mdx
index 5987a0dae..7879b3fae 100644
--- a/apps/web/src/content/docs/docs/evaluators/custom-assertions.mdx
+++ b/apps/web/src/content/docs/docs/graders/custom-assertions.mdx
@@ -18,7 +18,7 @@ AgentV provides two SDK functions for custom evaluation logic:
 
 **Use `defineAssertion()`** when you want a named assertion type that can be referenced across eval files without specifying a command path. It uses a simplified result contract focused on `pass` and optional `score`.
 
-**Use `defineCodeGrader()`** when you need full control over scoring with explicit `assertions` arrays, or when the evaluator is a one-off grader tied to a specific eval. See [Code Graders](/docs/evaluators/code-graders/) for details.
+**Use `defineCodeGrader()`** when you need full control over scoring with explicit `assertions` arrays, or when the grader is a one-off grader tied to a specific eval. See [Code Graders](/docs/graders/code-graders/) for details.
 
 Both functions handle stdin/stdout JSON parsing, snake_case-to-camelCase conversion, Zod validation, and error handling automatically.
 
@@ -243,4 +243,4 @@ npm install @agentv/eval
 agentv eval evals/dataset.eval.yaml
 ```
 
-Each test produces scores from both the built-in `contains` assertion and your custom `word-count` assertion. Results appear in the output JSONL with each evaluator's score in the `scores[]` array.
+Each test produces scores from both the built-in `contains` assertion and your custom `word-count` assertion. Results appear in the output JSONL with each grader's score in the `scores[]` array.
diff --git a/apps/web/src/content/docs/docs/evaluators/custom-evaluators.mdx b/apps/web/src/content/docs/docs/graders/custom-graders.mdx
similarity index 68%
rename from apps/web/src/content/docs/docs/evaluators/custom-evaluators.mdx
rename to apps/web/src/content/docs/docs/graders/custom-graders.mdx
index b7c847acf..8c958eba0 100644
--- a/apps/web/src/content/docs/docs/evaluators/custom-evaluators.mdx
+++ b/apps/web/src/content/docs/docs/graders/custom-graders.mdx
@@ -1,23 +1,23 @@
 ---
-title: Custom Evaluators
+title: Custom Graders
 description: Patterns for building custom evaluation logic
 sidebar:
   order: 3
 ---
 
-AgentV supports multiple evaluator types that can be combined for comprehensive evaluation.
+AgentV supports multiple grader types that can be combined for comprehensive evaluation.
 
-## Evaluator Types
+## Grader Types
 
 | Type | Description | Use Case |
 |------|-------------|----------|
 | `code_grader` | Deterministic command (Python/TS/any) | Exact matching, format validation, programmatic checks |
 | `llm_grader` | LLM-based evaluation with custom prompt | Semantic evaluation, nuance, subjective quality |
-| `rubrics` | Structured rubric evaluator via `assertions` | Multi-criterion grading with weights |
+| `rubrics` | Structured rubric grader via `assertions` | Multi-criterion grading with weights |
 
-## Referencing Evaluators
+## Referencing Graders
 
-Evaluators are configured using `assertions` — either top-level (applies to all tests) or per-test:
+Graders are configured using `assertions` — either top-level (applies to all tests) or per-test:
 
 ### Top-Level (Default for All Tests)
 
@@ -30,7 +30,7 @@ assertions:
 
 tests:
   - id: test-1
-    # Uses the top-level evaluator
+    # Uses the top-level grader
     ...
 ```
 
@@ -47,9 +47,9 @@ tests:
         command: [./validators/check_json.py]
 ```
 
-## Combining Evaluators
+## Combining Graders
 
-Use multiple evaluators on the same case for comprehensive scoring:
+Use multiple graders on the same case for comprehensive scoring:
 
 ```yaml
 tests:
@@ -70,21 +70,21 @@ tests:
         prompt: ./graders/code_quality.md
 ```
 
-Each evaluator produces its own score. Results appear in `scores[]` in the output JSONL.
+Each grader produces its own score. Results appear in `scores[]` in the output JSONL.
 
-For multiple evaluators in `assertions`, the test score is the weighted mean:
+For multiple graders in `assertions`, the test score is the weighted mean:
 
 ```
 final_score = sum(score_i * weight_i) / sum(weight_i)
 ```
 
 If `weight` is omitted, it defaults to `1.0` (equal weighting).
-If any evaluator has `required: true` (or `required: <threshold>`) and scores below its required threshold, the overall test score is forced to `0`.
+If any grader has `required: true` (or `required: <threshold>`) and scores below its required threshold, the overall test score is forced to `0`.
 
 ## Best Practices
 
 - **Use code graders for deterministic checks** — exact value matching, format validation, schema compliance
 - **Use LLM graders for semantic evaluation** — meaning, quality, helpfulness
 - **Use rubrics for structured multi-criteria grading** — when you need weighted, itemized scoring
-- **Combine evaluator types** for comprehensive coverage
+- **Combine grader types** for comprehensive coverage
 - **Test code graders locally** before running full evaluations
diff --git a/apps/web/src/content/docs/docs/evaluators/execution-metrics.mdx b/apps/web/src/content/docs/docs/graders/execution-metrics.mdx
similarity index 88%
rename from apps/web/src/content/docs/docs/evaluators/execution-metrics.mdx
rename to apps/web/src/content/docs/docs/graders/execution-metrics.mdx
index 9e0affa6c..7b12d3f69 100644
--- a/apps/web/src/content/docs/docs/evaluators/execution-metrics.mdx
+++ b/apps/web/src/content/docs/docs/graders/execution-metrics.mdx
@@ -5,11 +5,11 @@ sidebar:
   order: 5
 ---
 
-AgentV provides built-in evaluators for checking execution metrics against thresholds. These are useful for enforcing efficiency constraints without writing custom code.
+AgentV provides built-in graders for checking execution metrics against thresholds. These are useful for enforcing efficiency constraints without writing custom code.
 
 ## execution_metrics
 
-The `execution_metrics` evaluator provides declarative threshold-based checks on multiple metrics in a single evaluator.
+The `execution_metrics` grader provides declarative threshold-based checks on multiple metrics in a single grader.
 
 ```yaml
 assertions:
@@ -72,9 +72,9 @@ assertions:
     exploration_tolerance: 0.2     # Allow ±20% variance
 ```
 
-## Single-Metric Evaluators
+## Single-Metric Graders
 
-For simple single-threshold checks, AgentV also provides dedicated evaluators:
+For simple single-threshold checks, AgentV also provides dedicated graders:
 
 ### latency
 
@@ -108,15 +108,15 @@ Fails if total token usage exceeds the threshold.
 
 ## When to Use Each
 
-| Scenario | Recommended Evaluator |
+| Scenario | Recommended Grader |
 |----------|----------------------|
 | Check multiple metrics at once | `execution_metrics` |
 | Simple single-threshold check | `latency`, `cost`, or `token_usage` |
 | Complex custom formulas | `code_grader` with custom command |
 
-## Combining with Other Evaluators
+## Combining with Other Graders
 
-Execution metrics work well alongside semantic evaluators:
+Execution metrics work well alongside semantic graders:
 
 ```yaml
 tests:
diff --git a/apps/web/src/content/docs/docs/evaluators/llm-graders.mdx b/apps/web/src/content/docs/docs/graders/llm-graders.mdx
similarity index 99%
rename from apps/web/src/content/docs/docs/evaluators/llm-graders.mdx
rename to apps/web/src/content/docs/docs/graders/llm-graders.mdx
index 12f54ffe3..3f9cd969c 100644
--- a/apps/web/src/content/docs/docs/evaluators/llm-graders.mdx
+++ b/apps/web/src/content/docs/docs/graders/llm-graders.mdx
@@ -171,7 +171,7 @@ tests:
 
 Resolution order:
 
-- per-evaluator `preprocessors` override suite-level entries
+- per-grader `preprocessors` override suite-level entries
 - if no preprocessor matches, AgentV falls back to a UTF-8 text read
 - if the fallback read looks binary or invalid, the grader receives a warning note instead of failing the test run
 
diff --git a/apps/web/src/content/docs/docs/evaluators/structured-data.mdx b/apps/web/src/content/docs/docs/graders/structured-data.mdx
similarity index 81%
rename from apps/web/src/content/docs/docs/evaluators/structured-data.mdx
rename to apps/web/src/content/docs/docs/graders/structured-data.mdx
index b6d8e0f23..41af50f5a 100644
--- a/apps/web/src/content/docs/docs/evaluators/structured-data.mdx
+++ b/apps/web/src/content/docs/docs/graders/structured-data.mdx
@@ -1,11 +1,11 @@
 ---
-title: Structured Data & Metrics Evaluators
-description: Built-in evaluators for JSON field comparison and performance gates (latency, cost, token usage).
+title: Structured Data & Metrics Graders
+description: Built-in graders for JSON field comparison and performance gates (latency, cost, token usage).
 sidebar:
   order: 6
 ---
 
-Built-in evaluators for grading structured outputs and gating on execution metrics:
+Built-in graders for grading structured outputs and gating on execution metrics:
 
 - `field_accuracy` -- compare JSON fields against ground truth
 - `latency` -- gate on response time
@@ -14,7 +14,7 @@ Built-in evaluators for grading structured outputs and gating on execution metri
 
 ## Ground Truth
 
-Put the expected structured output in the test case `expected_output` (as an object or message array). Evaluators read expected values from there.
+Put the expected structured output in the test case `expected_output` (as an object or message array). Graders read expected values from there.
 
 ```yaml
 tests:
@@ -54,7 +54,7 @@ assertions:
 | `date` | Compares dates after parsing | `formats` -- list of accepted date formats |
 | `numeric_tolerance` | Numeric compare within tolerance | `tolerance` -- absolute threshold; `relative: true` for relative tolerance |
 
-For fuzzy string matching, use a `code_grader` evaluator (e.g. Levenshtein distance) instead of adding a fuzzy mode to `field_accuracy`.
+For fuzzy string matching, use a `code_grader` grader (e.g. Levenshtein distance) instead of adding a fuzzy mode to `field_accuracy`.
 
 ### Aggregation
 
@@ -99,9 +99,9 @@ assertions:
     # max_output: 2000
 ```
 
-## Combining with Composite Evaluators
+## Combining with Composite Graders
 
-Use a `composite` evaluator to produce a single "release gate" score from multiple checks:
+Use a `composite` grader to produce a single "release gate" score from multiple checks:
 
 ```yaml
 assertions:
diff --git a/apps/web/src/content/docs/docs/evaluators/tool-trajectory.mdx b/apps/web/src/content/docs/docs/graders/tool-trajectory.mdx
similarity index 90%
rename from apps/web/src/content/docs/docs/evaluators/tool-trajectory.mdx
rename to apps/web/src/content/docs/docs/graders/tool-trajectory.mdx
index 1950edfbd..5ce1a6568 100644
--- a/apps/web/src/content/docs/docs/evaluators/tool-trajectory.mdx
+++ b/apps/web/src/content/docs/docs/graders/tool-trajectory.mdx
@@ -1,11 +1,11 @@
 ---
-title: Tool Trajectory Evaluators
+title: Tool Trajectory Graders
 description: Validate that agents use the right tools in the right order with argument matching and latency assertions.
 sidebar:
   order: 5
 ---
 
-Tool trajectory evaluators validate that an agent used the expected tools during execution. They work with trace data returned by agent providers (codex, vscode, cli with trace support).
+Tool trajectory graders validate that an agent used the expected tools during execution. They work with trace data returned by agent providers (codex, vscode, cli with trace support).
 
 ## Modes
 
@@ -127,7 +127,7 @@ Example: 3 expected tools with 2 latency assertions = 5 total assertion entries
 
 ## Trace Data Format
 
-Tool trajectory evaluators require trace data from the agent provider. Providers return `output` containing `tool_calls`:
+Tool trajectory graders require trace data from the agent provider. Providers return `output` containing `tool_calls`:
 
 ```json
 {
@@ -151,7 +151,7 @@ Tool trajectory evaluators require trace data from the agent provider. Providers
 }
 ```
 
-The evaluator extracts tool calls from `output[].tool_calls[]`. The `tool` and `input` fields are required. Optional fields:
+The grader extracts tool calls from `output[].tool_calls[]`. The `tool` and `input` fields are required. Optional fields:
 
 - `id` and `timestamp` — for debugging
 - `duration_ms` — required if using `max_duration_ms` latency assertions
@@ -172,7 +172,7 @@ agentv eval evals/test.yaml --dump-traces
 agentv eval evals/test.yaml --include-trace
 ```
 
-Use `--dump-traces` to inspect actual traces and understand agent behavior before writing evaluators.
+Use `--dump-traces` to inspect actual traces and understand agent behavior before writing graders.
 
 ## Complete Examples
 
@@ -255,7 +255,7 @@ tests:
 ## Best Practices
 
 1. **Start with `any_order`**, then tighten to `in_order` or `exact` as needed.
-2. **Combine with other evaluators** — use tool trajectory for execution validation and LLM graders for output quality.
-3. **Inspect traces first** with `--dump-traces` to understand agent behavior before writing evaluators.
+2. **Combine with other graders** — use tool trajectory for execution validation and LLM graders for output quality.
+3. **Inspect traces first** with `--dump-traces` to understand agent behavior before writing graders.
 4. **Use generous latency thresholds** to avoid flaky tests from timing variance.
 5. **Use code graders for custom validation** — write custom tool validation scripts when built-in modes are insufficient.
diff --git a/apps/web/src/content/docs/docs/guides/agent-eval-layers.mdx b/apps/web/src/content/docs/docs/guides/agent-eval-layers.mdx
index 84b4c6311..f4058dcf6 100644
--- a/apps/web/src/content/docs/docs/guides/agent-eval-layers.mdx
+++ b/apps/web/src/content/docs/docs/guides/agent-eval-layers.mdx
@@ -1,11 +1,11 @@
 ---
 title: Agent Evaluation Layers
-description: A four-layer taxonomy for evaluating AI agents — Reasoning, Action, End-to-End, and Safety — mapped to AgentV evaluators.
+description: A four-layer taxonomy for evaluating AI agents — Reasoning, Action, End-to-End, and Safety — mapped to AgentV graders.
 sidebar:
   order: 1
 ---
 
-A practical taxonomy for structuring agent evaluations. Each layer targets a different dimension of agent behavior, and maps directly to AgentV evaluators you can drop into an `EVAL.yaml`.
+A practical taxonomy for structuring agent evaluations. Each layer targets a different dimension of agent behavior, and maps directly to AgentV graders you can drop into an `EVAL.yaml`.
 
 ## Layer 1: Reasoning
 
@@ -13,7 +13,7 @@ A practical taxonomy for structuring agent evaluations. Each layer targets a dif
 
 Covers plan quality, plan adherence, and tool selection rationale. Use LLM-based graders that inspect the agent's reasoning trace.
 
-| Concern | AgentV evaluator |
+| Concern | AgentV grader |
 |---------|-----------------|
 | Plan quality & coherence | `rubrics` |
 | Workspace-aware auditing | `rubrics` with `required: true` criteria |
@@ -38,7 +38,7 @@ assertions:
 
 Covers tool call correctness, argument validity, execution path, and redundancy. Use trajectory validators and execution metrics for deterministic checks.
 
-| Concern | AgentV evaluator |
+| Concern | AgentV grader |
 |---------|-----------------|
 | Tool sequence | `tool_trajectory` (`in_order`, `exact`) |
 | Minimum tool usage | `tool_trajectory` (`any_order`) |
@@ -70,7 +70,7 @@ assertions:
 
 Covers task completion, output correctness, step efficiency, latency, and cost. Combine outcome-focused graders with deterministic assertions and execution budgets.
 
-| Concern | AgentV evaluator |
+| Concern | AgentV grader |
 |---------|-----------------|
 | Output correctness | `rubrics`, `equals`, `contains`, `regex` |
 | Structured data accuracy | `field_accuracy` |
@@ -100,11 +100,11 @@ assertions:
 
 Covers prompt injection resilience, policy adherence, bias, and content safety. Use the `negate` flag to assert that unsafe behaviors do **not** occur.
 
-| Concern | AgentV evaluator |
+| Concern | AgentV grader |
 |---------|-----------------|
 | Content safety | `rubrics` |
 | Policy enforcement | `code_grader` with policy command |
-| "Must NOT" assertions | Any evaluator with `negate: true` |
+| "Must NOT" assertions | Any grader with `negate: true` |
 
 ```yaml
 # Layer 4: Safety — verify the agent doesn't do harmful things
diff --git a/apps/web/src/content/docs/docs/guides/agent-skills-evals.mdx b/apps/web/src/content/docs/docs/guides/agent-skills-evals.mdx
index d93639150..846915036 100644
--- a/apps/web/src/content/docs/docs/guides/agent-skills-evals.mdx
+++ b/apps/web/src/content/docs/docs/guides/agent-skills-evals.mdx
@@ -15,7 +15,7 @@ AgentV natively supports `evals.json`. You can run Agent Skills evals directly:
 agentv eval evals.json --target claude
 ```
 
-When you need AgentV's power features (deterministic evaluators, composite scoring, multi-turn conversations, workspace isolation), you can graduate to EVAL.yaml.
+When you need AgentV's power features (deterministic graders, composite scoring, multi-turn conversations, workspace isolation), you can graduate to EVAL.yaml.
 
 ## Quick start
 
@@ -81,14 +81,14 @@ AgentV resolves these paths and copies the files into the workspace before the a
 
 ## Offline grading (no API keys)
 
-Grade existing agent sessions offline using `agentv import` to convert transcripts, then run deterministic evaluators:
+Grade existing agent sessions offline using `agentv import` to convert transcripts, then run deterministic graders:
 
 ```bash
 # Import a Claude Code session transcript
 agentv import claude --list
 agentv import claude --session-id <uuid>
 
-# Run deterministic evaluators against the imported transcript
+# Run deterministic graders against the imported transcript
 agentv eval evals.json --target copilot-log
 ```
 
@@ -145,10 +145,10 @@ The generated YAML includes comments about available AgentV features you can use
 ```yaml
 # Converted from Agent Skills evals.json
 # AgentV features you can add:
-#   - type: is_json, contains, regex for deterministic evaluators
+#   - type: is_json, contains, regex for deterministic graders
 #   - type: code-grader for custom scoring scripts
 #   - Multi-turn conversations via input message arrays
-#   - Composite evaluators with weighted scoring
+#   - Composite graders with weighted scoring
 #   - Workspace isolation with repos and hooks
 
 tests:
@@ -185,8 +185,8 @@ Use `evals.json` when:
 
 Switch to EVAL.yaml when you need:
 
-- **Deterministic evaluators**: `contains`, `regex`, `equals`, `is-json` — faster and cheaper than LLM graders
-- **Composite scoring**: Weighted evaluators with custom aggregation
+- **Deterministic graders**: `contains`, `regex`, `equals`, `is-json` — faster and cheaper than LLM graders
+- **Composite scoring**: Weighted graders with custom aggregation
 - **Multi-turn conversations**: Multi-message input sequences
 - **Workspace isolation**: Sandboxed file systems per test case
 - **Tool trajectory evaluation**: Assert on the sequence of tool calls
diff --git a/apps/web/src/content/docs/docs/guides/autoevals-integration.mdx b/apps/web/src/content/docs/docs/guides/autoevals-integration.mdx
index a0f012062..cb5401419 100644
--- a/apps/web/src/content/docs/docs/guides/autoevals-integration.mdx
+++ b/apps/web/src/content/docs/docs/guides/autoevals-integration.mdx
@@ -1,6 +1,6 @@
 ---
 title: Autoevals Integration
-description: Use Braintrust's open-source autoevals scorers (Factuality, Faithfulness, etc.) as code_grader evaluators in AgentV.
+description: Use Braintrust's open-source autoevals scorers (Factuality, Faithfulness, etc.) as code_grader graders in AgentV.
 sidebar:
   order: 2
 ---
@@ -13,7 +13,7 @@ sidebar:
 
 - Works standalone — no Braintrust platform account required
 - Uses any OpenAI-compatible endpoint for LLM-based scorers
-- Integrates with AgentV via `code_grader` evaluator type: wrap any autoevals scorer in a command that reads stdin and writes the AgentV grader result to stdout
+- Integrates with AgentV via `code_grader` grader type: wrap any autoevals scorer in a command that reads stdin and writes the AgentV grader result to stdout
 
 ## Installation
 
@@ -125,8 +125,8 @@ from autoevals import Faithfulness
 
 data = json.load(sys.stdin)
 
-evaluator = Faithfulness()
-result = evaluator(
+grader = Faithfulness()
+result = grader(
     input=data.get("input_text", ""),
     output=data.get("output_text", ""),
     expected=data.get("reference_answer", ""),
diff --git a/apps/web/src/content/docs/docs/guides/evaluation-types.mdx b/apps/web/src/content/docs/docs/guides/evaluation-types.mdx
index a251af482..c97c46ece 100644
--- a/apps/web/src/content/docs/docs/guides/evaluation-types.mdx
+++ b/apps/web/src/content/docs/docs/guides/evaluation-types.mdx
@@ -64,7 +64,7 @@ AgentV's eval tooling is designed for **execution quality**:
 - **`EVAL.yaml`** — define test cases with inputs, expected outputs, and assertions
 - **`evals.json`** — lightweight skill evaluation format (prompt/expected-output pairs)
 - **`agentv eval`** — execute evaluations and collect results
-- **Evaluators** — `llm-grader`, `code-grader`, `tool-trajectory`, `rubrics`, `contains`, `regex`, and others all measure execution behavior
+- **Graders** — `llm-grader`, `code-grader`, `tool-trajectory`, `rubrics`, `contains`, `regex`, and others all measure execution behavior
 
 These tools assume the skill is already loaded and invoked. They measure what happens *after* routing, not the routing decision itself.
 
diff --git a/apps/web/src/content/docs/docs/guides/human-review.mdx b/apps/web/src/content/docs/docs/guides/human-review.mdx
index 018aca2a0..339011358 100644
--- a/apps/web/src/content/docs/docs/guides/human-review.mdx
+++ b/apps/web/src/content/docs/docs/guides/human-review.mdx
@@ -5,7 +5,7 @@ sidebar:
   order: 6
 ---
 
-Human review sits between automated scoring and the next iteration. Automated evaluators catch regressions and enforce thresholds, but a human reviewer spots score-behavior mismatches, qualitative regressions, and cases where a grader is too strict or too lenient.
+Human review sits between automated scoring and the next iteration. Automated graders catch regressions and enforce thresholds, but a human reviewer spots score-behavior mismatches, qualitative regressions, and cases where a grader is too strict or too lenient.
 
 ## When to review
 
@@ -14,7 +14,7 @@ Review after every eval run where you plan to iterate on the skill or agent. The
 1. **Run evals** — `agentv eval EVAL.yaml` or `agentv eval evals.json`
 2. **Inspect results** — open the HTML report or scan the results JSONL
 3. **Write feedback** — create `feedback.json` alongside the results
-4. **Iterate** — use the feedback to guide prompt changes, evaluator tuning, or test case additions
+4. **Iterate** — use the feedback to guide prompt changes, grader tuning, or test case additions
 5. **Re-run** — verify improvements in the next eval run
 
 Skip the review step for routine CI gate runs where you only need pass/fail.
@@ -27,7 +27,7 @@ Skip the review step for routine CI gate runs where you only need pass/fail.
 | **False positive** | A `contains` check passes on a coincidental substring match |
 | **False negative** | An LLM grader penalizes a correct answer that uses different phrasing |
 | **Qualitative regression** | Scores stay the same but tone, formatting, or helpfulness degrades |
-| **Evaluator miscalibration** | A code grader is too strict on whitespace; a rubric is too lenient on accuracy |
+| **Grader miscalibration** | A code grader is too strict on whitespace; a rubric is too lenient on accuracy |
 | **Flaky results** | The same test produces wildly different scores across runs |
 
 ## How to review
@@ -118,7 +118,7 @@ The `feedback.json` file is a structured annotation of a single eval run. It rec
 | `test_id` | `string` | yes | Matches the test `id` from the eval file |
 | `verdict` | `enum` | yes | One of: `acceptable`, `needs_improvement`, `incorrect`, `flaky` |
 | `notes` | `string` | no | Free-form reviewer notes |
-| `evaluator_overrides` | `object` | no | Keyed by evaluator name — reviewer annotations on specific evaluator results |
+| `evaluator_overrides` | `object` | no | Keyed by grader name — reviewer annotations on specific grader results |
 | `workspace_notes` | `string` | no | Notes about workspace state (relevant for workspace evaluations) |
 
 ### Verdict values
@@ -130,9 +130,9 @@ The `feedback.json` file is a structured annotation of a single eval run. It rec
 | `incorrect` | The output is wrong, regardless of what the automated score says |
 | `flaky` | Results are inconsistent across runs — investigate non-determinism |
 
-### Evaluator overrides (workspace evaluations)
+### Grader overrides (workspace evaluations)
 
-For workspace evaluations with multiple evaluators (code graders, LLM graders, tool trajectory checks), the `evaluator_overrides` field lets the reviewer annotate specific evaluator results:
+For workspace evaluations with multiple graders (code graders, LLM graders, tool trajectory checks), the `evaluator_overrides` field lets the reviewer annotate specific grader results:
 
 ```json
 {
@@ -147,7 +147,7 @@ For workspace evaluations with multiple evaluators (code graders, LLM graders, t
 }
 ```
 
-Keys use the format `evaluator-type:evaluator-name` to match the evaluators defined in `assertions` blocks.
+Keys use the format `grader-type:grader-name` to match the graders defined in `assertions` blocks.
 
 ## Storing feedback across iterations
 
@@ -181,7 +181,7 @@ Define tests (EVAL.yaml / evals.json)
         ↓
   Write feedback.json
         ↓
-  Tune prompts / evaluators / test cases
+  Tune prompts / graders / test cases
         ↓
   Re-run evals
         ↓
diff --git a/apps/web/src/content/docs/docs/guides/skill-improvement-workflow.mdx b/apps/web/src/content/docs/docs/guides/skill-improvement-workflow.mdx
index 00ebd4fbb..6cc7bddc6 100644
--- a/apps/web/src/content/docs/docs/guides/skill-improvement-workflow.mdx
+++ b/apps/web/src/content/docs/docs/guides/skill-improvement-workflow.mdx
@@ -126,11 +126,11 @@ Or grade existing sessions offline (no API keys required):
 agentv import claude --list
 agentv import claude --session-id <uuid>
 
-# Run deterministic evaluators against the imported transcript
+# Run deterministic graders against the imported transcript
 agentv eval evals.json --target copilot-log
 ```
 
-Offline grading is useful when you want to evaluate skills with agents that don't have a direct API integration — import the session transcript and run deterministic evaluators.
+Offline grading is useful when you want to evaluate skills with agents that don't have a direct API integration — import the session transcript and run deterministic graders.
 
 ## Step 4: Compare Results
 
@@ -239,7 +239,7 @@ tests:
 ```
 
 After converting, you can:
-- Replace `llm-grader` assertions with faster deterministic evaluators (`contains`, `regex`, `equals`)
+- Replace `llm-grader` assertions with faster deterministic graders (`contains`, `regex`, `equals`)
 - Add `workspace` configuration for file-system isolation
 - Use `code-grader` for custom scoring logic
 - Define `tool-trajectory` assertions to check tool usage patterns
@@ -329,7 +329,7 @@ The optimizer uses the same core loop described in this guide but automates the
 Its bundled scripts map directly onto the workflow stages:
 
 - `run-eval.ts` and `compare-runs.ts` run and compare evaluations while still delegating to `agentv`
-- `run-loop.ts` repeats the evaluation loop without moving evaluator logic into the script layer
+- `run-loop.ts` repeats the evaluation loop without moving grader logic into the script layer
 - `aggregate-benchmark.ts` and `generate-report.ts` summarize AgentV artifacts into review-friendly output
 - `improve-description.ts` proposes follow-up description experiments once execution quality is stable
 
diff --git a/apps/web/src/content/docs/docs/index.mdx b/apps/web/src/content/docs/docs/index.mdx
index 895d15adf..a2a2e3226 100644
--- a/apps/web/src/content/docs/docs/index.mdx
+++ b/apps/web/src/content/docs/docs/index.mdx
@@ -15,7 +15,7 @@ AgentV is a CLI-first AI agent evaluation framework. It evaluates your agents lo
 - **No server** — just install and run
 - **Version-controlled** — YAML evaluation files live in Git alongside your code
 - **CI/CD ready** — run evaluations in your pipeline without external API calls
-- **Multiple evaluator types** — code validators, LLM graders, custom Python/TypeScript
+- **Multiple grader types** — code validators, LLM graders, custom Python/TypeScript
 
 ## How AgentV Compares
 
@@ -27,7 +27,7 @@ AgentV is a CLI-first AI agent evaluation framework. It evaluates your agents lo
 | **CLI-first** | Yes | No | Limited | Limited |
 | **CI/CD ready** | Yes | Requires API calls | Requires API calls | Requires API calls |
 | **Version control** | Yes (YAML in Git) | No | No | No |
-| **Evaluators** | Code + LLM + Custom | LLM only | LLM + Code | LLM only |
+| **Graders** | Code + LLM + Custom | LLM only | LLM + Code | LLM only |
 
 ## Core Concepts
 
@@ -38,14 +38,14 @@ AgentV is a CLI-first AI agent evaluation framework. It evaluates your agents lo
 - **Eval files** — YAML or JSONL definitions of test cases
 - **Tests** — Individual test entries with input messages and expected outcomes
 - **Targets** — The agent or LLM provider being evaluated
-- **Evaluators** — Code graders (Python/TypeScript) or LLM graders that score responses
+- **Graders** — Code graders (Python/TypeScript) or LLM graders that score responses
 - **Rubrics** — Structured criteria with weights for grading
 - **Results** — JSONL output with scores, reasoning, and execution traces
 
 ## Features
 
 - **Multi-objective scoring**: Correctness, latency, cost, safety in one run
-- **Multiple evaluator types**: Code validators, LLM graders, custom Python/TypeScript
+- **Multiple grader types**: Code validators, LLM graders, custom Python/TypeScript
 - **Built-in targets**: VS Code Copilot, Codex CLI, Pi Coding Agent, Azure OpenAI, local CLI agents
 - **Structured evaluation**: Rubric-based grading with weights and requirements
 - **Batch evaluation**: Run hundreds of test cases in parallel
diff --git a/apps/web/src/content/docs/docs/reference/comparison.mdx b/apps/web/src/content/docs/docs/reference/comparison.mdx
index 93a751c96..a354160c1 100644
--- a/apps/web/src/content/docs/docs/reference/comparison.mdx
+++ b/apps/web/src/content/docs/docs/reference/comparison.mdx
@@ -23,7 +23,7 @@ agentv eval evals/my-agent.yaml
 
 ### Agent Control — Govern
 
-Runtime guardrails. Intercepts agent actions (tool calls, API requests) and evaluates them against configurable policies. Deny, steer, warn, or log — without changing agent code. Pluggable evaluators with confidence scoring.
+Runtime guardrails. Intercepts agent actions (tool calls, API requests) and evaluates them against configurable policies. Deny, steer, warn, or log — without changing agent code. Pluggable graders with confidence scoring.
 
 ### Langfuse — Observe
 
@@ -73,11 +73,11 @@ This maps to how traditional software works:
 **Agent Control** handles:
 - Runtime policy enforcement (deny/steer/warn/log)
 - Pre/post execution evaluation of agent actions
-- Pluggable evaluators (regex, JSON, SQL, LLM-based)
+- Pluggable graders (regex, JSON, SQL, LLM-based)
 - Centralized control plane with dashboard
 
 **Langfuse** handles:
 - Production tracing with agent-native observation types
 - Live evaluation automation on trace ingestion
-- Score ingestion from external evaluators
+- Score ingestion from external graders
 - Team dashboards and debugging
diff --git a/apps/web/src/content/docs/docs/targets/coding-agents.mdx b/apps/web/src/content/docs/docs/targets/coding-agents.mdx
index f8daba364..51eaab50c 100644
--- a/apps/web/src/content/docs/docs/targets/coding-agents.mdx
+++ b/apps/web/src/content/docs/docs/targets/coding-agents.mdx
@@ -5,7 +5,7 @@ sidebar:
   order: 3
 ---
 
-Coding agent targets evaluate AI coding assistants and CLI-based agents. These targets require a `grader_target` (also accepts `judge_target` for backward compatibility) to run LLM-based evaluators.
+Coding agent targets evaluate AI coding assistants and CLI-based agents. These targets require a `grader_target` (also accepts `judge_target` for backward compatibility) to run LLM-based graders.
 
 ## Prompt format
 
diff --git a/apps/web/src/content/docs/docs/targets/configuration.mdx b/apps/web/src/content/docs/docs/targets/configuration.mdx
index 6b0d8aee6..92befdd63 100644
--- a/apps/web/src/content/docs/docs/targets/configuration.mdx
+++ b/apps/web/src/content/docs/docs/targets/configuration.mdx
@@ -79,7 +79,7 @@ tests:
 
 ## Grader Target
 
-Agent targets that need LLM-based evaluation specify a `grader_target` (also accepts `judge_target` for backward compatibility) — the LLM used to run LLM grader evaluators:
+Agent targets that need LLM-based evaluation specify a `grader_target` (also accepts `judge_target` for backward compatibility) — the LLM used to run LLM grader graders:
 
 ```yaml
 targets:
diff --git a/apps/web/src/content/docs/docs/tools/convert.mdx b/apps/web/src/content/docs/docs/tools/convert.mdx
index 24ad82814..6c40fd5b1 100644
--- a/apps/web/src/content/docs/docs/tools/convert.mdx
+++ b/apps/web/src/content/docs/docs/tools/convert.mdx
@@ -35,7 +35,7 @@ Converts an [Agent Skills `evals.json`](/docs/guides/agent-skills-evals) file in
 
 - Maps `prompt` → `input` message array
 - Maps `expected_output` → `expected_output`
-- Maps `assertions` → `assertions` evaluators (llm-grader)
+- Maps `assertions` → `assertions` graders (llm-grader)
 - Resolves `files[]` paths relative to the evals.json directory
 - Adds TODO comments for AgentV-specific features (workspace setup, code graders, rubrics)
 
diff --git a/apps/web/src/content/docs/docs/tools/import.mdx b/apps/web/src/content/docs/docs/tools/import.mdx
index 1948ba735..6cf90af38 100644
--- a/apps/web/src/content/docs/docs/tools/import.mdx
+++ b/apps/web/src/content/docs/docs/tools/import.mdx
@@ -133,7 +133,7 @@ Token usage is aggregated from the final cumulative value per LLM request. Durat
 
 ## Workflow
 
-Import a session, then run evaluators against it:
+Import a session, then run graders against it:
 
 ```bash
 # 1. List sessions and pick one
@@ -142,7 +142,7 @@ agentv import claude --list
 # 2. Import a session by ID
 agentv import claude --session-id 4c4f9e4e-e6f1-490b-a1b1-9aef543ebf22
 
-# 3. Run evaluators against the imported transcript
+# 3. Run graders against the imported transcript
 agentv eval evals/my-eval.yaml --transcript .agentv/transcripts/claude-4c4f9e4e.jsonl
 ```
 
diff --git a/apps/web/src/content/docs/docs/tools/studio.mdx b/apps/web/src/content/docs/docs/tools/studio.mdx
index c67821a71..1df0b42a2 100644
--- a/apps/web/src/content/docs/docs/tools/studio.mdx
+++ b/apps/web/src/content/docs/docs/tools/studio.mdx
@@ -56,7 +56,7 @@ agentv studio .agentv/results/runs/2026-03-30T11-45-56-989Z
 - **Recent Runs** — table of all evaluation runs with source badge (`local` / `remote`), target, experiment, timestamp, test count, pass rate, and mean score
 - **Experiments** — group and compare runs by experiment name
 - **Targets** — group runs by target (model/agent)
-- **Run Detail** — drill into a run to see per-test results, scores, and evaluator output
+- **Run Detail** — drill into a run to see per-test results, scores, and grader output
 - **Human Review** — add feedback annotations to individual test results
 - **Analytics** — two modes: an aggregated experiment × target matrix, and a per-run view for selecting individual runs to compare side-by-side with optional retroactive tags. Includes a collapsible charts section with baseline comparison analytics
 - **Remote Results** — sync and browse runs pushed from other machines or CI (see [Remote Results](#remote-results))
diff --git a/apps/web/src/content/docs/docs/tools/validate.mdx b/apps/web/src/content/docs/docs/tools/validate.mdx
index c91c27aa6..f8a4f32d8 100644
--- a/apps/web/src/content/docs/docs/tools/validate.mdx
+++ b/apps/web/src/content/docs/docs/tools/validate.mdx
@@ -23,7 +23,7 @@ agentv validate evals/**/*.yaml
 
 - YAML/JSONL syntax
 - Required fields (id, input, criteria)
-- Evaluator references (command paths, prompt files)
+- Grader references (command paths, prompt files)
 - Target references match entries in `targets.yaml`
 - Rubric structure and field types