From dd891c14c412bcd9700a40b4a56976c24ae9e63d Mon Sep 17 00:00:00 2001 From: Christopher Date: Thu, 9 Apr 2026 06:05:45 +0000 Subject: [PATCH] docs: rename trace command to inspect --- .../docs/docs/evaluation/running-evals.mdx | 6 ++-- .../content/docs/docs/guides/human-review.mdx | 2 +- .../guides/skill-improvement-workflow.mdx | 2 +- .../docs/tools/{trace.mdx => inspect.mdx} | 22 ++++++------- .../agentv-dev/skills/agentv-bench/SKILL.md | 2 +- .../skills/agentv-eval-writer/SKILL.md | 2 +- .../skills/agentv-trace-analyst/SKILL.md | 32 +++++++++---------- 7 files changed, 34 insertions(+), 34 deletions(-) rename apps/web/src/content/docs/docs/tools/{trace.mdx => inspect.mdx} (80%) diff --git a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx index b1a2ced9a..341b71d30 100644 --- a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx +++ b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx @@ -107,13 +107,13 @@ result-oriented workflows. For full-fidelity span inspection, export OTLP JSON e ```bash # Summary-level inspection from the run manifest -agentv trace stats .agentv/results/runs//index.jsonl +agentv inspect stats .agentv/results/runs//index.jsonl # Full-fidelity OTLP JSON trace (importable by OTel backends like Jaeger, Grafana) agentv eval evals/my-eval.yaml --otel-file traces/eval.otlp.json -# Inspect the OTLP trace export -agentv trace show traces/eval.otlp.json --tree +# Inspect the OTLP export +agentv inspect show traces/eval.otlp.json --tree ``` `index.jsonl` contains aggregate metrics such as score, latency, cost, token usage, and summary diff --git a/apps/web/src/content/docs/docs/guides/human-review.mdx b/apps/web/src/content/docs/docs/guides/human-review.mdx index ae737bd5f..018aca2a0 100644 --- a/apps/web/src/content/docs/docs/guides/human-review.mdx +++ b/apps/web/src/content/docs/docs/guides/human-review.mdx @@ -38,7 +38,7 @@ For workspace evaluations (EVAL.yaml), use the trace viewer: ```bash # View traces from a specific run -agentv trace show results/2026-03-14T10-32-00_claude/index.jsonl +agentv inspect show results/2026-03-14T10-32-00_claude/index.jsonl # View the HTML report (if generated via #562) open results/2026-03-14T10-32-00_claude/report.html diff --git a/apps/web/src/content/docs/docs/guides/skill-improvement-workflow.mdx b/apps/web/src/content/docs/docs/guides/skill-improvement-workflow.mdx index 42496afbe..00ebd4fbb 100644 --- a/apps/web/src/content/docs/docs/guides/skill-improvement-workflow.mdx +++ b/apps/web/src/content/docs/docs/guides/skill-improvement-workflow.mdx @@ -156,7 +156,7 @@ Look for: Use trace inspection to understand why specific cases failed: ```bash -agentv trace show +agentv inspect show ``` When reviewing failures, categorize them: diff --git a/apps/web/src/content/docs/docs/tools/trace.mdx b/apps/web/src/content/docs/docs/tools/inspect.mdx similarity index 80% rename from apps/web/src/content/docs/docs/tools/trace.mdx rename to apps/web/src/content/docs/docs/tools/inspect.mdx index d4813c043..c7c66c9cf 100644 --- a/apps/web/src/content/docs/docs/tools/trace.mdx +++ b/apps/web/src/content/docs/docs/tools/inspect.mdx @@ -1,11 +1,11 @@ --- -title: Trace -description: Inspect and analyze evaluation traces from the CLI +title: Inspect +description: Inspect and analyze evaluation results from the CLI sidebar: order: 5 --- -The `trace` command provides headless trace inspection and analysis — no server or dashboard needed. +The `inspect` command provides headless trace inspection and analysis — no server or dashboard needed. Supported sources: @@ -17,22 +17,22 @@ For full tool-call inspection, prefer OTLP JSON exports over eval manifests. ## Subcommands -### `trace list` +### `inspect list` Enumerate canonical evaluation run workspaces from `.agentv/results/runs/`. ```bash -agentv trace list [--limit N] [--format json|table] +agentv inspect list [--limit N] [--format json|table] ``` Shows filename, test count, pass rate, average score, file size, and timestamp for each run workspace. -### `trace show` +### `inspect show` Display evaluation results with trace details. ```bash -agentv trace show [--test-id ] [--tree] [--format json|table] +agentv inspect show [--test-id ] [--tree] [--format json|table] ``` | Option | Description | @@ -58,12 +58,12 @@ Scores: response_quality 75% | routing_accuracy 100% Falls back to a flat summary when output messages are not present in the run workspace. -### `trace stats` +### `inspect stats` Compute summary statistics (percentiles) across evaluation results. ```bash -agentv trace stats [--group-by target|suite|test-id] [--format json|table] +agentv inspect stats [--group-by target|suite|test-id] [--format json|table] ``` | Option | Description | @@ -90,11 +90,11 @@ All commands support `--format json` for piping to `jq`: ```bash # Find tests costing more than $0.10 -agentv trace show trace.otlp.json --format json \ +agentv inspect show trace.otlp.json --format json \ | jq '[.[] | select(.cost_usd > 0.10) | {test_id, score, cost: .cost_usd}]' # Compare providers -agentv trace stats .agentv/results/runs//index.jsonl --group-by target --format json \ +agentv inspect stats .agentv/results/runs//index.jsonl --group-by target --format json \ | jq '.groups[] | {label, score_mean: .metrics.score.mean}' ``` diff --git a/plugins/agentv-dev/skills/agentv-bench/SKILL.md b/plugins/agentv-dev/skills/agentv-bench/SKILL.md index 162b2dbd4..fe8952102 100644 --- a/plugins/agentv-dev/skills/agentv-bench/SKILL.md +++ b/plugins/agentv-dev/skills/agentv-bench/SKILL.md @@ -263,7 +263,7 @@ Read the JSONL results and look for: Use CLI tools for deeper investigation: ```bash -agentv trace # Detailed execution trace inspection +agentv inspect # Detailed execution trace inspection agentv compare # Structured diff between runs ``` diff --git a/plugins/agentv-dev/skills/agentv-eval-writer/SKILL.md b/plugins/agentv-dev/skills/agentv-eval-writer/SKILL.md index 05d2e3403..90bdd697b 100644 --- a/plugins/agentv-dev/skills/agentv-eval-writer/SKILL.md +++ b/plugins/agentv-dev/skills/agentv-eval-writer/SKILL.md @@ -681,7 +681,7 @@ After running evals, perform a human review before iterating. Create `feedback.j Use `evaluator_overrides` for workspace evaluations to annotate specific grader results (e.g., "code-grader was too strict"). Use `workspace_notes` for observations about workspace state. -Review workflow: run evals → inspect results (`agentv trace show`) → write feedback → tune prompts/graders → re-run. +Review workflow: run evals → inspect results (`agentv inspect show`) → write feedback → tune prompts/graders → re-run. Full guide: https://agentv.dev/guides/human-review/ diff --git a/plugins/agentv-dev/skills/agentv-trace-analyst/SKILL.md b/plugins/agentv-dev/skills/agentv-trace-analyst/SKILL.md index 2f75cedc2..5cfea496d 100644 --- a/plugins/agentv-dev/skills/agentv-trace-analyst/SKILL.md +++ b/plugins/agentv-dev/skills/agentv-trace-analyst/SKILL.md @@ -1,7 +1,7 @@ --- name: agentv-trace-analyst description: >- - Analyze AgentV evaluation traces and result JSONL files using `agentv trace` and `agentv compare` CLI commands. + Analyze AgentV evaluation traces and result JSONL files using `agentv inspect` and `agentv compare` CLI commands. Use when asked to inspect AgentV eval results, find regressions between AgentV evaluation runs, identify failure patterns in AgentV trace data, analyze tool trajectories, or compute cost/latency/score statistics from AgentV result files. @@ -11,19 +11,19 @@ description: >- # AgentV Trace Analyst -Analyze evaluation traces headlessly using `agentv trace` primitives and `jq`. +Analyze evaluation traces headlessly using `agentv inspect` primitives and `jq`. ## Primitives ```bash # List result files (most recent first) -agentv trace list [--limit N] [--format json|table] +agentv inspect list [--limit N] [--format json|table] # Show results with trace details -agentv trace show [--test-id ] [--tree] [--format json|table] +agentv inspect show [--test-id ] [--tree] [--format json|table] # Percentile statistics -agentv trace stats [--group-by target|suite|test-id] [--format json|table] +agentv inspect stats [--group-by target|suite|test-id] [--format json|table] # A/B comparison between runs agentv compare [--threshold 0.1] [--format json|table] @@ -34,7 +34,7 @@ agentv compare [--threshold 0.1] [--format js ### 1. Discover results ```bash -agentv trace list +agentv inspect list ``` Pick the result file to analyze. Most recent is first. @@ -42,7 +42,7 @@ Pick the result file to analyze. Most recent is first. ### 2. Get overview ```bash -agentv trace stats +agentv inspect stats ``` Read the percentile table. Key signals: @@ -54,7 +54,7 @@ Read the percentile table. Key signals: ### 3. Investigate failures ```bash -agentv trace show --format json | jq '[.[] | select(.score < 0.8) | {test_id, score, assertions: [.assertions[] | select(.passed | not)], trace: {tools: (.trace.tool_calls | keys)}, duration_ms, cost_usd}]' +agentv inspect show --format json | jq '[.[] | select(.score < 0.8) | {test_id, score, assertions: [.assertions[] | select(.passed | not)], trace: {tools: (.trace.tool_calls | keys)}, duration_ms, cost_usd}]' ``` For each failing test, examine: @@ -67,10 +67,10 @@ For each failing test, examine: ```bash # Flat view with trace summary -agentv trace show --test-id +agentv inspect show --test-id # Tree view (if output messages available) -agentv trace show --test-id --tree +agentv inspect show --test-id --tree ``` The tree view shows the agent's execution path — LLM calls interspersed with tool invocations. Look for: @@ -93,10 +93,10 @@ Look for: ```bash # By target provider -agentv trace stats --group-by target +agentv inspect stats --group-by target # By suite -agentv trace stats --group-by suite +agentv inspect stats --group-by suite ``` Compare providers side-by-side: which is cheaper, faster, more accurate? @@ -107,19 +107,19 @@ All commands support `--format json` for piping to `jq`: ```bash # Top 3 most expensive tests -agentv trace show --format json \ +agentv inspect show --format json \ | jq 'sort_by(-.cost_usd) | .[0:3] | .[] | {test_id, cost: .cost_usd, score}' # Tests where token usage exceeds 10k -agentv trace show --format json \ +agentv inspect show --format json \ | jq '[.[] | select(.token_usage.input + .token_usage.output > 10000) | {test_id, tokens: (.token_usage.input + .token_usage.output)}]' # Score distribution by suite -agentv trace show --format json \ +agentv inspect show --format json \ | jq 'group_by(.suite) | .[] | {suite: .[0].suite, count: length, avg_score: ([.[].score] | add / length)}' # Tool usage frequency across all tests -agentv trace show --format json \ +agentv inspect show --format json \ | jq '[.[].trace.tool_calls // {} | to_entries[]] | group_by(.key) | .[] | {tool: .[0].key, total_calls: ([.[].value] | add)}' # Find regressions > 0.1 between two runs