EntityProcess · christso · Apr 23, 2026 · Apr 23, 2026 · Apr 23, 2026 · Apr 23, 2026
diff --git a/AGENTS.md b/AGENTS.md
@@ -334,7 +334,7 @@ Unit tests alone are insufficient for grader changes. After implementing or modi
 
 4. **Update baseline files** if output format changes (e.g., type name renames). Baseline files live alongside eval YAML files as `*.baseline.jsonl` and contain expected `scores[].type` values. There are 30+ baseline files across `examples/`.
 
-5. **Note:** `--dry-run` returns schema-valid mock responses (`{}` as output, zeroed `tokenUsage`). Built-in graders will not crash, but scores are meaningless. Use it for testing harness flow, not grader logic.
+5. **Note:** `--dry-run` returns schema-valid mock responses for both agent output and grader evaluation (score=1, empty assertions/checks). Built-in LLM graders run without parse errors but scores are meaningless. Use it for end-to-end harness testing including grader plumbing.
 
 ### Completing Work — E2E Checklist
 
@@ -406,7 +406,7 @@ Types: `feat`, `fix`, `docs`, `style`, `refactor`, `test`, `chore`
 
 When working on a GitHub issue, **ALWAYS** follow this workflow:
 
-1. **Claim the issue** — prevents other agents from duplicating work:
+1. **Claim the issue** — prevents other agents from duplicating work by stamping Agent ID and setting status on the project board:
    ```bash
    # Load AGENT_ID from .env; if not set, ask the user or default to <harness>-<model>
    # Harness = the coding tool (claude-code, opencode, codex-cli, cursor, etc.)
@@ -419,15 +419,13 @@ When working on a GitHub issue, **ALWAYS** follow this workflow:
      echo "AGENT_ID is not set. Ask the user for an agent identifier, or default to devbox2-codex in this environment (otherwise use <harness>-<model>)."
    fi
 
-   # Check if already claimed
-   gh issue view <number> --json labels --jq '.labels[].name' | grep -q "in-progress" && echo "SKIP — already claimed" && exit 1
-
-   # Claim it — label + project roadmap status
-   gh issue edit <number> --add-label "in-progress"
+   # Check if already claimed via project board status
+   ITEM_ID=$(gh project item-list 1 --owner EntityProcess --format json | jq -r '.items[] | select(.content.number == <number> and .content.repository == "EntityProcess/agentv") | .id')
+   CURRENT_STATUS=$(gh project item-list 1 --owner EntityProcess --format json | jq -r '.items[] | select(.content.number == <number> and .content.repository == "EntityProcess/agentv") | .status')
+   [ "$CURRENT_STATUS" = "In Progress" ] && echo "SKIP — already claimed" && exit 1
 
    # Update project roadmap: ensure the issue is on the AgentV OSS board,
-   # then set status to "In progress" and stamp Agent ID
-   ITEM_ID=$(gh project item-list 1 --owner EntityProcess --format json | jq -r '.items[] | select(.content.number == <number> and .content.repository == "EntityProcess/agentv") | .id')
+   # then set status to "In Progress" and stamp Agent ID
    if [ -z "$ITEM_ID" ] || [ "$ITEM_ID" = "null" ]; then
      ITEM_ID=$(gh project item-add 1 --owner EntityProcess --url "https://github.com/EntityProcess/agentv/issues/<number>" --format json | jq -r '.id')
    fi
@@ -436,7 +434,7 @@ When working on a GitHub issue, **ALWAYS** follow this workflow:
      gh project item-edit --project-id PVT_kwDOAIbbRc4BSmjF --id "$ITEM_ID" --field-id PVTF_lADOAIbbRc4BSmjFzhAHSnk --text "$AGENT_ID"
    fi
    ```
-   If the issue has the `in-progress` label, **do not work on it** — pick a different issue.
+   If the issue has project board status "In Progress", **do not work on it** — pick a different issue.
 
 2. **Update local `main` to the latest `origin/main`** before branching:
    ```bash
@@ -468,7 +466,7 @@ When working on a GitHub issue, **ALWAYS** follow this workflow:
    1. Run unit tests.
    2. Execute every test plan item from the issue/PR checklist, mark each `[x]`, and paste CLI output as evidence.
    3. Manual red/green UAT with before/after evidence.
-   4. **After e2e passes**, spawn a final subagent code review pass and address or call out any findings. Do NOT run the code review before e2e — if e2e fails you'll need to fix it first, which invalidates the review.
+   4. **After e2e passes**, spawn a final subagent code review pass and address or call out any findings — **unless the change is focused** (single-responsibility, well-tested, no architectural impact), in which case this step may be skipped. Do NOT run the code review before e2e — if e2e fails you'll need to fix it first, which invalidates the review.
    5. CI pipeline passes (all checks green).
    6. No merge conflicts with `main`.
 
@@ -481,17 +479,14 @@ When working on a GitHub issue, **ALWAYS** follow this workflow:
    - Remove the local worktree created for the issue
    - Confirm the primary checkout is back on an up-to-date `main`
 
-The `in-progress` label stays on the issue until the PR is merged and the issue is closed. Do not remove it manually.
-
 **IMPORTANT:** Never push directly to `main`. Always use branches and PRs.
 
 ### Tracker Conventions
 
-- The roadmap project is the source of truth for prioritization.
+- The roadmap project is the source of truth for prioritization and claim status — use it, not labels.
 - Issues in the roadmap are prioritized; issues outside it are not.
 - `bug` marks defects.
 - Issues without `bug` are non-bug work by default.
-- `in-progress` marks an issue as claimed by an agent — do not start work on it.
 - `core`, `wui`, and `tui` are area labels.
 - Keep issue bodies focused on the handoff contract: objective, design latitude, acceptance signals, non-goals, and related links.
 - Do not put priority metadata in issue bodies.

diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
@@ -589,7 +589,9 @@ async function prepareFileMetadata(params: {
           name: `${targetDefinition.name}-dry-run`,
           graderTarget: undefined,
           config: {
-            response: '{"answer":"Mock dry-run response"}',
+            // Schema-valid grader response so --dry-run works end-to-end with LLM graders.
+            // Satisfies freeform (score), rubric (checks, overall_reasoning), and score-range (checks) without real LLM calls.
+            response: '{"score":1,"assertions":[],"checks":[],"overall_reasoning":"dry-run mock"}',
             delayMs: options.dryRunDelay,
             delayMinMs: options.dryRunDelayMin,
             delayMaxMs: options.dryRunDelayMax,

diff --git a/apps/cli/src/commands/eval/targets.ts b/apps/cli/src/commands/eval/targets.ts
@@ -13,6 +13,17 @@ const ANSI_YELLOW = '\u001b[33m';
 const ANSI_RED = '\u001b[31m';
 const ANSI_RESET = '\u001b[0m';
 
+/**
+ * Dry-run mock response: satisfies all LLM grader schemas (freeform, rubric, score-range)
+ * so that --dry-run works end-to-end including graders without real LLM calls.
+ *
+ * - freeformEvaluationSchema:    "score" (required), "assertions" (optional)
+ * - rubricEvaluationSchema:      "checks" (required), "overall_reasoning" (required)
+ * - scoreRangeEvaluationSchema:  "checks" (required), "overall_reasoning" (optional)
+ */
+const DRY_RUN_MOCK_RESPONSE =
+  '{"score":1,"assertions":[],"checks":[],"overall_reasoning":"dry-run mock"}';
+
 function isTTY(): boolean {
   return process.stdout.isTTY ?? false;
 }
@@ -183,7 +194,7 @@ export async function selectTarget(options: TargetSelectionOptions): Promise<Tar
       name: `${targetDefinition.name}-dry-run`,
       graderTarget: undefined,
       config: {
-        response: '{"answer":"Mock dry-run response"}',
+        response: DRY_RUN_MOCK_RESPONSE,
         delayMs: dryRunDelay,
         delayMinMs: dryRunDelayMin,
         delayMaxMs: dryRunDelayMax,
@@ -307,7 +318,7 @@ export async function selectMultipleTargets(
         name: `${targetDefinition.name}-dry-run`,
         graderTarget: undefined,
         config: {
-          response: '{"answer":"Mock dry-run response"}',
+          response: DRY_RUN_MOCK_RESPONSE,
           delayMs: dryRunDelay,
           delayMinMs: dryRunDelayMin,
           delayMaxMs: dryRunDelayMax,

diff --git a/apps/cli/src/index.ts b/apps/cli/src/index.ts
@@ -1,4 +1,7 @@
+import path from 'node:path';
+import { loadConfig, runBeforeSessionHook } from '@agentv/core';
 import { binary, run, subcommands } from 'cmd-ts';
+import { findRepoRoot } from './commands/eval/shared.js';
 
 import packageJson from '../package.json' with { type: 'json' };
 import { compareCommand } from './commands/compare/index.js';
@@ -130,5 +133,16 @@ export async function runCli(argv: string[] = process.argv): Promise<void> {
   });
 
   const processedArgv = preprocessArgv(argv);
+
+  // Run before_session hook once at startup, before any command executes.
+  // Uses cwd as the search root for .agentv/config.yaml.
+  const cwd = process.cwd();
+  const repoRoot = await findRepoRoot(cwd);
+  const sessionConfig = await loadConfig(path.join(cwd, '_'), repoRoot);
+  const beforeSessionCommand = sessionConfig?.hooks?.before_session;
+  if (beforeSessionCommand) {
+    runBeforeSessionHook(beforeSessionCommand);
+  }
+
   await run(binary(app), processedArgv);
 }
diff --git a/docs/plans/pre-run-hook.md b/docs/plans/pre-run-hook.md
@@ -0,0 +1,39 @@
+# Plan: hooks.pre_run — pre-eval environment injection
+
+Issue: #1149
+
+## Problem
+
+Users who fetch secrets at runtime (e.g. from Azure Key Vault, AWS Secrets Manager) must wrap the
+`agentv` CLI in a project-level script. A generic `hooks.pre_run` config option removes this need.
+
+## Implementation
+
+### Files changed
+
+| File | Change |
+|------|--------|
+| `packages/core/src/evaluation/hooks.ts` | New — `parseEnvOutput()` and `runPreRunHook()` |
+| `packages/core/src/evaluation/config.ts` | Add `hooks.preRun` to Zod schema + TS type |
+| `packages/core/src/evaluation/loaders/config-loader.ts` | Add `HooksConfig` type + `parseHooksConfig()` + wire into `loadConfig()` |
+| `packages/core/src/index.ts` | Export `runPreRunHook`, `parseEnvOutput` |
+| `apps/cli/src/commands/eval/run-eval.ts` | Import `runPreRunHook`; call before `normalizeOptions` |
+| `plugins/agentv-dev/skills/agentv-eval-writer/references/config-schema.json` | Add `hooks.pre_run` to JSON schema |
+| `packages/core/test/evaluation/hooks.test.ts` | New — unit tests for `parseEnvOutput` |
+
+### Precedence
+
+YAML config (`hooks.pre_run`) takes precedence over TS config (`hooks.preRun`). This matches the
+existing pattern for other settings.
+
+### Env var injection rules
+
+- Parses `export KEY="value"` (shell export) and `KEY=value` (dotenv) from stdout
+- Existing env vars are NOT overwritten — process.env always wins
+- Non-zero exit throws, aborting the eval
+- Stderr forwarded to process.stderr so users see hook output
+
+## Docs to update before merge
+
+- `apps/web/src/content/docs/` — add `hooks.pre_run` to the config reference page
+- Delete this plan file
diff --git a/packages/core/src/evaluation/config.ts b/packages/core/src/evaluation/config.ts
@@ -78,6 +78,20 @@ const AgentVConfigSchema = z.object({
       maxDurationMs: z.number().int().min(0).optional(),
     })
     .optional(),
+
+  /** Lifecycle hooks */
+  hooks: z
+    .object({
+      /**
+       * Shell command to run once at agentv startup, before any command executes.
+       * stdout is parsed for env var exports (`KEY=value` or `export KEY="value"`)
+       * and injected into process.env. Keys already set in the environment are
+       * not overwritten — existing env always takes priority.
+       * stderr is forwarded to the user. Non-zero exit aborts with an error.
+       */
+      beforeSession: z.string().optional(),
+    })
+    .optional(),
 });
 
 /**

diff --git a/packages/core/src/evaluation/graders/index.ts b/packages/core/src/evaluation/graders/index.ts
@@ -52,6 +52,7 @@ export {
   substituteVariables,
   freeformEvaluationSchema,
   rubricEvaluationSchema,
+  scoreRangeEvaluationSchema,
 } from './llm-grader.js';
 export type { LlmGraderOptions } from './llm-grader.js';
 

diff --git a/packages/core/src/evaluation/graders/llm-grader.ts b/packages/core/src/evaluation/graders/llm-grader.ts
@@ -147,7 +147,7 @@ const scoreRangeEvaluationSchema = z.object({
   overall_reasoning: z.string().describe('Overall assessment summary (1-2 sentences)').optional(),
 });
 
-export { freeformEvaluationSchema, rubricEvaluationSchema };
+export { freeformEvaluationSchema, rubricEvaluationSchema, scoreRangeEvaluationSchema };
 
 interface StructuredGenerationResult {
   readonly text: string;

diff --git a/packages/core/src/evaluation/hooks.ts b/packages/core/src/evaluation/hooks.ts
@@ -0,0 +1,125 @@
+/**
+ * Session hook execution for AgentV.
+ *
+ * Runs a shell command once at agentv startup and injects exported environment
+ * variables into the current process. This lets projects fetch secrets at
+ * runtime (e.g. from a vault) without needing a wrapper script.
+ *
+ * ## How it works
+ *
+ * 1. The command is run via `sh -c` (or `cmd /c` on Windows).
+ * 2. stdout is captured and parsed for env var exports.
+ * 3. stderr is forwarded to the process stderr so the user sees output.
+ * 4. Non-zero exit aborts with a clear error.
+ * 5. Parsed keys are injected into `process.env` — only for keys not already
+ *    set, so existing env always wins.
+ *
+ * ## Supported output formats
+ *
+ * Both shell-export and dotenv formats are accepted:
+ *   export KEY="value"   (shell export — quotes optional)
+ *   KEY=value            (dotenv — no export prefix)
+ *
+ * Lines that don't match either pattern are silently ignored.
+ *
+ * @module
+ */
+
+import { spawnSync } from 'node:child_process';
+
+const ANSI_YELLOW = '[33m';
+const ANSI_RESET = '[0m';
+
+/**
+ * Parse env var lines from hook stdout.
+ *
+ * Accepts:
+ *   export KEY="value"   → { KEY: "value" }
+ *   export KEY=value     → { KEY: "value" }
+ *   KEY=value            → { KEY: "value" }
+ *
+ * Strips surrounding single or double quotes from values.
+ * Skips lines with empty keys or values that look like shell syntax.
+ */
+export function parseEnvOutput(stdout: string): Record<string, string> {
+  const result: Record<string, string> = {};
+
+  for (const line of stdout.split('\n')) {
+    const trimmed = line.trim();
+    if (!trimmed || trimmed.startsWith('#')) continue;
+
+    // Match: [export ]KEY=value
+    const match = trimmed.match(/^(?:export\s+)?([A-Za-z_][A-Za-z0-9_]*)=(.*)$/);
+    if (!match) continue;
+
+    const key = match[1];
+    let value = match[2];
+
+    // Strip surrounding quotes (single or double)
+    if (
+      (value.startsWith('"') && value.endsWith('"')) ||
+      (value.startsWith("'") && value.endsWith("'"))
+    ) {
+      value = value.slice(1, -1);
+    }
+
+    if (key) {
+      result[key] = value;
+    }
+  }
+
+  return result;
+}
+
+/**
+ * Run the before_session hook command and inject exported env vars into process.env.
+ *
+ * - Runs via shell (`sh -c` on POSIX, `cmd /c` on Windows)
+ * - Captured stdout is parsed for env vars; stderr is forwarded to process.stderr
+ * - Non-zero exit throws an Error with the command and exit code
+ * - Keys already set in process.env are NOT overwritten
+ *
+ * @param command Shell command string to execute
+ */
+export function runBeforeSessionHook(command: string): void {
+  const isWindows = process.platform === 'win32';
+  const shell = isWindows ? 'cmd' : 'sh';
+  const shellFlag = isWindows ? '/c' : '-c';
+
+  console.log(`${ANSI_YELLOW}Running before_session hook: ${command}${ANSI_RESET}`);
+
+  const result = spawnSync(shell, [shellFlag, command], {
+    encoding: 'utf8',
+    // Do not inherit stdio — capture stdout for parsing, forward stderr manually
+    stdio: ['ignore', 'pipe', 'pipe'],
+  });
+
+  // Forward stderr so the user can see hook output (warnings, progress, etc.)
+  if (result.stderr) {
+    process.stderr.write(result.stderr);
+  }
+
+  if (result.error) {
+    throw new Error(`before_session hook failed to start: ${result.error.message}`);
+  }
+
+  if (result.status !== 0) {
+    throw new Error(
+      `before_session hook exited with code ${result.status ?? 'unknown'}: ${command}`,
+    );
+  }
+
+  const vars = parseEnvOutput(result.stdout ?? '');
+  let injected = 0;
+
+  for (const [key, value] of Object.entries(vars)) {
+    if (process.env[key] === undefined) {
+      process.env[key] = value;
+      injected++;
+    }
+  }
+
+  if (injected > 0) {
+    console.log(`before_session hook injected ${injected} environment variable(s).`);
+  }
+}