agents-oss
diff --git a/‎CLAUDE.md‎
Lines changed: 45 additions & 0 deletions b/‎CLAUDE.md‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎docs/.vitepress/config.mts‎
Lines changed: 4 additions & 3 deletions b/‎docs/.vitepress/config.mts‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎docs/guides/evaluation-datasets.md‎
Lines changed: 107 additions & 0 deletions b/‎docs/guides/evaluation-datasets.md‎
Lines changed: 107 additions & 0 deletions
diff --git a/‎packages/adapter-claude/src/__tests__/claude-adapter.test.ts‎
Lines changed: 5 additions & 4 deletions b/‎packages/adapter-claude/src/__tests__/claude-adapter.test.ts‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎packages/cli/src/__tests__/commands.test.ts‎
Lines changed: 1 addition & 1 deletion b/‎packages/cli/src/__tests__/commands.test.ts‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎packages/cli/src/__tests__/deploy-k8s.test.ts‎
Lines changed: 3 additions & 0 deletions b/‎packages/cli/src/__tests__/deploy-k8s.test.ts‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎packages/cli/src/__tests__/diff.test.ts‎
Lines changed: 2 additions & 2 deletions b/‎packages/cli/src/__tests__/diff.test.ts‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎packages/cli/src/__tests__/evaluate.test.ts‎
Lines changed: 1 addition & 1 deletion b/‎packages/cli/src/__tests__/evaluate.test.ts‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎packages/cli/src/__tests__/generate-policy.test.ts‎
Lines changed: 1 addition & 1 deletion b/‎packages/cli/src/__tests__/generate-policy.test.ts‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎packages/cli/src/__tests__/generate.test.ts‎
Lines changed: 12 additions & 6 deletions b/‎packages/cli/src/__tests__/generate.test.ts‎
Lines changed: 12 additions & 6 deletions
@@ -42,6 +42,51 @@ agentspec/
 
 ## Design Principles
 
+### 0. Thin orchestrator + named helpers (preferred style for all new code)
+
+**This is the default way to write functions in this codebase.** Prefer many small, named functions over one large function — even before the code gets long.
+
+When writing new code, decompose by intent first:
+- If a block of logic has a name (even just in a comment), make it a function.
+- Orchestrators read like a pipeline of named steps; they contain no implementation details.
+- Helpers are pure or near-pure: explicit inputs, explicit output, no side effects on shared state.
+
+**Rule**: if you can label a code block with a comment like `// Phase 3: score results`, that label is the function name — extract it.
+
+**Template** (applied throughout this codebase):
+```typescript
+// ── Internal interfaces (module-private) ─────────────────────────────────────
+interface PhaseAResult { ... }
+interface PhaseBResult { ... }
+
+// ── Private helpers ────────────────────────────────────────────────────────────
+function phaseA(input: Input): PhaseAResult { ... }
+function phaseB(intermediate: PhaseAResult): PhaseBResult { ... }
+function phaseC(a: PhaseAResult, b: PhaseBResult): FinalResult { ... }
+
+// ── Public orchestrator ────────────────────────────────────────────────────────
+export function doThing(input: Input): FinalResult {
+  const a = phaseA(input)
+  const b = phaseB(a)
+  return phaseC(a, b)
+}
+```
+
+**Applied examples in this repo:**
+| File | Orchestrator | Extracted helpers |
+|------|-------------|-------------------|
+| `sdk/src/audit/index.ts` | `runAudit()` | `resolveActiveRules` · `collectSuppressions` · `executeRuleChecks` · `computeScoring` · `computeProvedScore` |
+| `sdk/src/health/index.ts` | `runHealthCheck()` | `runSubagentChecks` · `runEvalChecks` · `computeHealthStatus` |
+| `cli/src/commands/audit.ts` | action closure | `fetchProofRecords` · `printScoreSummary` · `formatEvidenceBreakdown` |
+| `cli/src/commands/generate.ts` | action closure | `validateFramework` · `handleK8sGeneration` · `handleLLMGeneration` · `writePushModeEnv` |
+| `cli/src/commands/evaluate.ts` | action closure | `resolveChatEndpoint` · `runInference` · `determineCiGateExit` |
+| `cli/src/commands/scan.ts` | action closure | `collectAndValidateSourceFiles` · `validateScanResponse` |
+| `sdk/src/agent/reporter.ts` | `startPushMode()` | `_pushHeartbeat` (private method) |
+
+**Helpers are always module-private** (not exported) unless reuse across files is proven necessary. Internal `interface` types for inter-helper data shapes are also module-private.
+
+---
+
 ### 1. Zod as single source of truth
 The `packages/sdk/src/schema/manifest.schema.ts` is the canonical definition.
 - Types are inferred from Zod with `z.infer<>`
 
@@ -76,9 +76,10 @@ export default defineConfig({
             text: 'Verification & CI',
             collapsed: false,
             items: [
-              { text: 'Integrate Proof Tools', link: '/guides/proof-integration' },
-              { text: 'CI Integration',        link: '/guides/ci-integration' },
-              { text: 'E2E Testing',           link: '/guides/e2e-testing' },
+              { text: 'Structure Evaluation Datasets', link: '/guides/evaluation-datasets' },
+              { text: 'Integrate Proof Tools',         link: '/guides/proof-integration' },
+              { text: 'CI Integration',                link: '/guides/ci-integration' },
+              { text: 'E2E Testing',                   link: '/guides/e2e-testing' },
             ],
           },
           {
 
@@ -0,0 +1,107 @@
+# Structure Evaluation Datasets
+
+Organise evaluation data so each dataset file tests one concern and each metric is declared exactly once — in `agent.yaml`, not in the JSONL.
+
+## The rule: one dataset, one concern
+
+Metrics are declared at the dataset level in the manifest, not per sample in the JSONL file.
+A dataset file contains only data — inputs, expected outputs, and optional context.
+
+This means: **if you need different metrics, use different dataset files.**
+
+```yaml
+spec:
+  evaluation:
+    framework: ragas
+    datasets:
+      - name: rag-quality
+        path: $file:evals/rag.jsonl
+        metrics: [faithfulness, context_recall, answer_relevancy]
+
+      - name: safety
+        path: $file:evals/safety.jsonl
+        metrics: [toxicity, bias]
+
+      - name: accuracy
+        path: $file:evals/accuracy.jsonl
+        metrics: [answer_similarity, hallucination]
+
+    thresholds:
+      faithfulness: 0.80
+      context_recall: 0.75
+      answer_relevancy: 0.75
+      toxicity: 0.90
+      bias: 0.85
+      answer_similarity: 0.80
+      hallucination: 0.05
+    ciGate: true
+```
+
+## JSONL sample format
+
+Each line in a dataset file is a JSON object. All fields except `input` and `expected` are optional.
+
+```jsonl
+{"input": "What is RAG?", "expected": "Retrieval Augmented Generation", "context": ["RAG combines a retrieval step..."], "tags": ["basics"]}
+{"input": "How does vector search work?", "expected": "By comparing embedding distances", "context": ["Vectors are high-dimensional..."], "reference_contexts": ["Embeddings encode semantic meaning..."], "tags": ["rag", "advanced"], "metadata": {"difficulty": "medium"}}
+```
+
+| Field | Required | Description |
+|---|---|---|
+| `input` | yes | User query sent to the agent |
+| `expected` | yes | Expected output — used for `answer_similarity` and `string_match` scoring |
+| `context` | for RAG metrics | Retrieved chunks the agent used. Required for `faithfulness`, `context_precision`, `hallucination` |
+| `reference_contexts` | for `context_recall` | Ground-truth relevant chunks. Required for `context_recall` |
+| `tags` | no | Labels for filtering with `--tag` |
+| `metadata` | no | Arbitrary key/value pairs reported in output (e.g. `{"difficulty": "hard", "source": "prod-logs"}`) |
+
+## Which metrics need which fields
+
+| Metric | `context` | `reference_contexts` |
+|---|---|---|
+| `answer_similarity` | no | no |
+| `answer_relevancy` | no | no |
+| `hallucination` | yes | no |
+| `faithfulness` | yes | no |
+| `context_precision` | yes | no |
+| `context_recall` | yes | yes |
+| `toxicity` | no | no |
+| `bias` | no | no |
+
+If a dataset declares a RAG metric but its samples have no `context` field, the evaluation framework will error or return meaningless scores. Splitting by concern prevents this.
+
+## Running a dataset
+
+```bash
+# Run all samples
+agentspec evaluate agent.yaml --url http://localhost:4000 --dataset rag-quality
+
+# Run 20 random samples
+agentspec evaluate agent.yaml --url http://localhost:4000 --dataset rag-quality --sample-size 20
+
+# Run only samples tagged "advanced"
+agentspec evaluate agent.yaml --url http://localhost:4000 --dataset rag-quality --tag advanced
+
+# Machine-readable output
+agentspec evaluate agent.yaml --url http://localhost:4000 --dataset safety --json
+```
+
+Exit code `1` when `ciGate: true` and any metric falls below its threshold.
+
+## Recommended file layout
+
+```
+evals/
+  rag.jsonl          # faithfulness, context_recall, answer_relevancy
+  safety.jsonl       # toxicity, bias
+  accuracy.jsonl     # answer_similarity, hallucination
+  regression.jsonl   # string_match on known Q&A pairs (no context needed)
+```
+
+One JSONL per concern keeps datasets independently runnable, independently versionable, and easy to extend without touching other test suites.
+
+## See also
+
+- [`agentspec evaluate` CLI reference](../reference/cli.md#agentspec-evaluate)
+- [Probe coverage & evidence tiers](../concepts/probe-coverage.md)
+- [CI integration](./ci-integration.md)
@@ -22,6 +22,7 @@ const baseManifest: AgentSpecManifest = {
     },
     prompts: {
       system: '$file:prompts/system.md',
+      hotReload: false,
     },
   },
 }
@@ -69,7 +70,7 @@ function makeClaudeResponse(jsonContent: object | string): object {
 // ── context-builder tests ─────────────────────────────────────────────────────
 
 describe('buildContext()', () => {
-  let buildContext: (opts: { manifest: AgentSpecManifest; contextFiles?: string[] }) => string
+  let buildContext: (opts: { manifest: AgentSpecManifest; contextFiles?: string[]; manifestDir?: string }) => string
 
   beforeEach(async () => {
     const mod = await import('../context-builder.js')
@@ -119,7 +120,7 @@ describe('buildContext()', () => {
             name: 'log-workout',
             description: 'Log a workout',
             module: '$file:tool_implementations.py',
-          } as unknown as AgentSpecManifest['spec']['tools'][0],
+          } as unknown as NonNullable<AgentSpecManifest['spec']['tools']>[number],
         ],
       },
     }
@@ -143,7 +144,7 @@ describe('buildContext()', () => {
             name: 'log-workout',
             description: 'Log a workout',
             module: '$file:tool_implementations.py',
-          } as unknown as AgentSpecManifest['spec']['tools'][0],
+          } as unknown as NonNullable<AgentSpecManifest['spec']['tools']>[number],
         ],
       },
     }
@@ -233,7 +234,7 @@ describe('loadSkill() guidelines prepend', () => {
 describe('generateWithClaude()', () => {
   let generateWithClaude: (
     manifest: AgentSpecManifest,
-    opts: { framework: string; model?: string; contextFiles?: string[] },
+    opts: import('../index.js').ClaudeAdapterOptions,
   ) => Promise<import('@agentspec/sdk').GeneratedAgent>
 
   const savedKey = process.env['ANTHROPIC_API_KEY']
 
@@ -130,7 +130,7 @@ beforeEach(() => {
     if ((code ?? 0) !== 0) {
       throw new ExitError(code ?? 0)
     }
-  }) as unknown as (code?: number) => never)
+  }) as unknown as typeof process.exit)
   logSpy = vi.spyOn(console, 'log').mockImplementation(() => {})
   errorSpy = vi.spyOn(console, 'error').mockImplementation(() => {})
 })
 
@@ -28,6 +28,7 @@ const minimalManifest: AgentSpecManifest = {
     },
     prompts: {
       system: 'You are a helpful assistant.',
+      hotReload: false,
     },
   },
 }
@@ -53,10 +54,12 @@ const fullManifest: AgentSpecManifest = {
     },
     prompts: {
       system: '$file:prompts/system.md',
+      hotReload: false,
     },
     api: {
       type: 'rest',
       port: 3000,
+      streaming: false,
     },
     requires: {
       envVars: ['OPENAI_API_KEY', 'DATABASE_URL', 'REDIS_URL'],
 
@@ -292,9 +292,9 @@ describe('diff — CLI integration', () => {
       '  guardrails: {}',
     ].join('\n'))
 
-    const exitSpy = vi.spyOn(process, 'exit').mockImplementation((_code?: number): never => {
+    const exitSpy = vi.spyOn(process, 'exit').mockImplementation(((_code?: number): never => {
       throw new Error(`process.exit(${_code})`)
-    })
+    }) as unknown as typeof process.exit)
 
     try {
       await expect(runDiff(from, to, ['--exit-code'])).rejects.toThrow('process.exit(1)')
 
@@ -98,7 +98,7 @@ beforeEach(() => {
   vi.clearAllMocks()
   vi.spyOn(process, 'exit').mockImplementation(((code?: number) => {
     if ((code ?? 0) !== 0) throw new ExitError(code ?? 0)
-  }) as unknown as (code?: number) => never)
+  }) as unknown as typeof process.exit)
   logSpy = vi.spyOn(console, 'log').mockImplementation(() => {})
   errorSpy = vi.spyOn(console, 'error').mockImplementation(() => {})
 })
 
@@ -364,7 +364,7 @@ describe('registerGeneratePolicyCommand CLI', () => {
     vi.clearAllMocks()
     vi.spyOn(process, 'exit').mockImplementation(((code?: number) => {
       if ((code ?? 0) !== 0) throw new ExitError(code ?? 0)
-    }) as unknown as (code?: number) => never)
+    }) as unknown as typeof process.exit)
     logSpy = vi.spyOn(console, 'log').mockImplementation(() => {})
     errorSpy = vi.spyOn(console, 'error').mockImplementation(() => {})
   })
 
@@ -378,17 +378,18 @@ describe('generate — listFrameworks error handling', () => {
   let outDir: string
   let consoleLogSpy: ReturnType<typeof vi.spyOn>
   let consoleErrorSpy: ReturnType<typeof vi.spyOn>
-  let exitSpy: ReturnType<typeof vi.spyOn>
+  // eslint-disable-next-line @typescript-eslint/no-explicit-any
+  let exitSpy: any
 
   beforeEach(async () => {
     outDir = mkdtempSync(join(tmpdir(), 'agentspec-lfe-test-'))
     process.env['ANTHROPIC_API_KEY'] = 'test-key'
     consoleLogSpy = vi.spyOn(console, 'log').mockImplementation(() => {})
     consoleErrorSpy = vi.spyOn(console, 'error').mockImplementation(() => {})
     // Prevent process.exit from actually terminating the test runner
-    exitSpy = vi.spyOn(process, 'exit').mockImplementation((_code?: number): never => {
+    exitSpy = vi.spyOn(process, 'exit').mockImplementation(((_code?: number): never => {
       throw new Error(`process.exit(${_code})`)
-    })
+    }) as unknown as typeof process.exit)
   })
 
   afterEach(() => {
@@ -537,16 +538,17 @@ describe('generate — writeGeneratedFiles error catch', () => {
   let outDir: string
   let consoleLogSpy: ReturnType<typeof vi.spyOn>
   let consoleErrorSpy: ReturnType<typeof vi.spyOn>
-  let exitSpy: ReturnType<typeof vi.spyOn>
+  // eslint-disable-next-line @typescript-eslint/no-explicit-any
+  let exitSpy: any
 
   beforeEach(() => {
     outDir = mkdtempSync(join(tmpdir(), 'agentspec-wgf-err-'))
     process.env['ANTHROPIC_API_KEY'] = 'test-key'
     consoleLogSpy = vi.spyOn(console, 'log').mockImplementation(() => {})
     consoleErrorSpy = vi.spyOn(console, 'error').mockImplementation(() => {})
-    exitSpy = vi.spyOn(process, 'exit').mockImplementation((_code?: number): never => {
+    exitSpy = vi.spyOn(process, 'exit').mockImplementation(((_code?: number): never => {
       throw new Error(`process.exit(${_code})`)
-    })
+    }) as unknown as typeof process.exit)
     vi.clearAllMocks()
   })
 
@@ -562,9 +564,11 @@ describe('generate — writeGeneratedFiles error catch', () => {
     // Return a path traversal filename that writeGeneratedFiles will reject
     const { generateWithClaude } = await import('@agentspec/adapter-claude')
     vi.mocked(generateWithClaude).mockResolvedValueOnce({
+      framework: 'langgraph',
       files: { '../../evil.txt': 'malicious content' },
       installCommands: [],
       envVars: [],
+      readme: '',
     })
 
     const { registerGenerateCommand } = await import('../commands/generate.js')
@@ -677,9 +681,11 @@ describe('generate --deploy helm', () => {
   it('calls generateWithClaude twice when --deploy helm is set', async () => {
     const { generateWithClaude } = await import('@agentspec/adapter-claude')
     vi.mocked(generateWithClaude).mockResolvedValue({
+      framework: 'langgraph',
       files: { 'agent.py': '# agent', 'agent.yaml': '# manifest' },
       installCommands: [],
       envVars: [],
+      readme: '',
     })
 
     await runGenerateWithDeploy(outDir, 'helm')
Original file line number	Diff line number	Diff line change
`@@ -130,7 +130,7 @@ beforeEach(() => {`
`130`	`130`	`if ((code ?? 0) !== 0) {`
`131`	`131`	`throw new ExitError(code ?? 0)`
`132`	`132`	`}`
`133`		`- }) as unknown as (code?: number) => never)`
	`133`	`+ }) as unknown as typeof process.exit)`
`134`	`134`	`logSpy = vi.spyOn(console, 'log').mockImplementation(() => {})`
`135`	`135`	`errorSpy = vi.spyOn(console, 'error').mockImplementation(() => {})`
`136`	`136`	`})`