From a926f814e97909ab741bfb9356ba14c1fa2af72c Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 13 Apr 2026 01:39:38 +0000 Subject: [PATCH 1/6] test: add stage-local contract tests for compiler and query pipelines Closes #6 - compiler-contracts.test.ts: 5 describe blocks covering Parser resilience, Graph closure, Index round-trip, Validation coverage, and Snapshot determinism - query-contracts.test.ts: 7 describe blocks covering Normalizer, Seeder, Ranker, Frontier expansion, Projection stability, Synthesis isolation, and cross-cutting Trace determinism Each test targets a specific pipeline stage promise so that a failure pinpoints the broken stage rather than surfacing as a generic 'end-to-end answer is wrong.' Existing end-to-end tests remain untouched. Co-Authored-By: Stanislau --- tests/compiler-contracts.test.ts | 276 ++++++++++++++++++++++++ tests/query-contracts.test.ts | 348 +++++++++++++++++++++++++++++++ 2 files changed, 624 insertions(+) create mode 100644 tests/compiler-contracts.test.ts create mode 100644 tests/query-contracts.test.ts diff --git a/tests/compiler-contracts.test.ts b/tests/compiler-contracts.test.ts new file mode 100644 index 0000000..b520598 --- /dev/null +++ b/tests/compiler-contracts.test.ts @@ -0,0 +1,276 @@ +import { createHash } from 'node:crypto'; +import { readFile } from 'node:fs/promises'; +import { resolve } from 'node:path'; + +import { describe, expect, it } from '@rstest/core'; + +import { compileFpfSource, type CompilerOutput } from '../src/runtime/compiler.js'; + +/** + * Stage-local contract tests for the compiler pipeline. + * + * Each test targets a specific compiler stage promise so that a failure + * pinpoints the broken stage rather than surfacing as a generic + * "end-to-end answer is wrong." + */ + +let cachedOutput: CompilerOutput | undefined; + +async function getCompilerOutput(): Promise { + if (cachedOutput) { + return cachedOutput; + } + const sourcePath = resolve(process.cwd(), 'FPF-spec.md'); + const sourceText = await readFile(sourcePath, 'utf8'); + const sourceHash = createHash('sha256').update(sourceText).digest('hex'); + cachedOutput = compileFpfSource({ + sourcePath, + sourceHash, + builtAt: new Date().toISOString(), + sourceText, + }); + return cachedOutput; +} + +// --------------------------------------------------------------------------- +// Stage 1: Parser resilience +// --------------------------------------------------------------------------- +describe('Compiler / Parser stage', () => { + it('parses a non-trivial number of sections, patterns, routes, and lexicon entries', async () => { + const { snapshot } = await getCompilerOutput(); + const { validation } = snapshot; + + expect(validation.parsedSections).toBeGreaterThan(100); + expect(validation.parsedPatterns).toBeGreaterThan(50); + expect(validation.parsedRoutes).toBeGreaterThan(0); + expect(validation.parsedLexiconEntries).toBeGreaterThan(5); + }); + + it('assigns IDs to all compiled nodes and none are empty strings', async () => { + const { snapshot } = await getCompilerOutput(); + const nodeIds = Object.keys(snapshot.compiledNodes); + + expect(nodeIds.length).toBeGreaterThan(50); + for (const nodeId of nodeIds) { + expect(nodeId.length).toBeGreaterThan(0); + } + }); + + it('preserves pattern metadata fields (title, status, part)', async () => { + const { snapshot } = await getCompilerOutput(); + const pattern = snapshot.patternGraph.nodes['A.1.1']; + + expect(pattern).toBeDefined(); + expect(pattern!.title.length).toBeGreaterThan(0); + expect(pattern!.status.length).toBeGreaterThan(0); + expect(pattern!.sectionIds.length).toBeGreaterThan(0); + }); + + it('produces anchors with valid line ranges', async () => { + const { snapshot } = await getCompilerOutput(); + const anchors = Object.values(snapshot.anchorMap); + + expect(anchors.length).toBeGreaterThan(50); + for (const anchor of anchors.slice(0, 20)) { + expect(anchor.lineStart).toBeGreaterThanOrEqual(0); + expect(anchor.lineEnd).toBeGreaterThan(anchor.lineStart); + } + + const nonEmpty = anchors.filter((a) => a.text.length > 0); + expect(nonEmpty.length).toBeGreaterThan(anchors.length / 2); + }); +}); + +// --------------------------------------------------------------------------- +// Stage 2: Graph closure +// --------------------------------------------------------------------------- +describe('Compiler / Graph closure stage', () => { + it('keeps unresolved references bounded and stable', async () => { + const { snapshot } = await getCompilerOutput(); + const { validation } = snapshot; + + // The FPF spec has a small number of forward/external references that + // don't resolve to compiled nodes. The contract is that this set stays + // bounded — a regression would show as a sudden spike. + expect(validation.unresolvedReferences.length).toBeLessThan(20); + }); + + it('tracks duplicate IDs produced by catalog + heading overlap', async () => { + const { snapshot } = await getCompilerOutput(); + const { validation } = snapshot; + + // "duplicateIds" lists pattern IDs that appear in both the catalog table + // and heading sections — this is expected for the FPF spec. The contract + // is that the count stays proportional to the number of patterns. + expect(validation.duplicateIds.length).toBeGreaterThan(0); + expect(validation.duplicateIds.length).toBeLessThan( + Object.keys(snapshot.patternGraph.nodes).length + 10, + ); + }); + + it('has no broken routes', async () => { + const { snapshot } = await getCompilerOutput(); + const { validation } = snapshot; + + expect(validation.brokenRoutes).toEqual([]); + }); + + it('contains outline relations linking parents to children', async () => { + const { snapshot } = await getCompilerOutput(); + const outlineChildren = snapshot.relationGraph.filter( + (edge) => edge.relation === 'outline_child', + ); + + expect(outlineChildren.length).toBeGreaterThan(10); + + const a15Children = outlineChildren.filter((edge) => edge.from === 'A.15'); + expect(a15Children.length).toBeGreaterThan(0); + }); + + it('contains explicit_reference relations extracted from source text', async () => { + const { snapshot } = await getCompilerOutput(); + const explicitRefs = snapshot.relationGraph.filter( + (edge) => edge.relation === 'explicit_reference', + ); + + expect(explicitRefs.length).toBeGreaterThan(0); + }); + + it('routes reference mostly existing compiled nodes', async () => { + const { snapshot } = await getCompilerOutput(); + const allNodeIds = new Set(Object.keys(snapshot.compiledNodes)); + + let total = 0; + let resolved = 0; + for (const route of Object.values(snapshot.routeGraph.nodes)) { + for (const id of [...route.orderedIds, ...route.optionalIds, ...route.landingIds]) { + total += 1; + if (allNodeIds.has(id)) { + resolved += 1; + } + } + } + + // At least 90% of route step IDs should resolve to compiled nodes. + expect(resolved / total).toBeGreaterThan(0.9); + }); +}); + +// --------------------------------------------------------------------------- +// Stage 3: Index round-trip +// --------------------------------------------------------------------------- +describe('Compiler / Index round-trip stage', () => { + it('indexes resolve back to their source patterns', async () => { + const { snapshot } = await getCompilerOutput(); + const indexNodes = Object.values(snapshot.indexMap); + const patternIds = new Set(Object.keys(snapshot.patternGraph.nodes)); + + const linkedToPattern = indexNodes.filter( + (node) => node.metadata.patternId && patternIds.has(node.metadata.patternId), + ); + expect(linkedToPattern.length).toBeGreaterThan(20); + }); + + it('alias index entries resolve to existing compiled nodes', async () => { + const { snapshot } = await getCompilerOutput(); + const allNodeIds = new Set(Object.keys(snapshot.compiledNodes)); + + for (const [_alias, nodeIds] of Object.entries(snapshot.indexes.aliasIndex)) { + for (const nodeId of nodeIds) { + expect(allNodeIds.has(nodeId)).toBe(true); + } + } + }); + + it('lexicon entries have at least one linked node', async () => { + const { snapshot } = await getCompilerOutput(); + + for (const entry of Object.values(snapshot.lexicon)) { + expect(entry.linkedNodeIds.length).toBeGreaterThan(0); + } + }); + + it('status index keys partition compiled nodes without overlap', async () => { + const { snapshot } = await getCompilerOutput(); + const statusIndex = snapshot.indexes.statusIndex; + + expect(Object.keys(statusIndex).length).toBeGreaterThan(0); + + for (const [_status, nodeIds] of Object.entries(statusIndex)) { + for (const nodeId of nodeIds) { + expect(snapshot.compiledNodes[nodeId]).toBeDefined(); + } + } + }); + + it('route name index resolves to existing route nodes', async () => { + const { snapshot } = await getCompilerOutput(); + + for (const [_name, nodeIds] of Object.entries(snapshot.indexes.routeNameIndex)) { + for (const nodeId of nodeIds) { + expect(snapshot.routeGraph.nodes[nodeId]).toBeDefined(); + } + } + }); +}); + +// --------------------------------------------------------------------------- +// Stage 4: Validation coverage +// --------------------------------------------------------------------------- +describe('Compiler / Validation stage', () => { + it('keeps missing required fields bounded', async () => { + const { snapshot } = await getCompilerOutput(); + const { validation } = snapshot; + + // The FPF spec has a small number of patterns with incomplete metadata. + // The contract is that this stays bounded — a regression would spike it. + expect(validation.missingRequiredFields).toBeLessThan(25); + }); + + it('counts a plausible number of index map nodes', async () => { + const { snapshot } = await getCompilerOutput(); + const { validation } = snapshot; + + expect(validation.indexMapNodes).toBeGreaterThan(50); + expect(validation.indexMapNodes).toBe(Object.keys(snapshot.indexMap).length); + }); +}); + +// --------------------------------------------------------------------------- +// Stage 5: Snapshot determinism +// --------------------------------------------------------------------------- +describe('Compiler / Snapshot determinism stage', () => { + it('produces byte-identical snapshot when compiled twice with the same input', async () => { + const sourcePath = resolve(process.cwd(), 'FPF-spec.md'); + const sourceText = await readFile(sourcePath, 'utf8'); + const sourceHash = createHash('sha256').update(sourceText).digest('hex'); + const builtAt = '2025-01-01T00:00:00.000Z'; + + const first = compileFpfSource({ sourcePath, sourceHash, builtAt, sourceText }); + const second = compileFpfSource({ sourcePath, sourceHash, builtAt, sourceText }); + + const firstJson = JSON.stringify(first.snapshot); + const secondJson = JSON.stringify(second.snapshot); + + expect(firstJson).toBe(secondJson); + }); + + it('produces different sourceHash when source text changes', async () => { + const sourcePath = resolve(process.cwd(), 'FPF-spec.md'); + const sourceText = await readFile(sourcePath, 'utf8'); + const builtAt = '2025-01-01T00:00:00.000Z'; + + const hash1 = createHash('sha256').update(sourceText).digest('hex'); + const hash2 = createHash('sha256').update(`${sourceText}\n\n`).digest('hex'); + + const first = compileFpfSource({ sourcePath, sourceHash: hash1, builtAt, sourceText }); + const second = compileFpfSource({ + sourcePath, + sourceHash: hash2, + builtAt, + sourceText: `${sourceText}\n\n`, + }); + + expect(first.snapshot.sourceHash).not.toBe(second.snapshot.sourceHash); + }); +}); diff --git a/tests/query-contracts.test.ts b/tests/query-contracts.test.ts new file mode 100644 index 0000000..b867b49 --- /dev/null +++ b/tests/query-contracts.test.ts @@ -0,0 +1,348 @@ +import { createHash } from 'node:crypto'; +import { readFile } from 'node:fs/promises'; +import { resolve } from 'node:path'; + +import { describe, expect, it } from '@rstest/core'; + +import { compileFpfSource } from '../src/runtime/compiler.js'; +import { QueryEngine } from '../src/runtime/query-engine.js'; +import type { LocalAnswerSynthesizer, Snapshot } from '../src/runtime/types.js'; + +/** + * Stage-local contract tests for the query pipeline. + * + * Each test targets a specific retrieval stage promise so that a failure + * pinpoints the broken stage rather than surfacing as a generic + * "end-to-end answer is wrong." + */ + +let cachedSnapshot: Snapshot | undefined; + +async function getSnapshot(): Promise { + if (cachedSnapshot) { + return cachedSnapshot; + } + const sourcePath = resolve(process.cwd(), 'FPF-spec.md'); + const sourceText = await readFile(sourcePath, 'utf8'); + const sourceHash = createHash('sha256').update(sourceText).digest('hex'); + const output = compileFpfSource({ + sourcePath, + sourceHash, + builtAt: new Date().toISOString(), + sourceText, + }); + cachedSnapshot = output.snapshot; + return cachedSnapshot; +} + +function engine(snapshot: Snapshot, synthesizer?: LocalAnswerSynthesizer): QueryEngine { + return new QueryEngine(snapshot, false, synthesizer); +} + +// --------------------------------------------------------------------------- +// Stage 1: Normalizer +// --------------------------------------------------------------------------- +describe('Query / Normalizer stage', () => { + it('detects explicit IDs in the question', async () => { + const snapshot = await getSnapshot(); + const trace = engine(snapshot).trace('What is A.1.1?'); + + expect(trace.detected.ids).toContain('A.1.1'); + expect(trace.normalizedQuestion.length).toBeGreaterThan(0); + }); + + it('detects route names when mentioned in the question', async () => { + const snapshot = await getSnapshot(); + const routeNames = Object.values(snapshot.routeGraph.nodes).map((r) => r.name); + const firstRoute = routeNames[0]; + + if (firstRoute) { + const trace = engine(snapshot).trace(`Tell me about the ${firstRoute} route`); + expect(trace.detected.routeNames).toContain(firstRoute); + } + }); + + it('detects status terms present in the status index', async () => { + const snapshot = await getSnapshot(); + const statusKeys = Object.keys(snapshot.indexes.statusIndex); + + if (statusKeys.length > 0) { + const firstStatus = statusKeys[0]!; + const trace = engine(snapshot).trace(`Show me ${firstStatus} patterns`); + expect(trace.detected.statusTerms).toContain(firstStatus); + } + }); + + it('returns empty signals for a nonsense question', async () => { + const snapshot = await getSnapshot(); + const trace = engine(snapshot).trace('xyzzy plugh'); + + expect(trace.detected.ids).toEqual([]); + expect(trace.detected.routeNames).toEqual([]); + }); +}); + +// --------------------------------------------------------------------------- +// Stage 2: Candidate seeder +// --------------------------------------------------------------------------- +describe('Query / Seed coverage stage', () => { + it('seeds exact-match candidates when explicit IDs are in the question', async () => { + const snapshot = await getSnapshot(); + const trace = engine(snapshot).trace('What is A.1.1?'); + + const exactCandidate = trace.candidateScores.find((c) => c.nodeId === 'A.1.1'); + expect(exactCandidate).toBeDefined(); + expect(exactCandidate!.reasons).toContain('exact-id'); + expect(exactCandidate!.score).toBeGreaterThanOrEqual(100); + }); + + it('seeds lexical candidates for keyword-rich queries', async () => { + const snapshot = await getSnapshot(); + const trace = engine(snapshot).trace('How does bounded context relate to role assignment?'); + + const lexicalFrontier = trace.frontierCandidates.filter((c) => c.origin === 'lexical'); + expect(lexicalFrontier.length).toBeGreaterThan(0); + }); + + it('seeds route expansion candidates for route-bearing queries', async () => { + const snapshot = await getSnapshot(); + const trace = engine(snapshot).trace( + 'What is the first practical route when vocabulary is overloaded across teams?', + ); + + const routeCandidates = trace.candidateScores.filter((c) => c.kind === 'route'); + expect(routeCandidates.length).toBeGreaterThan(0); + }); + + it('produces few or low-scoring candidates for a completely unrelated question', async () => { + const snapshot = await getSnapshot(); + const trace = engine(snapshot).trace('xyzzy plugh'); + + // Index description overlap may still surface some weak candidates. + // The contract is that no candidate scores above the exact-match + // threshold (100) and total count stays low relative to the full catalog. + const highScoring = trace.candidateScores.filter((c) => c.score >= 100); + expect(highScoring.length).toBe(0); + expect(trace.candidateScores.length).toBeLessThan( + Object.keys(snapshot.compiledNodes).length / 2, + ); + }); +}); + +// --------------------------------------------------------------------------- +// Stage 3: Candidate ranker +// --------------------------------------------------------------------------- +describe('Query / Ranker stage', () => { + it('ranks exact-ID matches above lexical matches', async () => { + const snapshot = await getSnapshot(); + const trace = engine(snapshot).trace('What is A.1.1?'); + + const scores = trace.candidateScores; + expect(scores.length).toBeGreaterThan(0); + expect(scores[0]!.nodeId).toBe('A.1.1'); + }); + + it('selects the expected initial node IDs for an explicit ID query', async () => { + const snapshot = await getSnapshot(); + const trace = engine(snapshot).trace('What is A.1.1?'); + + expect(trace.selectedNodeIds).toContain('A.1.1'); + }); + + it('selects a route node when route intent is clear', async () => { + const snapshot = await getSnapshot(); + const trace = engine(snapshot).trace( + 'What is the first practical route when vocabulary is overloaded across teams?', + ); + + const routeNodes = trace.selectedNodeIds.filter( + (id) => snapshot.compiledNodes[id]?.kind === 'route', + ); + expect(routeNodes.length).toBeGreaterThan(0); + }); +}); + +// --------------------------------------------------------------------------- +// Stage 4: Frontier expansion bounds +// --------------------------------------------------------------------------- +describe('Query / Frontier expansion stage', () => { + it('respects the MAX_HOPS budget (≤6 retrieval hops)', async () => { + const snapshot = await getSnapshot(); + const trace = engine(snapshot).trace( + 'How do U.RoleAssignment, U.BoundedContext, and U.RoleStateGraph connect in a lawful workflow?', + ); + + expect(trace.retrievalHops.length).toBeLessThanOrEqual(6); + }); + + it('respects the MAX_SELECTED_ANCHORS budget (≤12 anchors)', async () => { + const snapshot = await getSnapshot(); + const trace = engine(snapshot).trace('What is A.1.1?'); + + expect(trace.selectedAnchorIds.length).toBeLessThanOrEqual(12); + }); + + it('records hop metadata (iteration, reason, added nodes/anchors)', async () => { + const snapshot = await getSnapshot(); + const trace = engine(snapshot).trace( + 'How do U.RoleAssignment, U.BoundedContext, and U.RoleStateGraph connect in a lawful workflow?', + ); + + if (trace.retrievalHops.length > 0) { + const firstHop = trace.retrievalHops[0]!; + expect(firstHop.iteration).toBe(1); + expect(firstHop.reason.length).toBeGreaterThan(0); + expect(typeof firstHop.sufficientAfter).toBe('boolean'); + } + }); + + it('marks sufficiency correctly — sufficient traces have anchors', async () => { + const snapshot = await getSnapshot(); + const trace = engine(snapshot).trace('What is A.1.1?'); + + if (trace.sufficient) { + expect(trace.selectedAnchorIds.length).toBeGreaterThan(0); + } + }); +}); + +// --------------------------------------------------------------------------- +// Stage 5: Answer projection stability +// --------------------------------------------------------------------------- +describe('Query / Projection stability stage', () => { + it('produces stable support set across repeated queries', async () => { + const snapshot = await getSnapshot(); + const eng = engine(snapshot); + + const trace1 = eng.trace('What is A.1.1?'); + const trace2 = eng.trace('What is A.1.1?'); + + expect(trace1.selectedNodeIds).toEqual(trace2.selectedNodeIds); + expect(trace1.selectedAnchorIds).toEqual(trace2.selectedAnchorIds); + expect(trace1.candidateScores.map((c) => c.nodeId)).toEqual( + trace2.candidateScores.map((c) => c.nodeId), + ); + }); + + it('projects a non-empty answer with citations for a known pattern query', async () => { + const snapshot = await getSnapshot(); + const result = await engine(snapshot).query('What is A.1.1?', 'verbose'); + + expect(result.status).toBe('ok'); + expect(result.answer.length).toBeGreaterThan(0); + expect(result.ids).toContain('A.1.1'); + expect(result.citations.length).toBeGreaterThan(0); + }); + + it('projects constraints for verbose mode', async () => { + const snapshot = await getSnapshot(); + const result = await engine(snapshot).query('What is A.1.1?', 'verbose'); + + expect(result.constraints.length).toBeGreaterThanOrEqual(1); + }); + + it('projects a grounding chain in proof mode', async () => { + const snapshot = await getSnapshot(); + const result = await engine(snapshot).query('What is A.1.1?', 'proof'); + + expect(result.groundingChain).toBeDefined(); + expect(result.groundingChain!.length).toBeGreaterThan(0); + }); + + it('returns low-confidence status for completely unresolvable questions', async () => { + const snapshot = await getSnapshot(); + const result = await engine(snapshot).query('xyzzy plugh nonsense', 'compact'); + + // Weak index-description overlap may still produce ambiguous candidates, + // so the engine may return 'ambiguous' or 'not_found'. The contract is + // that confidence stays below the high-confidence threshold. + expect(['not_found', 'ambiguous']).toContain(result.status); + expect(result.confidence).toBeLessThan(0.7); + }); +}); + +// --------------------------------------------------------------------------- +// Stage 6: Synthesis isolation +// --------------------------------------------------------------------------- +describe('Query / Synthesis isolation stage', () => { + it('returns deterministic answer when synthesizer is unavailable', async () => { + const snapshot = await getSnapshot(); + const unavailable: LocalAnswerSynthesizer = { + isAvailable: async () => false, + synthesize: async () => { + throw new Error('should not be called'); + }, + }; + + const result = await engine(snapshot, unavailable).query('What is A.1.1?', 'verbose'); + + expect(result.status).toBe('ok'); + expect(result.ids).toContain('A.1.1'); + expect(result.answer.length).toBeGreaterThan(0); + }); + + it('falls back to deterministic answer when synthesizer throws', async () => { + const snapshot = await getSnapshot(); + const failing: LocalAnswerSynthesizer = { + isAvailable: async () => true, + synthesize: async () => { + throw new Error('synthesizer crashed'); + }, + }; + + const result = await engine(snapshot, failing).query('What is A.1.1?', 'verbose'); + + expect(result.status).toBe('ok'); + expect(result.ids).toContain('A.1.1'); + expect(result.gaps.some((gap) => gap.includes('synthesis skipped') || gap.includes('synthesizer crashed'))).toBe(true); + }); + + it('does not alter deterministic IDs or citations when synthesis fails', async () => { + const snapshot = await getSnapshot(); + const eng = engine(snapshot); + const deterministicResult = await eng.query('What is A.1.1?', 'verbose'); + + const failing: LocalAnswerSynthesizer = { + isAvailable: async () => true, + synthesize: async () => { + throw new Error('test failure'); + }, + }; + const failedSynthResult = await engine(snapshot, failing).query('What is A.1.1?', 'verbose'); + + expect(failedSynthResult.ids).toEqual(deterministicResult.ids); + expect(failedSynthResult.citations).toEqual(deterministicResult.citations); + expect(failedSynthResult.relations).toEqual(deterministicResult.relations); + }); + + it('does not call synthesize when synthesizer reports unavailable', async () => { + const snapshot = await getSnapshot(); + let synthesizeCalled = false; + const unavailable: LocalAnswerSynthesizer = { + isAvailable: async () => false, + synthesize: async () => { + synthesizeCalled = true; + return {}; + }, + }; + + await engine(snapshot, unavailable).query('What is A.1.1?', 'compact'); + + expect(synthesizeCalled).toBe(false); + }); +}); + +// --------------------------------------------------------------------------- +// Trace determinism (cross-cutting) +// --------------------------------------------------------------------------- +describe('Query / Trace determinism', () => { + it('same snapshot + same query → identical trace structure', async () => { + const snapshot = await getSnapshot(); + const eng = engine(snapshot); + + const trace1 = eng.trace('How does bounded context relate to role assignment?', 'verbose'); + const trace2 = eng.trace('How does bounded context relate to role assignment?', 'verbose'); + + expect(JSON.stringify(trace1)).toBe(JSON.stringify(trace2)); + }); +}); From b7c8660d0f99a2d4f6f1b185b9212c07e7f575fa Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 13 Apr 2026 01:58:20 +0000 Subject: [PATCH 2/6] test: address review feedback on contract tests - Add total>0 guard before division in route resolution test (Devin Review) - Use hardcoded normalizer tokens for status term test (Codex) - Use frontierCandidates for route seeder and nonsense tests (Copilot) - Rename status index test to match actual assertions (CodeRabbit) - Fix hop metadata test to handle already-sufficient grounding - Use real FPF IDs (A.1.1, A.15, B.3) for hop metadata query Co-Authored-By: Stanislau --- tests/compiler-contracts.test.ts | 16 +++++++---- tests/query-contracts.test.ts | 46 +++++++++++++++++++------------- 2 files changed, 38 insertions(+), 24 deletions(-) diff --git a/tests/compiler-contracts.test.ts b/tests/compiler-contracts.test.ts index b520598..3e7cb28 100644 --- a/tests/compiler-contracts.test.ts +++ b/tests/compiler-contracts.test.ts @@ -26,12 +26,17 @@ async function getCompilerOutput(): Promise { cachedOutput = compileFpfSource({ sourcePath, sourceHash, - builtAt: new Date().toISOString(), + builtAt: '2025-01-01T00:00:00.000Z', sourceText, }); return cachedOutput; } +/** Minimum thresholds — deliberately loose so spec edits don't break tests. */ +const MIN_SECTIONS = 100; +const MIN_PATTERNS = 50; +const MIN_LEXICON_ENTRIES = 5; + // --------------------------------------------------------------------------- // Stage 1: Parser resilience // --------------------------------------------------------------------------- @@ -40,10 +45,10 @@ describe('Compiler / Parser stage', () => { const { snapshot } = await getCompilerOutput(); const { validation } = snapshot; - expect(validation.parsedSections).toBeGreaterThan(100); - expect(validation.parsedPatterns).toBeGreaterThan(50); + expect(validation.parsedSections).toBeGreaterThan(MIN_SECTIONS); + expect(validation.parsedPatterns).toBeGreaterThan(MIN_PATTERNS); expect(validation.parsedRoutes).toBeGreaterThan(0); - expect(validation.parsedLexiconEntries).toBeGreaterThan(5); + expect(validation.parsedLexiconEntries).toBeGreaterThan(MIN_LEXICON_ENTRIES); }); it('assigns IDs to all compiled nodes and none are empty strings', async () => { @@ -152,6 +157,7 @@ describe('Compiler / Graph closure stage', () => { } // At least 90% of route step IDs should resolve to compiled nodes. + expect(total).toBeGreaterThan(0); expect(resolved / total).toBeGreaterThan(0.9); }); }); @@ -190,7 +196,7 @@ describe('Compiler / Index round-trip stage', () => { } }); - it('status index keys partition compiled nodes without overlap', async () => { + it('status index entries resolve to existing compiled nodes', async () => { const { snapshot } = await getCompilerOutput(); const statusIndex = snapshot.indexes.statusIndex; diff --git a/tests/query-contracts.test.ts b/tests/query-contracts.test.ts index b867b49..8079aa5 100644 --- a/tests/query-contracts.test.ts +++ b/tests/query-contracts.test.ts @@ -28,7 +28,7 @@ async function getSnapshot(): Promise { const output = compileFpfSource({ sourcePath, sourceHash, - builtAt: new Date().toISOString(), + builtAt: '2025-01-01T00:00:00.000Z', sourceText, }); cachedSnapshot = output.snapshot; @@ -54,23 +54,25 @@ describe('Query / Normalizer stage', () => { it('detects route names when mentioned in the question', async () => { const snapshot = await getSnapshot(); const routeNames = Object.values(snapshot.routeGraph.nodes).map((r) => r.name); - const firstRoute = routeNames[0]; - if (firstRoute) { - const trace = engine(snapshot).trace(`Tell me about the ${firstRoute} route`); - expect(trace.detected.routeNames).toContain(firstRoute); - } + expect(routeNames.length).toBeGreaterThan(0); + const firstRoute = routeNames[0]!; + const trace = engine(snapshot).trace(`Tell me about the ${firstRoute} route`); + expect(trace.detected.routeNames).toContain(firstRoute); }); it('detects status terms present in the status index', async () => { const snapshot = await getSnapshot(); - const statusKeys = Object.keys(snapshot.indexes.statusIndex); - if (statusKeys.length > 0) { - const firstStatus = statusKeys[0]!; - const trace = engine(snapshot).trace(`Show me ${firstStatus} patterns`); - expect(trace.detected.statusTerms).toContain(firstStatus); - } + // The normalizer only detects these fixed tokens. + const knownTokens = ['draft', 'stable', 'stub', 'transitional']; + const matchedToken = knownTokens.find( + (t) => snapshot.indexes.statusIndex[t] !== undefined, + ); + + expect(matchedToken).toBeDefined(); + const trace = engine(snapshot).trace(`Show me ${matchedToken} patterns`); + expect(trace.detected.statusTerms).toContain(matchedToken); }); it('returns empty signals for a nonsense question', async () => { @@ -110,8 +112,10 @@ describe('Query / Seed coverage stage', () => { 'What is the first practical route when vocabulary is overloaded across teams?', ); - const routeCandidates = trace.candidateScores.filter((c) => c.kind === 'route'); - expect(routeCandidates.length).toBeGreaterThan(0); + const routeFrontier = trace.frontierCandidates.filter( + (c) => c.origin === 'route_expansion', + ); + expect(routeFrontier.length).toBeGreaterThan(0); }); it('produces few or low-scoring candidates for a completely unrelated question', async () => { @@ -123,7 +127,7 @@ describe('Query / Seed coverage stage', () => { // threshold (100) and total count stays low relative to the full catalog. const highScoring = trace.candidateScores.filter((c) => c.score >= 100); expect(highScoring.length).toBe(0); - expect(trace.candidateScores.length).toBeLessThan( + expect(trace.frontierCandidates.length).toBeLessThan( Object.keys(snapshot.compiledNodes).length / 2, ); }); @@ -185,14 +189,19 @@ describe('Query / Frontier expansion stage', () => { it('records hop metadata (iteration, reason, added nodes/anchors)', async () => { const snapshot = await getSnapshot(); const trace = engine(snapshot).trace( - 'How do U.RoleAssignment, U.BoundedContext, and U.RoleStateGraph connect in a lawful workflow?', + 'How do A.1.1, A.15, and B.3 connect in a lawful workflow?', ); + // If the engine already considers the grounding sufficient before any + // expansion, hops will be empty — that's valid behavior, not a test failure. if (trace.retrievalHops.length > 0) { const firstHop = trace.retrievalHops[0]!; expect(firstHop.iteration).toBe(1); expect(firstHop.reason.length).toBeGreaterThan(0); expect(typeof firstHop.sufficientAfter).toBe('boolean'); + } else { + // No hops means grounding was already sufficient from initial selection. + expect(trace.sufficient).toBe(true); } }); @@ -200,9 +209,8 @@ describe('Query / Frontier expansion stage', () => { const snapshot = await getSnapshot(); const trace = engine(snapshot).trace('What is A.1.1?'); - if (trace.sufficient) { - expect(trace.selectedAnchorIds.length).toBeGreaterThan(0); - } + expect(trace.sufficient).toBe(true); + expect(trace.selectedAnchorIds.length).toBeGreaterThan(0); }); }); From c4765d65dc104f79776f793db9c7bd04766a576a Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 13 Apr 2026 02:06:22 +0000 Subject: [PATCH 3/6] test: replace tautological sourceHash check with structural assertion Co-Authored-By: Stanislau --- tests/compiler-contracts.test.ts | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/tests/compiler-contracts.test.ts b/tests/compiler-contracts.test.ts index 3e7cb28..73ad7b9 100644 --- a/tests/compiler-contracts.test.ts +++ b/tests/compiler-contracts.test.ts @@ -261,22 +261,35 @@ describe('Compiler / Snapshot determinism stage', () => { expect(firstJson).toBe(secondJson); }); - it('produces different sourceHash when source text changes', async () => { + it('produces structurally different output when source text changes', async () => { const sourcePath = resolve(process.cwd(), 'FPF-spec.md'); const sourceText = await readFile(sourcePath, 'utf8'); const builtAt = '2025-01-01T00:00:00.000Z'; const hash1 = createHash('sha256').update(sourceText).digest('hex'); - const hash2 = createHash('sha256').update(`${sourceText}\n\n`).digest('hex'); + // Append a new heading + body — the compiler must parse it as an + // additional section, which changes the structural output (not just + // the caller-provided hash). + const modifiedText = `${sourceText}\n\n## Z.99 Synthetic Test Section\n\nA synthetic section added to verify the compiler processes changed source text.\n`; + const hash2 = createHash('sha256').update(modifiedText).digest('hex'); const first = compileFpfSource({ sourcePath, sourceHash: hash1, builtAt, sourceText }); const second = compileFpfSource({ sourcePath, sourceHash: hash2, builtAt, - sourceText: `${sourceText}\n\n`, + sourceText: modifiedText, }); - expect(first.snapshot.sourceHash).not.toBe(second.snapshot.sourceHash); + // Verify a structural difference — the added heading should produce at + // least one more parsed section or index-map node than the original. + const firstSections = first.snapshot.validation.parsedSections; + const firstIndexNodes = Object.keys(first.snapshot.indexMap).length; + const secondSections = second.snapshot.validation.parsedSections; + const secondIndexNodes = Object.keys(second.snapshot.indexMap).length; + + const structurallyDifferent = + secondSections > firstSections || secondIndexNodes > firstIndexNodes; + expect(structurallyDifferent).toBe(true); }); }); From 82dd2dc10c768332011405066be3b95687f38a18 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Tue, 14 Apr 2026 03:53:23 +0000 Subject: [PATCH 4/6] fix: replace nonsense probe string, strengthen Z.99 assertion, add spec-ID fixture comments - Replace 'xyzzy plugh' with '__FPFTEST_NONSENSE_999__' to avoid false failures if spec ever mentions those words - Strengthen Z.99 structural assertion: verify parsedSections grows (not just 'something grew'), with comment explaining why Z.99 appears as a section not a pattern node - Add canonical fixture ID comments to both test files explaining that A.1.1/A.15/B.3 are stable spec anchors and where to update if renamed Co-Authored-By: Stanislau --- tests/compiler-contracts.test.ts | 11 ++++++++--- tests/query-contracts.test.ts | 10 +++++++--- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/tests/compiler-contracts.test.ts b/tests/compiler-contracts.test.ts index 73ad7b9..5813a4a 100644 --- a/tests/compiler-contracts.test.ts +++ b/tests/compiler-contracts.test.ts @@ -12,6 +12,10 @@ import { compileFpfSource, type CompilerOutput } from '../src/runtime/compiler.j * Each test targets a specific compiler stage promise so that a failure * pinpoints the broken stage rather than surfacing as a generic * "end-to-end answer is wrong." + * + * Canonical fixture IDs: `A.1.1` is used as a stable spec anchor for + * metadata assertions. If the FPF spec renames or renumbers this + * pattern, update the ID here to match. */ let cachedOutput: CompilerOutput | undefined; @@ -288,8 +292,9 @@ describe('Compiler / Snapshot determinism stage', () => { const secondSections = second.snapshot.validation.parsedSections; const secondIndexNodes = Object.keys(second.snapshot.indexMap).length; - const structurallyDifferent = - secondSections > firstSections || secondIndexNodes > firstIndexNodes; - expect(structurallyDifferent).toBe(true); + // The synthetic Z.99 heading is parsed as a section (not a pattern — + // the compiler only promotes headings that match spec-catalog entries). + // Verify the section count grew, proving the parser handled the new heading. + expect(secondSections).toBeGreaterThan(firstSections); }); }); diff --git a/tests/query-contracts.test.ts b/tests/query-contracts.test.ts index 8079aa5..c00574c 100644 --- a/tests/query-contracts.test.ts +++ b/tests/query-contracts.test.ts @@ -14,6 +14,10 @@ import type { LocalAnswerSynthesizer, Snapshot } from '../src/runtime/types.js'; * Each test targets a specific retrieval stage promise so that a failure * pinpoints the broken stage rather than surfacing as a generic * "end-to-end answer is wrong." + * + * Canonical fixture IDs: `A.1.1`, `A.15`, `B.3` are used as stable spec + * anchors throughout these tests. If the FPF spec renames or renumbers + * these patterns, update the IDs here to match. */ let cachedSnapshot: Snapshot | undefined; @@ -77,7 +81,7 @@ describe('Query / Normalizer stage', () => { it('returns empty signals for a nonsense question', async () => { const snapshot = await getSnapshot(); - const trace = engine(snapshot).trace('xyzzy plugh'); + const trace = engine(snapshot).trace('__FPFTEST_NONSENSE_999__'); expect(trace.detected.ids).toEqual([]); expect(trace.detected.routeNames).toEqual([]); @@ -120,7 +124,7 @@ describe('Query / Seed coverage stage', () => { it('produces few or low-scoring candidates for a completely unrelated question', async () => { const snapshot = await getSnapshot(); - const trace = engine(snapshot).trace('xyzzy plugh'); + const trace = engine(snapshot).trace('__FPFTEST_NONSENSE_999__'); // Index description overlap may still surface some weak candidates. // The contract is that no candidate scores above the exact-match @@ -259,7 +263,7 @@ describe('Query / Projection stability stage', () => { it('returns low-confidence status for completely unresolvable questions', async () => { const snapshot = await getSnapshot(); - const result = await engine(snapshot).query('xyzzy plugh nonsense', 'compact'); + const result = await engine(snapshot).query('__FPFTEST_NONSENSE_999__', 'compact'); // Weak index-description overlap may still produce ambiguous candidates, // so the engine may return 'ambiguous' or 'not_found'. The contract is From 7a69ba0159c6cfa70a3bf37dbf367a01f387d414 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Tue, 14 Apr 2026 03:56:05 +0000 Subject: [PATCH 5/6] fix: remove unused firstIndexNodes/secondIndexNodes variables Co-Authored-By: Stanislau --- tests/compiler-contracts.test.ts | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/compiler-contracts.test.ts b/tests/compiler-contracts.test.ts index 5813a4a..5ecfbb7 100644 --- a/tests/compiler-contracts.test.ts +++ b/tests/compiler-contracts.test.ts @@ -288,9 +288,7 @@ describe('Compiler / Snapshot determinism stage', () => { // Verify a structural difference — the added heading should produce at // least one more parsed section or index-map node than the original. const firstSections = first.snapshot.validation.parsedSections; - const firstIndexNodes = Object.keys(first.snapshot.indexMap).length; const secondSections = second.snapshot.validation.parsedSections; - const secondIndexNodes = Object.keys(second.snapshot.indexMap).length; // The synthetic Z.99 heading is parsed as a section (not a pattern — // the compiler only promotes headings that match spec-catalog entries). From 13f7af7e5481addd25c284d0e7263e698517cd07 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Tue, 14 Apr 2026 05:43:04 +0000 Subject: [PATCH 6/6] refactor: call stage modules directly in query contract tests Address venikman's review: replace QueryEngine.trace()/query() with direct imports of normalizeQuery, seedCandidates, rankCandidates, expandGrounding, buildPatternAnswer, synthesizeAnswer, confidenceFromTrace, and gapsFromTrace. Each describe block now targets its stage function in isolation so a regression in one stage cannot masquerade as a failure in another. Co-Authored-By: Stanislau --- tests/query-contracts.test.ts | 345 +++++++++++++++++++++++++--------- 1 file changed, 258 insertions(+), 87 deletions(-) diff --git a/tests/query-contracts.test.ts b/tests/query-contracts.test.ts index c00574c..174c646 100644 --- a/tests/query-contracts.test.ts +++ b/tests/query-contracts.test.ts @@ -5,15 +5,26 @@ import { resolve } from 'node:path'; import { describe, expect, it } from '@rstest/core'; import { compileFpfSource } from '../src/runtime/compiler.js'; -import { QueryEngine } from '../src/runtime/query-engine.js'; -import type { LocalAnswerSynthesizer, Snapshot } from '../src/runtime/types.js'; +import { normalizeQuery } from '../src/runtime/query-normalizer.js'; +import { seedCandidates } from '../src/runtime/candidate-seeder.js'; +import { isAmbiguous, rankCandidates } from '../src/runtime/candidate-ranker.js'; +import { expandGrounding } from '../src/runtime/frontier-expander.js'; +import { + buildPatternAnswer, + confidenceFromTrace, + gapsFromTrace, + prepareSynthesisSlices, +} from '../src/runtime/answer-projector.js'; +import { synthesizeAnswer } from '../src/runtime/synthesis-adapter.js'; +import { MAX_EXCLUDED } from '../src/runtime/constants.js'; +import type { CompiledNode, LocalAnswerSynthesizer, Snapshot, TraceResult } from '../src/runtime/types.js'; /** * Stage-local contract tests for the query pipeline. * - * Each test targets a specific retrieval stage promise so that a failure - * pinpoints the broken stage rather than surfacing as a generic - * "end-to-end answer is wrong." + * Each describe block targets a single stage function imported directly + * from its module so that a regression in one stage cannot masquerade + * as a failure in another. * * Canonical fixture IDs: `A.1.1`, `A.15`, `B.3` are used as stable spec * anchors throughout these tests. If the FPF spec renames or renumbers @@ -39,20 +50,80 @@ async function getSnapshot(): Promise { return cachedSnapshot; } -function engine(snapshot: Snapshot, synthesizer?: LocalAnswerSynthesizer): QueryEngine { - return new QueryEngine(snapshot, false, synthesizer); +/** + * Assemble a TraceResult from stage outputs, mirroring QueryEngine.trace(). + * Used by projection and synthesis tests so they can feed stage outputs + * forward without routing through QueryEngine. + */ +function assembleTrace( + question: string, + mode: 'compact' | 'verbose' | 'proof', + snapshot: Snapshot, +): TraceResult { + const normalized = normalizeQuery(question, snapshot); + const seeding = seedCandidates(normalized, snapshot); + const ranking = rankCandidates(question, seeding.candidateMap, snapshot); + const grounding = expandGrounding( + question, + ranking.candidates, + ranking.initialNodeIds, + ranking.initialAnchorIds, + seeding.frontierCandidates, + seeding.frontierKeys, + snapshot, + ); + + const selectedNodeIds = grounding.selectedNodeIds; + const excludedNodeIds = ranking.candidates + .map((c) => c.nodeId) + .filter((nodeId) => !selectedNodeIds.includes(nodeId)) + .slice(0, MAX_EXCLUDED); + const status = + selectedNodeIds.length === 0 + ? 'not_found' + : ranking.routeWins + ? 'ok' + : isAmbiguous(question, ranking.candidates) + ? 'ambiguous' + : 'ok'; + + return { + mode, + question, + normalizedQuestion: normalized.normalizedQuestion, + detected: normalized.detected, + candidateScores: ranking.candidates.slice(0, 16), + frontierCandidates: seeding.frontierCandidates, + graphExpansions: grounding.graphExpansions, + selectedNodeIds, + selectedAnchorIds: grounding.selectedAnchorIds, + excludedNodeIds, + followedReferences: grounding.followedReferences, + retrievalHops: grounding.retrievalHops, + sessionApplied: seeding.sessionApplied, + sessionReusedNodeIds: [], + sessionMateriallyChanged: false, + sufficient: grounding.sufficient, + routeWins: ranking.routeWins, + status, + snapshot: { + sourceHash: snapshot.sourceHash, + builtAt: snapshot.builtAt, + rebuilt: false, + }, + }; } // --------------------------------------------------------------------------- -// Stage 1: Normalizer +// Stage 1: Normalizer (normalizeQuery) // --------------------------------------------------------------------------- describe('Query / Normalizer stage', () => { it('detects explicit IDs in the question', async () => { const snapshot = await getSnapshot(); - const trace = engine(snapshot).trace('What is A.1.1?'); + const normalized = normalizeQuery('What is A.1.1?', snapshot); - expect(trace.detected.ids).toContain('A.1.1'); - expect(trace.normalizedQuestion.length).toBeGreaterThan(0); + expect(normalized.detected.ids).toContain('A.1.1'); + expect(normalized.normalizedQuestion.length).toBeGreaterThan(0); }); it('detects route names when mentioned in the question', async () => { @@ -61,42 +132,42 @@ describe('Query / Normalizer stage', () => { expect(routeNames.length).toBeGreaterThan(0); const firstRoute = routeNames[0]!; - const trace = engine(snapshot).trace(`Tell me about the ${firstRoute} route`); - expect(trace.detected.routeNames).toContain(firstRoute); + const normalized = normalizeQuery(`Tell me about the ${firstRoute} route`, snapshot); + expect(normalized.detected.routeNames).toContain(firstRoute); }); it('detects status terms present in the status index', async () => { const snapshot = await getSnapshot(); - // The normalizer only detects these fixed tokens. const knownTokens = ['draft', 'stable', 'stub', 'transitional']; const matchedToken = knownTokens.find( (t) => snapshot.indexes.statusIndex[t] !== undefined, ); expect(matchedToken).toBeDefined(); - const trace = engine(snapshot).trace(`Show me ${matchedToken} patterns`); - expect(trace.detected.statusTerms).toContain(matchedToken); + const normalized = normalizeQuery(`Show me ${matchedToken} patterns`, snapshot); + expect(normalized.detected.statusTerms).toContain(matchedToken); }); it('returns empty signals for a nonsense question', async () => { const snapshot = await getSnapshot(); - const trace = engine(snapshot).trace('__FPFTEST_NONSENSE_999__'); + const normalized = normalizeQuery('__FPFTEST_NONSENSE_999__', snapshot); - expect(trace.detected.ids).toEqual([]); - expect(trace.detected.routeNames).toEqual([]); + expect(normalized.detected.ids).toEqual([]); + expect(normalized.detected.routeNames).toEqual([]); }); }); // --------------------------------------------------------------------------- -// Stage 2: Candidate seeder +// Stage 2: Candidate seeder (seedCandidates) // --------------------------------------------------------------------------- describe('Query / Seed coverage stage', () => { it('seeds exact-match candidates when explicit IDs are in the question', async () => { const snapshot = await getSnapshot(); - const trace = engine(snapshot).trace('What is A.1.1?'); + const normalized = normalizeQuery('What is A.1.1?', snapshot); + const seeding = seedCandidates(normalized, snapshot); - const exactCandidate = trace.candidateScores.find((c) => c.nodeId === 'A.1.1'); + const exactCandidate = seeding.candidateMap.get('A.1.1'); expect(exactCandidate).toBeDefined(); expect(exactCandidate!.reasons).toContain('exact-id'); expect(exactCandidate!.score).toBeGreaterThanOrEqual(100); @@ -104,19 +175,22 @@ describe('Query / Seed coverage stage', () => { it('seeds lexical candidates for keyword-rich queries', async () => { const snapshot = await getSnapshot(); - const trace = engine(snapshot).trace('How does bounded context relate to role assignment?'); + const normalized = normalizeQuery('How does bounded context relate to role assignment?', snapshot); + const seeding = seedCandidates(normalized, snapshot); - const lexicalFrontier = trace.frontierCandidates.filter((c) => c.origin === 'lexical'); + const lexicalFrontier = seeding.frontierCandidates.filter((c) => c.origin === 'lexical'); expect(lexicalFrontier.length).toBeGreaterThan(0); }); it('seeds route expansion candidates for route-bearing queries', async () => { const snapshot = await getSnapshot(); - const trace = engine(snapshot).trace( + const normalized = normalizeQuery( 'What is the first practical route when vocabulary is overloaded across teams?', + snapshot, ); + const seeding = seedCandidates(normalized, snapshot); - const routeFrontier = trace.frontierCandidates.filter( + const routeFrontier = seeding.frontierCandidates.filter( (c) => c.origin === 'route_expansion', ); expect(routeFrontier.length).toBeGreaterThan(0); @@ -124,46 +198,49 @@ describe('Query / Seed coverage stage', () => { it('produces few or low-scoring candidates for a completely unrelated question', async () => { const snapshot = await getSnapshot(); - const trace = engine(snapshot).trace('__FPFTEST_NONSENSE_999__'); + const normalized = normalizeQuery('__FPFTEST_NONSENSE_999__', snapshot); + const seeding = seedCandidates(normalized, snapshot); - // Index description overlap may still surface some weak candidates. - // The contract is that no candidate scores above the exact-match - // threshold (100) and total count stays low relative to the full catalog. - const highScoring = trace.candidateScores.filter((c) => c.score >= 100); + const highScoring = Array.from(seeding.candidateMap.values()).filter((c) => c.score >= 100); expect(highScoring.length).toBe(0); - expect(trace.frontierCandidates.length).toBeLessThan( + expect(seeding.frontierCandidates.length).toBeLessThan( Object.keys(snapshot.compiledNodes).length / 2, ); }); }); // --------------------------------------------------------------------------- -// Stage 3: Candidate ranker +// Stage 3: Candidate ranker (rankCandidates) // --------------------------------------------------------------------------- describe('Query / Ranker stage', () => { it('ranks exact-ID matches above lexical matches', async () => { const snapshot = await getSnapshot(); - const trace = engine(snapshot).trace('What is A.1.1?'); + const normalized = normalizeQuery('What is A.1.1?', snapshot); + const seeding = seedCandidates(normalized, snapshot); + const ranking = rankCandidates('What is A.1.1?', seeding.candidateMap, snapshot); - const scores = trace.candidateScores; - expect(scores.length).toBeGreaterThan(0); - expect(scores[0]!.nodeId).toBe('A.1.1'); + expect(ranking.candidates.length).toBeGreaterThan(0); + expect(ranking.candidates[0]!.nodeId).toBe('A.1.1'); }); it('selects the expected initial node IDs for an explicit ID query', async () => { const snapshot = await getSnapshot(); - const trace = engine(snapshot).trace('What is A.1.1?'); + const normalized = normalizeQuery('What is A.1.1?', snapshot); + const seeding = seedCandidates(normalized, snapshot); + const ranking = rankCandidates('What is A.1.1?', seeding.candidateMap, snapshot); - expect(trace.selectedNodeIds).toContain('A.1.1'); + expect(ranking.initialNodeIds).toContain('A.1.1'); }); it('selects a route node when route intent is clear', async () => { const snapshot = await getSnapshot(); - const trace = engine(snapshot).trace( - 'What is the first practical route when vocabulary is overloaded across teams?', - ); + const question = 'What is the first practical route when vocabulary is overloaded across teams?'; + const normalized = normalizeQuery(question, snapshot); + const seeding = seedCandidates(normalized, snapshot); + const ranking = rankCandidates(question, seeding.candidateMap, snapshot); - const routeNodes = trace.selectedNodeIds.filter( + expect(ranking.routeWins).toBe(true); + const routeNodes = ranking.initialNodeIds.filter( (id) => snapshot.compiledNodes[id]?.kind === 'route', ); expect(routeNodes.length).toBeGreaterThan(0); @@ -171,63 +248,103 @@ describe('Query / Ranker stage', () => { }); // --------------------------------------------------------------------------- -// Stage 4: Frontier expansion bounds +// Stage 4: Frontier expansion (expandGrounding) // --------------------------------------------------------------------------- describe('Query / Frontier expansion stage', () => { it('respects the MAX_HOPS budget (≤6 retrieval hops)', async () => { const snapshot = await getSnapshot(); - const trace = engine(snapshot).trace( - 'How do U.RoleAssignment, U.BoundedContext, and U.RoleStateGraph connect in a lawful workflow?', + const question = 'How do U.RoleAssignment, U.BoundedContext, and U.RoleStateGraph connect in a lawful workflow?'; + const normalized = normalizeQuery(question, snapshot); + const seeding = seedCandidates(normalized, snapshot); + const ranking = rankCandidates(question, seeding.candidateMap, snapshot); + const grounding = expandGrounding( + question, + ranking.candidates, + ranking.initialNodeIds, + ranking.initialAnchorIds, + seeding.frontierCandidates, + seeding.frontierKeys, + snapshot, ); - expect(trace.retrievalHops.length).toBeLessThanOrEqual(6); + expect(grounding.retrievalHops.length).toBeLessThanOrEqual(6); }); it('respects the MAX_SELECTED_ANCHORS budget (≤12 anchors)', async () => { const snapshot = await getSnapshot(); - const trace = engine(snapshot).trace('What is A.1.1?'); + const question = 'What is A.1.1?'; + const normalized = normalizeQuery(question, snapshot); + const seeding = seedCandidates(normalized, snapshot); + const ranking = rankCandidates(question, seeding.candidateMap, snapshot); + const grounding = expandGrounding( + question, + ranking.candidates, + ranking.initialNodeIds, + ranking.initialAnchorIds, + seeding.frontierCandidates, + seeding.frontierKeys, + snapshot, + ); - expect(trace.selectedAnchorIds.length).toBeLessThanOrEqual(12); + expect(grounding.selectedAnchorIds.length).toBeLessThanOrEqual(12); }); it('records hop metadata (iteration, reason, added nodes/anchors)', async () => { const snapshot = await getSnapshot(); - const trace = engine(snapshot).trace( - 'How do A.1.1, A.15, and B.3 connect in a lawful workflow?', + const question = 'How do A.1.1, A.15, and B.3 connect in a lawful workflow?'; + const normalized = normalizeQuery(question, snapshot); + const seeding = seedCandidates(normalized, snapshot); + const ranking = rankCandidates(question, seeding.candidateMap, snapshot); + const grounding = expandGrounding( + question, + ranking.candidates, + ranking.initialNodeIds, + ranking.initialAnchorIds, + seeding.frontierCandidates, + seeding.frontierKeys, + snapshot, ); - // If the engine already considers the grounding sufficient before any - // expansion, hops will be empty — that's valid behavior, not a test failure. - if (trace.retrievalHops.length > 0) { - const firstHop = trace.retrievalHops[0]!; + if (grounding.retrievalHops.length > 0) { + const firstHop = grounding.retrievalHops[0]!; expect(firstHop.iteration).toBe(1); expect(firstHop.reason.length).toBeGreaterThan(0); expect(typeof firstHop.sufficientAfter).toBe('boolean'); } else { - // No hops means grounding was already sufficient from initial selection. - expect(trace.sufficient).toBe(true); + expect(grounding.sufficient).toBe(true); } }); it('marks sufficiency correctly — sufficient traces have anchors', async () => { const snapshot = await getSnapshot(); - const trace = engine(snapshot).trace('What is A.1.1?'); + const question = 'What is A.1.1?'; + const normalized = normalizeQuery(question, snapshot); + const seeding = seedCandidates(normalized, snapshot); + const ranking = rankCandidates(question, seeding.candidateMap, snapshot); + const grounding = expandGrounding( + question, + ranking.candidates, + ranking.initialNodeIds, + ranking.initialAnchorIds, + seeding.frontierCandidates, + seeding.frontierKeys, + snapshot, + ); - expect(trace.sufficient).toBe(true); - expect(trace.selectedAnchorIds.length).toBeGreaterThan(0); + expect(grounding.sufficient).toBe(true); + expect(grounding.selectedAnchorIds.length).toBeGreaterThan(0); }); }); // --------------------------------------------------------------------------- -// Stage 5: Answer projection stability +// Stage 5: Answer projection (buildPatternAnswer / buildRouteAnswer / confidenceFromTrace) // --------------------------------------------------------------------------- describe('Query / Projection stability stage', () => { - it('produces stable support set across repeated queries', async () => { + it('produces stable support set across repeated stage invocations', async () => { const snapshot = await getSnapshot(); - const eng = engine(snapshot); - const trace1 = eng.trace('What is A.1.1?'); - const trace2 = eng.trace('What is A.1.1?'); + const trace1 = assembleTrace('What is A.1.1?', 'compact', snapshot); + const trace2 = assembleTrace('What is A.1.1?', 'compact', snapshot); expect(trace1.selectedNodeIds).toEqual(trace2.selectedNodeIds); expect(trace1.selectedAnchorIds).toEqual(trace2.selectedAnchorIds); @@ -238,7 +355,8 @@ describe('Query / Projection stability stage', () => { it('projects a non-empty answer with citations for a known pattern query', async () => { const snapshot = await getSnapshot(); - const result = await engine(snapshot).query('What is A.1.1?', 'verbose'); + const trace = assembleTrace('What is A.1.1?', 'verbose', snapshot); + const result = buildPatternAnswer('What is A.1.1?', 'verbose', trace, snapshot, false); expect(result.status).toBe('ok'); expect(result.answer.length).toBeGreaterThan(0); @@ -248,37 +366,61 @@ describe('Query / Projection stability stage', () => { it('projects constraints for verbose mode', async () => { const snapshot = await getSnapshot(); - const result = await engine(snapshot).query('What is A.1.1?', 'verbose'); + const trace = assembleTrace('What is A.1.1?', 'verbose', snapshot); + const result = buildPatternAnswer('What is A.1.1?', 'verbose', trace, snapshot, false); expect(result.constraints.length).toBeGreaterThanOrEqual(1); }); it('projects a grounding chain in proof mode', async () => { const snapshot = await getSnapshot(); - const result = await engine(snapshot).query('What is A.1.1?', 'proof'); + const trace = assembleTrace('What is A.1.1?', 'proof', snapshot); + const result = buildPatternAnswer('What is A.1.1?', 'proof', trace, snapshot, false); expect(result.groundingChain).toBeDefined(); expect(result.groundingChain!.length).toBeGreaterThan(0); }); - it('returns low-confidence status for completely unresolvable questions', async () => { + it('returns low confidence for completely unresolvable questions', async () => { + const snapshot = await getSnapshot(); + const trace = assembleTrace('__FPFTEST_NONSENSE_999__', 'compact', snapshot); + + expect(['not_found', 'ambiguous']).toContain(trace.status); + expect(confidenceFromTrace(trace)).toBeLessThan(0.7); + }); + + it('computes confidence via confidenceFromTrace without QueryEngine', async () => { + const snapshot = await getSnapshot(); + const trace = assembleTrace('What is A.1.1?', 'verbose', snapshot); + + const confidence = confidenceFromTrace(trace); + expect(confidence).toBeGreaterThan(0.5); + expect(confidence).toBeLessThanOrEqual(1); + }); + + it('computes gaps via gapsFromTrace without QueryEngine', async () => { const snapshot = await getSnapshot(); - const result = await engine(snapshot).query('__FPFTEST_NONSENSE_999__', 'compact'); + const trace = assembleTrace('What is A.1.1?', 'verbose', snapshot); - // Weak index-description overlap may still produce ambiguous candidates, - // so the engine may return 'ambiguous' or 'not_found'. The contract is - // that confidence stays below the high-confidence threshold. - expect(['not_found', 'ambiguous']).toContain(result.status); - expect(result.confidence).toBeLessThan(0.7); + const gaps = gapsFromTrace(trace); + expect(Array.isArray(gaps)).toBe(true); }); }); // --------------------------------------------------------------------------- -// Stage 6: Synthesis isolation +// Stage 6: Synthesis isolation (synthesizeAnswer) // --------------------------------------------------------------------------- describe('Query / Synthesis isolation stage', () => { it('returns deterministic answer when synthesizer is unavailable', async () => { const snapshot = await getSnapshot(); + const trace = assembleTrace('What is A.1.1?', 'verbose', snapshot); + const deterministicResult = buildPatternAnswer('What is A.1.1?', 'verbose', trace, snapshot, false); + const nodes = trace.selectedNodeIds + .map((nodeId) => snapshot.compiledNodes[nodeId]) + .filter((node): node is CompiledNode => Boolean(node)) + .slice(0, 8); + const slices = prepareSynthesisSlices(trace, snapshot); + const unavailable: LocalAnswerSynthesizer = { isAvailable: async () => false, synthesize: async () => { @@ -286,7 +428,9 @@ describe('Query / Synthesis isolation stage', () => { }, }; - const result = await engine(snapshot, unavailable).query('What is A.1.1?', 'verbose'); + const result = await synthesizeAnswer( + 'What is A.1.1?', 'verbose', trace, nodes, slices, deterministicResult, unavailable, + ); expect(result.status).toBe('ok'); expect(result.ids).toContain('A.1.1'); @@ -295,6 +439,14 @@ describe('Query / Synthesis isolation stage', () => { it('falls back to deterministic answer when synthesizer throws', async () => { const snapshot = await getSnapshot(); + const trace = assembleTrace('What is A.1.1?', 'verbose', snapshot); + const deterministicResult = buildPatternAnswer('What is A.1.1?', 'verbose', trace, snapshot, false); + const nodes = trace.selectedNodeIds + .map((nodeId) => snapshot.compiledNodes[nodeId]) + .filter((node): node is CompiledNode => Boolean(node)) + .slice(0, 8); + const slices = prepareSynthesisSlices(trace, snapshot); + const failing: LocalAnswerSynthesizer = { isAvailable: async () => true, synthesize: async () => { @@ -302,7 +454,9 @@ describe('Query / Synthesis isolation stage', () => { }, }; - const result = await engine(snapshot, failing).query('What is A.1.1?', 'verbose'); + const result = await synthesizeAnswer( + 'What is A.1.1?', 'verbose', trace, nodes, slices, deterministicResult, failing, + ); expect(result.status).toBe('ok'); expect(result.ids).toContain('A.1.1'); @@ -311,8 +465,13 @@ describe('Query / Synthesis isolation stage', () => { it('does not alter deterministic IDs or citations when synthesis fails', async () => { const snapshot = await getSnapshot(); - const eng = engine(snapshot); - const deterministicResult = await eng.query('What is A.1.1?', 'verbose'); + const trace = assembleTrace('What is A.1.1?', 'verbose', snapshot); + const deterministicResult = buildPatternAnswer('What is A.1.1?', 'verbose', trace, snapshot, false); + const nodes = trace.selectedNodeIds + .map((nodeId) => snapshot.compiledNodes[nodeId]) + .filter((node): node is CompiledNode => Boolean(node)) + .slice(0, 8); + const slices = prepareSynthesisSlices(trace, snapshot); const failing: LocalAnswerSynthesizer = { isAvailable: async () => true, @@ -320,7 +479,10 @@ describe('Query / Synthesis isolation stage', () => { throw new Error('test failure'); }, }; - const failedSynthResult = await engine(snapshot, failing).query('What is A.1.1?', 'verbose'); + + const failedSynthResult = await synthesizeAnswer( + 'What is A.1.1?', 'verbose', trace, nodes, slices, deterministicResult, failing, + ); expect(failedSynthResult.ids).toEqual(deterministicResult.ids); expect(failedSynthResult.citations).toEqual(deterministicResult.citations); @@ -329,6 +491,14 @@ describe('Query / Synthesis isolation stage', () => { it('does not call synthesize when synthesizer reports unavailable', async () => { const snapshot = await getSnapshot(); + const trace = assembleTrace('What is A.1.1?', 'verbose', snapshot); + const deterministicResult = buildPatternAnswer('What is A.1.1?', 'verbose', trace, snapshot, false); + const nodes = trace.selectedNodeIds + .map((nodeId) => snapshot.compiledNodes[nodeId]) + .filter((node): node is CompiledNode => Boolean(node)) + .slice(0, 8); + const slices = prepareSynthesisSlices(trace, snapshot); + let synthesizeCalled = false; const unavailable: LocalAnswerSynthesizer = { isAvailable: async () => false, @@ -338,22 +508,23 @@ describe('Query / Synthesis isolation stage', () => { }, }; - await engine(snapshot, unavailable).query('What is A.1.1?', 'compact'); + await synthesizeAnswer( + 'What is A.1.1?', 'compact', trace, nodes, slices, deterministicResult, unavailable, + ); expect(synthesizeCalled).toBe(false); }); }); // --------------------------------------------------------------------------- -// Trace determinism (cross-cutting) +// Trace determinism (cross-cutting — assembled from stages, not QueryEngine) // --------------------------------------------------------------------------- describe('Query / Trace determinism', () => { - it('same snapshot + same query → identical trace structure', async () => { + it('same snapshot + same question → identical assembled trace', async () => { const snapshot = await getSnapshot(); - const eng = engine(snapshot); - const trace1 = eng.trace('How does bounded context relate to role assignment?', 'verbose'); - const trace2 = eng.trace('How does bounded context relate to role assignment?', 'verbose'); + const trace1 = assembleTrace('How does bounded context relate to role assignment?', 'verbose', snapshot); + const trace2 = assembleTrace('How does bounded context relate to role assignment?', 'verbose', snapshot); expect(JSON.stringify(trace1)).toBe(JSON.stringify(trace2)); });