From 123be8d5fffd3ccbcac8c21abc4095e82ff609ec Mon Sep 17 00:00:00 2001 From: Halleluyah Oludele Date: Wed, 27 May 2026 02:09:26 +0100 Subject: [PATCH 1/3] feat(retrieval): content-aware re-ranker over strategy-selected candidates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds pkg/retrieval/rerank.go: a ReRanker that takes the strategy's selected section IDs plus the first ~2000 chars of each section's body and issues one short LLM call to score relevance on a 0-100 scale. Sorted descending by score; ties preserve the strategy's original order via a stable insertion sort. Failure semantics deliberately tolerant — every input ID surfaces in the output exactly once: - Empty candidate list short-circuits without an LLM call. - Transport failure returns input order + non-nil error. - All retries returning bad JSON degrade to input order + nil error (mirrors runSelectionWithRetry). - Hallucinated IDs are dropped silently. - Missing IDs receive score=0 at the bottom of the output. Prompt scoring rubric is anchored to four bands so the model returns coarse-but-meaningful integers rather than a uniform 50 across the board: 90-100 directly answers, 60-89 contributes evidence, 30-59 tangential, 0-29 not useful. ParseReRank reuses the tolerant code-fence + prose-stripping pattern from ParseSelection / ParsePlan. 16 table-driven tests cover happy path, empty input, transport failure, parse-retry exhaustion, parse-then-succeed, hallucinated IDs, missing IDs, duplicate IDs, negative-score clamp, prompt content+truncation, nil LLM no-op, nil receiver no-op, and the parser contract (code fence, leading prose, empty input). --- pkg/retrieval/rerank.go | 431 +++++++++++++++++++++++++++++ pkg/retrieval/rerank_test.go | 514 +++++++++++++++++++++++++++++++++++ 2 files changed, 945 insertions(+) create mode 100644 pkg/retrieval/rerank.go create mode 100644 pkg/retrieval/rerank_test.go diff --git a/pkg/retrieval/rerank.go b/pkg/retrieval/rerank.go new file mode 100644 index 0000000..c32363b --- /dev/null +++ b/pkg/retrieval/rerank.go @@ -0,0 +1,431 @@ +package retrieval + +import ( + "context" + "encoding/json" + "fmt" + "log" + "strings" + + "github.com/hallelx2/llmgate" + + "github.com/hallelx2/vectorless-engine/pkg/tree" +) + +// ScoredSection is one entry in the re-ranker's output: a section ID, +// the model's relevance score (0-100), and an optional short reason. +// +// Scores are returned as float64 so the caller can sort stably and so +// future re-rankers (e.g. combining the model score with a cheap +// lexical prior) don't have to re-encode the field. The 0-100 scale is +// what the prompt asks the model to use; rerank.go preserves whatever +// the model returns (clamped to non-negative) rather than rescaling. +type ScoredSection struct { + ID tree.SectionID `json:"id"` + Score float64 `json:"score"` + Reason string `json:"reason,omitempty"` +} + +// SectionContent is one candidate passed to the re-ranker. The caller is +// responsible for loading Content from storage; the re-ranker does not +// touch the storage layer itself. +// +// Title is included separately from Content so the prompt can present +// both even when the section's body is empty (e.g. a structural-only +// section whose children carry the real text). Models tend to ignore a +// section whose body is blank if the title isn't surfaced explicitly. +type SectionContent struct { + ID tree.SectionID + Title string + Content string +} + +// ReRanker re-orders a strategy's candidate sections by reading the first +// chunk of each section's content and asking the LLM which sections +// actually answer the query. This rescues the case where the retrieval +// strategy reasoned over titles + summaries + HyDE candidate questions +// alone and got fooled by surface-level signals. +// +// One LLM call per query, regardless of candidate count. Cost is bounded +// by MaxContentChars per section × candidate count, plus a small +// prompt overhead. +// +// Re-rank is intentionally tolerant about model output: bad JSON, +// unknown IDs, and missing IDs are all handled gracefully so a single +// model blip never drops a candidate. See ReRank for the exact contract. +type ReRanker struct { + // LLM is the client used for the re-rank call. + LLM llmgate.Client + + // Model is the model name passed to the LLM. Callers should point + // this at a small/fast model — the re-rank call is short and + // running it on the flagship model would defeat the cost story. + Model string + + // MaxContentChars caps how many characters of each section's content + // are sent to the model. Default: defaultReRankMaxContentChars. + // Set higher when sections are long and the query needs more + // context to decide; set lower to tighten the budget. + MaxContentChars int + + // MaxRetries bounds JSON-parse retries on a single re-rank call. + // Zero defaults to defaultReRankRetries. + MaxRetries int +} + +// defaultReRankMaxContentChars is the per-section content budget. ~2000 +// chars × 5 candidates ≈ 10k chars ≈ 2.5k tokens — comfortable inside +// any modern model's context window and cheap on gemini-2.5-flash. +const defaultReRankMaxContentChars = 2000 + +// defaultReRankRetries mirrors the planning + selection retry counts. +// Models occasionally drop out of JSON mode; one retry usually recovers. +const defaultReRankRetries = 2 + +// NewReRanker constructs a ReRanker with sensible defaults. Callers can +// override MaxContentChars/MaxRetries on the returned struct. +func NewReRanker(client llmgate.Client, model string) *ReRanker { + return &ReRanker{ + LLM: client, + Model: model, + MaxContentChars: defaultReRankMaxContentChars, + MaxRetries: defaultReRankRetries, + } +} + +// ReRank scores and reorders candidates by relevance to query. Returns +// a slice the same length as candidates (in descending score order), +// the cumulative LLM Usage, and an optional error. +// +// Failure semantics — the whole point of this method is to be safer +// than a hard re-rank that can drop sections on model flakes: +// +// - Empty candidates → returns (nil, zero Usage, nil) without any +// LLM call. Callers can pass an empty list unconditionally. +// - LLM transport failure → returns the input order (each ID with +// score=0) plus a non-nil error. The caller logs and keeps moving. +// - All retry attempts return un-parseable JSON → returns the input +// order with score=0 and a nil error. This mirrors how +// runSelectionWithRetry degrades: a single JSON glitch must not +// 500 the request. +// - Response references unknown IDs → those entries are dropped; +// only IDs present in candidates surface. +// - Response is missing some input IDs → those IDs get score=0 and +// appear at the bottom of the output in their original relative +// order. +// +// In all cases, every input ID appears in the output exactly once. +// This is the load-bearing invariant: re-rank can reorder, but it +// never drops candidates. +func (r *ReRanker) ReRank(ctx context.Context, query string, candidates []SectionContent) ([]ScoredSection, Usage, error) { + if r == nil || r.LLM == nil { + // A nil re-ranker is treated as a no-op (returns input order) + // so production wiring can pass nil when re-rank is disabled. + return inputOrderScored(candidates), Usage{}, nil + } + if len(candidates) == 0 { + return nil, Usage{}, nil + } + + maxChars := r.MaxContentChars + if maxChars <= 0 { + maxChars = defaultReRankMaxContentChars + } + maxRetries := r.MaxRetries + if maxRetries < 0 { + maxRetries = 0 + } + if maxRetries == 0 { + maxRetries = defaultReRankRetries + } + + prompt := buildReRankPrompt(query, candidates, maxChars) + baseReq := llmgate.Request{ + Model: r.Model, + Messages: []llmgate.Message{ + {Role: llmgate.RoleSystem, Content: reRankSystemPrompt}, + {Role: llmgate.RoleUser, Content: prompt}, + }, + MaxTokens: 1024, + Temperature: 0, + JSONMode: true, + JSONSchema: []byte(reRankJSONSchema), + } + + scored, usage, err := runReRankWithRetry(ctx, r.LLM, baseReq, maxRetries) + if err != nil { + // Transport failure: preserve input order, surface the error + // so the caller can decide whether to log loud or quiet. + return inputOrderScored(candidates), usage, fmt.Errorf("rerank llm call: %w", err) + } + if scored == nil { + // All retries failed to parse. Degrade gracefully — same shape + // as a nil-rerank result so the response stays consistent. + return inputOrderScored(candidates), usage, nil + } + + return mergeScored(candidates, scored), usage, nil +} + +// reRankSystemPrompt frames the task. The 0-100 scale was picked over +// 0-1 because models are noticeably better at returning a coarse +// integer score than a fine-grained float, and the downstream code +// only needs ordering. "The answer is in this section" is deliberately +// phrased as "directly answers OR provides the load-bearing evidence +// for" — the goal is to surface sections that close out the query, not +// sections that merely mention the topic. +const reRankSystemPrompt = `You are a precise relevance scorer. Given a user query and a list of candidate document sections (each shown with its ID, title, and the first portion of its content), score how well each section actually answers the query. + +Rules: +- Score each section on a 0-100 integer scale where: + - 90-100: the section directly answers the query OR provides the load-bearing evidence the answer relies on. + - 60-89: the section is highly relevant — it discusses the topic the query is about and likely contributes to the answer. + - 30-59: the section is tangentially related — it mentions the topic but probably does not answer the query. + - 0-29: the section is not useful for this query. Generic mentions, off-topic content, or content that only matches on a shared keyword without real relevance. +- Score every section in the input list. Do not skip any. +- Use the section IDs exactly as provided. Do not invent IDs. +- For each score include a one-line "reason" (≤120 chars) explaining the score. Keep it concrete — quote a phrase from the section when possible. + +Return only the JSON object described in the schema. No prose, no markdown.` + +const reRankJSONSchema = `{ + "type": "object", + "properties": { + "scores": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": {"type": "string"}, + "score": {"type": "number"}, + "reason": {"type": "string"} + }, + "required": ["id", "score"] + } + } + }, + "required": ["scores"] +}` + +// buildReRankPrompt renders the user message. Sections are presented as +// "[id] Title\nContent excerpt" blocks, separated by blank lines so the +// model sees clear boundaries. Content is truncated at maxChars with an +// ellipsis when cut so the model can tell a section was long without +// having to count. +func buildReRankPrompt(query string, candidates []SectionContent, maxChars int) string { + var b strings.Builder + b.WriteString("User query:\n") + b.WriteString(query) + b.WriteString("\n\nCandidate sections:\n") + + for i, c := range candidates { + if i > 0 { + b.WriteString("\n") + } + fmt.Fprintf(&b, "[%s] %s\n", string(c.ID), c.Title) + excerpt := strings.TrimSpace(c.Content) + if excerpt == "" { + b.WriteString("(section body is empty)\n") + continue + } + if len(excerpt) > maxChars { + excerpt = excerpt[:maxChars] + "…" + } + b.WriteString(excerpt) + b.WriteByte('\n') + } + + b.WriteString("\nReturn a JSON object with a `scores` array. Each entry has `id` (string, exactly as shown above), `score` (number, 0-100), and `reason` (string, ≤120 chars). Score every section in the candidate list.") + return b.String() +} + +// reRankPayload is the expected JSON shape. +type reRankPayload struct { + Scores []reRankItem `json:"scores"` +} + +type reRankItem struct { + ID string `json:"id"` + Score float64 `json:"score"` + Reason string `json:"reason"` +} + +// ParseReRank extracts a re-rank result from an LLM JSON response. +// Tolerates code-fence wrappers and leading/trailing prose, like +// ParseSelection and ParsePlan. Returns the parsed []ScoredSection +// (un-merged with the input candidate list — that's mergeScored's job) +// and any parse error. +func ParseReRank(raw string) ([]ScoredSection, error) { + raw = strings.TrimSpace(raw) + if raw == "" { + return nil, fmt.Errorf("empty rerank response") + } + if strings.HasPrefix(raw, "```") { + if i := strings.Index(raw, "\n"); i >= 0 { + raw = raw[i+1:] + } + raw = strings.TrimSuffix(raw, "```") + raw = strings.TrimSpace(raw) + } + if i := strings.Index(raw, "{"); i > 0 { + raw = raw[i:] + } + if j := strings.LastIndex(raw, "}"); j >= 0 && j < len(raw)-1 { + raw = raw[:j+1] + } + + var p reRankPayload + if err := json.Unmarshal([]byte(raw), &p); err != nil { + return nil, fmt.Errorf("unmarshal rerank: %w", err) + } + out := make([]ScoredSection, 0, len(p.Scores)) + for _, it := range p.Scores { + id := strings.TrimSpace(it.ID) + if id == "" { + continue + } + score := it.Score + if score < 0 { + score = 0 + } + out = append(out, ScoredSection{ + ID: tree.SectionID(id), + Score: score, + Reason: strings.TrimSpace(it.Reason), + }) + } + return out, nil +} + +// runReRankWithRetry issues the re-rank LLM call, retrying up to +// maxRetries additional times when the response does not parse. Mirrors +// runSelectionWithRetry / runPlanningWithRetry. Returns (nil, usage, nil) +// when retries are exhausted so the caller falls back to input order. +func runReRankWithRetry(ctx context.Context, client llmgate.Client, baseReq llmgate.Request, maxRetries int) ([]ScoredSection, Usage, error) { + var totalUsage Usage + var lastParseErr error + for attempt := 0; attempt <= maxRetries; attempt++ { + req := baseReq + if attempt > 0 { + msgs := make([]llmgate.Message, len(baseReq.Messages)) + copy(msgs, baseReq.Messages) + tail := len(msgs) - 1 + msgs[tail] = llmgate.Message{ + Role: msgs[tail].Role, + Content: msgs[tail].Content + "\n\nIMPORTANT: respond with ONLY a JSON object matching the schema. Do not include prose, explanation, or markdown fences.", + } + req.Messages = msgs + } + resp, err := client.Complete(ctx, req) + if err != nil { + return nil, totalUsage, err + } + totalUsage.Add(Usage{ + InputTokens: resp.Usage.InputTokens, + OutputTokens: resp.Usage.OutputTokens, + TotalTokens: resp.Usage.TotalTokens, + CostUSD: resp.Usage.CostUSD, + LLMCalls: 1, + }) + scored, parseErr := ParseReRank(resp.Content) + if parseErr == nil { + return scored, totalUsage, nil + } + lastParseErr = parseErr + } + log.Printf("retrieval: rerank parse failed after %d attempts (%v); preserving input order", maxRetries+1, lastParseErr) + return nil, totalUsage, nil +} + +// inputOrderScored returns the candidates as ScoredSection entries with +// score=0, preserving the input order. Used as the safe fallback when +// the re-ranker can't produce a real score for any reason. +func inputOrderScored(candidates []SectionContent) []ScoredSection { + if len(candidates) == 0 { + return nil + } + out := make([]ScoredSection, len(candidates)) + for i, c := range candidates { + out[i] = ScoredSection{ID: c.ID, Score: 0} + } + return out +} + +// mergeScored combines the input candidates with the model's scored +// output. Every input ID appears in the result exactly once: +// +// - Known IDs (present in both input + model output) carry the +// model's score and reason, sorted descending by score. +// - Unknown IDs from the model output are dropped — the model +// hallucinated. +// - Input IDs missing from the model output get score=0 and appear +// at the bottom in input order (so the response is still useful +// to the caller). +// +// Ties on score preserve the original input order. This makes the +// output deterministic when the model returns equal scores and keeps +// reasonable behaviour when the model returns uniform scores (e.g. +// "everything is a 50") — the strategy's original order wins. +func mergeScored(candidates []SectionContent, scored []ScoredSection) []ScoredSection { + if len(candidates) == 0 { + return nil + } + // Position of each input ID, for stable ordering on ties. + pos := make(map[tree.SectionID]int, len(candidates)) + for i, c := range candidates { + pos[c.ID] = i + } + + // Index scored entries by ID. If the model returned the same ID + // twice, the first wins — defensive against duplicate entries. + byID := make(map[tree.SectionID]ScoredSection, len(scored)) + for _, s := range scored { + if _, known := pos[s.ID]; !known { + continue // hallucinated ID + } + if _, seen := byID[s.ID]; seen { + continue + } + byID[s.ID] = s + } + + out := make([]ScoredSection, 0, len(candidates)) + missing := make([]ScoredSection, 0) + for _, c := range candidates { + if s, ok := byID[c.ID]; ok { + out = append(out, s) + } else { + missing = append(missing, ScoredSection{ID: c.ID, Score: 0}) + } + } + + // Stable descending sort by score with original-order tiebreak. + // Hand-rolled because we want strict stability across equal + // scores and Go's sort.Slice is not stable. + insertionSortByScore(out, pos) + + // Append missing IDs (all score=0) at the bottom, in input order. + out = append(out, missing...) + return out +} + +// insertionSortByScore sorts ss descending by Score, with ties broken +// by the input position recorded in pos (lower pos → earlier). O(n²) +// is fine here: re-rank candidates are typically ≤20. +func insertionSortByScore(ss []ScoredSection, pos map[tree.SectionID]int) { + for i := 1; i < len(ss); i++ { + cur := ss[i] + curPos := pos[cur.ID] + j := i - 1 + for j >= 0 { + cmp := ss[j] + cmpPos := pos[cmp.ID] + if cmp.Score > cur.Score || (cmp.Score == cur.Score && cmpPos <= curPos) { + break + } + ss[j+1] = ss[j] + j-- + } + ss[j+1] = cur + } +} diff --git a/pkg/retrieval/rerank_test.go b/pkg/retrieval/rerank_test.go new file mode 100644 index 0000000..d20eadb --- /dev/null +++ b/pkg/retrieval/rerank_test.go @@ -0,0 +1,514 @@ +package retrieval_test + +import ( + "context" + "encoding/json" + "errors" + "strings" + "sync" + "sync/atomic" + "testing" + + "github.com/hallelx2/llmgate" + + "github.com/hallelx2/vectorless-engine/pkg/retrieval" + "github.com/hallelx2/vectorless-engine/pkg/tree" +) + +// contains is a tiny shim so we don't have to litter assertions with +// strings.Contains; the external _test package cannot share span_test.go's +// internal helper. +func contains(s, sub string) bool { return strings.Contains(s, sub) } + +// rerankMock is a minimal llmgate client that returns scripted replies +// in order. Each Complete() call advances the counter; when the +// scripted replies are exhausted the last reply is reused so a +// single-element script behaves like a fixed response. +// +// Kept separate from planner/single_pass mocks because the re-rank +// schema is distinct and the test surface is small enough that a +// dedicated mock keeps the assertions simple. +type rerankMock struct { + mu sync.Mutex + replies []string + err error + + calls int32 + prompts []string +} + +func (m *rerankMock) Complete(ctx context.Context, req llmgate.Request) (*llmgate.Response, error) { + m.mu.Lock() + defer m.mu.Unlock() + atomic.AddInt32(&m.calls, 1) + for _, msg := range req.Messages { + if msg.Role == llmgate.RoleUser { + m.prompts = append(m.prompts, msg.Content) + } + } + if m.err != nil { + return nil, m.err + } + if len(m.replies) == 0 { + return &llmgate.Response{}, nil + } + idx := int(atomic.LoadInt32(&m.calls)) - 1 + if idx >= len(m.replies) { + idx = len(m.replies) - 1 + } + return &llmgate.Response{ + Content: m.replies[idx], + Usage: llmgate.Usage{ + InputTokens: 100, + OutputTokens: 30, + TotalTokens: 130, + CostUSD: 0.0003, + }, + }, nil +} + +func (m *rerankMock) CountTokens(ctx context.Context, s string) (int, error) { + return len(s) / 4, nil +} + +// scoreReply marshals a list of (id, score, reason) tuples into the +// re-rank JSON envelope. +func scoreReply(items ...rerankReplyItem) string { + type payload struct { + Scores []rerankReplyItem `json:"scores"` + } + raw, _ := json.Marshal(payload{Scores: items}) + return string(raw) +} + +type rerankReplyItem struct { + ID string `json:"id"` + Score float64 `json:"score"` + Reason string `json:"reason,omitempty"` +} + +// sampleCandidates builds a small candidate list used by most tests. +// "sec-1" / "sec-2" / "sec-3" — short stable IDs so assertions are +// readable. +func sampleCandidates() []retrieval.SectionContent { + return []retrieval.SectionContent{ + {ID: tree.SectionID("sec-1"), Title: "Long-Term Debt", Content: "The company reported long-term debt of $4.2B as of Q4 2024."}, + {ID: tree.SectionID("sec-2"), Title: "Revenue Breakdown", Content: "Apple's fiscal 2023 revenue was $383.3B, down 2.8% YoY."}, + {ID: tree.SectionID("sec-3"), Title: "Risk Factors", Content: "Foreign currency translation may impact future revenue."}, + } +} + +// TestReRanker_HappyPath: model returns reordered scores, output is +// sorted descending by score. +func TestReRanker_HappyPath(t *testing.T) { + t.Parallel() + m := &rerankMock{ + replies: []string{ + // sec-2 is the most relevant (92), sec-3 next (45), sec-1 last (10). + // Strategy returned them in 1/2/3 order — re-rank should flip it. + scoreReply( + rerankReplyItem{ID: "sec-1", Score: 10, Reason: "long-term debt not relevant to revenue query"}, + rerankReplyItem{ID: "sec-2", Score: 92, Reason: "directly states fiscal 2023 revenue"}, + rerankReplyItem{ID: "sec-3", Score: 45, Reason: "tangential mention of revenue"}, + ), + }, + } + r := retrieval.NewReRanker(m, "gemini-2.5-flash") + + got, usage, err := r.ReRank(context.Background(), "What was Apple's fiscal 2023 revenue?", sampleCandidates()) + if err != nil { + t.Fatalf("ReRank: %v", err) + } + if len(got) != 3 { + t.Fatalf("len(got) = %d, want 3", len(got)) + } + wantOrder := []tree.SectionID{"sec-2", "sec-3", "sec-1"} + for i, w := range wantOrder { + if got[i].ID != w { + t.Errorf("got[%d].ID = %q, want %q (full order: %+v)", i, got[i].ID, w, got) + } + } + if got[0].Score != 92 { + t.Errorf("top score = %v, want 92", got[0].Score) + } + if got[0].Reason == "" { + t.Error("top entry should carry a reason from the model") + } + if usage.LLMCalls != 1 { + t.Errorf("Usage.LLMCalls = %d, want 1", usage.LLMCalls) + } + if usage.CostUSD <= 0 { + t.Errorf("Usage.CostUSD = %v, want > 0", usage.CostUSD) + } +} + +// TestReRanker_EmptyInput: nil/empty candidate list short-circuits +// without an LLM call. +func TestReRanker_EmptyInput(t *testing.T) { + t.Parallel() + m := &rerankMock{replies: []string{`{"scores":[]}`}} + r := retrieval.NewReRanker(m, "gemini-2.5-flash") + + got, usage, err := r.ReRank(context.Background(), "irrelevant", nil) + if err != nil { + t.Fatalf("ReRank: %v", err) + } + if got != nil { + t.Errorf("empty input should return nil slice, got %+v", got) + } + if usage.LLMCalls != 0 { + t.Errorf("empty input should issue no LLM calls, got %d", usage.LLMCalls) + } + if c := atomic.LoadInt32(&m.calls); c != 0 { + t.Errorf("empty input must NOT call the LLM, got %d calls", c) + } +} + +// TestReRanker_LLMFailure: transport error bubbles up, input order is +// preserved with score=0 so the caller never loses a candidate. +func TestReRanker_LLMFailure(t *testing.T) { + t.Parallel() + m := &rerankMock{err: errors.New("provider 500")} + r := retrieval.NewReRanker(m, "gemini-2.5-flash") + + cands := sampleCandidates() + got, _, err := r.ReRank(context.Background(), "any query", cands) + if err == nil { + t.Fatal("expected transport error, got nil") + } + if len(got) != len(cands) { + t.Fatalf("LLM failure must preserve all candidates: got %d, want %d", len(got), len(cands)) + } + for i, c := range cands { + if got[i].ID != c.ID { + t.Errorf("got[%d].ID = %q, want %q (preserved input order)", i, got[i].ID, c.ID) + } + if got[i].Score != 0 { + t.Errorf("got[%d].Score = %v, want 0 on transport failure", i, got[i].Score) + } + } +} + +// TestReRanker_BadJSONExhaustsRetries: when all retry attempts return +// un-parseable JSON, returns input order + nil error (graceful +// degradation, matching runSelectionWithRetry behaviour). +func TestReRanker_BadJSONExhaustsRetries(t *testing.T) { + t.Parallel() + m := &rerankMock{ + replies: []string{ + "sorry, here's some prose instead of JSON", + "still talking, not JSON", + "and one more time", + }, + } + r := retrieval.NewReRanker(m, "gemini-2.5-flash") + r.MaxRetries = 2 // 1 initial + 2 retries = 3 attempts + + cands := sampleCandidates() + got, usage, err := r.ReRank(context.Background(), "any query", cands) + if err != nil { + t.Fatalf("parse-only failure must return nil error, got %v", err) + } + if c := atomic.LoadInt32(&m.calls); c != 3 { + t.Errorf("expected 3 LLM attempts (1 + 2 retries), got %d", c) + } + if usage.LLMCalls != 3 { + t.Errorf("usage.LLMCalls = %d, want 3 (all attempts counted)", usage.LLMCalls) + } + if len(got) != len(cands) { + t.Fatalf("parse failure must preserve all candidates: got %d", len(got)) + } + for i, c := range cands { + if got[i].ID != c.ID { + t.Errorf("got[%d].ID = %q, want %q (input order)", i, got[i].ID, c.ID) + } + if got[i].Score != 0 { + t.Errorf("got[%d].Score = %v, want 0 on parse failure", i, got[i].Score) + } + } +} + +// TestReRanker_BadJSONThenSuccess: a single bad reply followed by a +// good one returns the parsed scores. +func TestReRanker_BadJSONThenSuccess(t *testing.T) { + t.Parallel() + m := &rerankMock{ + replies: []string{ + "not json", + scoreReply( + rerankReplyItem{ID: "sec-1", Score: 50}, + rerankReplyItem{ID: "sec-2", Score: 80}, + rerankReplyItem{ID: "sec-3", Score: 20}, + ), + }, + } + r := retrieval.NewReRanker(m, "gemini-2.5-flash") + got, usage, err := r.ReRank(context.Background(), "Q", sampleCandidates()) + if err != nil { + t.Fatalf("ReRank: %v", err) + } + if usage.LLMCalls != 2 { + t.Errorf("LLMCalls = %d, want 2 (1 failed + 1 ok)", usage.LLMCalls) + } + if got[0].ID != "sec-2" { + t.Errorf("top = %q, want sec-2 (highest score)", got[0].ID) + } +} + +// TestReRanker_UnknownIDDropped: when the model invents an ID, it is +// silently dropped from the output. The known IDs still surface. +func TestReRanker_UnknownIDDropped(t *testing.T) { + t.Parallel() + m := &rerankMock{ + replies: []string{ + scoreReply( + rerankReplyItem{ID: "sec-1", Score: 30}, + rerankReplyItem{ID: "sec-2", Score: 70}, + rerankReplyItem{ID: "sec-3", Score: 50}, + rerankReplyItem{ID: "sec-bogus", Score: 99, Reason: "hallucinated"}, + ), + }, + } + r := retrieval.NewReRanker(m, "gemini-2.5-flash") + got, _, err := r.ReRank(context.Background(), "Q", sampleCandidates()) + if err != nil { + t.Fatalf("ReRank: %v", err) + } + if len(got) != 3 { + t.Fatalf("len(got) = %d, want 3 (bogus ID dropped)", len(got)) + } + for _, s := range got { + if s.ID == "sec-bogus" { + t.Errorf("hallucinated ID leaked into output: %+v", s) + } + } + if got[0].ID != "sec-2" { + t.Errorf("top = %q, want sec-2", got[0].ID) + } +} + +// TestReRanker_MissingIDsScoreZero: input IDs the model didn't score +// appear at the bottom with score=0. +func TestReRanker_MissingIDsScoreZero(t *testing.T) { + t.Parallel() + m := &rerankMock{ + replies: []string{ + // Model only scored sec-2; sec-1 and sec-3 are missing. + scoreReply(rerankReplyItem{ID: "sec-2", Score: 88}), + }, + } + r := retrieval.NewReRanker(m, "gemini-2.5-flash") + got, _, err := r.ReRank(context.Background(), "Q", sampleCandidates()) + if err != nil { + t.Fatalf("ReRank: %v", err) + } + if len(got) != 3 { + t.Fatalf("len(got) = %d, want 3 (missing IDs must still surface)", len(got)) + } + if got[0].ID != "sec-2" || got[0].Score != 88 { + t.Errorf("top = %+v, want sec-2 / 88", got[0]) + } + // Missing IDs come back with score=0 in input order. sec-1 was input + // position 0 and sec-3 was input position 2, so we expect sec-1 then sec-3. + if got[1].ID != "sec-1" || got[1].Score != 0 { + t.Errorf("got[1] = %+v, want sec-1 / 0", got[1]) + } + if got[2].ID != "sec-3" || got[2].Score != 0 { + t.Errorf("got[2] = %+v, want sec-3 / 0", got[2]) + } +} + +// TestReRanker_DuplicateIDsInResponse: when the model returns the same +// ID twice the first wins and the duplicate is dropped. +func TestReRanker_DuplicateIDsInResponse(t *testing.T) { + t.Parallel() + m := &rerankMock{ + replies: []string{ + scoreReply( + rerankReplyItem{ID: "sec-1", Score: 60, Reason: "first"}, + rerankReplyItem{ID: "sec-1", Score: 10, Reason: "duplicate"}, + rerankReplyItem{ID: "sec-2", Score: 30}, + rerankReplyItem{ID: "sec-3", Score: 20}, + ), + }, + } + r := retrieval.NewReRanker(m, "gemini-2.5-flash") + got, _, err := r.ReRank(context.Background(), "Q", sampleCandidates()) + if err != nil { + t.Fatalf("ReRank: %v", err) + } + if len(got) != 3 { + t.Fatalf("len(got) = %d, want 3", len(got)) + } + if got[0].ID != "sec-1" || got[0].Score != 60 || got[0].Reason != "first" { + t.Errorf("expected first occurrence to win, got %+v", got[0]) + } +} + +// TestReRanker_NegativeScoreClamped: a negative score from the model +// is clamped to 0 by ParseReRank. +func TestReRanker_NegativeScoreClamped(t *testing.T) { + t.Parallel() + m := &rerankMock{ + replies: []string{ + scoreReply( + rerankReplyItem{ID: "sec-1", Score: -5}, + rerankReplyItem{ID: "sec-2", Score: 30}, + rerankReplyItem{ID: "sec-3", Score: 0}, + ), + }, + } + r := retrieval.NewReRanker(m, "gemini-2.5-flash") + got, _, err := r.ReRank(context.Background(), "Q", sampleCandidates()) + if err != nil { + t.Fatalf("ReRank: %v", err) + } + // Top should be sec-2 (30). sec-1 with score=-5 is clamped to 0; it + // will tie with sec-3 (0) and the stable-sort tiebreak puts sec-1 + // before sec-3 since that's the input order. + if got[0].ID != "sec-2" { + t.Errorf("top = %q, want sec-2", got[0].ID) + } + for _, s := range got { + if s.Score < 0 { + t.Errorf("negative score leaked through: %+v", s) + } + } +} + +// TestReRanker_PromptIncludesContent: the prompt actually carries the +// candidate content (otherwise re-rank is back to title-only). +func TestReRanker_PromptIncludesContent(t *testing.T) { + t.Parallel() + m := &rerankMock{ + replies: []string{scoreReply( + rerankReplyItem{ID: "sec-1", Score: 50}, + rerankReplyItem{ID: "sec-2", Score: 50}, + rerankReplyItem{ID: "sec-3", Score: 50}, + )}, + } + r := retrieval.NewReRanker(m, "gemini-2.5-flash") + _, _, err := r.ReRank(context.Background(), "What was Apple's fiscal 2023 revenue?", sampleCandidates()) + if err != nil { + t.Fatalf("ReRank: %v", err) + } + if len(m.prompts) != 1 { + t.Fatalf("want 1 captured user prompt, got %d", len(m.prompts)) + } + prompt := m.prompts[0] + // All three IDs surfaced. + for _, id := range []string{"sec-1", "sec-2", "sec-3"} { + if !contains(prompt, "["+id+"]") { + t.Errorf("prompt missing ID marker %q", id) + } + } + // Content excerpt for sec-2 must appear (the prompt is title + body). + if !contains(prompt, "Apple's fiscal 2023 revenue was $383.3B") { + t.Error("prompt missing sec-2 content excerpt") + } + if !contains(prompt, "What was Apple's fiscal 2023 revenue?") { + t.Error("prompt missing user query") + } +} + +// TestReRanker_MaxContentCharsTruncates: when content exceeds the cap +// the prompt carries a truncated excerpt with an ellipsis. +func TestReRanker_MaxContentCharsTruncates(t *testing.T) { + t.Parallel() + m := &rerankMock{replies: []string{scoreReply(rerankReplyItem{ID: "sec-1", Score: 50})}} + r := retrieval.NewReRanker(m, "gemini-2.5-flash") + r.MaxContentChars = 20 + + longContent := "AAAAAAAAAA BBBBBBBBBB CCCCCCCCCC DDDDDDDDDD EEEEEEEEEE" + _, _, err := r.ReRank(context.Background(), "Q", []retrieval.SectionContent{ + {ID: tree.SectionID("sec-1"), Title: "T", Content: longContent}, + }) + if err != nil { + t.Fatalf("ReRank: %v", err) + } + prompt := m.prompts[0] + if !contains(prompt, "AAAAAAAAAA BBBBBBBBB") { + t.Errorf("prompt missing first 20 chars of long content") + } + if !contains(prompt, "…") { + t.Error("prompt missing ellipsis marker for truncation") + } + if contains(prompt, "EEEEEEEEEE") { + t.Error("prompt unexpectedly contained tail of long content") + } +} + +// TestReRanker_NilLLMNoOp: a re-ranker with a nil LLM client returns +// input order without panicking. This lets server wiring pass a stub +// for the disabled case. +func TestReRanker_NilLLMNoOp(t *testing.T) { + t.Parallel() + r := &retrieval.ReRanker{Model: "any"} + got, usage, err := r.ReRank(context.Background(), "Q", sampleCandidates()) + if err != nil { + t.Fatalf("ReRank with nil LLM: %v", err) + } + if len(got) != 3 { + t.Errorf("nil LLM should preserve input order, got %d entries", len(got)) + } + if usage.LLMCalls != 0 { + t.Errorf("nil LLM should issue no calls, got %d", usage.LLMCalls) + } +} + +// TestReRanker_NilReRankerNoOp: a nil *ReRanker is safe — callers can +// pass nil when re-rank is disabled. +func TestReRanker_NilReRankerNoOp(t *testing.T) { + t.Parallel() + var r *retrieval.ReRanker + got, usage, err := r.ReRank(context.Background(), "Q", sampleCandidates()) + if err != nil { + t.Fatalf("nil ReRanker: %v", err) + } + if len(got) != 3 { + t.Errorf("nil ReRanker should preserve input order, got %d entries", len(got)) + } + if usage.LLMCalls != 0 { + t.Errorf("nil ReRanker should issue no calls, got %d", usage.LLMCalls) + } +} + +// TestParseReRank_CodeFence: tolerant parsing matches the other +// JSON-mode parsers in this package. +func TestParseReRank_CodeFence(t *testing.T) { + t.Parallel() + raw := "```json\n" + scoreReply(rerankReplyItem{ID: "sec-1", Score: 75}) + "\n```" + got, err := retrieval.ParseReRank(raw) + if err != nil { + t.Fatalf("ParseReRank: %v", err) + } + if len(got) != 1 || got[0].ID != "sec-1" || got[0].Score != 75 { + t.Errorf("got %+v", got) + } +} + +// TestParseReRank_LeadingProse: leading prose ahead of the JSON object +// is stripped. +func TestParseReRank_LeadingProse(t *testing.T) { + t.Parallel() + raw := "Sure, here are the scores: " + scoreReply(rerankReplyItem{ID: "sec-1", Score: 10}) + got, err := retrieval.ParseReRank(raw) + if err != nil { + t.Fatalf("ParseReRank: %v", err) + } + if len(got) != 1 { + t.Errorf("got %+v", got) + } +} + +// TestParseReRank_Empty: empty / whitespace-only responses error so +// retry can fire. +func TestParseReRank_Empty(t *testing.T) { + t.Parallel() + if _, err := retrieval.ParseReRank(""); err == nil { + t.Error("empty input should parse-error so retry fires") + } + if _, err := retrieval.ParseReRank(" \n\n "); err == nil { + t.Error("whitespace input should parse-error") + } +} From a5c4d8bf48eb23220ab1e4e3c139e936638a63f8 Mon Sep 17 00:00:00 2001 From: Halleluyah Oludele Date: Wed, 27 May 2026 02:11:43 +0100 Subject: [PATCH 2/3] feat(config): retrieval.rerank block + VLE_RETRIEVAL_RERANK_* env overrides Adds ReRankBlock under RetrievalConfig with Enabled / Model / MaxContentChars (default 2000) / TopK (default 0 = keep all) fields, plus matching VLE_RETRIEVAL_RERANK_ENABLED, _MODEL, _MAX_CONTENT_CHARS, and _TOP_K env overrides following the same shape as the existing planning + agentic + answer_span env handling. Re-rank is opt-in: defaults are Enabled=false / TopK=0 so existing deployments behave identically. Per-request `enable_rerank` body field (wired in the next commit) overrides this block when set. Validate rejects negative MaxContentChars and TopK so a stray env or YAML typo can't silently flip the model into "send no content" mode. Zero is valid for both: TopK=0 means keep all candidates, MaxContentChars=0 falls through to the ReRanker's compiled default. Four new tests: default values, env override happy path, env disable of a YAML-set true, env override rejects bad numeric values, and validation of negative inputs. --- pkg/config/config.go | 71 ++++++++++++++++++++++++ pkg/config/config_test.go | 112 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 183 insertions(+) diff --git a/pkg/config/config.go b/pkg/config/config.go index 28ad5ac..abfb8ce 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -204,6 +204,44 @@ type RetrievalConfig struct { AnswerSpan AnswerSpanBlock `yaml:"answer_span"` Answer AnswerBlock `yaml:"answer"` Planning PlanningBlock `yaml:"planning"` + ReRank ReRankBlock `yaml:"rerank"` +} + +// ReRankBlock configures the Phase 2.3 content-aware re-rank pass. +// +// When enabled, every /v1/query and /v1/answer request that returns +// candidate sections runs one extra LLM call: the candidates' first +// MaxContentChars chars of content are fed to the model with the +// query, and the model returns a per-section relevance score +// (0-100). Sections are reordered by score; if TopK > 0 the response +// is truncated to the top K. +// +// The pass is opt-in. Per-request `enable_rerank` body field +// overrides this block. +// +// Re-rank failures never drop sections — at worst the original +// strategy order is preserved and the request returns unchanged from +// the no-rerank path. See pkg/retrieval/rerank.go for the exact +// degradation contract. +type ReRankBlock struct { + // Enabled toggles re-rank at the server level. Default: false. + Enabled bool `yaml:"enabled"` + + // Model overrides the re-rank LLM model. Empty means use the + // request's model (which itself falls back to the engine default). + // Point this at a small/fast model — the re-rank prompt is short + // and shouldn't burn the flagship model's budget. + Model string `yaml:"model"` + + // MaxContentChars caps how many characters of each candidate's + // content are sent to the model. Default: 2000. + MaxContentChars int `yaml:"max_content_chars"` + + // TopK caps the number of sections kept after re-ranking. 0 means + // keep all candidates (re-rank only reorders). Useful when the + // strategy is configured to return a wide candidate list and the + // re-rank pass picks the focused top-k for synthesis. + TopK int `yaml:"top_k"` } // PlanningBlock configures Phase 2.1 query planning + Phase 2.2 multi-hop @@ -368,6 +406,11 @@ func Default() Config { CacheSize: 128, Decompose: true, }, + ReRank: ReRankBlock{ + Enabled: false, + MaxContentChars: 2000, + TopK: 0, + }, }, Ingest: IngestConfig{ GlobalLLMConcurrency: 12, @@ -556,6 +599,27 @@ func applyEnvOverrides(c *Config) { c.Retrieval.Planning.Decompose = false } } + if v := os.Getenv("VLE_RETRIEVAL_RERANK_ENABLED"); v != "" { + switch strings.ToLower(strings.TrimSpace(v)) { + case "1", "true", "yes", "on": + c.Retrieval.ReRank.Enabled = true + case "0", "false", "no", "off": + c.Retrieval.ReRank.Enabled = false + } + } + if v := os.Getenv("VLE_RETRIEVAL_RERANK_MODEL"); v != "" { + c.Retrieval.ReRank.Model = v + } + if v := os.Getenv("VLE_RETRIEVAL_RERANK_MAX_CONTENT_CHARS"); v != "" { + if n, err := strconv.Atoi(v); err == nil && n > 0 { + c.Retrieval.ReRank.MaxContentChars = n + } + } + if v := os.Getenv("VLE_RETRIEVAL_RERANK_TOP_K"); v != "" { + if n, err := strconv.Atoi(v); err == nil && n >= 0 { + c.Retrieval.ReRank.TopK = n + } + } } // Validate checks that required fields for the selected drivers are set. @@ -642,5 +706,12 @@ func (c Config) Validate() error { return fmt.Errorf("retrieval.planning.cache_size must be >= 0, got %d", c.Retrieval.Planning.CacheSize) } + if c.Retrieval.ReRank.MaxContentChars < 0 { + return fmt.Errorf("retrieval.rerank.max_content_chars must be >= 0, got %d", c.Retrieval.ReRank.MaxContentChars) + } + if c.Retrieval.ReRank.TopK < 0 { + return fmt.Errorf("retrieval.rerank.top_k must be >= 0, got %d", c.Retrieval.ReRank.TopK) + } + return nil } diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go index fb60973..64ba7d3 100644 --- a/pkg/config/config_test.go +++ b/pkg/config/config_test.go @@ -46,11 +46,123 @@ func TestDefaultValues(t *testing.T) { if !cfg.Retrieval.Planning.Decompose { t.Error("retrieval.planning.decompose should default to true (when planning is enabled)") } + if cfg.Retrieval.ReRank.Enabled { + t.Error("retrieval.rerank.enabled should default to false (opt-in)") + } + if cfg.Retrieval.ReRank.MaxContentChars != 2000 { + t.Errorf("retrieval.rerank.max_content_chars = %d, want 2000", cfg.Retrieval.ReRank.MaxContentChars) + } + if cfg.Retrieval.ReRank.TopK != 0 { + t.Errorf("retrieval.rerank.top_k = %d, want 0 (keep all)", cfg.Retrieval.ReRank.TopK) + } if cfg.Log.Level != "info" { t.Errorf("log.level = %q, want info", cfg.Log.Level) } } +func TestReRankEnvOverride(t *testing.T) { + // Not parallel — mutates env. Restore on exit. + prevEnabled := os.Getenv("VLE_RETRIEVAL_RERANK_ENABLED") + prevModel := os.Getenv("VLE_RETRIEVAL_RERANK_MODEL") + prevMax := os.Getenv("VLE_RETRIEVAL_RERANK_MAX_CONTENT_CHARS") + prevTopK := os.Getenv("VLE_RETRIEVAL_RERANK_TOP_K") + defer func() { + os.Setenv("VLE_RETRIEVAL_RERANK_ENABLED", prevEnabled) + os.Setenv("VLE_RETRIEVAL_RERANK_MODEL", prevModel) + os.Setenv("VLE_RETRIEVAL_RERANK_MAX_CONTENT_CHARS", prevMax) + os.Setenv("VLE_RETRIEVAL_RERANK_TOP_K", prevTopK) + }() + + os.Setenv("VLE_RETRIEVAL_RERANK_ENABLED", "true") + os.Setenv("VLE_RETRIEVAL_RERANK_MODEL", "gemini-2.0-flash") + os.Setenv("VLE_RETRIEVAL_RERANK_MAX_CONTENT_CHARS", "1500") + os.Setenv("VLE_RETRIEVAL_RERANK_TOP_K", "3") + + cfg := Default() + applyEnvOverrides(&cfg) + + if !cfg.Retrieval.ReRank.Enabled { + t.Error("VLE_RETRIEVAL_RERANK_ENABLED=true should enable rerank") + } + if cfg.Retrieval.ReRank.Model != "gemini-2.0-flash" { + t.Errorf("rerank model = %q, want gemini-2.0-flash", cfg.Retrieval.ReRank.Model) + } + if cfg.Retrieval.ReRank.MaxContentChars != 1500 { + t.Errorf("rerank max_content_chars = %d, want 1500", cfg.Retrieval.ReRank.MaxContentChars) + } + if cfg.Retrieval.ReRank.TopK != 3 { + t.Errorf("rerank top_k = %d, want 3", cfg.Retrieval.ReRank.TopK) + } +} + +func TestReRankEnvOverrideDisable(t *testing.T) { + // Toggle off via env: start from a config that defaults to false, + // then set =false explicitly; verify the path executes (not just + // that the default value is preserved). + prev := os.Getenv("VLE_RETRIEVAL_RERANK_ENABLED") + defer os.Setenv("VLE_RETRIEVAL_RERANK_ENABLED", prev) + + cfg := Default() + cfg.Retrieval.ReRank.Enabled = true // simulate a YAML-set true + os.Setenv("VLE_RETRIEVAL_RERANK_ENABLED", "false") + applyEnvOverrides(&cfg) + if cfg.Retrieval.ReRank.Enabled { + t.Error("VLE_RETRIEVAL_RERANK_ENABLED=false should disable rerank even when YAML set it true") + } +} + +func TestReRankEnvOverrideRejectsBad(t *testing.T) { + // Garbage env values should be rejected, not silently zero the field. + prevMax := os.Getenv("VLE_RETRIEVAL_RERANK_MAX_CONTENT_CHARS") + prevTopK := os.Getenv("VLE_RETRIEVAL_RERANK_TOP_K") + defer func() { + os.Setenv("VLE_RETRIEVAL_RERANK_MAX_CONTENT_CHARS", prevMax) + os.Setenv("VLE_RETRIEVAL_RERANK_TOP_K", prevTopK) + }() + + os.Setenv("VLE_RETRIEVAL_RERANK_MAX_CONTENT_CHARS", "not-a-number") + os.Setenv("VLE_RETRIEVAL_RERANK_TOP_K", "abc") + + cfg := Default() + applyEnvOverrides(&cfg) + if cfg.Retrieval.ReRank.MaxContentChars != 2000 { + t.Errorf("bad max_content_chars env should preserve default, got %d", cfg.Retrieval.ReRank.MaxContentChars) + } + if cfg.Retrieval.ReRank.TopK != 0 { + t.Errorf("bad top_k env should preserve default, got %d", cfg.Retrieval.ReRank.TopK) + } +} + +func TestValidateReRankNegatives(t *testing.T) { + t.Parallel() + + // Negative max_content_chars rejected. + cfg := Default() + cfg.Database.URL = "postgres://localhost/test" + cfg.Retrieval.ReRank.MaxContentChars = -1 + if err := cfg.Validate(); err == nil { + t.Error("negative max_content_chars should fail validation") + } + + // Negative top_k rejected. + cfg2 := Default() + cfg2.Database.URL = "postgres://localhost/test" + cfg2.Retrieval.ReRank.TopK = -1 + if err := cfg2.Validate(); err == nil { + t.Error("negative top_k should fail validation") + } + + // Zero on both is valid (TopK=0 means "keep all", MaxContentChars=0 + // means "use default"). + cfg3 := Default() + cfg3.Database.URL = "postgres://localhost/test" + cfg3.Retrieval.ReRank.MaxContentChars = 0 + cfg3.Retrieval.ReRank.TopK = 0 + if err := cfg3.Validate(); err != nil { + t.Errorf("zero rerank values should pass validation: %v", err) + } +} + func TestPlanningEnvOverride(t *testing.T) { // Not parallel — mutates env. Restore on exit. prevEnabled := os.Getenv("VLE_RETRIEVAL_PLANNING_ENABLED") From 478e5788d7d0073bffcd5d6528c45851f0afecc3 Mon Sep 17 00:00:00 2001 From: Halleluyah Oludele Date: Wed, 27 May 2026 02:17:31 +0100 Subject: [PATCH 3/3] feat(api): wire content-aware re-rank into /v1/query + /v1/answer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the per-request `enable_rerank` body field (pointer, so absent vs. explicit false are distinguishable) on both QueryRequest and AnswerRequest. When true, or when retrieval.rerank.enabled is on at config level, the engine runs one extra LLM call after candidate content is loaded but before answer-span extraction / synthesis. The pass scores each candidate on 0-100 and reorders the slice descending by score. retrieval.rerank.top_k (when non-zero) truncates the survivors. Sections that ran through re-rank surface a `score` field in /v1/query.sections[] and /v1/answer.citations[]. Failure semantics carried through from pkg/retrieval/rerank.go: - No LLM client or no ReRanker wired → re-rank silently skipped. - Empty candidate slice → no LLM call, no scores. - Transport / parse failure → warn-log, strategy order preserved. - Hallucinated IDs dropped; missing IDs surface unscored at the bottom. No candidate is ever lost. For /v1/answer the ReRanker's Usage is accumulated into totalUsage so the response's usage block reflects the real cost; the synthesis prompt then sees the post-rerank top-k. cmd/engine/main.go instantiates a ReRanker against the configured LLM whenever a client is wired (mirrors the planner pattern) — opt-in behaviour is purely flag-driven from there. Boot logs the model + content cap + top-k when re-rank is enabled at config level. openapi.yaml: enable_rerank field on QueryRequest + AnswerRequest; score field on QuerySection + AnswerCitation. Schema descriptions document the 0-100 scale and the "never drops sections" envelope. config.example.yaml: documented retrieval.rerank block with cost + opt-in commentary so operators understand why this is off by default. --- cmd/engine/main.go | 24 ++++++ config.example.yaml | 30 ++++++++ internal/api/server.go | 169 ++++++++++++++++++++++++++++++++++++++++- openapi.yaml | 38 +++++++++ 4 files changed, 260 insertions(+), 1 deletion(-) diff --git a/cmd/engine/main.go b/cmd/engine/main.go index e3dd112..6be4ee1 100644 --- a/cmd/engine/main.go +++ b/cmd/engine/main.go @@ -129,6 +129,28 @@ func run() error { } } + // ReRanker: opt-in Phase 2.3. Instantiated whenever an LLM client + // is wired — the per-request `enable_rerank` body field overrides + // the config, mirroring the planner pattern. + var reRanker *retrieval.ReRanker + if llmClient != nil { + reRankModel := cfg.Retrieval.ReRank.Model + if reRankModel == "" { + reRankModel = modelFor(cfg.LLM) + } + reRanker = retrieval.NewReRanker(llmClient, reRankModel) + if cfg.Retrieval.ReRank.MaxContentChars > 0 { + reRanker.MaxContentChars = cfg.Retrieval.ReRank.MaxContentChars + } + if cfg.Retrieval.ReRank.Enabled { + logger.Info("retrieval: rerank enabled", + "model", reRankModel, + "max_content_chars", reRanker.MaxContentChars, + "top_k", cfg.Retrieval.ReRank.TopK, + ) + } + } + pipeline := ingest.NewPipeline(ingest.Pipeline{ DB: pool, Storage: store, @@ -157,6 +179,8 @@ func run() error { Answer: cfg.Retrieval.Answer, Planner: planner, Planning: cfg.Retrieval.Planning, + ReRanker: reRanker, + ReRank: cfg.Retrieval.ReRank, } srv := &http.Server{ diff --git a/config.example.yaml b/config.example.yaml index ffb43a1..53b5a79 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -152,6 +152,36 @@ retrieval: # isolation (plan returned, but retrieval uses the original query). decompose: true + # rerank: Phase 2.3 content-aware re-rank pass. After the retrieval + # strategy returns candidate sections and their content is loaded, + # one extra LLM call scores each section (0-100) against the query + # and the engine reorders descending by score. + # + # This is the safety net for the case where the strategy reasoned + # over title + summary + HyDE candidate questions and got fooled + # by surface-level matches. Reading the actual content closes that + # gap. ~3-5k input tokens per query on gemini-2.5-flash; ~$0.0003 + # per call at typical rates. + # + # OPT-IN. Default disabled. Per-request `enable_rerank` body field + # overrides this block. Failures never drop sections — at worst the + # strategy's order is preserved. + rerank: + enabled: false + # Override the re-rank model; empty inherits the request's model + # (or the engine default). Keep this on a small/fast model — the + # re-rank prompt is short and shouldn't burn the flagship model. + model: "" + # Per-candidate content budget. Higher = more context for the + # model to judge with, lower = tighter cost. 2000 chars ≈ 500 + # tokens, comfortable for typical section sizes. + max_content_chars: 2000 + # Truncate the post-rerank candidate list to the top K. 0 means + # keep all candidates (re-rank only reorders). Useful when the + # strategy returns a wide candidate list and you want the + # re-rank pass to do the final selection. + top_k: 0 + ingest: # The summarize and HyDE stages run concurrently. This caps the total # number of LLM calls in flight across both stages combined, so the diff --git a/internal/api/server.go b/internal/api/server.go index 38dadce..2a4f557 100644 --- a/internal/api/server.go +++ b/internal/api/server.go @@ -73,6 +73,16 @@ type Deps struct { // `enable_planning` field on /v1/query and /v1/answer overrides // Planning.Enabled. Planning config.PlanningBlock + + // ReRanker runs Phase 2.3 content-aware re-rank on the strategy's + // candidate sections (one extra LLM call per query). Nil disables + // re-rank even when a request opts in via `enable_rerank`. + ReRanker *retrieval.ReRanker + + // ReRank carries the server-side re-rank config. The body-level + // `enable_rerank` field on /v1/query and /v1/answer overrides + // ReRank.Enabled. TopK truncates the post-rerank candidate list. + ReRank config.ReRankBlock } // Router builds and returns the chi router wired with v1 routes. @@ -400,6 +410,10 @@ func (d Deps) handleQuery(w http.ResponseWriter, r *http.Request) { // planner. A pointer so we can distinguish "absent" from // "explicit false" — absent falls back to the server config. EnablePlanning *bool `json:"enable_planning"` + // EnableReRank opts this request into the Phase 2.3 + // content-aware re-rank pass. Pointer for the same reason as + // EnablePlanning. Overrides retrieval.rerank.enabled. + EnableReRank *bool `json:"enable_rerank"` } if err := json.NewDecoder(r.Body).Decode(&body); err != nil { writeErr(w, http.StatusBadRequest, "invalid json: "+err.Error()) @@ -471,6 +485,15 @@ func (d Deps) handleQuery(w http.ResponseWriter, r *http.Request) { enriched = append(enriched, sectionWithContent{sec: sec, content: content}) } + // Optional: content-aware re-rank pass. One LLM call that scores + // each loaded section against the query and re-orders the slice + // descending by score. TopK truncates the survivors. Failures + // never drop sections — at worst the strategy's order is + // preserved (see retrieval.ReRanker.ReRank). + if d.reRankEnabled(body.EnableReRank) { + enriched, _ = d.runReRank(r.Context(), enriched, body.Query, body.Model) + } + // Optional: per-section answer-span extraction. Opt-in via config — // one LLM call per returned section. Failures are non-fatal; the // section is returned without a span. @@ -499,11 +522,18 @@ func (d Deps) handleQuery(w http.ResponseWriter, r *http.Request) { } // sectionWithContent bundles a tree section with its loaded content -// and an optional answer-span. Used by /v1/query and /v1/answer. +// and optional re-rank score / answer-span. Used by /v1/query and +// /v1/answer. type sectionWithContent struct { sec *tree.Section content string span *retrieval.AnswerSpan + + // hasScore reports whether score was populated by a re-rank pass. + // Distinct from score == 0 since 0 is a legitimate score the + // model can return. + hasScore bool + score float64 } // sectionWithContentToMap renders the section as the API map shape. @@ -528,6 +558,9 @@ func sectionWithContentToMap(e sectionWithContent) map[string]any { if e.span != nil { s["answer_span"] = e.span } + if e.hasScore { + s["score"] = e.score + } return s } @@ -620,6 +653,10 @@ func (d Deps) handleAnswer(w http.ResponseWriter, r *http.Request) { // EnablePlanning opts this request into the Phase 2.1 query // planner. See handleQuery for the same field's semantics. EnablePlanning *bool `json:"enable_planning"` + // EnableReRank opts this request into the Phase 2.3 re-rank + // pass. Synthesis then sees the re-ranked top-k. Overrides + // retrieval.rerank.enabled. + EnableReRank *bool `json:"enable_rerank"` } if err := json.NewDecoder(r.Body).Decode(&body); err != nil { writeErr(w, http.StatusBadRequest, "invalid json: "+err.Error()) @@ -699,6 +736,16 @@ func (d Deps) handleAnswer(w http.ResponseWriter, r *http.Request) { enriched = append(enriched, sectionWithContent{sec: sec, content: content}) } + // Optional: content-aware re-rank before synthesis sees the + // evidence. When TopK is set the synthesis prompt only ever sees + // the post-rerank top-k, keeping the answer focused on the + // best-evidence sections. + if d.reRankEnabled(body.EnableReRank) { + var reRankUsage retrieval.Usage + enriched, reRankUsage = d.runReRank(r.Context(), enriched, body.Query, body.Model) + totalUsage.Add(reRankUsage) + } + // Always extract spans for /v1/answer — they ground each citation. spanExtractor := d.spanExtractor(body.Model) runSpansConcurrent(r.Context(), spanExtractor, body.Query, enriched, d.AnswerSpan.MaxConcurrency, d.Logger) @@ -747,6 +794,9 @@ func (d Deps) handleAnswer(w http.ResponseWriter, r *http.Request) { c["quote_end"] = e.span.End } } + if e.hasScore { + c["score"] = e.score + } citations = append(citations, c) } @@ -1132,6 +1182,123 @@ func (d Deps) shouldDecompose(plan *retrieval.Plan) bool { return d.Planning.Decompose } +// --- re-rank helpers --- + +// reRankEnabled reports whether the request should go through the +// re-rank pass. The per-request body field (when present) wins over +// the server-side config; a nil body field falls back to the config. +// +// Returns false when no LLM client is wired or when no ReRanker is +// configured, regardless of intent — re-rank without an LLM is +// physically impossible. +func (d Deps) reRankEnabled(bodyOverride *bool) bool { + if d.ReRanker == nil || d.LLM == nil { + return false + } + if bodyOverride != nil { + return *bodyOverride + } + return d.ReRank.Enabled +} + +// runReRank executes the re-rank pass over the loaded section slice +// and returns the reordered slice plus the LLM Usage spent. On any +// failure the original slice is returned (with the same hasScore +// values it had on input — i.e. unchanged) so the caller never has +// to think about partial state. The error is LOGGED, not returned — +// re-rank is best-effort and a failure must never abort the request. +// +// requestModel is the model the request asked for. When the +// ReRanker has its own Model set (the config-level override), that +// wins; the request model is the fall-through. +func (d Deps) runReRank(ctx context.Context, enriched []sectionWithContent, query, requestModel string) ([]sectionWithContent, retrieval.Usage) { + if d.ReRanker == nil || d.LLM == nil || len(enriched) == 0 { + return enriched, retrieval.Usage{} + } + + // Apply the model fall-through: config override → request model → + // engine default. We don't mutate d.ReRanker since Deps is shared + // across requests; instead build a shallow copy with the chosen + // model. This is the same pattern spanExtractor() uses. + ranker := *d.ReRanker + if ranker.Model == "" { + if requestModel != "" { + ranker.Model = requestModel + } else { + ranker.Model = d.LLMModel + } + } + + candidates := make([]retrieval.SectionContent, len(enriched)) + for i, e := range enriched { + candidates[i] = retrieval.SectionContent{ + ID: e.sec.ID, + Title: e.sec.Title, + Content: e.content, + } + } + + scored, usage, err := ranker.ReRank(ctx, query, candidates) + if err != nil { + if d.Logger != nil { + d.Logger.Warn("rerank: failed; preserving strategy order", "err", err) + } + // ReRank returns input order on error so we *could* apply it + // (it'd just stamp score=0 on everything). Skip — the caller + // shouldn't see score=0 on every section when re-rank + // physically failed. + return enriched, usage + } + if len(scored) == 0 { + return enriched, usage + } + + reordered := reorderByScore(enriched, scored) + if d.ReRank.TopK > 0 && len(reordered) > d.ReRank.TopK { + reordered = reordered[:d.ReRank.TopK] + } + return reordered, usage +} + +// reorderByScore takes the loaded section slice and the model's +// scored output (already sorted descending by score by the +// ReRanker), and returns a new slice in the same order as scored +// with each entry carrying the per-section score. +// +// Defensive: every input enriched section appears in the output +// exactly once, in the order dictated by scored. If scored is +// missing an input ID (shouldn't happen — ReRank's contract is to +// surface every input ID), that section is appended at the end with +// hasScore=false so the response stays complete. +func reorderByScore(enriched []sectionWithContent, scored []retrieval.ScoredSection) []sectionWithContent { + byID := make(map[tree.SectionID]int, len(enriched)) + for i, e := range enriched { + byID[e.sec.ID] = i + } + + out := make([]sectionWithContent, 0, len(enriched)) + taken := make([]bool, len(enriched)) + for _, s := range scored { + idx, ok := byID[s.ID] + if !ok || taken[idx] { + continue + } + taken[idx] = true + e := enriched[idx] + e.hasScore = true + e.score = s.Score + out = append(out, e) + } + // Append anything ReRank didn't surface — invariant says this + // should be empty, but a defence-in-depth check costs nothing. + for i, e := range enriched { + if !taken[i] { + out = append(out, e) + } + } + return out +} + // writePlanHints appends a short, model-readable "Planner notes" block // describing the structured plan. Synthesis uses this to orient itself // before reading the evidence. diff --git a/openapi.yaml b/openapi.yaml index f215556..8b35548 100644 --- a/openapi.yaml +++ b/openapi.yaml @@ -462,6 +462,19 @@ components: retrieval fans out one selection call per sub-question and unions the results. Overrides the server's `retrieval.planning.enabled` setting for this request only. + enable_rerank: + type: boolean + description: | + Opt this request into the Phase 2.3 content-aware re-rank + pass. After the retrieval strategy returns candidate + section IDs and the engine loads their content, one extra + LLM call scores each section (0-100) against the query and + sections are reordered descending by score. When + `retrieval.rerank.top_k` is set on the server, only the + top-k sections survive. Failures preserve the strategy's + original order — re-rank never drops sections. Overrides + the server's `retrieval.rerank.enabled` setting for this + request only. QueryResponse: type: object @@ -513,6 +526,14 @@ components: description: Full section content from storage. answer_span: $ref: "#/components/schemas/AnswerSpan" + score: + type: number + description: | + Re-rank relevance score on a 0-100 scale, populated only + when the request opted into the Phase 2.3 re-rank pass + (`enable_rerank`) or the server has + `retrieval.rerank.enabled=true`. Sections are returned + sorted descending by score. Omitted when no re-rank ran. AnswerSpan: type: object @@ -560,6 +581,14 @@ components: full semantics. When enabled, the synthesis prompt also sees the planner's structured intent and entity hints, and the response carries a top-level `plan` field. + enable_rerank: + type: boolean + description: | + Opt this request into the Phase 2.3 content-aware re-rank + pass. See QueryRequest.enable_rerank for full semantics. + When the pass runs, the synthesis prompt sees the + re-ranked top-k (capped by `retrieval.rerank.top_k`), and + each citation in the response carries a `score` field. AnswerResponse: type: object @@ -647,6 +676,9 @@ components: found one). `quote_start`/`quote_end` give byte offsets into the source section's content. `page_start`/`page_end` are the section's page range — omitted for non-paginated formats. + `score` carries the re-rank relevance score on a 0-100 scale, + present only when the request opted into the Phase 2.3 + re-rank pass. properties: section_id: type: string @@ -662,3 +694,9 @@ components: type: integer page_end: type: integer + score: + type: number + description: | + Re-rank relevance score on a 0-100 scale. Omitted when no + re-rank ran. Higher means the section is more directly + relevant to the query.