From 39e677fb14976935f31a4058032dbe7ebae17599 Mon Sep 17 00:00:00 2001 From: Halleluyah Oludele Date: Wed, 27 May 2026 01:41:33 +0100 Subject: [PATCH] feat: answer-span extraction + /v1/answer endpoint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 1.1 + 3.3 of the engine plan: turns the whitepaper's "answer has a verbatim quote with a page number" claim into two demonstrable, regulator-defensible features. pkg/retrieval/span.go - SpanExtractor.Extract(ctx, content, query) runs one LLM call, asks for the SHORTEST verbatim quote from the section that answers the query, returns AnswerSpan{Start, End, Text} with byte offsets back into content. - Two-pass locator: exact substring first, then whitespace- normalised match for content with weird linebreaks/spacing. - When the model paraphrases despite the verbatim rule, offsets are sentinel (-1, -1) but Text is preserved so callers can flag. - Schema-validated JSON output ({found, quote}) with the same tolerant parser pattern as ParseSelection (code fences, leading prose). internal/api/server.go - Deps gains LLM (llmgate.Client), LLMModel (default fallback), AnswerSpan + Answer config blocks. cmd/engine/main.go wires them. - /v1/query: when retrieval.answer_span.enabled, every returned section gets an `answer_span` field. Bounded concurrency (config) via a semaphore; failures are logged and dropped. - POST /v1/answer (NEW): single round-trip runs retrieval + per-section span extraction + a synthesis LLM call, returns {answer, citations:[{section_id, title, page_start, page_end, quote, quote_start, quote_end}], strategy, model, usage, elapsed_ms}. The model is instructed to cite by section_id. pkg/config/config.go - RetrievalConfig.AnswerSpan + .Answer blocks; defaults match behaviour: AnswerSpan disabled, Answer.MaxSections=5, Answer.MaxAnswerTokens=1024. - VLE_RETRIEVAL_ANSWER_SPAN_ENABLED / _MODEL / _MAX_CONCURRENCY and VLE_RETRIEVAL_ANSWER_MODEL / _MAX_SECTIONS env overrides follow the existing pattern. openapi.yaml - /v1/answer endpoint + AnswerRequest / AnswerResponse / AnswerCitation / AnswerSpan schemas. answer_span added as an optional field on QuerySection (omitempty in JSON). Tests: - pkg/retrieval/span_test.go covers the verbatim path, the not-found path, whitespace-normalised location, the hallucinated-quote sentinel-offset path, empty inputs, code-fence stripping, leading-prose stripping, and locateQuote edge cases. All pass. - go build ./..., go vet ./..., go test ./... — all green. --- cmd/engine/main.go | 33 +++- config.example.yaml | 22 +++ internal/api/server.go | 383 +++++++++++++++++++++++++++++++++++-- openapi.yaml | 128 +++++++++++++ pkg/config/config.go | 71 +++++++ pkg/retrieval/span.go | 214 +++++++++++++++++++++ pkg/retrieval/span_test.go | 172 +++++++++++++++++ 7 files changed, 998 insertions(+), 25 deletions(-) create mode 100644 pkg/retrieval/span.go create mode 100644 pkg/retrieval/span_test.go diff --git a/cmd/engine/main.go b/cmd/engine/main.go index c63f2b6..ccaab06 100644 --- a/cmd/engine/main.go +++ b/cmd/engine/main.go @@ -124,13 +124,17 @@ func run() error { q.Register(queue.KindIngestDocument, pipeline.Handler()) deps := api.Deps{ - Logger: logger, - DB: pool, - Storage: store, - Queue: q, - Strategy: strategy, - Version: version, - MultiDoc: multiDoc, + Logger: logger, + DB: pool, + Storage: store, + Queue: q, + Strategy: strategy, + Version: version, + MultiDoc: multiDoc, + LLM: llmClient, + LLMModel: modelFor(cfg.LLM), + AnswerSpan: cfg.Retrieval.AnswerSpan, + Answer: cfg.Retrieval.Answer, } srv := &http.Server{ @@ -228,6 +232,21 @@ func buildQueue(c config.QueueConfig, dbURL string) (queue.Queue, error) { } } +// modelFor returns the configured chat/general-purpose model name for +// the selected LLM driver. Used as a fallback when an API request +// omits an explicit model. +func modelFor(c config.LLMConfig) string { + switch c.Driver { + case "anthropic": + return c.Anthropic.Model + case "openai": + return c.OpenAI.Model + case "gemini": + return c.Gemini.Model + } + return "" +} + func buildLLM(c config.LLMConfig) (llmgate.Client, error) { switch c.Driver { case "anthropic": diff --git a/config.example.yaml b/config.example.yaml index 2325315..6faf264 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -107,6 +107,28 @@ retrieval: # doesn't own, so the model knows what else exists in the document. include_sibling_breadcrumbs: true + # answer_span: when enabled, every section returned by /v1/query gets an + # extra `answer_span` field carrying the verbatim quote the model judged + # most relevant to the query, plus byte offsets back into the section's + # content. Costs one LLM call per returned section. Opt-in by default. + answer_span: + enabled: false + # Override the model used for span extraction; empty inherits the + # request's model. Keep this on a cheap/fast model — the call is + # short and runs once per returned section. + model: "" + max_concurrency: 4 + max_quote_len: 400 + + # answer: /v1/answer endpoint configuration. The endpoint runs + # retrieval + per-section span extraction + a synthesis LLM call, + # returning {answer, citations:[{section_id, page_start, page_end, quote}]}. + answer: + # Override the synthesis-call model; empty inherits the request's model. + model: "" + max_sections: 5 + max_answer_tokens: 1024 + ingest: # The summarize and HyDE stages run concurrently. This caps the total # number of LLM calls in flight across both stages combined, so the diff --git a/internal/api/server.go b/internal/api/server.go index bae923c..86134ee 100644 --- a/internal/api/server.go +++ b/internal/api/server.go @@ -5,18 +5,23 @@ package api import ( + "context" "encoding/json" "errors" + "fmt" "io" "log/slog" "net/http" "strconv" "strings" + "sync" "time" "github.com/go-chi/chi/v5" "github.com/go-chi/chi/v5/middleware" + "github.com/hallelx2/llmgate" + "github.com/hallelx2/vectorless-engine/pkg/config" "github.com/hallelx2/vectorless-engine/pkg/db" "github.com/hallelx2/vectorless-engine/pkg/ingest" "github.com/hallelx2/vectorless-engine/pkg/queue" @@ -44,6 +49,20 @@ type Deps struct { // MultiDoc is the multi-document query dispatcher. If nil, the // /v1/query/multi endpoint returns 501. MultiDoc *retrieval.MultiDoc + + // LLM is the shared llmgate client used by handlers that issue + // LLM calls outside the retrieval strategy (answer-span extraction, + // /v1/answer synthesis). Nil disables those handlers (the endpoints + // return 501). + LLM llmgate.Client + + // LLMModel is the default model name. Per-request overrides win. + LLMModel string + + // AnswerSpan / Answer hold the relevant config blocks. Default + // values (AnswerSpan disabled, Answer.MaxSections=5) are safe. + AnswerSpan config.AnswerSpanBlock + Answer config.AnswerBlock } // Router builds and returns the chi router wired with v1 routes. @@ -69,6 +88,7 @@ func Router(d Deps) http.Handler { r.Get("/sections/{id}", d.handleGetSection) r.Post("/query", d.handleQuery) r.Post("/query/multi", d.handleQueryMulti) + r.Post("/answer", d.handleAnswer) }) r.Post("/internal/jobs/{kind}", d.handleQueueWebhook) @@ -410,7 +430,7 @@ func (d Deps) handleQuery(w http.ResponseWriter, r *http.Request) { ids = ids[:body.MaxSections] } - sections := make([]map[string]any, 0, len(ids)) + enriched := make([]sectionWithContent, 0, len(ids)) for _, id := range ids { sec := t.FindByID(id) if sec == nil { @@ -425,36 +445,363 @@ func (d Deps) handleQuery(w http.ResponseWriter, r *http.Request) { content = string(raw) } } - s := map[string]any{ - "id": sec.ID, - "parent_id": sec.ParentID, - "title": sec.Title, - "summary": sec.Summary, - "token_count": sec.TokenCount, - "content": content, + enriched = append(enriched, sectionWithContent{sec: sec, content: content}) + } + + // Optional: per-section answer-span extraction. Opt-in via config — + // one LLM call per returned section. Failures are non-fatal; the + // section is returned without a span. + if d.AnswerSpan.Enabled && d.LLM != nil { + extractor := d.spanExtractor(body.Model) + runSpansConcurrent(r.Context(), extractor, body.Query, enriched, d.AnswerSpan.MaxConcurrency, d.Logger) + } + + sections := make([]map[string]any, 0, len(enriched)) + for _, e := range enriched { + sections = append(sections, sectionWithContentToMap(e)) + } + + writeJSON(w, http.StatusOK, map[string]any{ + "document_id": body.DocumentID, + "query": body.Query, + "strategy": d.Strategy.Name(), + "model": body.Model, + "sections": sections, + "elapsed_ms": time.Since(started).Milliseconds(), + }) +} + +// sectionWithContent bundles a tree section with its loaded content +// and an optional answer-span. Used by /v1/query and /v1/answer. +type sectionWithContent struct { + sec *tree.Section + content string + span *retrieval.AnswerSpan +} + +// sectionWithContentToMap renders the section as the API map shape. +func sectionWithContentToMap(e sectionWithContent) map[string]any { + s := map[string]any{ + "id": e.sec.ID, + "parent_id": e.sec.ParentID, + "title": e.sec.Title, + "summary": e.sec.Summary, + "token_count": e.sec.TokenCount, + "content": e.content, + } + if e.sec.PageStart > 0 { + s["page_start"] = e.sec.PageStart + } + if e.sec.PageEnd > 0 { + s["page_end"] = e.sec.PageEnd + } + if len(e.sec.CandidateQuestions) > 0 { + s["candidate_questions"] = e.sec.CandidateQuestions + } + if e.span != nil { + s["answer_span"] = e.span + } + return s +} + +// spanExtractor builds a SpanExtractor honouring the configured model +// override, with a fall-through to the request's model then Deps default. +func (d Deps) spanExtractor(requestModel string) *retrieval.SpanExtractor { + model := d.AnswerSpan.Model + if model == "" { + model = requestModel + } + if model == "" { + model = d.LLMModel + } + ext := retrieval.NewSpanExtractor(d.LLM, model) + if d.AnswerSpan.MaxQuoteLen > 0 { + ext.MaxQuoteLen = d.AnswerSpan.MaxQuoteLen + } + return ext +} + +// runSpansConcurrent fans out span extraction across secs with a +// max-concurrency semaphore. Each extraction's outcome is written back +// into the matching slot's `span` field. Errors are logged and dropped +// — span extraction is best-effort. +func runSpansConcurrent(ctx context.Context, extractor *retrieval.SpanExtractor, query string, secs []sectionWithContent, maxConcurrency int, logger *slog.Logger) { + if maxConcurrency <= 0 { + maxConcurrency = 4 + } + sem := make(chan struct{}, maxConcurrency) + var wg sync.WaitGroup + for i := range secs { + i := i + if strings.TrimSpace(secs[i].content) == "" { + continue } - if sec.PageStart > 0 { - s["page_start"] = sec.PageStart + wg.Add(1) + go func() { + defer wg.Done() + select { + case sem <- struct{}{}: + defer func() { <-sem }() + case <-ctx.Done(): + return + } + span, _, err := extractor.Extract(ctx, secs[i].content, query) + if err != nil { + if logger != nil { + logger.Warn("answer-span: extract failed", "section_id", secs[i].sec.ID, "err", err) + } + return + } + secs[i].span = span + }() + } + wg.Wait() +} + +// handleAnswer runs retrieval + per-section answer-span extraction + +// a synthesis LLM call, returning a quote-grounded answer plus +// citations in a single round-trip. This is the most regulator- +// defensible thing the engine can produce — every citation carries a +// section ID, page range (when known), and the verbatim quote the +// answer relies on. +// +// Body: { document_id, query, model?, max_tokens?, reserved_for_prompt?, +// max_parallel_calls?, max_sections?, max_answer_tokens? }. +// Response: { document_id, query, answer, citations: +// +// [{section_id, title, page_start, page_end, quote}], strategy, +// model, usage, elapsed_ms }. +func (d Deps) handleAnswer(w http.ResponseWriter, r *http.Request) { + if d.LLM == nil { + writeErr(w, http.StatusNotImplemented, "answer endpoint requires an LLM client") + return + } + if d.Strategy == nil { + writeErr(w, http.StatusServiceUnavailable, "no retrieval strategy configured") + return + } + + var body struct { + DocumentID tree.DocumentID `json:"document_id"` + Query string `json:"query"` + Model string `json:"model"` + MaxTokens int `json:"max_tokens"` + ReservedForPrompt int `json:"reserved_for_prompt"` + MaxParallelCalls int `json:"max_parallel_calls"` + MaxSections int `json:"max_sections"` + MaxAnswerTokens int `json:"max_answer_tokens"` + } + if err := json.NewDecoder(r.Body).Decode(&body); err != nil { + writeErr(w, http.StatusBadRequest, "invalid json: "+err.Error()) + return + } + if body.DocumentID == "" || body.Query == "" { + writeErr(w, http.StatusBadRequest, "document_id and query are required") + return + } + + t, err := d.DB.LoadTree(r.Context(), body.DocumentID, standaloneOrgID, "") + if err != nil { + if errors.Is(err, db.ErrNotFound) { + writeErr(w, http.StatusNotFound, "document not found") + return } - if sec.PageEnd > 0 { - s["page_end"] = sec.PageEnd + writeErr(w, http.StatusInternalServerError, err.Error()) + return + } + + budget := retrieval.ContextBudget{ + ModelName: body.Model, + MaxTokens: body.MaxTokens, + ReservedForPrompt: body.ReservedForPrompt, + MaxParallelCalls: body.MaxParallelCalls, + } + if budget.MaxTokens == 0 { + budget.MaxTokens = 100000 + } + if budget.ReservedForPrompt == 0 { + budget.ReservedForPrompt = 4000 + } + if budget.MaxParallelCalls == 0 { + budget.MaxParallelCalls = 8 + } + + started := time.Now() + totalUsage := retrieval.Usage{} + + var ids []tree.SectionID + var retrievalUsage retrieval.Usage + if cs, ok := d.Strategy.(retrieval.CostStrategy); ok { + res, err := cs.SelectWithCost(r.Context(), t, body.Query, budget) + if err != nil { + writeErr(w, http.StatusInternalServerError, "retrieval failed: "+err.Error()) + return + } + ids, retrievalUsage = res.SelectedIDs, res.Usage + } else { + picks, err := d.Strategy.Select(r.Context(), t, body.Query, budget) + if err != nil { + writeErr(w, http.StatusInternalServerError, "retrieval failed: "+err.Error()) + return + } + ids = picks + } + totalUsage.Add(retrievalUsage) + + maxSections := body.MaxSections + if maxSections <= 0 { + maxSections = d.Answer.MaxSections + } + if maxSections <= 0 { + maxSections = 5 + } + if len(ids) > maxSections { + ids = ids[:maxSections] + } + + // Load each section's content. + enriched := make([]sectionWithContent, 0, len(ids)) + for _, id := range ids { + sec := t.FindByID(id) + if sec == nil { + continue + } + var content string + if sec.ContentRef != "" { + rc, _, err := d.Storage.Get(r.Context(), sec.ContentRef) + if err == nil { + raw, _ := io.ReadAll(rc) + rc.Close() + content = string(raw) + } + } + enriched = append(enriched, sectionWithContent{sec: sec, content: content}) + } + + // Always extract spans for /v1/answer — they ground each citation. + spanExtractor := d.spanExtractor(body.Model) + runSpansConcurrent(r.Context(), spanExtractor, body.Query, enriched, d.AnswerSpan.MaxConcurrency, d.Logger) + + // Synthesise. Feed only the spans (when available) + section + // titles into the prompt so the model stays grounded in the + // retrieved evidence. + synthModel := d.Answer.Model + if synthModel == "" { + synthModel = body.Model + } + if synthModel == "" { + synthModel = d.LLMModel + } + maxAnswerTokens := body.MaxAnswerTokens + if maxAnswerTokens <= 0 { + maxAnswerTokens = d.Answer.MaxAnswerTokens + } + if maxAnswerTokens <= 0 { + maxAnswerTokens = 1024 + } + + answerText, synthUsage, err := synthesiseAnswer(r.Context(), d.LLM, synthModel, body.Query, enriched, maxAnswerTokens) + if err != nil { + writeErr(w, http.StatusInternalServerError, "synthesis failed: "+err.Error()) + return + } + totalUsage.Add(synthUsage) + + citations := make([]map[string]any, 0, len(enriched)) + for _, e := range enriched { + c := map[string]any{ + "section_id": e.sec.ID, + "title": e.sec.Title, + } + if e.sec.PageStart > 0 { + c["page_start"] = e.sec.PageStart } - if len(sec.CandidateQuestions) > 0 { - s["candidate_questions"] = sec.CandidateQuestions + if e.sec.PageEnd > 0 { + c["page_end"] = e.sec.PageEnd } - sections = append(sections, s) + if e.span != nil && e.span.Text != "" { + c["quote"] = e.span.Text + if e.span.Start >= 0 && e.span.End > e.span.Start { + c["quote_start"] = e.span.Start + c["quote_end"] = e.span.End + } + } + citations = append(citations, c) } writeJSON(w, http.StatusOK, map[string]any{ "document_id": body.DocumentID, "query": body.Query, + "answer": answerText, + "citations": citations, "strategy": d.Strategy.Name(), - "model": body.Model, - "sections": sections, - "elapsed_ms": time.Since(started).Milliseconds(), + "model": synthModel, + "usage": map[string]any{ + "input_tokens": totalUsage.InputTokens, + "output_tokens": totalUsage.OutputTokens, + "total_tokens": totalUsage.TotalTokens, + "cost_usd": totalUsage.CostUSD, + "llm_calls": totalUsage.LLMCalls, + }, + "elapsed_ms": time.Since(started).Milliseconds(), }) } +// synthesiseAnswer runs one LLM call producing the final answer from +// retrieved sections + their extracted spans. The model is told to +// cite by section ID. +func synthesiseAnswer(ctx context.Context, client llmgate.Client, model, query string, secs []sectionWithContent, maxAnswerTokens int) (string, retrieval.Usage, error) { + var b strings.Builder + b.WriteString("You are answering a user's question using ONLY the evidence below.\n\n") + b.WriteString("User query:\n") + b.WriteString(query) + b.WriteString("\n\nRetrieved evidence (each block is a section of the document):\n") + for i, e := range secs { + fmt.Fprintf(&b, "\n[%d] section_id=%s, title=%q", i+1, e.sec.ID, e.sec.Title) + if e.sec.PageStart > 0 { + fmt.Fprintf(&b, ", pages=%d-%d", e.sec.PageStart, e.sec.PageEnd) + } + b.WriteString("\n") + if e.span != nil && e.span.Text != "" { + fmt.Fprintf(&b, "Most relevant quote: %q\n", e.span.Text) + } + // Always include some content so the model isn't blind when the + // span extractor returned nothing. + if e.content != "" { + snippet := e.content + if len(snippet) > 4000 { + snippet = snippet[:4000] + } + fmt.Fprintf(&b, "Section content:\n%s\n", snippet) + } + } + b.WriteString("\nWrite a concise answer to the user's query. ") + b.WriteString("If the evidence does not contain an answer, say so. ") + b.WriteString("Inline citations should reference the section_id values shown above. ") + b.WriteString("Output plain prose; no JSON.") + + req := llmgate.Request{ + Model: model, + Messages: []llmgate.Message{ + {Role: llmgate.RoleSystem, Content: "You synthesise grounded answers from retrieved document sections. Never invent facts; only cite what the evidence shows."}, + {Role: llmgate.RoleUser, Content: b.String()}, + }, + MaxTokens: maxAnswerTokens, + Temperature: 0, + } + resp, err := client.Complete(ctx, req) + if err != nil { + return "", retrieval.Usage{}, err + } + return strings.TrimSpace(resp.Content), retrieval.Usage{ + InputTokens: resp.Usage.InputTokens, + OutputTokens: resp.Usage.OutputTokens, + TotalTokens: resp.Usage.TotalTokens, + CostUSD: resp.Usage.CostUSD, + LLMCalls: 1, + }, nil +} + // handleQueryMulti accepts { document_ids, query, model?, max_tokens?, // reserved_for_prompt?, max_parallel_calls?, max_sections? } and runs the // retrieval strategy against every document in parallel, returning diff --git a/openapi.yaml b/openapi.yaml index 81bef87..c561693 100644 --- a/openapi.yaml +++ b/openapi.yaml @@ -246,6 +246,39 @@ paths: "404": $ref: "#/components/responses/NotFound" + /v1/answer: + post: + tags: [Query] + summary: Quote-grounded answer (retrieval + span + synthesis) + operationId: answer + description: | + Single round-trip endpoint that runs retrieval, per-section + answer-span extraction, and a synthesis LLM call to return a + natural-language answer plus citations. Every citation + includes the cited section's ID, page range (when known), and + the verbatim quote the answer relies on. + + This is the most regulator-defensible endpoint: the answer is + grounded in retrieved sections, and each claim carries a + replayable provenance trail. + requestBody: + required: true + content: + application/json: + schema: + $ref: "#/components/schemas/AnswerRequest" + responses: + "200": + description: Synthesised answer with citations + content: + application/json: + schema: + $ref: "#/components/schemas/AnswerResponse" + "404": + $ref: "#/components/responses/NotFound" + "501": + description: Endpoint not available — no LLM client configured + /v1/query/stream: post: tags: [Query] @@ -465,3 +498,98 @@ components: content: type: string description: Full section content from storage. + answer_span: + $ref: "#/components/schemas/AnswerSpan" + + AnswerSpan: + type: object + description: | + Verbatim quote from the section judged most relevant to the + query, plus byte offsets back into the section's content. + Returned only when `retrieval.answer_span.enabled` is true. + When `start` and `end` are both -1 the model paraphrased + despite the verbatim-quote rule — the text is preserved but + the offsets are sentinel. + properties: + start: + type: integer + end: + type: integer + text: + type: string + + AnswerRequest: + type: object + required: [document_id, query] + properties: + document_id: + type: string + query: + type: string + model: + type: string + max_tokens: + type: integer + reserved_for_prompt: + type: integer + max_parallel_calls: + type: integer + max_sections: + type: integer + description: Cap on sections fed into synthesis. Defaults to retrieval.answer.max_sections (5). + max_answer_tokens: + type: integer + description: Bound the synthesised answer length. Defaults to retrieval.answer.max_answer_tokens (1024). + + AnswerResponse: + type: object + properties: + document_id: + type: string + query: + type: string + answer: + type: string + description: Natural-language answer grounded in the cited sections. + citations: + type: array + items: + $ref: "#/components/schemas/AnswerCitation" + strategy: + type: string + model: + type: string + usage: + type: object + properties: + input_tokens: {type: integer} + output_tokens: {type: integer} + total_tokens: {type: integer} + cost_usd: {type: number} + llm_calls: {type: integer} + elapsed_ms: + type: integer + + AnswerCitation: + type: object + description: | + One citation behind the synthesised answer. `quote` is the + verbatim span the answer relies on (when the span extractor + found one). `quote_start`/`quote_end` give byte offsets into + the source section's content. `page_start`/`page_end` are the + section's page range — omitted for non-paginated formats. + properties: + section_id: + type: string + title: + type: string + quote: + type: string + quote_start: + type: integer + quote_end: + type: integer + page_start: + type: integer + page_end: + type: integer diff --git a/pkg/config/config.go b/pkg/config/config.go index 3845944..525f83d 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -201,6 +201,44 @@ type RetrievalConfig struct { ChunkedTree ChunkedTreeBlock `yaml:"chunked_tree"` Agentic AgenticBlock `yaml:"agentic"` Cache CacheBlock `yaml:"cache"` + AnswerSpan AnswerSpanBlock `yaml:"answer_span"` + Answer AnswerBlock `yaml:"answer"` +} + +// AnswerSpanBlock configures the answer-span extractor. +// +// When enabled, every section returned by /v1/query gets an extra +// `answer_span` field carrying the verbatim quote the model judged +// most relevant to the query, plus byte offsets back into the +// section's content. Costs one LLM call per returned section. +type AnswerSpanBlock struct { + // Enabled toggles per-section span extraction on /v1/query. Default: false. + Enabled bool `yaml:"enabled"` + // Model overrides the budget's model for the span extraction call. + // Empty means use the request's model. Keep this on a cheap/fast + // model (the call is short and runs once per returned section). + Model string `yaml:"model"` + // MaxConcurrency caps parallel span-extraction calls per request. + // Default: 4. + MaxConcurrency int `yaml:"max_concurrency"` + // MaxQuoteLen caps the per-section quote length (characters). + // Default: 400. + MaxQuoteLen int `yaml:"max_quote_len"` +} + +// AnswerBlock configures the /v1/answer endpoint, which runs retrieval +// + span extraction + a synthesis LLM call to return a quote-grounded +// answer in a single round-trip. +type AnswerBlock struct { + // Model overrides the budget's model for the synthesis call. + // Empty means use the request's model. + Model string `yaml:"model"` + // MaxSections caps how many sections are fed into synthesis. + // Default: 5. + MaxSections int `yaml:"max_sections"` + // MaxAnswerTokens bounds the synthesised answer length. + // Default: 1024. + MaxAnswerTokens int `yaml:"max_answer_tokens"` } // CacheBlock configures the retrieval-result cache. @@ -281,6 +319,15 @@ func Default() Config { MaxEntries: 1024, TTLSeconds: 600, }, + AnswerSpan: AnswerSpanBlock{ + Enabled: false, + MaxConcurrency: 4, + MaxQuoteLen: 400, + }, + Answer: AnswerBlock{ + MaxSections: 5, + MaxAnswerTokens: 1024, + }, }, Ingest: IngestConfig{ GlobalLLMConcurrency: 12, @@ -421,6 +468,30 @@ func applyEnvOverrides(c *Config) { c.Ingest.GlobalLLMConcurrency = n } } + if v := os.Getenv("VLE_RETRIEVAL_ANSWER_SPAN_ENABLED"); v != "" { + switch strings.ToLower(strings.TrimSpace(v)) { + case "1", "true", "yes", "on": + c.Retrieval.AnswerSpan.Enabled = true + case "0", "false", "no", "off": + c.Retrieval.AnswerSpan.Enabled = false + } + } + if v := os.Getenv("VLE_RETRIEVAL_ANSWER_SPAN_MODEL"); v != "" { + c.Retrieval.AnswerSpan.Model = v + } + if v := os.Getenv("VLE_RETRIEVAL_ANSWER_SPAN_MAX_CONCURRENCY"); v != "" { + if n, err := strconv.Atoi(v); err == nil && n > 0 { + c.Retrieval.AnswerSpan.MaxConcurrency = n + } + } + if v := os.Getenv("VLE_RETRIEVAL_ANSWER_MODEL"); v != "" { + c.Retrieval.Answer.Model = v + } + if v := os.Getenv("VLE_RETRIEVAL_ANSWER_MAX_SECTIONS"); v != "" { + if n, err := strconv.Atoi(v); err == nil && n > 0 { + c.Retrieval.Answer.MaxSections = n + } + } } // Validate checks that required fields for the selected drivers are set. diff --git a/pkg/retrieval/span.go b/pkg/retrieval/span.go new file mode 100644 index 0000000..2fdad13 --- /dev/null +++ b/pkg/retrieval/span.go @@ -0,0 +1,214 @@ +package retrieval + +import ( + "context" + "encoding/json" + "fmt" + "strings" + + "github.com/hallelx2/llmgate" +) + +// AnswerSpan is the most relevant substring of a section's content for +// a given query, with byte offsets back into the original content. +// +// Start and End are byte offsets such that content[Start:End] == Text +// after the locator step. When the span text does not appear verbatim +// in the content (the model paraphrased), Start and End are -1 and +// Text holds the model's quote. +type AnswerSpan struct { + Start int `json:"start"` + End int `json:"end"` + Text string `json:"text"` +} + +// SpanExtractor pulls the most query-relevant verbatim span out of a +// section's content with one LLM call. +type SpanExtractor struct { + LLM llmgate.Client + Model string + // MaxQuoteLen caps how many characters the model is allowed to quote. + // Keeps the response tight and forces the model to pick a focused + // span instead of returning the whole section. Default: 400. + MaxQuoteLen int +} + +// NewSpanExtractor constructs a SpanExtractor with sensible defaults. +func NewSpanExtractor(client llmgate.Client, model string) *SpanExtractor { + return &SpanExtractor{LLM: client, Model: model, MaxQuoteLen: 400} +} + +const spanSystemPrompt = `You are a precise quotation engine. Given a section of a document and a user query, extract the SHORTEST verbatim quote from the section that directly answers (or is the most relevant evidence for) the query. + +Rules: +- Quote verbatim from the section. Do not paraphrase, summarize, or invent text. +- Pick the smallest contiguous span that contains the answer. One sentence is usually enough; a phrase is better. +- If the section contains nothing useful for the query, set "found" to false and return an empty quote.` + +const spanJSONSchema = `{ + "type": "object", + "properties": { + "found": {"type": "boolean"}, + "quote": {"type": "string"} + }, + "required": ["found", "quote"] +}` + +// Extract runs one LLM call to pull the most relevant verbatim span +// from sectionContent for query. Returns nil (no error) when the +// section does not contain an answer; that is the no-evidence path, +// not a failure. A non-nil error is returned only on transport / LLM +// failure. +func (e *SpanExtractor) Extract(ctx context.Context, sectionContent, query string) (*AnswerSpan, Usage, error) { + if strings.TrimSpace(sectionContent) == "" || strings.TrimSpace(query) == "" { + return nil, Usage{}, nil + } + maxQuote := e.MaxQuoteLen + if maxQuote <= 0 { + maxQuote = 400 + } + + user := fmt.Sprintf( + "Section content:\n---\n%s\n---\n\nUser query:\n%s\n\nReturn a JSON object with `found` (boolean) and `quote` (string, verbatim from the section, ≤ %d characters).", + sectionContent, query, maxQuote, + ) + req := llmgate.Request{ + Model: e.Model, + Messages: []llmgate.Message{ + {Role: llmgate.RoleSystem, Content: spanSystemPrompt}, + {Role: llmgate.RoleUser, Content: user}, + }, + MaxTokens: 512, + Temperature: 0, + JSONMode: true, + JSONSchema: []byte(spanJSONSchema), + } + + resp, err := e.LLM.Complete(ctx, req) + if err != nil { + return nil, Usage{}, fmt.Errorf("span-extract llm call: %w", err) + } + usage := Usage{ + InputTokens: resp.Usage.InputTokens, + OutputTokens: resp.Usage.OutputTokens, + TotalTokens: resp.Usage.TotalTokens, + CostUSD: resp.Usage.CostUSD, + LLMCalls: 1, + } + + quote, found, parseErr := parseSpanResponse(resp.Content) + if parseErr != nil { + return nil, usage, fmt.Errorf("parse span response: %w", parseErr) + } + if !found || strings.TrimSpace(quote) == "" { + return nil, usage, nil + } + if len(quote) > maxQuote { + quote = quote[:maxQuote] + } + + start, end := locateQuote(sectionContent, quote) + return &AnswerSpan{Start: start, End: end, Text: quote}, usage, nil +} + +// locateQuote finds quote in content. Returns -1, -1 when the quote +// does not appear verbatim (the model paraphrased despite the +// instructions). First tries exact substring, then normalised +// whitespace. +func locateQuote(content, quote string) (int, int) { + if i := strings.Index(content, quote); i >= 0 { + return i, i + len(quote) + } + // Whitespace-normalised match: collapse runs of whitespace in both. + normContent := collapseWS(content) + normQuote := collapseWS(quote) + if j := strings.Index(normContent, normQuote); j >= 0 { + // Walk the original content counting normalised characters until + // we reach j; that's our start. Then add normQuote length back + // through the same walk for the end. + start := mapNormToOriginal(content, j) + end := mapNormToOriginal(content, j+len(normQuote)) + if start >= 0 && end > start { + return start, end + } + } + return -1, -1 +} + +func collapseWS(s string) string { + var b strings.Builder + b.Grow(len(s)) + prevWS := false + for i := 0; i < len(s); i++ { + c := s[i] + ws := c == ' ' || c == '\t' || c == '\n' || c == '\r' + if ws { + if !prevWS { + b.WriteByte(' ') + } + prevWS = true + continue + } + b.WriteByte(c) + prevWS = false + } + return b.String() +} + +// mapNormToOriginal returns the index in s that corresponds to the +// normalised-character index n (where the normalised string is +// collapseWS(s)). Returns -1 if n is out of range. +func mapNormToOriginal(s string, n int) int { + idx := 0 + prevWS := false + for i := 0; i < len(s); i++ { + if idx == n { + return i + } + c := s[i] + ws := c == ' ' || c == '\t' || c == '\n' || c == '\r' + if ws { + if !prevWS { + idx++ + } + prevWS = true + continue + } + idx++ + prevWS = false + } + if idx == n { + return len(s) + } + return -1 +} + +type spanPayload struct { + Found bool `json:"found"` + Quote string `json:"quote"` +} + +func parseSpanResponse(raw string) (quote string, found bool, err error) { + raw = strings.TrimSpace(raw) + if raw == "" { + return "", false, nil + } + if strings.HasPrefix(raw, "```") { + if i := strings.Index(raw, "\n"); i >= 0 { + raw = raw[i+1:] + } + raw = strings.TrimSuffix(raw, "```") + raw = strings.TrimSpace(raw) + } + if i := strings.Index(raw, "{"); i > 0 { + raw = raw[i:] + } + if j := strings.LastIndex(raw, "}"); j >= 0 && j < len(raw)-1 { + raw = raw[:j+1] + } + var p spanPayload + if err := json.Unmarshal([]byte(raw), &p); err != nil { + return "", false, fmt.Errorf("unmarshal span: %w", err) + } + return p.Quote, p.Found, nil +} diff --git a/pkg/retrieval/span_test.go b/pkg/retrieval/span_test.go new file mode 100644 index 0000000..0fc3807 --- /dev/null +++ b/pkg/retrieval/span_test.go @@ -0,0 +1,172 @@ +package retrieval + +import ( + "context" + "sync/atomic" + "testing" + + "github.com/hallelx2/llmgate" +) + +// spanMockLLM is a minimal LLM stub for span-extractor tests. The +// retrieval_test.go file uses an external-package mock; we need an +// internal one to exercise locateQuote / parseSpanResponse directly. +type spanMockLLM struct { + reply string + calls int32 +} + +func (m *spanMockLLM) Complete(ctx context.Context, req llmgate.Request) (*llmgate.Response, error) { + atomic.AddInt32(&m.calls, 1) + return &llmgate.Response{Content: m.reply}, nil +} + +func (m *spanMockLLM) CountTokens(ctx context.Context, s string) (int, error) { + return len(s) / 4, nil +} + +func TestSpanExtractor_VerbatimMatch(t *testing.T) { + content := "Apple Inc. reported revenue of $383.3 billion for fiscal 2023, up 2.8% year over year. " + + "The iPhone segment generated $200.6 billion of that total." + query := "What was Apple's fiscal 2023 revenue?" + + m := &spanMockLLM{reply: `{"found":true,"quote":"revenue of $383.3 billion for fiscal 2023"}`} + e := NewSpanExtractor(m, "gemini-2.5-flash") + + span, usage, err := e.Extract(context.Background(), content, query) + if err != nil { + t.Fatalf("Extract: %v", err) + } + if span == nil { + t.Fatalf("expected non-nil span") + } + if span.Text != "revenue of $383.3 billion for fiscal 2023" { + t.Errorf("text = %q", span.Text) + } + if span.Start <= 0 || span.End <= span.Start { + t.Errorf("offsets = (%d, %d)", span.Start, span.End) + } + if got := content[span.Start:span.End]; got != span.Text { + t.Errorf("content[Start:End] = %q, want %q", got, span.Text) + } + if usage.LLMCalls != 1 { + t.Errorf("usage.LLMCalls = %d, want 1", usage.LLMCalls) + } +} + +func TestSpanExtractor_NotFound(t *testing.T) { + content := "This section is about unrelated topics." + m := &spanMockLLM{reply: `{"found":false,"quote":""}`} + e := NewSpanExtractor(m, "gemini-2.5-flash") + + span, usage, err := e.Extract(context.Background(), content, "Q") + if err != nil { + t.Fatalf("Extract: %v", err) + } + if span != nil { + t.Errorf("expected nil span, got %+v", span) + } + if usage.LLMCalls != 1 { + t.Errorf("usage.LLMCalls = %d, want 1", usage.LLMCalls) + } +} + +func TestSpanExtractor_ParaphraseFallsBackToWhitespace(t *testing.T) { + // The original has weird whitespace (a newline mid-sentence + extra + // spaces) but the model returns a normalised version. We should + // still locate it. + content := "Apple Inc. reported revenue of\n $383.3 billion for fiscal 2023, up 2.8% year over year." + m := &spanMockLLM{reply: `{"found":true,"quote":"revenue of $383.3 billion for fiscal 2023"}`} + e := NewSpanExtractor(m, "gemini-2.5-flash") + + span, _, err := e.Extract(context.Background(), content, "revenue?") + if err != nil { + t.Fatalf("Extract: %v", err) + } + if span == nil { + t.Fatalf("expected non-nil span via whitespace match") + } + if span.Start < 0 || span.End < 0 { + t.Errorf("expected resolved offsets via WS-normalised match, got (%d, %d)", span.Start, span.End) + } +} + +func TestSpanExtractor_QuoteNotInContent(t *testing.T) { + // Model invents text not present anywhere — sentinel offsets, but + // span.Text still surfaces what the model said so callers can flag. + content := "Plain content with no apple references at all." + m := &spanMockLLM{reply: `{"found":true,"quote":"hallucinated quote that does not appear"}`} + e := NewSpanExtractor(m, "gemini-2.5-flash") + + span, _, err := e.Extract(context.Background(), content, "Q") + if err != nil { + t.Fatalf("Extract: %v", err) + } + if span == nil { + t.Fatalf("expected non-nil span even with bad quote") + } + if span.Start != -1 || span.End != -1 { + t.Errorf("expected sentinel offsets (-1,-1) for hallucinated quote, got (%d,%d)", span.Start, span.End) + } + if span.Text == "" { + t.Errorf("expected text preserved") + } +} + +func TestSpanExtractor_EmptyInput(t *testing.T) { + m := &spanMockLLM{reply: `{"found":true,"quote":"x"}`} + e := NewSpanExtractor(m, "gemini-2.5-flash") + if span, _, _ := e.Extract(context.Background(), "", "Q"); span != nil { + t.Errorf("empty content should yield nil span without an LLM call") + } + if span, _, _ := e.Extract(context.Background(), "content", ""); span != nil { + t.Errorf("empty query should yield nil span without an LLM call") + } +} + +func TestParseSpanResponse_CodeFence(t *testing.T) { + raw := "```json\n{\"found\":true,\"quote\":\"hello\"}\n```" + q, found, err := parseSpanResponse(raw) + if err != nil { + t.Fatalf("parse: %v", err) + } + if !found || q != "hello" { + t.Errorf("got (%q, %v)", q, found) + } +} + +func TestParseSpanResponse_LeadingProse(t *testing.T) { + raw := "Sure, here is the span: {\"found\":true,\"quote\":\"x\"}" + q, found, _ := parseSpanResponse(raw) + if !found || q != "x" { + t.Errorf("leading-prose parse failed: %q, %v", q, found) + } +} + +func TestLocateQuote_Exact(t *testing.T) { + c := "alpha beta gamma" + s, e := locateQuote(c, "beta") + if s != 6 || e != 10 { + t.Errorf("got (%d,%d), want (6,10)", s, e) + } +} + +func TestLocateQuote_WhitespaceNormalised(t *testing.T) { + c := "alpha\n\n beta gamma" + s, e := locateQuote(c, "beta gamma") + if s < 0 || e <= s { + t.Fatalf("got (%d,%d) — expected resolved offsets", s, e) + } + if !contains(c[s:e], "beta") || !contains(c[s:e], "gamma") { + t.Errorf("located span %q does not contain target words", c[s:e]) + } +} + +func contains(s, sub string) bool { + for i := 0; i+len(sub) <= len(s); i++ { + if s[i:i+len(sub)] == sub { + return true + } + } + return false +}