From 39e677fb14976935f31a4058032dbe7ebae17599 Mon Sep 17 00:00:00 2001
From: Halleluyah Oludele <halleluyaholudele@gmail.com>
Date: Wed, 27 May 2026 01:41:33 +0100
Subject: [PATCH] feat: answer-span extraction + /v1/answer endpoint
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase 1.1 + 3.3 of the engine plan: turns the whitepaper's
"answer has a verbatim quote with a page number" claim into two
demonstrable, regulator-defensible features.

pkg/retrieval/span.go
  - SpanExtractor.Extract(ctx, content, query) runs one LLM call,
    asks for the SHORTEST verbatim quote from the section that
    answers the query, returns AnswerSpan{Start, End, Text} with
    byte offsets back into content.
  - Two-pass locator: exact substring first, then whitespace-
    normalised match for content with weird linebreaks/spacing.
  - When the model paraphrases despite the verbatim rule, offsets
    are sentinel (-1, -1) but Text is preserved so callers can flag.
  - Schema-validated JSON output ({found, quote}) with the same
    tolerant parser pattern as ParseSelection (code fences, leading
    prose).

internal/api/server.go
  - Deps gains LLM (llmgate.Client), LLMModel (default fallback),
    AnswerSpan + Answer config blocks. cmd/engine/main.go wires them.
  - /v1/query: when retrieval.answer_span.enabled, every returned
    section gets an `answer_span` field. Bounded concurrency
    (config) via a semaphore; failures are logged and dropped.
  - POST /v1/answer (NEW): single round-trip runs retrieval +
    per-section span extraction + a synthesis LLM call, returns
    {answer, citations:[{section_id, title, page_start, page_end,
    quote, quote_start, quote_end}], strategy, model, usage,
    elapsed_ms}. The model is instructed to cite by section_id.

pkg/config/config.go
  - RetrievalConfig.AnswerSpan + .Answer blocks; defaults match
    behaviour: AnswerSpan disabled, Answer.MaxSections=5,
    Answer.MaxAnswerTokens=1024.
  - VLE_RETRIEVAL_ANSWER_SPAN_ENABLED / _MODEL / _MAX_CONCURRENCY
    and VLE_RETRIEVAL_ANSWER_MODEL / _MAX_SECTIONS env overrides
    follow the existing pattern.

openapi.yaml
  - /v1/answer endpoint + AnswerRequest / AnswerResponse /
    AnswerCitation / AnswerSpan schemas. answer_span added as an
    optional field on QuerySection (omitempty in JSON).

Tests:
  - pkg/retrieval/span_test.go covers the verbatim path, the
    not-found path, whitespace-normalised location, the
    hallucinated-quote sentinel-offset path, empty inputs,
    code-fence stripping, leading-prose stripping, and locateQuote
    edge cases. All pass.
  - go build ./..., go vet ./..., go test ./... — all green.
---
 cmd/engine/main.go         |  33 +++-
 config.example.yaml        |  22 +++
 internal/api/server.go     | 383 +++++++++++++++++++++++++++++++++++--
 openapi.yaml               | 128 +++++++++++++
 pkg/config/config.go       |  71 +++++++
 pkg/retrieval/span.go      | 214 +++++++++++++++++++++
 pkg/retrieval/span_test.go | 172 +++++++++++++++++
 7 files changed, 998 insertions(+), 25 deletions(-)
 create mode 100644 pkg/retrieval/span.go
 create mode 100644 pkg/retrieval/span_test.go

diff --git a/cmd/engine/main.go b/cmd/engine/main.go
index c63f2b6..ccaab06 100644
--- a/cmd/engine/main.go
+++ b/cmd/engine/main.go
@@ -124,13 +124,17 @@ func run() error {
 	q.Register(queue.KindIngestDocument, pipeline.Handler())
 
 	deps := api.Deps{
-		Logger:   logger,
-		DB:       pool,
-		Storage:  store,
-		Queue:    q,
-		Strategy: strategy,
-		Version:  version,
-		MultiDoc: multiDoc,
+		Logger:     logger,
+		DB:         pool,
+		Storage:    store,
+		Queue:      q,
+		Strategy:   strategy,
+		Version:    version,
+		MultiDoc:   multiDoc,
+		LLM:        llmClient,
+		LLMModel:   modelFor(cfg.LLM),
+		AnswerSpan: cfg.Retrieval.AnswerSpan,
+		Answer:     cfg.Retrieval.Answer,
 	}
 
 	srv := &http.Server{
@@ -228,6 +232,21 @@ func buildQueue(c config.QueueConfig, dbURL string) (queue.Queue, error) {
 	}
 }
 
+// modelFor returns the configured chat/general-purpose model name for
+// the selected LLM driver. Used as a fallback when an API request
+// omits an explicit model.
+func modelFor(c config.LLMConfig) string {
+	switch c.Driver {
+	case "anthropic":
+		return c.Anthropic.Model
+	case "openai":
+		return c.OpenAI.Model
+	case "gemini":
+		return c.Gemini.Model
+	}
+	return ""
+}
+
 func buildLLM(c config.LLMConfig) (llmgate.Client, error) {
 	switch c.Driver {
 	case "anthropic":
diff --git a/config.example.yaml b/config.example.yaml
index 2325315..6faf264 100644
--- a/config.example.yaml
+++ b/config.example.yaml
@@ -107,6 +107,28 @@ retrieval:
     # doesn't own, so the model knows what else exists in the document.
     include_sibling_breadcrumbs: true
 
+  # answer_span: when enabled, every section returned by /v1/query gets an
+  # extra `answer_span` field carrying the verbatim quote the model judged
+  # most relevant to the query, plus byte offsets back into the section's
+  # content. Costs one LLM call per returned section. Opt-in by default.
+  answer_span:
+    enabled: false
+    # Override the model used for span extraction; empty inherits the
+    # request's model. Keep this on a cheap/fast model — the call is
+    # short and runs once per returned section.
+    model: ""
+    max_concurrency: 4
+    max_quote_len: 400
+
+  # answer: /v1/answer endpoint configuration. The endpoint runs
+  # retrieval + per-section span extraction + a synthesis LLM call,
+  # returning {answer, citations:[{section_id, page_start, page_end, quote}]}.
+  answer:
+    # Override the synthesis-call model; empty inherits the request's model.
+    model: ""
+    max_sections: 5
+    max_answer_tokens: 1024
+
 ingest:
   # The summarize and HyDE stages run concurrently. This caps the total
   # number of LLM calls in flight across both stages combined, so the
diff --git a/internal/api/server.go b/internal/api/server.go
index bae923c..86134ee 100644
--- a/internal/api/server.go
+++ b/internal/api/server.go
@@ -5,18 +5,23 @@
 package api
 
 import (
+	"context"
 	"encoding/json"
 	"errors"
+	"fmt"
 	"io"
 	"log/slog"
 	"net/http"
 	"strconv"
 	"strings"
+	"sync"
 	"time"
 
 	"github.com/go-chi/chi/v5"
 	"github.com/go-chi/chi/v5/middleware"
+	"github.com/hallelx2/llmgate"
 
+	"github.com/hallelx2/vectorless-engine/pkg/config"
 	"github.com/hallelx2/vectorless-engine/pkg/db"
 	"github.com/hallelx2/vectorless-engine/pkg/ingest"
 	"github.com/hallelx2/vectorless-engine/pkg/queue"
@@ -44,6 +49,20 @@ type Deps struct {
 	// MultiDoc is the multi-document query dispatcher. If nil, the
 	// /v1/query/multi endpoint returns 501.
 	MultiDoc *retrieval.MultiDoc
+
+	// LLM is the shared llmgate client used by handlers that issue
+	// LLM calls outside the retrieval strategy (answer-span extraction,
+	// /v1/answer synthesis). Nil disables those handlers (the endpoints
+	// return 501).
+	LLM llmgate.Client
+
+	// LLMModel is the default model name. Per-request overrides win.
+	LLMModel string
+
+	// AnswerSpan / Answer hold the relevant config blocks. Default
+	// values (AnswerSpan disabled, Answer.MaxSections=5) are safe.
+	AnswerSpan config.AnswerSpanBlock
+	Answer     config.AnswerBlock
 }
 
 // Router builds and returns the chi router wired with v1 routes.
@@ -69,6 +88,7 @@ func Router(d Deps) http.Handler {
 		r.Get("/sections/{id}", d.handleGetSection)
 		r.Post("/query", d.handleQuery)
 		r.Post("/query/multi", d.handleQueryMulti)
+		r.Post("/answer", d.handleAnswer)
 	})
 
 	r.Post("/internal/jobs/{kind}", d.handleQueueWebhook)
@@ -410,7 +430,7 @@ func (d Deps) handleQuery(w http.ResponseWriter, r *http.Request) {
 		ids = ids[:body.MaxSections]
 	}
 
-	sections := make([]map[string]any, 0, len(ids))
+	enriched := make([]sectionWithContent, 0, len(ids))
 	for _, id := range ids {
 		sec := t.FindByID(id)
 		if sec == nil {
@@ -425,36 +445,363 @@ func (d Deps) handleQuery(w http.ResponseWriter, r *http.Request) {
 				content = string(raw)
 			}
 		}
-		s := map[string]any{
-			"id":          sec.ID,
-			"parent_id":   sec.ParentID,
-			"title":       sec.Title,
-			"summary":     sec.Summary,
-			"token_count": sec.TokenCount,
-			"content":     content,
+		enriched = append(enriched, sectionWithContent{sec: sec, content: content})
+	}
+
+	// Optional: per-section answer-span extraction. Opt-in via config —
+	// one LLM call per returned section. Failures are non-fatal; the
+	// section is returned without a span.
+	if d.AnswerSpan.Enabled && d.LLM != nil {
+		extractor := d.spanExtractor(body.Model)
+		runSpansConcurrent(r.Context(), extractor, body.Query, enriched, d.AnswerSpan.MaxConcurrency, d.Logger)
+	}
+
+	sections := make([]map[string]any, 0, len(enriched))
+	for _, e := range enriched {
+		sections = append(sections, sectionWithContentToMap(e))
+	}
+
+	writeJSON(w, http.StatusOK, map[string]any{
+		"document_id": body.DocumentID,
+		"query":       body.Query,
+		"strategy":    d.Strategy.Name(),
+		"model":       body.Model,
+		"sections":    sections,
+		"elapsed_ms":  time.Since(started).Milliseconds(),
+	})
+}
+
+// sectionWithContent bundles a tree section with its loaded content
+// and an optional answer-span. Used by /v1/query and /v1/answer.
+type sectionWithContent struct {
+	sec     *tree.Section
+	content string
+	span    *retrieval.AnswerSpan
+}
+
+// sectionWithContentToMap renders the section as the API map shape.
+func sectionWithContentToMap(e sectionWithContent) map[string]any {
+	s := map[string]any{
+		"id":          e.sec.ID,
+		"parent_id":   e.sec.ParentID,
+		"title":       e.sec.Title,
+		"summary":     e.sec.Summary,
+		"token_count": e.sec.TokenCount,
+		"content":     e.content,
+	}
+	if e.sec.PageStart > 0 {
+		s["page_start"] = e.sec.PageStart
+	}
+	if e.sec.PageEnd > 0 {
+		s["page_end"] = e.sec.PageEnd
+	}
+	if len(e.sec.CandidateQuestions) > 0 {
+		s["candidate_questions"] = e.sec.CandidateQuestions
+	}
+	if e.span != nil {
+		s["answer_span"] = e.span
+	}
+	return s
+}
+
+// spanExtractor builds a SpanExtractor honouring the configured model
+// override, with a fall-through to the request's model then Deps default.
+func (d Deps) spanExtractor(requestModel string) *retrieval.SpanExtractor {
+	model := d.AnswerSpan.Model
+	if model == "" {
+		model = requestModel
+	}
+	if model == "" {
+		model = d.LLMModel
+	}
+	ext := retrieval.NewSpanExtractor(d.LLM, model)
+	if d.AnswerSpan.MaxQuoteLen > 0 {
+		ext.MaxQuoteLen = d.AnswerSpan.MaxQuoteLen
+	}
+	return ext
+}
+
+// runSpansConcurrent fans out span extraction across secs with a
+// max-concurrency semaphore. Each extraction's outcome is written back
+// into the matching slot's `span` field. Errors are logged and dropped
+// — span extraction is best-effort.
+func runSpansConcurrent(ctx context.Context, extractor *retrieval.SpanExtractor, query string, secs []sectionWithContent, maxConcurrency int, logger *slog.Logger) {
+	if maxConcurrency <= 0 {
+		maxConcurrency = 4
+	}
+	sem := make(chan struct{}, maxConcurrency)
+	var wg sync.WaitGroup
+	for i := range secs {
+		i := i
+		if strings.TrimSpace(secs[i].content) == "" {
+			continue
 		}
-		if sec.PageStart > 0 {
-			s["page_start"] = sec.PageStart
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			select {
+			case sem <- struct{}{}:
+				defer func() { <-sem }()
+			case <-ctx.Done():
+				return
+			}
+			span, _, err := extractor.Extract(ctx, secs[i].content, query)
+			if err != nil {
+				if logger != nil {
+					logger.Warn("answer-span: extract failed", "section_id", secs[i].sec.ID, "err", err)
+				}
+				return
+			}
+			secs[i].span = span
+		}()
+	}
+	wg.Wait()
+}
+
+// handleAnswer runs retrieval + per-section answer-span extraction +
+// a synthesis LLM call, returning a quote-grounded answer plus
+// citations in a single round-trip. This is the most regulator-
+// defensible thing the engine can produce — every citation carries a
+// section ID, page range (when known), and the verbatim quote the
+// answer relies on.
+//
+// Body: { document_id, query, model?, max_tokens?, reserved_for_prompt?,
+// max_parallel_calls?, max_sections?, max_answer_tokens? }.
+// Response: { document_id, query, answer, citations:
+//
+//	[{section_id, title, page_start, page_end, quote}], strategy,
+//	model, usage, elapsed_ms }.
+func (d Deps) handleAnswer(w http.ResponseWriter, r *http.Request) {
+	if d.LLM == nil {
+		writeErr(w, http.StatusNotImplemented, "answer endpoint requires an LLM client")
+		return
+	}
+	if d.Strategy == nil {
+		writeErr(w, http.StatusServiceUnavailable, "no retrieval strategy configured")
+		return
+	}
+
+	var body struct {
+		DocumentID        tree.DocumentID `json:"document_id"`
+		Query             string          `json:"query"`
+		Model             string          `json:"model"`
+		MaxTokens         int             `json:"max_tokens"`
+		ReservedForPrompt int             `json:"reserved_for_prompt"`
+		MaxParallelCalls  int             `json:"max_parallel_calls"`
+		MaxSections       int             `json:"max_sections"`
+		MaxAnswerTokens   int             `json:"max_answer_tokens"`
+	}
+	if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
+		writeErr(w, http.StatusBadRequest, "invalid json: "+err.Error())
+		return
+	}
+	if body.DocumentID == "" || body.Query == "" {
+		writeErr(w, http.StatusBadRequest, "document_id and query are required")
+		return
+	}
+
+	t, err := d.DB.LoadTree(r.Context(), body.DocumentID, standaloneOrgID, "")
+	if err != nil {
+		if errors.Is(err, db.ErrNotFound) {
+			writeErr(w, http.StatusNotFound, "document not found")
+			return
 		}
-		if sec.PageEnd > 0 {
-			s["page_end"] = sec.PageEnd
+		writeErr(w, http.StatusInternalServerError, err.Error())
+		return
+	}
+
+	budget := retrieval.ContextBudget{
+		ModelName:         body.Model,
+		MaxTokens:         body.MaxTokens,
+		ReservedForPrompt: body.ReservedForPrompt,
+		MaxParallelCalls:  body.MaxParallelCalls,
+	}
+	if budget.MaxTokens == 0 {
+		budget.MaxTokens = 100000
+	}
+	if budget.ReservedForPrompt == 0 {
+		budget.ReservedForPrompt = 4000
+	}
+	if budget.MaxParallelCalls == 0 {
+		budget.MaxParallelCalls = 8
+	}
+
+	started := time.Now()
+	totalUsage := retrieval.Usage{}
+
+	var ids []tree.SectionID
+	var retrievalUsage retrieval.Usage
+	if cs, ok := d.Strategy.(retrieval.CostStrategy); ok {
+		res, err := cs.SelectWithCost(r.Context(), t, body.Query, budget)
+		if err != nil {
+			writeErr(w, http.StatusInternalServerError, "retrieval failed: "+err.Error())
+			return
+		}
+		ids, retrievalUsage = res.SelectedIDs, res.Usage
+	} else {
+		picks, err := d.Strategy.Select(r.Context(), t, body.Query, budget)
+		if err != nil {
+			writeErr(w, http.StatusInternalServerError, "retrieval failed: "+err.Error())
+			return
+		}
+		ids = picks
+	}
+	totalUsage.Add(retrievalUsage)
+
+	maxSections := body.MaxSections
+	if maxSections <= 0 {
+		maxSections = d.Answer.MaxSections
+	}
+	if maxSections <= 0 {
+		maxSections = 5
+	}
+	if len(ids) > maxSections {
+		ids = ids[:maxSections]
+	}
+
+	// Load each section's content.
+	enriched := make([]sectionWithContent, 0, len(ids))
+	for _, id := range ids {
+		sec := t.FindByID(id)
+		if sec == nil {
+			continue
+		}
+		var content string
+		if sec.ContentRef != "" {
+			rc, _, err := d.Storage.Get(r.Context(), sec.ContentRef)
+			if err == nil {
+				raw, _ := io.ReadAll(rc)
+				rc.Close()
+				content = string(raw)
+			}
+		}
+		enriched = append(enriched, sectionWithContent{sec: sec, content: content})
+	}
+
+	// Always extract spans for /v1/answer — they ground each citation.
+	spanExtractor := d.spanExtractor(body.Model)
+	runSpansConcurrent(r.Context(), spanExtractor, body.Query, enriched, d.AnswerSpan.MaxConcurrency, d.Logger)
+
+	// Synthesise. Feed only the spans (when available) + section
+	// titles into the prompt so the model stays grounded in the
+	// retrieved evidence.
+	synthModel := d.Answer.Model
+	if synthModel == "" {
+		synthModel = body.Model
+	}
+	if synthModel == "" {
+		synthModel = d.LLMModel
+	}
+	maxAnswerTokens := body.MaxAnswerTokens
+	if maxAnswerTokens <= 0 {
+		maxAnswerTokens = d.Answer.MaxAnswerTokens
+	}
+	if maxAnswerTokens <= 0 {
+		maxAnswerTokens = 1024
+	}
+
+	answerText, synthUsage, err := synthesiseAnswer(r.Context(), d.LLM, synthModel, body.Query, enriched, maxAnswerTokens)
+	if err != nil {
+		writeErr(w, http.StatusInternalServerError, "synthesis failed: "+err.Error())
+		return
+	}
+	totalUsage.Add(synthUsage)
+
+	citations := make([]map[string]any, 0, len(enriched))
+	for _, e := range enriched {
+		c := map[string]any{
+			"section_id": e.sec.ID,
+			"title":      e.sec.Title,
+		}
+		if e.sec.PageStart > 0 {
+			c["page_start"] = e.sec.PageStart
 		}
-		if len(sec.CandidateQuestions) > 0 {
-			s["candidate_questions"] = sec.CandidateQuestions
+		if e.sec.PageEnd > 0 {
+			c["page_end"] = e.sec.PageEnd
 		}
-		sections = append(sections, s)
+		if e.span != nil && e.span.Text != "" {
+			c["quote"] = e.span.Text
+			if e.span.Start >= 0 && e.span.End > e.span.Start {
+				c["quote_start"] = e.span.Start
+				c["quote_end"] = e.span.End
+			}
+		}
+		citations = append(citations, c)
 	}
 
 	writeJSON(w, http.StatusOK, map[string]any{
 		"document_id": body.DocumentID,
 		"query":       body.Query,
+		"answer":      answerText,
+		"citations":   citations,
 		"strategy":    d.Strategy.Name(),
-		"model":       body.Model,
-		"sections":    sections,
-		"elapsed_ms":  time.Since(started).Milliseconds(),
+		"model":       synthModel,
+		"usage": map[string]any{
+			"input_tokens":  totalUsage.InputTokens,
+			"output_tokens": totalUsage.OutputTokens,
+			"total_tokens":  totalUsage.TotalTokens,
+			"cost_usd":      totalUsage.CostUSD,
+			"llm_calls":     totalUsage.LLMCalls,
+		},
+		"elapsed_ms": time.Since(started).Milliseconds(),
 	})
 }
 
+// synthesiseAnswer runs one LLM call producing the final answer from
+// retrieved sections + their extracted spans. The model is told to
+// cite by section ID.
+func synthesiseAnswer(ctx context.Context, client llmgate.Client, model, query string, secs []sectionWithContent, maxAnswerTokens int) (string, retrieval.Usage, error) {
+	var b strings.Builder
+	b.WriteString("You are answering a user's question using ONLY the evidence below.\n\n")
+	b.WriteString("User query:\n")
+	b.WriteString(query)
+	b.WriteString("\n\nRetrieved evidence (each block is a section of the document):\n")
+	for i, e := range secs {
+		fmt.Fprintf(&b, "\n[%d] section_id=%s, title=%q", i+1, e.sec.ID, e.sec.Title)
+		if e.sec.PageStart > 0 {
+			fmt.Fprintf(&b, ", pages=%d-%d", e.sec.PageStart, e.sec.PageEnd)
+		}
+		b.WriteString("\n")
+		if e.span != nil && e.span.Text != "" {
+			fmt.Fprintf(&b, "Most relevant quote: %q\n", e.span.Text)
+		}
+		// Always include some content so the model isn't blind when the
+		// span extractor returned nothing.
+		if e.content != "" {
+			snippet := e.content
+			if len(snippet) > 4000 {
+				snippet = snippet[:4000]
+			}
+			fmt.Fprintf(&b, "Section content:\n%s\n", snippet)
+		}
+	}
+	b.WriteString("\nWrite a concise answer to the user's query. ")
+	b.WriteString("If the evidence does not contain an answer, say so. ")
+	b.WriteString("Inline citations should reference the section_id values shown above. ")
+	b.WriteString("Output plain prose; no JSON.")
+
+	req := llmgate.Request{
+		Model: model,
+		Messages: []llmgate.Message{
+			{Role: llmgate.RoleSystem, Content: "You synthesise grounded answers from retrieved document sections. Never invent facts; only cite what the evidence shows."},
+			{Role: llmgate.RoleUser, Content: b.String()},
+		},
+		MaxTokens:   maxAnswerTokens,
+		Temperature: 0,
+	}
+	resp, err := client.Complete(ctx, req)
+	if err != nil {
+		return "", retrieval.Usage{}, err
+	}
+	return strings.TrimSpace(resp.Content), retrieval.Usage{
+		InputTokens:  resp.Usage.InputTokens,
+		OutputTokens: resp.Usage.OutputTokens,
+		TotalTokens:  resp.Usage.TotalTokens,
+		CostUSD:      resp.Usage.CostUSD,
+		LLMCalls:     1,
+	}, nil
+}
+
 // handleQueryMulti accepts { document_ids, query, model?, max_tokens?,
 // reserved_for_prompt?, max_parallel_calls?, max_sections? } and runs the
 // retrieval strategy against every document in parallel, returning
diff --git a/openapi.yaml b/openapi.yaml
index 81bef87..c561693 100644
--- a/openapi.yaml
+++ b/openapi.yaml
@@ -246,6 +246,39 @@ paths:
         "404":
           $ref: "#/components/responses/NotFound"
 
+  /v1/answer:
+    post:
+      tags: [Query]
+      summary: Quote-grounded answer (retrieval + span + synthesis)
+      operationId: answer
+      description: |
+        Single round-trip endpoint that runs retrieval, per-section
+        answer-span extraction, and a synthesis LLM call to return a
+        natural-language answer plus citations. Every citation
+        includes the cited section's ID, page range (when known), and
+        the verbatim quote the answer relies on.
+
+        This is the most regulator-defensible endpoint: the answer is
+        grounded in retrieved sections, and each claim carries a
+        replayable provenance trail.
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/AnswerRequest"
+      responses:
+        "200":
+          description: Synthesised answer with citations
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/AnswerResponse"
+        "404":
+          $ref: "#/components/responses/NotFound"
+        "501":
+          description: Endpoint not available — no LLM client configured
+
   /v1/query/stream:
     post:
       tags: [Query]
@@ -465,3 +498,98 @@ components:
         content:
           type: string
           description: Full section content from storage.
+        answer_span:
+          $ref: "#/components/schemas/AnswerSpan"
+
+    AnswerSpan:
+      type: object
+      description: |
+        Verbatim quote from the section judged most relevant to the
+        query, plus byte offsets back into the section's content.
+        Returned only when `retrieval.answer_span.enabled` is true.
+        When `start` and `end` are both -1 the model paraphrased
+        despite the verbatim-quote rule — the text is preserved but
+        the offsets are sentinel.
+      properties:
+        start:
+          type: integer
+        end:
+          type: integer
+        text:
+          type: string
+
+    AnswerRequest:
+      type: object
+      required: [document_id, query]
+      properties:
+        document_id:
+          type: string
+        query:
+          type: string
+        model:
+          type: string
+        max_tokens:
+          type: integer
+        reserved_for_prompt:
+          type: integer
+        max_parallel_calls:
+          type: integer
+        max_sections:
+          type: integer
+          description: Cap on sections fed into synthesis. Defaults to retrieval.answer.max_sections (5).
+        max_answer_tokens:
+          type: integer
+          description: Bound the synthesised answer length. Defaults to retrieval.answer.max_answer_tokens (1024).
+
+    AnswerResponse:
+      type: object
+      properties:
+        document_id:
+          type: string
+        query:
+          type: string
+        answer:
+          type: string
+          description: Natural-language answer grounded in the cited sections.
+        citations:
+          type: array
+          items:
+            $ref: "#/components/schemas/AnswerCitation"
+        strategy:
+          type: string
+        model:
+          type: string
+        usage:
+          type: object
+          properties:
+            input_tokens: {type: integer}
+            output_tokens: {type: integer}
+            total_tokens: {type: integer}
+            cost_usd: {type: number}
+            llm_calls: {type: integer}
+        elapsed_ms:
+          type: integer
+
+    AnswerCitation:
+      type: object
+      description: |
+        One citation behind the synthesised answer. `quote` is the
+        verbatim span the answer relies on (when the span extractor
+        found one). `quote_start`/`quote_end` give byte offsets into
+        the source section's content. `page_start`/`page_end` are the
+        section's page range — omitted for non-paginated formats.
+      properties:
+        section_id:
+          type: string
+        title:
+          type: string
+        quote:
+          type: string
+        quote_start:
+          type: integer
+        quote_end:
+          type: integer
+        page_start:
+          type: integer
+        page_end:
+          type: integer
diff --git a/pkg/config/config.go b/pkg/config/config.go
index 3845944..525f83d 100644
--- a/pkg/config/config.go
+++ b/pkg/config/config.go
@@ -201,6 +201,44 @@ type RetrievalConfig struct {
 	ChunkedTree ChunkedTreeBlock `yaml:"chunked_tree"`
 	Agentic     AgenticBlock     `yaml:"agentic"`
 	Cache       CacheBlock       `yaml:"cache"`
+	AnswerSpan  AnswerSpanBlock  `yaml:"answer_span"`
+	Answer      AnswerBlock      `yaml:"answer"`
+}
+
+// AnswerSpanBlock configures the answer-span extractor.
+//
+// When enabled, every section returned by /v1/query gets an extra
+// `answer_span` field carrying the verbatim quote the model judged
+// most relevant to the query, plus byte offsets back into the
+// section's content. Costs one LLM call per returned section.
+type AnswerSpanBlock struct {
+	// Enabled toggles per-section span extraction on /v1/query. Default: false.
+	Enabled bool `yaml:"enabled"`
+	// Model overrides the budget's model for the span extraction call.
+	// Empty means use the request's model. Keep this on a cheap/fast
+	// model (the call is short and runs once per returned section).
+	Model string `yaml:"model"`
+	// MaxConcurrency caps parallel span-extraction calls per request.
+	// Default: 4.
+	MaxConcurrency int `yaml:"max_concurrency"`
+	// MaxQuoteLen caps the per-section quote length (characters).
+	// Default: 400.
+	MaxQuoteLen int `yaml:"max_quote_len"`
+}
+
+// AnswerBlock configures the /v1/answer endpoint, which runs retrieval
+// + span extraction + a synthesis LLM call to return a quote-grounded
+// answer in a single round-trip.
+type AnswerBlock struct {
+	// Model overrides the budget's model for the synthesis call.
+	// Empty means use the request's model.
+	Model string `yaml:"model"`
+	// MaxSections caps how many sections are fed into synthesis.
+	// Default: 5.
+	MaxSections int `yaml:"max_sections"`
+	// MaxAnswerTokens bounds the synthesised answer length.
+	// Default: 1024.
+	MaxAnswerTokens int `yaml:"max_answer_tokens"`
 }
 
 // CacheBlock configures the retrieval-result cache.
@@ -281,6 +319,15 @@ func Default() Config {
 				MaxEntries: 1024,
 				TTLSeconds: 600,
 			},
+			AnswerSpan: AnswerSpanBlock{
+				Enabled:        false,
+				MaxConcurrency: 4,
+				MaxQuoteLen:    400,
+			},
+			Answer: AnswerBlock{
+				MaxSections:     5,
+				MaxAnswerTokens: 1024,
+			},
 		},
 		Ingest: IngestConfig{
 			GlobalLLMConcurrency: 12,
@@ -421,6 +468,30 @@ func applyEnvOverrides(c *Config) {
 			c.Ingest.GlobalLLMConcurrency = n
 		}
 	}
+	if v := os.Getenv("VLE_RETRIEVAL_ANSWER_SPAN_ENABLED"); v != "" {
+		switch strings.ToLower(strings.TrimSpace(v)) {
+		case "1", "true", "yes", "on":
+			c.Retrieval.AnswerSpan.Enabled = true
+		case "0", "false", "no", "off":
+			c.Retrieval.AnswerSpan.Enabled = false
+		}
+	}
+	if v := os.Getenv("VLE_RETRIEVAL_ANSWER_SPAN_MODEL"); v != "" {
+		c.Retrieval.AnswerSpan.Model = v
+	}
+	if v := os.Getenv("VLE_RETRIEVAL_ANSWER_SPAN_MAX_CONCURRENCY"); v != "" {
+		if n, err := strconv.Atoi(v); err == nil && n > 0 {
+			c.Retrieval.AnswerSpan.MaxConcurrency = n
+		}
+	}
+	if v := os.Getenv("VLE_RETRIEVAL_ANSWER_MODEL"); v != "" {
+		c.Retrieval.Answer.Model = v
+	}
+	if v := os.Getenv("VLE_RETRIEVAL_ANSWER_MAX_SECTIONS"); v != "" {
+		if n, err := strconv.Atoi(v); err == nil && n > 0 {
+			c.Retrieval.Answer.MaxSections = n
+		}
+	}
 }
 
 // Validate checks that required fields for the selected drivers are set.
diff --git a/pkg/retrieval/span.go b/pkg/retrieval/span.go
new file mode 100644
index 0000000..2fdad13
--- /dev/null
+++ b/pkg/retrieval/span.go
@@ -0,0 +1,214 @@
+package retrieval
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"strings"
+
+	"github.com/hallelx2/llmgate"
+)
+
+// AnswerSpan is the most relevant substring of a section's content for
+// a given query, with byte offsets back into the original content.
+//
+// Start and End are byte offsets such that content[Start:End] == Text
+// after the locator step. When the span text does not appear verbatim
+// in the content (the model paraphrased), Start and End are -1 and
+// Text holds the model's quote.
+type AnswerSpan struct {
+	Start int    `json:"start"`
+	End   int    `json:"end"`
+	Text  string `json:"text"`
+}
+
+// SpanExtractor pulls the most query-relevant verbatim span out of a
+// section's content with one LLM call.
+type SpanExtractor struct {
+	LLM   llmgate.Client
+	Model string
+	// MaxQuoteLen caps how many characters the model is allowed to quote.
+	// Keeps the response tight and forces the model to pick a focused
+	// span instead of returning the whole section. Default: 400.
+	MaxQuoteLen int
+}
+
+// NewSpanExtractor constructs a SpanExtractor with sensible defaults.
+func NewSpanExtractor(client llmgate.Client, model string) *SpanExtractor {
+	return &SpanExtractor{LLM: client, Model: model, MaxQuoteLen: 400}
+}
+
+const spanSystemPrompt = `You are a precise quotation engine. Given a section of a document and a user query, extract the SHORTEST verbatim quote from the section that directly answers (or is the most relevant evidence for) the query.
+
+Rules:
+- Quote verbatim from the section. Do not paraphrase, summarize, or invent text.
+- Pick the smallest contiguous span that contains the answer. One sentence is usually enough; a phrase is better.
+- If the section contains nothing useful for the query, set "found" to false and return an empty quote.`
+
+const spanJSONSchema = `{
+  "type": "object",
+  "properties": {
+    "found": {"type": "boolean"},
+    "quote": {"type": "string"}
+  },
+  "required": ["found", "quote"]
+}`
+
+// Extract runs one LLM call to pull the most relevant verbatim span
+// from sectionContent for query. Returns nil (no error) when the
+// section does not contain an answer; that is the no-evidence path,
+// not a failure. A non-nil error is returned only on transport / LLM
+// failure.
+func (e *SpanExtractor) Extract(ctx context.Context, sectionContent, query string) (*AnswerSpan, Usage, error) {
+	if strings.TrimSpace(sectionContent) == "" || strings.TrimSpace(query) == "" {
+		return nil, Usage{}, nil
+	}
+	maxQuote := e.MaxQuoteLen
+	if maxQuote <= 0 {
+		maxQuote = 400
+	}
+
+	user := fmt.Sprintf(
+		"Section content:\n---\n%s\n---\n\nUser query:\n%s\n\nReturn a JSON object with `found` (boolean) and `quote` (string, verbatim from the section, ≤ %d characters).",
+		sectionContent, query, maxQuote,
+	)
+	req := llmgate.Request{
+		Model: e.Model,
+		Messages: []llmgate.Message{
+			{Role: llmgate.RoleSystem, Content: spanSystemPrompt},
+			{Role: llmgate.RoleUser, Content: user},
+		},
+		MaxTokens:   512,
+		Temperature: 0,
+		JSONMode:    true,
+		JSONSchema:  []byte(spanJSONSchema),
+	}
+
+	resp, err := e.LLM.Complete(ctx, req)
+	if err != nil {
+		return nil, Usage{}, fmt.Errorf("span-extract llm call: %w", err)
+	}
+	usage := Usage{
+		InputTokens:  resp.Usage.InputTokens,
+		OutputTokens: resp.Usage.OutputTokens,
+		TotalTokens:  resp.Usage.TotalTokens,
+		CostUSD:      resp.Usage.CostUSD,
+		LLMCalls:     1,
+	}
+
+	quote, found, parseErr := parseSpanResponse(resp.Content)
+	if parseErr != nil {
+		return nil, usage, fmt.Errorf("parse span response: %w", parseErr)
+	}
+	if !found || strings.TrimSpace(quote) == "" {
+		return nil, usage, nil
+	}
+	if len(quote) > maxQuote {
+		quote = quote[:maxQuote]
+	}
+
+	start, end := locateQuote(sectionContent, quote)
+	return &AnswerSpan{Start: start, End: end, Text: quote}, usage, nil
+}
+
+// locateQuote finds quote in content. Returns -1, -1 when the quote
+// does not appear verbatim (the model paraphrased despite the
+// instructions). First tries exact substring, then normalised
+// whitespace.
+func locateQuote(content, quote string) (int, int) {
+	if i := strings.Index(content, quote); i >= 0 {
+		return i, i + len(quote)
+	}
+	// Whitespace-normalised match: collapse runs of whitespace in both.
+	normContent := collapseWS(content)
+	normQuote := collapseWS(quote)
+	if j := strings.Index(normContent, normQuote); j >= 0 {
+		// Walk the original content counting normalised characters until
+		// we reach j; that's our start. Then add normQuote length back
+		// through the same walk for the end.
+		start := mapNormToOriginal(content, j)
+		end := mapNormToOriginal(content, j+len(normQuote))
+		if start >= 0 && end > start {
+			return start, end
+		}
+	}
+	return -1, -1
+}
+
+func collapseWS(s string) string {
+	var b strings.Builder
+	b.Grow(len(s))
+	prevWS := false
+	for i := 0; i < len(s); i++ {
+		c := s[i]
+		ws := c == ' ' || c == '\t' || c == '\n' || c == '\r'
+		if ws {
+			if !prevWS {
+				b.WriteByte(' ')
+			}
+			prevWS = true
+			continue
+		}
+		b.WriteByte(c)
+		prevWS = false
+	}
+	return b.String()
+}
+
+// mapNormToOriginal returns the index in s that corresponds to the
+// normalised-character index n (where the normalised string is
+// collapseWS(s)). Returns -1 if n is out of range.
+func mapNormToOriginal(s string, n int) int {
+	idx := 0
+	prevWS := false
+	for i := 0; i < len(s); i++ {
+		if idx == n {
+			return i
+		}
+		c := s[i]
+		ws := c == ' ' || c == '\t' || c == '\n' || c == '\r'
+		if ws {
+			if !prevWS {
+				idx++
+			}
+			prevWS = true
+			continue
+		}
+		idx++
+		prevWS = false
+	}
+	if idx == n {
+		return len(s)
+	}
+	return -1
+}
+
+type spanPayload struct {
+	Found bool   `json:"found"`
+	Quote string `json:"quote"`
+}
+
+func parseSpanResponse(raw string) (quote string, found bool, err error) {
+	raw = strings.TrimSpace(raw)
+	if raw == "" {
+		return "", false, nil
+	}
+	if strings.HasPrefix(raw, "```") {
+		if i := strings.Index(raw, "\n"); i >= 0 {
+			raw = raw[i+1:]
+		}
+		raw = strings.TrimSuffix(raw, "```")
+		raw = strings.TrimSpace(raw)
+	}
+	if i := strings.Index(raw, "{"); i > 0 {
+		raw = raw[i:]
+	}
+	if j := strings.LastIndex(raw, "}"); j >= 0 && j < len(raw)-1 {
+		raw = raw[:j+1]
+	}
+	var p spanPayload
+	if err := json.Unmarshal([]byte(raw), &p); err != nil {
+		return "", false, fmt.Errorf("unmarshal span: %w", err)
+	}
+	return p.Quote, p.Found, nil
+}
diff --git a/pkg/retrieval/span_test.go b/pkg/retrieval/span_test.go
new file mode 100644
index 0000000..0fc3807
--- /dev/null
+++ b/pkg/retrieval/span_test.go
@@ -0,0 +1,172 @@
+package retrieval
+
+import (
+	"context"
+	"sync/atomic"
+	"testing"
+
+	"github.com/hallelx2/llmgate"
+)
+
+// spanMockLLM is a minimal LLM stub for span-extractor tests. The
+// retrieval_test.go file uses an external-package mock; we need an
+// internal one to exercise locateQuote / parseSpanResponse directly.
+type spanMockLLM struct {
+	reply string
+	calls int32
+}
+
+func (m *spanMockLLM) Complete(ctx context.Context, req llmgate.Request) (*llmgate.Response, error) {
+	atomic.AddInt32(&m.calls, 1)
+	return &llmgate.Response{Content: m.reply}, nil
+}
+
+func (m *spanMockLLM) CountTokens(ctx context.Context, s string) (int, error) {
+	return len(s) / 4, nil
+}
+
+func TestSpanExtractor_VerbatimMatch(t *testing.T) {
+	content := "Apple Inc. reported revenue of $383.3 billion for fiscal 2023, up 2.8% year over year. " +
+		"The iPhone segment generated $200.6 billion of that total."
+	query := "What was Apple's fiscal 2023 revenue?"
+
+	m := &spanMockLLM{reply: `{"found":true,"quote":"revenue of $383.3 billion for fiscal 2023"}`}
+	e := NewSpanExtractor(m, "gemini-2.5-flash")
+
+	span, usage, err := e.Extract(context.Background(), content, query)
+	if err != nil {
+		t.Fatalf("Extract: %v", err)
+	}
+	if span == nil {
+		t.Fatalf("expected non-nil span")
+	}
+	if span.Text != "revenue of $383.3 billion for fiscal 2023" {
+		t.Errorf("text = %q", span.Text)
+	}
+	if span.Start <= 0 || span.End <= span.Start {
+		t.Errorf("offsets = (%d, %d)", span.Start, span.End)
+	}
+	if got := content[span.Start:span.End]; got != span.Text {
+		t.Errorf("content[Start:End] = %q, want %q", got, span.Text)
+	}
+	if usage.LLMCalls != 1 {
+		t.Errorf("usage.LLMCalls = %d, want 1", usage.LLMCalls)
+	}
+}
+
+func TestSpanExtractor_NotFound(t *testing.T) {
+	content := "This section is about unrelated topics."
+	m := &spanMockLLM{reply: `{"found":false,"quote":""}`}
+	e := NewSpanExtractor(m, "gemini-2.5-flash")
+
+	span, usage, err := e.Extract(context.Background(), content, "Q")
+	if err != nil {
+		t.Fatalf("Extract: %v", err)
+	}
+	if span != nil {
+		t.Errorf("expected nil span, got %+v", span)
+	}
+	if usage.LLMCalls != 1 {
+		t.Errorf("usage.LLMCalls = %d, want 1", usage.LLMCalls)
+	}
+}
+
+func TestSpanExtractor_ParaphraseFallsBackToWhitespace(t *testing.T) {
+	// The original has weird whitespace (a newline mid-sentence + extra
+	// spaces) but the model returns a normalised version. We should
+	// still locate it.
+	content := "Apple Inc. reported revenue of\n  $383.3   billion for fiscal 2023, up 2.8% year over year."
+	m := &spanMockLLM{reply: `{"found":true,"quote":"revenue of $383.3 billion for fiscal 2023"}`}
+	e := NewSpanExtractor(m, "gemini-2.5-flash")
+
+	span, _, err := e.Extract(context.Background(), content, "revenue?")
+	if err != nil {
+		t.Fatalf("Extract: %v", err)
+	}
+	if span == nil {
+		t.Fatalf("expected non-nil span via whitespace match")
+	}
+	if span.Start < 0 || span.End < 0 {
+		t.Errorf("expected resolved offsets via WS-normalised match, got (%d, %d)", span.Start, span.End)
+	}
+}
+
+func TestSpanExtractor_QuoteNotInContent(t *testing.T) {
+	// Model invents text not present anywhere — sentinel offsets, but
+	// span.Text still surfaces what the model said so callers can flag.
+	content := "Plain content with no apple references at all."
+	m := &spanMockLLM{reply: `{"found":true,"quote":"hallucinated quote that does not appear"}`}
+	e := NewSpanExtractor(m, "gemini-2.5-flash")
+
+	span, _, err := e.Extract(context.Background(), content, "Q")
+	if err != nil {
+		t.Fatalf("Extract: %v", err)
+	}
+	if span == nil {
+		t.Fatalf("expected non-nil span even with bad quote")
+	}
+	if span.Start != -1 || span.End != -1 {
+		t.Errorf("expected sentinel offsets (-1,-1) for hallucinated quote, got (%d,%d)", span.Start, span.End)
+	}
+	if span.Text == "" {
+		t.Errorf("expected text preserved")
+	}
+}
+
+func TestSpanExtractor_EmptyInput(t *testing.T) {
+	m := &spanMockLLM{reply: `{"found":true,"quote":"x"}`}
+	e := NewSpanExtractor(m, "gemini-2.5-flash")
+	if span, _, _ := e.Extract(context.Background(), "", "Q"); span != nil {
+		t.Errorf("empty content should yield nil span without an LLM call")
+	}
+	if span, _, _ := e.Extract(context.Background(), "content", ""); span != nil {
+		t.Errorf("empty query should yield nil span without an LLM call")
+	}
+}
+
+func TestParseSpanResponse_CodeFence(t *testing.T) {
+	raw := "```json\n{\"found\":true,\"quote\":\"hello\"}\n```"
+	q, found, err := parseSpanResponse(raw)
+	if err != nil {
+		t.Fatalf("parse: %v", err)
+	}
+	if !found || q != "hello" {
+		t.Errorf("got (%q, %v)", q, found)
+	}
+}
+
+func TestParseSpanResponse_LeadingProse(t *testing.T) {
+	raw := "Sure, here is the span: {\"found\":true,\"quote\":\"x\"}"
+	q, found, _ := parseSpanResponse(raw)
+	if !found || q != "x" {
+		t.Errorf("leading-prose parse failed: %q, %v", q, found)
+	}
+}
+
+func TestLocateQuote_Exact(t *testing.T) {
+	c := "alpha beta gamma"
+	s, e := locateQuote(c, "beta")
+	if s != 6 || e != 10 {
+		t.Errorf("got (%d,%d), want (6,10)", s, e)
+	}
+}
+
+func TestLocateQuote_WhitespaceNormalised(t *testing.T) {
+	c := "alpha\n\n  beta   gamma"
+	s, e := locateQuote(c, "beta gamma")
+	if s < 0 || e <= s {
+		t.Fatalf("got (%d,%d) — expected resolved offsets", s, e)
+	}
+	if !contains(c[s:e], "beta") || !contains(c[s:e], "gamma") {
+		t.Errorf("located span %q does not contain target words", c[s:e])
+	}
+}
+
+func contains(s, sub string) bool {
+	for i := 0; i+len(sub) <= len(s); i++ {
+		if s[i:i+len(sub)] == sub {
+			return true
+		}
+	}
+	return false
+}