hallelx2 · hallelx2 · May 27, 2026 · May 27, 2026
diff --git a/cmd/engine/main.go b/cmd/engine/main.go
@@ -124,13 +124,17 @@ func run() error {
 	q.Register(queue.KindIngestDocument, pipeline.Handler())
 
 	deps := api.Deps{
-		Logger:   logger,
-		DB:       pool,
-		Storage:  store,
-		Queue:    q,
-		Strategy: strategy,
-		Version:  version,
-		MultiDoc: multiDoc,
+		Logger:     logger,
+		DB:         pool,
+		Storage:    store,
+		Queue:      q,
+		Strategy:   strategy,
+		Version:    version,
+		MultiDoc:   multiDoc,
+		LLM:        llmClient,
+		LLMModel:   modelFor(cfg.LLM),
+		AnswerSpan: cfg.Retrieval.AnswerSpan,
+		Answer:     cfg.Retrieval.Answer,
 	}
 
 	srv := &http.Server{
@@ -228,6 +232,21 @@ func buildQueue(c config.QueueConfig, dbURL string) (queue.Queue, error) {
 	}
 }
 
+// modelFor returns the configured chat/general-purpose model name for
+// the selected LLM driver. Used as a fallback when an API request
+// omits an explicit model.
+func modelFor(c config.LLMConfig) string {
+	switch c.Driver {
+	case "anthropic":
+		return c.Anthropic.Model
+	case "openai":
+		return c.OpenAI.Model
+	case "gemini":
+		return c.Gemini.Model
+	}
+	return ""
+}
+
 func buildLLM(c config.LLMConfig) (llmgate.Client, error) {
 	switch c.Driver {
 	case "anthropic":

diff --git a/config.example.yaml b/config.example.yaml
@@ -107,6 +107,28 @@ retrieval:
     # doesn't own, so the model knows what else exists in the document.
     include_sibling_breadcrumbs: true
 
+  # answer_span: when enabled, every section returned by /v1/query gets an
+  # extra `answer_span` field carrying the verbatim quote the model judged
+  # most relevant to the query, plus byte offsets back into the section's
+  # content. Costs one LLM call per returned section. Opt-in by default.
+  answer_span:
+    enabled: false
+    # Override the model used for span extraction; empty inherits the
+    # request's model. Keep this on a cheap/fast model — the call is
+    # short and runs once per returned section.
+    model: ""
+    max_concurrency: 4
+    max_quote_len: 400
+
+  # answer: /v1/answer endpoint configuration. The endpoint runs
+  # retrieval + per-section span extraction + a synthesis LLM call,
+  # returning {answer, citations:[{section_id, page_start, page_end, quote}]}.
+  answer:
+    # Override the synthesis-call model; empty inherits the request's model.
+    model: ""
+    max_sections: 5
+    max_answer_tokens: 1024
+
 ingest:
   # The summarize and HyDE stages run concurrently. This caps the total
   # number of LLM calls in flight across both stages combined, so the