Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 26 additions & 7 deletions cmd/engine/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,13 +124,17 @@ func run() error {
q.Register(queue.KindIngestDocument, pipeline.Handler())

deps := api.Deps{
Logger: logger,
DB: pool,
Storage: store,
Queue: q,
Strategy: strategy,
Version: version,
MultiDoc: multiDoc,
Logger: logger,
DB: pool,
Storage: store,
Queue: q,
Strategy: strategy,
Version: version,
MultiDoc: multiDoc,
LLM: llmClient,
LLMModel: modelFor(cfg.LLM),
AnswerSpan: cfg.Retrieval.AnswerSpan,
Answer: cfg.Retrieval.Answer,
}

srv := &http.Server{
Expand Down Expand Up @@ -228,6 +232,21 @@ func buildQueue(c config.QueueConfig, dbURL string) (queue.Queue, error) {
}
}

// modelFor returns the configured chat/general-purpose model name for
// the selected LLM driver. Used as a fallback when an API request
// omits an explicit model.
func modelFor(c config.LLMConfig) string {
switch c.Driver {
case "anthropic":
return c.Anthropic.Model
case "openai":
return c.OpenAI.Model
case "gemini":
return c.Gemini.Model
}
return ""
}

func buildLLM(c config.LLMConfig) (llmgate.Client, error) {
switch c.Driver {
case "anthropic":
Expand Down
22 changes: 22 additions & 0 deletions config.example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,28 @@ retrieval:
# doesn't own, so the model knows what else exists in the document.
include_sibling_breadcrumbs: true

# answer_span: when enabled, every section returned by /v1/query gets an
# extra `answer_span` field carrying the verbatim quote the model judged
# most relevant to the query, plus byte offsets back into the section's
# content. Costs one LLM call per returned section. Opt-in by default.
answer_span:
enabled: false
# Override the model used for span extraction; empty inherits the
# request's model. Keep this on a cheap/fast model — the call is
# short and runs once per returned section.
model: ""
max_concurrency: 4
max_quote_len: 400

# answer: /v1/answer endpoint configuration. The endpoint runs
# retrieval + per-section span extraction + a synthesis LLM call,
# returning {answer, citations:[{section_id, page_start, page_end, quote}]}.
answer:
# Override the synthesis-call model; empty inherits the request's model.
model: ""
max_sections: 5
max_answer_tokens: 1024

ingest:
# The summarize and HyDE stages run concurrently. This caps the total
# number of LLM calls in flight across both stages combined, so the
Expand Down
Loading
Loading