vectorless-engine/config.server.example.yaml at main · hallelx2/vectorless-engine · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# Vectorless Server — example configuration.
#
# Every value below has a sensible default. Override with env vars
# (VLS_* for server, VLE_* for engine) or edit this file.
#
# Precedence: env vars > YAML file > built-in defaults.

# ── Server ─────────────────────────────────────────────────────────
server:
  addr: ":8080"
  read_timeout: 30s
  write_timeout: 120s
  drain_timeout: 15s

  # Direct TLS (optional). Leave empty to terminate TLS at your proxy.
  tls:
    cert_file: ""
    key_file: ""
    min_version: "1.2"   # "1.2" or "1.3"

# ── Authentication ─────────────────────────────────────────────────
auth:
  # "none" — all requests are anonymous (default, for local dev).
  # "api_key" — require Authorization: Bearer <key>.
  mode: "none"
  api_key: ""   # set via VLS_AUTH_API_KEY in production

# ── Prometheus Metrics ─────────────────────────────────────────────
metrics:
  enabled: true   # serves /metrics endpoint

# ── OpenTelemetry Tracing ──────────────────────────────────────────
tracing:
  enabled: false
  endpoint: "localhost:4317"   # OTLP gRPC collector
  insecure: true               # disable TLS for local dev
  service_name: "vectorless-server"
  sample_rate: 1.0             # 0.0–1.0; 1.0 = sample everything

# ── Rate Limiting ──────────────────────────────────────────────────
rate_limit:
  enabled: false
  requests_per_minute: 600

# ── Engine Configuration ───────────────────────────────────────────
# Everything below is passed through to the vectorless engine.
engine:
  database:
    url: "postgres://vectorless:vectorless@localhost:5432/vectorless?sslmode=disable"
    max_conns: 10

  storage:
    driver: "local"   # "local" or "s3"
    local:
      root: "./data/documents"
    # s3:
    #   endpoint: "http://localhost:9000"
    #   region: "us-east-1"
    #   bucket: "vectorless"
    #   access_key: "minioadmin"
    #   secret_key: "minioadmin"
    #   use_path_style: true

  queue:
    driver: "river"   # "river", "qstash", or "asynq"
    river:
      num_workers: 10
    # qstash:
    #   token: ""
    #   webhook_base_url: "https://your-server.com"
    #   current_signing_key: ""
    #   next_signing_key: ""
    # asynq:
    #   addr: "localhost:6379"
    #   password: ""
    #   db: 0
    #   concurrency: 20

  llm:
    driver: "anthropic"   # "anthropic", "openai", or "gemini"
    anthropic:
      api_key: ""   # set via VLS_ANTHROPIC_API_KEY
      model: "claude-sonnet-4-20250514"
      reasoning_model: ""
    # openai:
    #   api_key: ""
    #   model: "gpt-4o"
    #   reasoning_model: ""
    # gemini:
    #   api_key: ""
    #   model: "gemini-2.0-flash"
    #   reasoning_model: ""

  retrieval:
    strategy: "chunked-tree"   # "single-pass" or "chunked-tree"
    chunked_tree:
      max_tokens_per_call: 60000
      max_parallel_calls: 8
      include_sibling_breadcrumbs: true

  ingest:
    # The summarize and HyDE stages run concurrently. This caps the total
    # number of LLM calls in flight across both stages combined.
    # 0 disables the global cap; default is 12.
    global_llm_concurrency: 12

    # HyDE candidate-question generation per leaf section. Folded into
    # the retrieval prompt at query time to widen recall on queries that
    # don't echo the section's exact wording.
    hyde:
      enabled: true
      model: ""             # empty => same model as summarization
      num_questions: 5
      concurrency: 4

    # Multi-axis structured summaries (Phase 2.5). JSON-mode summarizer
    # returns {topics, entities, numbers, one_line}. The retrieval
    # prompt surfaces entities + numbers on the section line; the
    # one_line continues to populate the flat `summary` field for
    # backward compatibility.
    summary_axes:
      enabled: true
      max_topics: 4
      max_entities: 8
      max_numbers: 6

  log:
    level: "info"     # "debug", "info", "warn", "error"
    format: "json"    # "json" or "console"