Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ Or via environment variables: `VLE_TLS_CERT_FILE`, `VLE_TLS_KEY_FILE`.
| Markdown | `goldmark` | ATX + Setext headings become section boundaries |
| HTML | `golang.org/x/net/html` | Prefers `<main>`/`<article>`; skips nav/footer/script |
| DOCX | stdlib `archive/zip` + `encoding/xml` | `Heading 1…9` styles become section boundaries |
| PDF | `ledongthuc/pdf` | Font-size heuristic recovers headings from unstructured PDFs |
| PDF | `hallelx2/pdftable` + `ledongthuc/pdf` | pdftable extracts positioned words + ruled / borderless tables (Markdown-rendered, `Metadata["table"]="true"`); font-size heuristic recovers headings; ledongthuc supplies `/Outlines` when present |
| Text | stdlib | Single-section fallback |

New parsers drop in behind a one-method `Parser` interface — see [`pkg/parser/`](pkg/parser/).
Expand Down
29 changes: 28 additions & 1 deletion cmd/engine/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import (
"github.com/hallelx2/vectorless-engine/pkg/config"
"github.com/hallelx2/vectorless-engine/pkg/db"
"github.com/hallelx2/vectorless-engine/pkg/ingest"
"github.com/hallelx2/vectorless-engine/pkg/parser"
"github.com/hallelx2/vectorless-engine/pkg/queue"
"github.com/hallelx2/vectorless-engine/pkg/retrieval"
"github.com/hallelx2/vectorless-engine/pkg/storage"
Expand Down Expand Up @@ -171,14 +172,24 @@ func run() error {
DB: pool,
Storage: store,
LLM: llmClient,
Parsers: ingest.DefaultRegistry(),
Parsers: ingest.RegistryFromTableOpts(tableOptsFromConfig(cfg.Ingest.Tables)),
Logger: logger,
HyDEEnabled: cfg.Ingest.HyDE.Enabled,
HyDEModel: cfg.Ingest.HyDE.Model,
HyDENumQuestions: cfg.Ingest.HyDE.NumQuestions,
HyDEConcurrency: cfg.Ingest.HyDE.Concurrency,
GlobalLLMConcurrency: cfg.Ingest.GlobalLLMConcurrency,
})
if cfg.Ingest.Tables.Enabled {
logger.Info("ingest: pdf table extraction enabled",
"vertical_strategy", cfg.Ingest.Tables.VerticalStrategy,
"horizontal_strategy", cfg.Ingest.Tables.HorizontalStrategy,
"min_rows", cfg.Ingest.Tables.MinTableRows,
"min_cols", cfg.Ingest.Tables.MinTableCols,
)
} else {
logger.Info("ingest: pdf table extraction disabled")
}
q.Register(queue.KindIngestDocument, pipeline.Handler())

deps := api.Deps{
Expand Down Expand Up @@ -405,3 +416,19 @@ func newLogger(c config.LogConfig) *slog.Logger {
}
return slog.New(h)
}

// tableOptsFromConfig translates the YAML/env Tables block into the
// parser-level TableOpts struct. Returns nil when tables are disabled so
// the PDF parser short-circuits without instantiating pdftable settings.
func tableOptsFromConfig(c config.TablesConfig) *parser.TableOpts {
if !c.Enabled {
return nil
}
return &parser.TableOpts{
Enabled: true,
VerticalStrategy: c.VerticalStrategy,
HorizontalStrategy: c.HorizontalStrategy,
MinTableRows: c.MinTableRows,
MinTableCols: c.MinTableCols,
}
}
30 changes: 29 additions & 1 deletion cmd/server/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import (

"github.com/hallelx2/vectorless-engine/pkg/db"
"github.com/hallelx2/vectorless-engine/pkg/ingest"
"github.com/hallelx2/vectorless-engine/pkg/parser"
"github.com/hallelx2/vectorless-engine/pkg/queue"
"github.com/hallelx2/vectorless-engine/pkg/retrieval"
"github.com/hallelx2/vectorless-engine/pkg/storage"
Expand Down Expand Up @@ -158,14 +159,24 @@ func run() error {
DB: pool,
Storage: store,
LLM: llmClient,
Parsers: ingest.DefaultRegistry(),
Parsers: ingest.RegistryFromTableOpts(tableOptsFromConfig(cfg.Engine.Ingest.Tables)),
Logger: logger,
HyDEEnabled: cfg.Engine.Ingest.HyDE.Enabled,
HyDEModel: cfg.Engine.Ingest.HyDE.Model,
HyDENumQuestions: cfg.Engine.Ingest.HyDE.NumQuestions,
HyDEConcurrency: cfg.Engine.Ingest.HyDE.Concurrency,
GlobalLLMConcurrency: cfg.Engine.Ingest.GlobalLLMConcurrency,
})
if cfg.Engine.Ingest.Tables.Enabled {
logger.Info("ingest: pdf table extraction enabled",
"vertical_strategy", cfg.Engine.Ingest.Tables.VerticalStrategy,
"horizontal_strategy", cfg.Engine.Ingest.Tables.HorizontalStrategy,
"min_rows", cfg.Engine.Ingest.Tables.MinTableRows,
"min_cols", cfg.Engine.Ingest.Tables.MinTableCols,
)
} else {
logger.Info("ingest: pdf table extraction disabled")
}
q.Register(queue.KindIngestDocument, pipeline.Handler())

// ── Start subsystems ──────────────────────────────────────────
Expand Down Expand Up @@ -395,3 +406,20 @@ func newLogger(c enginecfg.LogConfig) *slog.Logger {
}
return slog.New(h)
}

// tableOptsFromConfig translates the engine's TablesConfig (from the
// embedded engine config block) into the parser-level TableOpts. Returns
// nil when tables are disabled so the PDF parser short-circuits without
// instantiating pdftable settings.
func tableOptsFromConfig(c enginecfg.TablesConfig) *parser.TableOpts {
if !c.Enabled {
return nil
}
return &parser.TableOpts{
Enabled: true,
VerticalStrategy: c.VerticalStrategy,
HorizontalStrategy: c.HorizontalStrategy,
MinTableRows: c.MinTableRows,
MinTableCols: c.MinTableCols,
}
}
27 changes: 27 additions & 0 deletions config.example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,33 @@ ingest:
num_questions: 5
concurrency: 4

# Tables: pdftable-driven extraction. Every detected table on a PDF
# page becomes its own Section with `Metadata["table"]="true"`, content
# rendered as GitHub-flavoured Markdown. This is the single biggest
# retrieval-quality lever on documents where numeric answers live in
# balance sheets — text-only extraction collapses tables into a
# space-joined run that's effectively unsearchable.
#
# ENABLED BY DEFAULT. Flip to false if a pathological PDF surfaces a
# regression — table-extraction errors never break ingest (text-only
# output still ships), but the flag is the kill switch.
tables:
enabled: true
# Vertical / horizontal edge-detection strategy. One of:
# lines (default) — edges from drawn lines/rects/curves
# lines_strict edges from drawn lines only
# text edges inferred from word alignment
# (best for borderless / narrative tables)
# explicit caller-supplied coordinates (reserved)
# The two axes mix independently, so "lines" vertical + "text"
# horizontal works for half-ruled tables.
vertical_strategy: "lines"
horizontal_strategy: "lines"
# Drop candidate tables smaller than this. 2x2 is the floor — a
# single row or column is a list or a header, not a table.
min_table_rows: 2
min_table_cols: 2

log:
level: "info" # debug | info | warn | error
format: "json" # json | console
11 changes: 9 additions & 2 deletions docs/ENGINE.md
Original file line number Diff line number Diff line change
Expand Up @@ -234,8 +234,15 @@ internals.
pooling.
- **Embedded SQL migrations** via `//go:embed`. No Atlas, no goose, no
Flyway. Migration is ten lines of Go; external tools are overkill.
- **ledongthuc/pdf** for PDF — pure Go, no cgo, cross-compiles cleanly.
Trade-off: no OCR, no encrypted PDFs. Deferred to Phase 2+.
- **hallelx2/pdftable** (primary) + **ledongthuc/pdf** (fallback for
`/Outlines` only) for PDF. pdftable is a pure-Go port of pdfplumber:
positioned-word extraction + pdfplumber-parity table-finding pipeline
(`lines` / `lines_strict` / `text` / `explicit` strategies). Detected
tables become Sections flagged with `Metadata["table"]="true"` and
Markdown-rendered content. Encrypted PDFs are auto-decrypted via
pdfcpu's empty-password path. Trade-off: no OCR (scanned PDFs still
unsupported); single-bookmark / outline access still requires
ledongthuc until pdftable exposes the dictionary.
- **goldmark** for Markdown — the Go community's standard, actively
maintained.
- **`golang.org/x/net/html`** for HTML — stdlib-ish, no third-party dep.
Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ require (
github.com/go-chi/chi/v5 v5.2.5
github.com/google/uuid v1.6.0
github.com/hallelx2/llmgate v0.2.0
github.com/hallelx2/pdftable v0.3.0
github.com/hibiken/asynq v0.26.0
github.com/jackc/pgx/v5 v5.9.2
github.com/ledongthuc/pdf v0.0.0-20250511090121-5959a4027728
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,8 @@ github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.1 h1:X5VWvz21y3gzm9Nw/kaUeku/1+u
github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.1/go.mod h1:Zanoh4+gvIgluNqcfMVTJueD4wSS5hT7zTt4Mrutd90=
github.com/hallelx2/llmgate v0.2.0 h1:x/LNCeHUPZpafn2IXi+LqpnZa7TtEQdLVlpkkJTlzBI=
github.com/hallelx2/llmgate v0.2.0/go.mod h1:MK2Ol/5CIweTQ2/9eSiTJ5g/KSSuobNZL9TD3s57JxY=
github.com/hallelx2/pdftable v0.3.0 h1:SwZPu2z4cIR4R30gP+7bpunGh931StjO1vrsxoldiDw=
github.com/hallelx2/pdftable v0.3.0/go.mod h1:pxNlc4D43wjzis7M6EfgQZvHOsQ4okggm+xqUu+OokI=
github.com/hhrutter/lzw v1.0.0 h1:laL89Llp86W3rRs83LvKbwYRx6INE8gDn0XNb1oXtm0=
github.com/hhrutter/lzw v1.0.0/go.mod h1:2HC6DJSn/n6iAZfgM3Pg+cP1KxeWc3ezG8bBqW5+WEo=
github.com/hhrutter/pkcs7 v0.2.2 h1:xMoifoVWah1LNym3C0pomEiLmyJyVIBXt/8oTPyPz+8=
Expand Down
99 changes: 99 additions & 0 deletions pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,12 @@ type Config struct {
type IngestConfig struct {
HyDE HyDEConfig `yaml:"hyde"`

// Tables configures pdftable's table-finding pass over PDF inputs.
// Enabled by default — tables are the single biggest retrieval-quality
// boost on FinanceBench-style documents because every numeric question
// hides in a balance sheet that text-only extraction collapses.
Tables TablesConfig `yaml:"tables"`

// GlobalLLMConcurrency caps the total number of LLM calls in flight
// across the summarize and HyDE stages combined, which now run
// concurrently. Each stage still respects its own per-stage cap
Expand All @@ -47,6 +53,48 @@ type IngestConfig struct {
GlobalLLMConcurrency int `yaml:"global_llm_concurrency"`
}

// TablesConfig configures the table-extraction stage of the PDF parser.
// The stage runs pdftable's geometry-based finder over every page and
// emits each detected table as its own Section with
// Metadata["table"]="true", so downstream retrieval and the agentic
// navigator can branch on whether a candidate is a numeric table or
// prose.
//
// All knobs are forwarded to pdftable's TableSettings; defaults match
// pdfplumber. See pdftable's docs for the full strategy surface.
type TablesConfig struct {
// Enabled toggles the stage. Default: true. Flip to false to
// restore pre-integration text-only output; one config change is
// enough to roll back if a real-world PDF triggers a regression.
Enabled bool `yaml:"enabled"`

// VerticalStrategy picks the source of vertical column boundaries.
// Allowed values:
// - "lines" (default) edges from drawn lines/rects/curves
// - "lines_strict" edges from drawn lines only
// - "text" edges inferred from word alignment (borderless
// tables — bank statements, narrative 10-Ks)
// - "explicit" caller-supplied coordinates (not yet wired
// through the engine config; reserved)
VerticalStrategy string `yaml:"vertical_strategy"`

// HorizontalStrategy picks the source of horizontal row boundaries.
// Same value set as VerticalStrategy; the two axes can mix
// independently (e.g. "lines" vertical + "text" horizontal).
HorizontalStrategy string `yaml:"horizontal_strategy"`

// MinTableRows drops candidate tables with fewer than this many
// rows. Default: 2. Trivial single-row matches are almost always
// false positives from layout artefacts (form-field grids, ruling
// hairlines on a single line of text).
MinTableRows int `yaml:"min_table_rows"`

// MinTableCols drops candidate tables with fewer than this many
// columns. Default: 2. Same rationale as MinTableRows — a single
// column is a vertical list, not a table.
MinTableCols int `yaml:"min_table_cols"`
}

// HyDEConfig configures the HyDE candidate-question stage. For each
// leaf section the pipeline asks the LLM to enumerate questions the
// section's content can answer; those are later folded into the
Expand Down Expand Up @@ -449,6 +497,13 @@ func Default() Config {
NumQuestions: 5,
Concurrency: 4,
},
Tables: TablesConfig{
Enabled: true,
VerticalStrategy: "lines",
HorizontalStrategy: "lines",
MinTableRows: 2,
MinTableCols: 2,
},
},
Log: LogConfig{Level: "info", Format: "json"},
}
Expand Down Expand Up @@ -581,6 +636,31 @@ func applyEnvOverrides(c *Config) {
c.Ingest.GlobalLLMConcurrency = n
}
}
// pdftable-driven table extraction.
if v := os.Getenv("VLE_INGEST_TABLES_ENABLED"); v != "" {
switch strings.ToLower(strings.TrimSpace(v)) {
case "1", "true", "yes", "on":
c.Ingest.Tables.Enabled = true
case "0", "false", "no", "off":
c.Ingest.Tables.Enabled = false
}
}
if v := os.Getenv("VLE_INGEST_TABLES_VERTICAL_STRATEGY"); v != "" {
c.Ingest.Tables.VerticalStrategy = v
}
if v := os.Getenv("VLE_INGEST_TABLES_HORIZONTAL_STRATEGY"); v != "" {
c.Ingest.Tables.HorizontalStrategy = v
}
if v := os.Getenv("VLE_INGEST_TABLES_MIN_ROWS"); v != "" {
if n, err := strconv.Atoi(v); err == nil && n >= 0 {
c.Ingest.Tables.MinTableRows = n
}
}
if v := os.Getenv("VLE_INGEST_TABLES_MIN_COLS"); v != "" {
if n, err := strconv.Atoi(v); err == nil && n >= 0 {
c.Ingest.Tables.MinTableCols = n
}
}
if v := os.Getenv("VLE_RETRIEVAL_ANSWER_SPAN_ENABLED"); v != "" {
switch strings.ToLower(strings.TrimSpace(v)) {
case "1", "true", "yes", "on":
Expand Down Expand Up @@ -750,6 +830,25 @@ func (c Config) Validate() error {
return fmt.Errorf("ingest.global_llm_concurrency must be >= 0, got %d", c.Ingest.GlobalLLMConcurrency)
}

switch c.Ingest.Tables.VerticalStrategy {
case "", "lines", "lines_strict", "text", "explicit":
default:
return fmt.Errorf("ingest.tables.vertical_strategy must be one of lines|lines_strict|text|explicit, got %q",
c.Ingest.Tables.VerticalStrategy)
}
switch c.Ingest.Tables.HorizontalStrategy {
case "", "lines", "lines_strict", "text", "explicit":
default:
return fmt.Errorf("ingest.tables.horizontal_strategy must be one of lines|lines_strict|text|explicit, got %q",
c.Ingest.Tables.HorizontalStrategy)
}
if c.Ingest.Tables.MinTableRows < 0 {
return fmt.Errorf("ingest.tables.min_table_rows must be >= 0, got %d", c.Ingest.Tables.MinTableRows)
}
if c.Ingest.Tables.MinTableCols < 0 {
return fmt.Errorf("ingest.tables.min_table_cols must be >= 0, got %d", c.Ingest.Tables.MinTableCols)
}

if c.Retrieval.Planning.CacheSize < 0 {
return fmt.Errorf("retrieval.planning.cache_size must be >= 0, got %d", c.Retrieval.Planning.CacheSize)
}
Expand Down
Loading
Loading