diff --git a/README.md b/README.md index 648c5c8..d1fc30e 100644 --- a/README.md +++ b/README.md @@ -222,7 +222,7 @@ Or via environment variables: `VLE_TLS_CERT_FILE`, `VLE_TLS_KEY_FILE`. | Markdown | `goldmark` | ATX + Setext headings become section boundaries | | HTML | `golang.org/x/net/html` | Prefers `
`/`
`; skips nav/footer/script | | DOCX | stdlib `archive/zip` + `encoding/xml` | `Heading 1…9` styles become section boundaries | -| PDF | `ledongthuc/pdf` | Font-size heuristic recovers headings from unstructured PDFs | +| PDF | `hallelx2/pdftable` + `ledongthuc/pdf` | pdftable extracts positioned words + ruled / borderless tables (Markdown-rendered, `Metadata["table"]="true"`); font-size heuristic recovers headings; ledongthuc supplies `/Outlines` when present | | Text | stdlib | Single-section fallback | New parsers drop in behind a one-method `Parser` interface — see [`pkg/parser/`](pkg/parser/). diff --git a/cmd/engine/main.go b/cmd/engine/main.go index 04f21bd..5398a31 100644 --- a/cmd/engine/main.go +++ b/cmd/engine/main.go @@ -27,6 +27,7 @@ import ( "github.com/hallelx2/vectorless-engine/pkg/config" "github.com/hallelx2/vectorless-engine/pkg/db" "github.com/hallelx2/vectorless-engine/pkg/ingest" + "github.com/hallelx2/vectorless-engine/pkg/parser" "github.com/hallelx2/vectorless-engine/pkg/queue" "github.com/hallelx2/vectorless-engine/pkg/retrieval" "github.com/hallelx2/vectorless-engine/pkg/storage" @@ -171,7 +172,7 @@ func run() error { DB: pool, Storage: store, LLM: llmClient, - Parsers: ingest.DefaultRegistry(), + Parsers: ingest.RegistryFromTableOpts(tableOptsFromConfig(cfg.Ingest.Tables)), Logger: logger, HyDEEnabled: cfg.Ingest.HyDE.Enabled, HyDEModel: cfg.Ingest.HyDE.Model, @@ -179,6 +180,16 @@ func run() error { HyDEConcurrency: cfg.Ingest.HyDE.Concurrency, GlobalLLMConcurrency: cfg.Ingest.GlobalLLMConcurrency, }) + if cfg.Ingest.Tables.Enabled { + logger.Info("ingest: pdf table extraction enabled", + "vertical_strategy", cfg.Ingest.Tables.VerticalStrategy, + "horizontal_strategy", cfg.Ingest.Tables.HorizontalStrategy, + "min_rows", cfg.Ingest.Tables.MinTableRows, + "min_cols", cfg.Ingest.Tables.MinTableCols, + ) + } else { + logger.Info("ingest: pdf table extraction disabled") + } q.Register(queue.KindIngestDocument, pipeline.Handler()) deps := api.Deps{ @@ -405,3 +416,19 @@ func newLogger(c config.LogConfig) *slog.Logger { } return slog.New(h) } + +// tableOptsFromConfig translates the YAML/env Tables block into the +// parser-level TableOpts struct. Returns nil when tables are disabled so +// the PDF parser short-circuits without instantiating pdftable settings. +func tableOptsFromConfig(c config.TablesConfig) *parser.TableOpts { + if !c.Enabled { + return nil + } + return &parser.TableOpts{ + Enabled: true, + VerticalStrategy: c.VerticalStrategy, + HorizontalStrategy: c.HorizontalStrategy, + MinTableRows: c.MinTableRows, + MinTableCols: c.MinTableCols, + } +} diff --git a/cmd/server/main.go b/cmd/server/main.go index c9b8014..2ac2f61 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -33,6 +33,7 @@ import ( "github.com/hallelx2/vectorless-engine/pkg/db" "github.com/hallelx2/vectorless-engine/pkg/ingest" + "github.com/hallelx2/vectorless-engine/pkg/parser" "github.com/hallelx2/vectorless-engine/pkg/queue" "github.com/hallelx2/vectorless-engine/pkg/retrieval" "github.com/hallelx2/vectorless-engine/pkg/storage" @@ -158,7 +159,7 @@ func run() error { DB: pool, Storage: store, LLM: llmClient, - Parsers: ingest.DefaultRegistry(), + Parsers: ingest.RegistryFromTableOpts(tableOptsFromConfig(cfg.Engine.Ingest.Tables)), Logger: logger, HyDEEnabled: cfg.Engine.Ingest.HyDE.Enabled, HyDEModel: cfg.Engine.Ingest.HyDE.Model, @@ -166,6 +167,16 @@ func run() error { HyDEConcurrency: cfg.Engine.Ingest.HyDE.Concurrency, GlobalLLMConcurrency: cfg.Engine.Ingest.GlobalLLMConcurrency, }) + if cfg.Engine.Ingest.Tables.Enabled { + logger.Info("ingest: pdf table extraction enabled", + "vertical_strategy", cfg.Engine.Ingest.Tables.VerticalStrategy, + "horizontal_strategy", cfg.Engine.Ingest.Tables.HorizontalStrategy, + "min_rows", cfg.Engine.Ingest.Tables.MinTableRows, + "min_cols", cfg.Engine.Ingest.Tables.MinTableCols, + ) + } else { + logger.Info("ingest: pdf table extraction disabled") + } q.Register(queue.KindIngestDocument, pipeline.Handler()) // ── Start subsystems ────────────────────────────────────────── @@ -395,3 +406,20 @@ func newLogger(c enginecfg.LogConfig) *slog.Logger { } return slog.New(h) } + +// tableOptsFromConfig translates the engine's TablesConfig (from the +// embedded engine config block) into the parser-level TableOpts. Returns +// nil when tables are disabled so the PDF parser short-circuits without +// instantiating pdftable settings. +func tableOptsFromConfig(c enginecfg.TablesConfig) *parser.TableOpts { + if !c.Enabled { + return nil + } + return &parser.TableOpts{ + Enabled: true, + VerticalStrategy: c.VerticalStrategy, + HorizontalStrategy: c.HorizontalStrategy, + MinTableRows: c.MinTableRows, + MinTableCols: c.MinTableCols, + } +} diff --git a/config.example.yaml b/config.example.yaml index 235bb63..ee1b9c4 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -222,6 +222,33 @@ ingest: num_questions: 5 concurrency: 4 + # Tables: pdftable-driven extraction. Every detected table on a PDF + # page becomes its own Section with `Metadata["table"]="true"`, content + # rendered as GitHub-flavoured Markdown. This is the single biggest + # retrieval-quality lever on documents where numeric answers live in + # balance sheets — text-only extraction collapses tables into a + # space-joined run that's effectively unsearchable. + # + # ENABLED BY DEFAULT. Flip to false if a pathological PDF surfaces a + # regression — table-extraction errors never break ingest (text-only + # output still ships), but the flag is the kill switch. + tables: + enabled: true + # Vertical / horizontal edge-detection strategy. One of: + # lines (default) — edges from drawn lines/rects/curves + # lines_strict edges from drawn lines only + # text edges inferred from word alignment + # (best for borderless / narrative tables) + # explicit caller-supplied coordinates (reserved) + # The two axes mix independently, so "lines" vertical + "text" + # horizontal works for half-ruled tables. + vertical_strategy: "lines" + horizontal_strategy: "lines" + # Drop candidate tables smaller than this. 2x2 is the floor — a + # single row or column is a list or a header, not a table. + min_table_rows: 2 + min_table_cols: 2 + log: level: "info" # debug | info | warn | error format: "json" # json | console diff --git a/docs/ENGINE.md b/docs/ENGINE.md index 366b999..17e5e6e 100644 --- a/docs/ENGINE.md +++ b/docs/ENGINE.md @@ -234,8 +234,15 @@ internals. pooling. - **Embedded SQL migrations** via `//go:embed`. No Atlas, no goose, no Flyway. Migration is ten lines of Go; external tools are overkill. -- **ledongthuc/pdf** for PDF — pure Go, no cgo, cross-compiles cleanly. - Trade-off: no OCR, no encrypted PDFs. Deferred to Phase 2+. +- **hallelx2/pdftable** (primary) + **ledongthuc/pdf** (fallback for + `/Outlines` only) for PDF. pdftable is a pure-Go port of pdfplumber: + positioned-word extraction + pdfplumber-parity table-finding pipeline + (`lines` / `lines_strict` / `text` / `explicit` strategies). Detected + tables become Sections flagged with `Metadata["table"]="true"` and + Markdown-rendered content. Encrypted PDFs are auto-decrypted via + pdfcpu's empty-password path. Trade-off: no OCR (scanned PDFs still + unsupported); single-bookmark / outline access still requires + ledongthuc until pdftable exposes the dictionary. - **goldmark** for Markdown — the Go community's standard, actively maintained. - **`golang.org/x/net/html`** for HTML — stdlib-ish, no third-party dep. diff --git a/go.mod b/go.mod index edba3be..130314e 100644 --- a/go.mod +++ b/go.mod @@ -13,6 +13,7 @@ require ( github.com/go-chi/chi/v5 v5.2.5 github.com/google/uuid v1.6.0 github.com/hallelx2/llmgate v0.2.0 + github.com/hallelx2/pdftable v0.3.0 github.com/hibiken/asynq v0.26.0 github.com/jackc/pgx/v5 v5.9.2 github.com/ledongthuc/pdf v0.0.0-20250511090121-5959a4027728 diff --git a/go.sum b/go.sum index aad3ef1..d52b49a 100644 --- a/go.sum +++ b/go.sum @@ -134,6 +134,8 @@ github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.1 h1:X5VWvz21y3gzm9Nw/kaUeku/1+u github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.1/go.mod h1:Zanoh4+gvIgluNqcfMVTJueD4wSS5hT7zTt4Mrutd90= github.com/hallelx2/llmgate v0.2.0 h1:x/LNCeHUPZpafn2IXi+LqpnZa7TtEQdLVlpkkJTlzBI= github.com/hallelx2/llmgate v0.2.0/go.mod h1:MK2Ol/5CIweTQ2/9eSiTJ5g/KSSuobNZL9TD3s57JxY= +github.com/hallelx2/pdftable v0.3.0 h1:SwZPu2z4cIR4R30gP+7bpunGh931StjO1vrsxoldiDw= +github.com/hallelx2/pdftable v0.3.0/go.mod h1:pxNlc4D43wjzis7M6EfgQZvHOsQ4okggm+xqUu+OokI= github.com/hhrutter/lzw v1.0.0 h1:laL89Llp86W3rRs83LvKbwYRx6INE8gDn0XNb1oXtm0= github.com/hhrutter/lzw v1.0.0/go.mod h1:2HC6DJSn/n6iAZfgM3Pg+cP1KxeWc3ezG8bBqW5+WEo= github.com/hhrutter/pkcs7 v0.2.2 h1:xMoifoVWah1LNym3C0pomEiLmyJyVIBXt/8oTPyPz+8= diff --git a/pkg/config/config.go b/pkg/config/config.go index 43cd976..a640319 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -35,6 +35,12 @@ type Config struct { type IngestConfig struct { HyDE HyDEConfig `yaml:"hyde"` + // Tables configures pdftable's table-finding pass over PDF inputs. + // Enabled by default — tables are the single biggest retrieval-quality + // boost on FinanceBench-style documents because every numeric question + // hides in a balance sheet that text-only extraction collapses. + Tables TablesConfig `yaml:"tables"` + // GlobalLLMConcurrency caps the total number of LLM calls in flight // across the summarize and HyDE stages combined, which now run // concurrently. Each stage still respects its own per-stage cap @@ -47,6 +53,48 @@ type IngestConfig struct { GlobalLLMConcurrency int `yaml:"global_llm_concurrency"` } +// TablesConfig configures the table-extraction stage of the PDF parser. +// The stage runs pdftable's geometry-based finder over every page and +// emits each detected table as its own Section with +// Metadata["table"]="true", so downstream retrieval and the agentic +// navigator can branch on whether a candidate is a numeric table or +// prose. +// +// All knobs are forwarded to pdftable's TableSettings; defaults match +// pdfplumber. See pdftable's docs for the full strategy surface. +type TablesConfig struct { + // Enabled toggles the stage. Default: true. Flip to false to + // restore pre-integration text-only output; one config change is + // enough to roll back if a real-world PDF triggers a regression. + Enabled bool `yaml:"enabled"` + + // VerticalStrategy picks the source of vertical column boundaries. + // Allowed values: + // - "lines" (default) edges from drawn lines/rects/curves + // - "lines_strict" edges from drawn lines only + // - "text" edges inferred from word alignment (borderless + // tables — bank statements, narrative 10-Ks) + // - "explicit" caller-supplied coordinates (not yet wired + // through the engine config; reserved) + VerticalStrategy string `yaml:"vertical_strategy"` + + // HorizontalStrategy picks the source of horizontal row boundaries. + // Same value set as VerticalStrategy; the two axes can mix + // independently (e.g. "lines" vertical + "text" horizontal). + HorizontalStrategy string `yaml:"horizontal_strategy"` + + // MinTableRows drops candidate tables with fewer than this many + // rows. Default: 2. Trivial single-row matches are almost always + // false positives from layout artefacts (form-field grids, ruling + // hairlines on a single line of text). + MinTableRows int `yaml:"min_table_rows"` + + // MinTableCols drops candidate tables with fewer than this many + // columns. Default: 2. Same rationale as MinTableRows — a single + // column is a vertical list, not a table. + MinTableCols int `yaml:"min_table_cols"` +} + // HyDEConfig configures the HyDE candidate-question stage. For each // leaf section the pipeline asks the LLM to enumerate questions the // section's content can answer; those are later folded into the @@ -449,6 +497,13 @@ func Default() Config { NumQuestions: 5, Concurrency: 4, }, + Tables: TablesConfig{ + Enabled: true, + VerticalStrategy: "lines", + HorizontalStrategy: "lines", + MinTableRows: 2, + MinTableCols: 2, + }, }, Log: LogConfig{Level: "info", Format: "json"}, } @@ -581,6 +636,31 @@ func applyEnvOverrides(c *Config) { c.Ingest.GlobalLLMConcurrency = n } } + // pdftable-driven table extraction. + if v := os.Getenv("VLE_INGEST_TABLES_ENABLED"); v != "" { + switch strings.ToLower(strings.TrimSpace(v)) { + case "1", "true", "yes", "on": + c.Ingest.Tables.Enabled = true + case "0", "false", "no", "off": + c.Ingest.Tables.Enabled = false + } + } + if v := os.Getenv("VLE_INGEST_TABLES_VERTICAL_STRATEGY"); v != "" { + c.Ingest.Tables.VerticalStrategy = v + } + if v := os.Getenv("VLE_INGEST_TABLES_HORIZONTAL_STRATEGY"); v != "" { + c.Ingest.Tables.HorizontalStrategy = v + } + if v := os.Getenv("VLE_INGEST_TABLES_MIN_ROWS"); v != "" { + if n, err := strconv.Atoi(v); err == nil && n >= 0 { + c.Ingest.Tables.MinTableRows = n + } + } + if v := os.Getenv("VLE_INGEST_TABLES_MIN_COLS"); v != "" { + if n, err := strconv.Atoi(v); err == nil && n >= 0 { + c.Ingest.Tables.MinTableCols = n + } + } if v := os.Getenv("VLE_RETRIEVAL_ANSWER_SPAN_ENABLED"); v != "" { switch strings.ToLower(strings.TrimSpace(v)) { case "1", "true", "yes", "on": @@ -750,6 +830,25 @@ func (c Config) Validate() error { return fmt.Errorf("ingest.global_llm_concurrency must be >= 0, got %d", c.Ingest.GlobalLLMConcurrency) } + switch c.Ingest.Tables.VerticalStrategy { + case "", "lines", "lines_strict", "text", "explicit": + default: + return fmt.Errorf("ingest.tables.vertical_strategy must be one of lines|lines_strict|text|explicit, got %q", + c.Ingest.Tables.VerticalStrategy) + } + switch c.Ingest.Tables.HorizontalStrategy { + case "", "lines", "lines_strict", "text", "explicit": + default: + return fmt.Errorf("ingest.tables.horizontal_strategy must be one of lines|lines_strict|text|explicit, got %q", + c.Ingest.Tables.HorizontalStrategy) + } + if c.Ingest.Tables.MinTableRows < 0 { + return fmt.Errorf("ingest.tables.min_table_rows must be >= 0, got %d", c.Ingest.Tables.MinTableRows) + } + if c.Ingest.Tables.MinTableCols < 0 { + return fmt.Errorf("ingest.tables.min_table_cols must be >= 0, got %d", c.Ingest.Tables.MinTableCols) + } + if c.Retrieval.Planning.CacheSize < 0 { return fmt.Errorf("retrieval.planning.cache_size must be >= 0, got %d", c.Retrieval.Planning.CacheSize) } diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go index a9d9fab..f71ad41 100644 --- a/pkg/config/config_test.go +++ b/pkg/config/config_test.go @@ -602,3 +602,83 @@ func TestLoadInvalidYAML(t *testing.T) { t.Error("expected error for invalid YAML") } } + +func TestTablesDefaults(t *testing.T) { + t.Parallel() + cfg := Default() + if !cfg.Ingest.Tables.Enabled { + t.Error("ingest.tables.enabled should default to true") + } + if cfg.Ingest.Tables.VerticalStrategy != "lines" { + t.Errorf("vertical_strategy = %q, want lines", cfg.Ingest.Tables.VerticalStrategy) + } + if cfg.Ingest.Tables.HorizontalStrategy != "lines" { + t.Errorf("horizontal_strategy = %q, want lines", cfg.Ingest.Tables.HorizontalStrategy) + } + if cfg.Ingest.Tables.MinTableRows != 2 { + t.Errorf("min_table_rows = %d, want 2", cfg.Ingest.Tables.MinTableRows) + } + if cfg.Ingest.Tables.MinTableCols != 2 { + t.Errorf("min_table_cols = %d, want 2", cfg.Ingest.Tables.MinTableCols) + } +} + +func TestTablesEnvOverride(t *testing.T) { + // Mutates env — restore on exit. Not parallel. + prevEnabled := os.Getenv("VLE_INGEST_TABLES_ENABLED") + prevV := os.Getenv("VLE_INGEST_TABLES_VERTICAL_STRATEGY") + prevH := os.Getenv("VLE_INGEST_TABLES_HORIZONTAL_STRATEGY") + prevRows := os.Getenv("VLE_INGEST_TABLES_MIN_ROWS") + prevCols := os.Getenv("VLE_INGEST_TABLES_MIN_COLS") + defer func() { + os.Setenv("VLE_INGEST_TABLES_ENABLED", prevEnabled) + os.Setenv("VLE_INGEST_TABLES_VERTICAL_STRATEGY", prevV) + os.Setenv("VLE_INGEST_TABLES_HORIZONTAL_STRATEGY", prevH) + os.Setenv("VLE_INGEST_TABLES_MIN_ROWS", prevRows) + os.Setenv("VLE_INGEST_TABLES_MIN_COLS", prevCols) + }() + + os.Setenv("VLE_INGEST_TABLES_ENABLED", "false") + os.Setenv("VLE_INGEST_TABLES_VERTICAL_STRATEGY", "text") + os.Setenv("VLE_INGEST_TABLES_HORIZONTAL_STRATEGY", "lines_strict") + os.Setenv("VLE_INGEST_TABLES_MIN_ROWS", "4") + os.Setenv("VLE_INGEST_TABLES_MIN_COLS", "3") + + cfg := Default() + applyEnvOverrides(&cfg) + + if cfg.Ingest.Tables.Enabled { + t.Error("VLE_INGEST_TABLES_ENABLED=false should disable") + } + if cfg.Ingest.Tables.VerticalStrategy != "text" { + t.Errorf("vertical_strategy = %q, want text", cfg.Ingest.Tables.VerticalStrategy) + } + if cfg.Ingest.Tables.HorizontalStrategy != "lines_strict" { + t.Errorf("horizontal_strategy = %q, want lines_strict", cfg.Ingest.Tables.HorizontalStrategy) + } + if cfg.Ingest.Tables.MinTableRows != 4 { + t.Errorf("min_table_rows = %d, want 4", cfg.Ingest.Tables.MinTableRows) + } + if cfg.Ingest.Tables.MinTableCols != 3 { + t.Errorf("min_table_cols = %d, want 3", cfg.Ingest.Tables.MinTableCols) + } +} + +func TestTablesValidateRejectsBadStrategy(t *testing.T) { + t.Parallel() + cfg := Default() + cfg.Ingest.Tables.VerticalStrategy = "magic" + if err := cfg.Validate(); err == nil { + t.Error("expected error for unknown vertical_strategy") + } + cfg = Default() + cfg.Ingest.Tables.HorizontalStrategy = "wacky" + if err := cfg.Validate(); err == nil { + t.Error("expected error for unknown horizontal_strategy") + } + cfg = Default() + cfg.Ingest.Tables.MinTableRows = -1 + if err := cfg.Validate(); err == nil { + t.Error("expected error for negative min_table_rows") + } +} diff --git a/pkg/ingest/ingest.go b/pkg/ingest/ingest.go index 91a6857..666dee3 100644 --- a/pkg/ingest/ingest.go +++ b/pkg/ingest/ingest.go @@ -676,7 +676,9 @@ func SourceKey(id tree.DocumentID, filename string) string { } // DefaultRegistry returns a parser.Registry preloaded with the parsers -// the engine ships with. Callers may add more via Registry.Register. +// the engine ships with, using the production defaults for each format +// (including table-aware PDF extraction). Callers that need to override +// PDF table behaviour from config should use RegistryFromTableOpts. func DefaultRegistry() *parser.Registry { return parser.NewRegistry( parser.NewMarkdown(), @@ -687,5 +689,19 @@ func DefaultRegistry() *parser.Registry { ) } +// RegistryFromTableOpts returns a parser.Registry where the PDF parser +// is configured from the supplied TableOpts. Pass nil to disable table +// extraction entirely; pass parser.DefaultTableOpts() (or a custom set) +// to enable. All non-PDF parsers are constructed at their defaults. +func RegistryFromTableOpts(opts *parser.TableOpts) *parser.Registry { + return parser.NewRegistry( + parser.NewMarkdown(), + parser.NewHTML(), + parser.NewDOCX(), + parser.NewPDFWithTables(opts), + parser.NewText(), + ) +} + // helper kept for tests — not used by the pipeline itself. var _ = time.Now diff --git a/pkg/parser/pdf.go b/pkg/parser/pdf.go index 01ddb75..8a25c45 100644 --- a/pkg/parser/pdf.go +++ b/pkg/parser/pdf.go @@ -3,12 +3,15 @@ package parser import ( "bytes" "context" + "errors" "fmt" "io" + "log/slog" "regexp" "sort" "strings" + "github.com/hallelx2/pdftable" pdflib "github.com/ledongthuc/pdf" "github.com/pdfcpu/pdfcpu/pkg/api" "github.com/pdfcpu/pdfcpu/pkg/pdfcpu/model" @@ -20,24 +23,79 @@ import ( // headings in the wire layer, just runs of glyphs with font sizes and // positions. To recover structure we: // -// 1. Extract text per page, row-by-row, with font-size information. +// 1. Extract positioned WORDS per page (font name + size + bbox) using +// pdftable's content-stream interpreter. // 2. Compute the median font size across the whole document. // 3. Treat any row whose font size exceeds a threshold (1.2× median) // AND that is short (<= 14 words) as a heading candidate. // 4. Group headings into levels by font-size buckets (largest = level 1). // 5. Everything else is body text for the most recent heading. +// 6. Run pdftable's table-finding pipeline over each page and emit one +// extra Section per detected table whose Content is a GitHub-flavoured +// Markdown rendering of the cells. Tables are flagged with +// Metadata["table"]="true" so retrieval can lean on numeric content +// that would otherwise collapse into a space-joined run. // -// This won't beat a PDF with a proper bookmark outline, but it recovers -// surprisingly usable structure from academic papers, whitepapers, and -// reports. A future parser can read the PDF's /Outlines dictionary -// directly for documents that have one. -// -// Encrypted PDFs, PDFs with non-standard fonts, and scanned PDFs (pure -// images) are not supported at this stage. -type PDF struct{} +// Encrypted PDFs are auto-decrypted with the empty password via pdfcpu. +// PDFs with non-standard fonts and scanned PDFs (pure images) are not +// supported at this stage. +type PDF struct { + // Tables, when non-nil, overrides the default table-extraction + // behaviour (enabled, lines/lines strategies, minima 2×2). Pass nil + // to use the engine defaults; pass a zero value to disable tables + // entirely. + Tables *TableOpts +} + +// TableOpts controls pdftable's table-finding stage. The zero value +// disables table extraction; use DefaultTableOpts() for the +// production-default knobs. +type TableOpts struct { + // Enabled toggles table extraction. When false, the parser behaves + // exactly like the pre-integration text-only flow. + Enabled bool + + // VerticalStrategy is forwarded to pdftable as + // TableSettings.VerticalStrategy. Empty falls back to "lines". + VerticalStrategy string + + // HorizontalStrategy is forwarded to pdftable as + // TableSettings.HorizontalStrategy. Empty falls back to "lines". + HorizontalStrategy string + + // MinTableRows is the minimum row count for a candidate table to be + // emitted as a Section. 0 means "no minimum"; recommend 2 in + // production so trivial single-row matches don't leak into the + // outline. + MinTableRows int + + // MinTableCols is the minimum column count for a candidate table. + // Same semantics as MinTableRows. + MinTableCols int +} -// NewPDF returns a new PDF parser. -func NewPDF() *PDF { return &PDF{} } +// DefaultTableOpts returns the production defaults: tables on, both axes +// using the "lines" strategy, minima 2×2. These mirror pdftable's own +// DefaultTableSettings() and were tuned against the FinanceBench 10-K +// fixtures. +func DefaultTableOpts() *TableOpts { + return &TableOpts{ + Enabled: true, + VerticalStrategy: "lines", + HorizontalStrategy: "lines", + MinTableRows: 2, + MinTableCols: 2, + } +} + +// NewPDF returns a new PDF parser with table extraction enabled at the +// production defaults. Pass NewPDFWithTables(nil) (or a zero TableOpts) +// to opt out of tables. +func NewPDF() *PDF { return &PDF{Tables: DefaultTableOpts()} } + +// NewPDFWithTables returns a PDF parser using the supplied table- +// extraction options. Pass nil to disable table extraction. +func NewPDFWithTables(opts *TableOpts) *PDF { return &PDF{Tables: opts} } // Name implements Parser. func (*PDF) Name() string { return "pdf" } @@ -51,32 +109,58 @@ func (*PDF) Accepts(contentType, filename string) bool { } // Parse implements Parser. -func (*PDF) Parse(_ context.Context, r io.Reader) (*ParsedDoc, error) { +func (p *PDF) Parse(_ context.Context, r io.Reader) (*ParsedDoc, error) { buf, err := io.ReadAll(r) if err != nil { return nil, err } - reader, err := pdflib.NewReader(bytes.NewReader(buf), int64(len(buf))) + + // We run TWO PDF backends in parallel here: + // + // - pdftable (the new primitive layer) extracts positioned WORDS + // (font name + size + bbox) directly. This is the input to the + // section-discovery heuristics and is also the only source for + // the table-finding pass. It is robust to letter-spaced glyphs + // and ships pdfplumber-parity word grouping out of the box. + // + // - ledongthuc/pdf is retained solely for /Outlines (bookmark) + // access — pdftable does not expose the outline dictionary yet, + // and outlines are ground truth for SEC filings / academic papers + // that have one. Once pdftable surfaces outlines we can drop the + // dependency entirely. + // + // Both backends consume the same byte slice. If pdftable rejects the + // document as encrypted we strip the encryption layer with pdfcpu + // (empty password) and retry — this is the path that lets us index + // "owner-password" PDFs whose only restriction is print/copy. + docBytes := buf + pdoc, err := pdftable.OpenBytes(docBytes) if err != nil { - // ledongthuc/pdf has no encryption support — even PDFs that - // open in any normal viewer (empty user password, owner-only - // permissions like print/copy restrictions) get rejected with - // a "256-bit encryption key" / "encrypted" error. Try to strip - // the encryption layer with pdfcpu using the empty password, - // then retry the parser on the cleaned bytes. - if isEncryptedPDFError(err) { + if isPdftableEncryptedErr(err) { cleaned, decErr := decryptPDFWithEmptyPassword(buf) if decErr != nil { return nil, fmt.Errorf("pdf: open: encrypted and could not be unlocked with empty password: %w", decErr) } - reader, err = pdflib.NewReader(bytes.NewReader(cleaned), int64(len(cleaned))) + docBytes = cleaned + pdoc, err = pdftable.OpenBytes(docBytes) } if err != nil { return nil, fmt.Errorf("pdf: open: %w", err) } } + defer pdoc.Close() + + reader, err := pdflib.NewReader(bytes.NewReader(docBytes), int64(len(docBytes))) + if err != nil { + // ledongthuc/pdf can fail on PDFs pdftable accepts (e.g. some + // xref-stream variants). Outline access is optional, so a + // failure here is not fatal — we just skip the outline path. + // Log at debug level and carry on with the heuristic flow. + slog.Debug("pdf: outline backend unavailable", "err", err) + reader = nil + } - rows, err := extractPDFRows(reader) + rows, err := extractPDFRows(pdoc) if err != nil { return nil, err } @@ -92,13 +176,21 @@ func (*PDF) Parse(_ context.Context, r io.Reader) (*ParsedDoc, error) { return nil, fmt.Errorf("pdf: text extraction produced no usable content — the document may have an overlay watermark or use a non-standard font encoding") } + // Run table extraction once, BEFORE we commit to either the outline + // path or the heuristic path: both should be able to inherit the + // same set of detected tables. + tableSections := extractPDFTables(pdoc, p.Tables) + // If the PDF ships with a real outline (bookmarks), use it as ground // truth for structure — beats any font-size heuristic. We still rely // on row extraction for section bodies by matching outline titles // against the first occurrence of that text in the row stream. - if outline := reader.Outline(); len(outline.Child) > 0 { - if doc, ok := parsePDFWithOutline(outline, rows); ok { - return doc, nil + if reader != nil { + if outline := reader.Outline(); len(outline.Child) > 0 { + if doc, ok := parsePDFWithOutline(outline, rows); ok { + attachTableSections(doc, tableSections) + return doc, nil + } } } @@ -264,10 +356,12 @@ func (*PDF) Parse(_ context.Context, r io.Reader) (*ParsedDoc, error) { // so callers reading the outline can still cite a page span. propagateSectionPages(rootSec.Children) - return &ParsedDoc{ + out := &ParsedDoc{ Title: title, Sections: chunkOversizedLeaves(rootSec.Children), - }, nil + } + attachTableSections(out, tableSections) + return out, nil } // propagateSectionPages fills internal-node PageStart/PageEnd from the union @@ -422,26 +516,43 @@ type pdfRow struct { text string } -// extractPDFRows walks each page, grouping letters into rows by y-position -// and recording the dominant font size per row. ledongthuc/pdf's Content() -// returns individual glyphs; we reassemble them into lines. -func extractPDFRows(reader *pdflib.Reader) ([]pdfRow, error) { - numPages := reader.NumPage() +// extractPDFRows walks each page of doc, asks pdftable for positioned +// Words, and groups them into rows by visual top (Y1 in PDF user space). +// pdftable's Words() already takes care of intra-word glyph reassembly, +// letter-spacing collapse, and ligature expansion — so this layer just +// has to bucket words back into lines and tally the dominant font size +// + bold ratio per row. +// +// The bucket tolerance (Y1 within 2pt) matches what the previous +// ledongthuc-backed implementation used; word-level Y1 jitter is the +// same scale as the per-glyph jitter it replaced. +func extractPDFRows(doc pdftable.Document) ([]pdfRow, error) { + numPages := doc.NumPages() var out []pdfRow for pageNum := 1; pageNum <= numPages; pageNum++ { - page := reader.Page(pageNum) - if page.V.IsNull() { + page, err := doc.Page(pageNum) + if err != nil { + // A bad page shouldn't take down the document — pdftable + // can fail page-by-page on malformed content streams. Skip. + continue + } + words, err := page.Words(pdftable.DefaultWordOpts()) + if err != nil { + continue + } + if len(words) == 0 { continue } - content := page.Content() - // Group letters by (approximate) baseline Y. Values within 2pt are - // considered the same row — PDFs frequently jitter Y by a fraction. + // Group words by visual top (Y1). Values within 2pt are + // considered the same row — pdftable already clusters chars + // into words by its own YTolerance, so this is just the next + // step up: words at near-identical baselines become a row. type rowBucket struct { y float64 maxFS float64 - chars []pdflib.Text + words []pdftable.Word } var buckets []*rowBucket find := func(y float64) *rowBucket { @@ -454,47 +565,34 @@ func extractPDFRows(reader *pdflib.Reader) ([]pdfRow, error) { buckets = append(buckets, b) return b } - for _, t := range content.Text { - b := find(t.Y) - b.chars = append(b.chars, t) - if t.FontSize > b.maxFS { - b.maxFS = t.FontSize + for _, w := range words { + b := find(w.Y1) + b.words = append(b.words, w) + if w.FontSize > b.maxFS { + b.maxFS = w.FontSize } } - // Sort rows top-to-bottom (higher Y = higher on page in PDF). + // Sort rows top-to-bottom (higher Y = higher on page in PDF + // user space). sort.Slice(buckets, func(i, j int) bool { return buckets[i].y > buckets[j].y }) for _, b := range buckets { - sort.Slice(b.chars, func(i, j int) bool { return b.chars[i].X < b.chars[j].X }) + sort.Slice(b.words, func(i, j int) bool { return b.words[i].X0 < b.words[j].X0 }) var sb strings.Builder - var lastX float64 - boldGlyphs, totalGlyphs := 0, 0 - for i, ch := range b.chars { - // Insert a space when the gap between the previous - // glyph's end and this glyph's start exceeds a fraction - // of the font size. 0.20 was tuned against real PDFs - // (arXiv papers): word-boundary gaps land around - // 0.20-0.30·fontSize while intra-word kerning stays - // well below. The old 0.30 threshold missed most word - // boundaries, producing run-together text like - // "implementingtensor2tensor". - if i > 0 && ch.X-lastX > ch.FontSize*0.20 { + boldWords, totalWords := 0, 0 + for i, w := range b.words { + if i > 0 { sb.WriteString(" ") } - sb.WriteString(ch.S) - lastX = ch.X + ch.W - if strings.TrimSpace(ch.S) != "" { - totalGlyphs++ - if isBoldFont(ch.Font) { - boldGlyphs++ + sb.WriteString(w.Text) + if strings.TrimSpace(w.Text) != "" { + totalWords++ + if isBoldFont(w.FontName) { + boldWords++ } } } - // Wide letter-tracking — common on filing cover pages and - // bold section headers — makes every glyph gap exceed the - // space threshold, yielding "U N I T E D S T A T E S". - // Re-join those runs into real words. - text := collapseLetterSpacing(strings.TrimSpace(sb.String())) + text := strings.TrimSpace(sb.String()) if text == "" { continue } @@ -508,7 +606,7 @@ func extractPDFRows(reader *pdflib.Reader) ([]pdfRow, error) { out = append(out, pdfRow{ page: pageNum, fontSize: b.maxFS, - bold: totalGlyphs > 0 && boldGlyphs*2 > totalGlyphs, + bold: totalWords > 0 && boldWords*2 > totalWords, text: text, }) } @@ -792,8 +890,6 @@ func looksLikeHeading(s string) bool { return true } -var multiSpaceRe = regexp.MustCompile(`\s{2,}`) - // isBoldFont reports whether a PDF font name denotes a bold weight. SEC filing // section headings are typically bold at body font size (not larger), so this is // how we recover them — a size-only heuristic misses them entirely. @@ -802,49 +898,6 @@ func isBoldFont(font string) bool { return strings.Contains(f, "bold") || strings.Contains(f, "-bd") || strings.Contains(f, ",bd") } -// looksLetterSpaced reports whether a row is dominated by solitary-character -// tokens — the signature of wide letter-tracking ("U N I T E D S T A T E S"). -func looksLetterSpaced(s string) bool { - toks := strings.Fields(s) - if len(toks) < 4 { - return false - } - single := 0 - for _, t := range toks { - if len([]rune(t)) == 1 { - single++ - } - } - return single*2 > len(toks) -} - -// collapseLetterSpacing rejoins letter-tracked text. Word boundaries survive as -// runs of 2+ spaces; within each word the single spaces between solitary glyphs -// are removed ("F O R M 1 0 - Q" → "FORM 10-Q"). Rows that aren't -// letter-spaced are returned unchanged, so normal prose is never touched. -func collapseLetterSpacing(s string) string { - if !looksLetterSpaced(s) { - return s - } - words := multiSpaceRe.Split(s, -1) - for i, w := range words { - parts := strings.Fields(w) - allSingle := len(parts) > 0 - for _, p := range parts { - if len([]rune(p)) > 1 { - allSingle = false - break - } - } - if allSingle { - words[i] = strings.Join(parts, "") - } else { - words[i] = strings.Join(parts, " ") - } - } - return strings.TrimSpace(strings.Join(words, " ")) -} - func abs(f float64) float64 { if f < 0 { return -f @@ -935,3 +988,267 @@ func decryptPDFWithEmptyPassword(in []byte) ([]byte, error) { } return out.Bytes(), nil } + +// isPdftableEncryptedErr reports whether the given pdftable error is +// the sentinel for an encrypted PDF. pdftable surfaces ErrEncrypted via +// errors.Is, which is what we use here so we stay forward-compatible if +// the wrapping ever changes. +func isPdftableEncryptedErr(err error) bool { + if err == nil { + return false + } + if errors.Is(err, pdftable.ErrEncrypted) { + return true + } + // Defensive fallback: even if the sentinel ever changes name we + // still want to retry through pdfcpu rather than fail open. + msg := strings.ToLower(err.Error()) + return strings.Contains(msg, "encrypted") || strings.Contains(msg, "encryption") +} + +// extractPDFTables runs pdftable's table-finding pipeline over every +// page of doc and returns one Section per detected table. Each +// returned section carries: +// +// - Title: "Table (page N)" for callers/UIs that want a stable label. +// - Content: a GitHub-flavoured Markdown rendering of the cells. +// - PageStart/PageEnd: the page the table was found on (always equal +// because pdftable does not yet cross-page-merge tables). +// - Metadata["table"]="true": retrieval can branch on this to apply +// numeric-content-aware ranking; the rows/cols entries surface the +// shape for debugging and per-document analytics. +// +// Errors during table extraction are LOGGED and SWALLOWED — the engine's +// commitment is that bad PDFs never break ingest. A panic inside +// pdftable (defensive guard) is also caught. +// +// Pass opts=nil or opts.Enabled=false to short-circuit; the function +// then returns nil cheaply without walking the document. +func extractPDFTables(doc pdftable.Document, opts *TableOpts) []Section { + if opts == nil || !opts.Enabled { + return nil + } + settings := pdftable.DefaultTableSettings() + if opts.VerticalStrategy != "" { + settings.VerticalStrategy = pdftable.TableStrategy(opts.VerticalStrategy) + } + if opts.HorizontalStrategy != "" { + settings.HorizontalStrategy = pdftable.TableStrategy(opts.HorizontalStrategy) + } + minRows := opts.MinTableRows + minCols := opts.MinTableCols + + var sections []Section + for n := 1; n <= doc.NumPages(); n++ { + page, err := doc.Page(n) + if err != nil { + continue + } + tables := safeExtractTables(page, settings, n) + for _, t := range tables { + if t == nil { + continue + } + rows := normaliseTableRows(t.Rows) + if len(rows) < minRows { + continue + } + cols := 0 + if len(rows) > 0 { + cols = len(rows[0]) + } + if cols < minCols { + continue + } + md := tableToMarkdown(rows) + if strings.TrimSpace(md) == "" { + continue + } + sections = append(sections, Section{ + Level: 1, + Title: fmt.Sprintf("Table (page %d)", n), + Content: md, + PageStart: n, + PageEnd: n, + Metadata: map[string]string{ + "table": "true", + "rows": fmt.Sprintf("%d", len(rows)), + "cols": fmt.Sprintf("%d", cols), + }, + }) + } + } + return sections +} + +// safeExtractTables wraps page.ExtractTables in a recover() so a bug +// deep inside pdftable can never take down the engine's ingest +// pipeline. Errors and panics are logged at warn level (not error — +// the document still gets ingested, just without its tables). +func safeExtractTables(page pdftable.Page, settings pdftable.TableSettings, pageNum int) (tables []*pdftable.Table) { + defer func() { + if r := recover(); r != nil { + slog.Warn("pdf: table extraction panicked", + "page", pageNum, + "panic", fmt.Sprintf("%v", r)) + tables = nil + } + }() + tables, err := page.ExtractTables(settings) + if err != nil { + slog.Warn("pdf: table extraction failed", + "page", pageNum, + "err", err) + return nil + } + return tables +} + +// normaliseTableRows trims whitespace per cell and pads short rows out +// to the table's max column count. pdftable can emit rows with fewer +// cells than the header when its cell detection finds a hole; we +// promote those to empty strings so Markdown rendering produces a +// well-formed grid (every row has the same column count). +func normaliseTableRows(rows [][]string) [][]string { + maxCols := 0 + for _, r := range rows { + if len(r) > maxCols { + maxCols = len(r) + } + } + if maxCols == 0 { + return nil + } + out := make([][]string, 0, len(rows)) + for _, r := range rows { + row := make([]string, maxCols) + for i := 0; i < maxCols; i++ { + if i < len(r) { + row[i] = strings.TrimSpace(r[i]) + } else { + row[i] = "" + } + } + // Drop entirely blank rows — they're cell-detection artefacts + // and contribute no information to retrieval. + if !isAllBlank(row) { + out = append(out, row) + } + } + return out +} + +// isAllBlank reports whether every cell in row is empty/whitespace. +func isAllBlank(row []string) bool { + for _, c := range row { + if strings.TrimSpace(c) != "" { + return false + } + } + return true +} + +// tableToMarkdown renders a normalised table-rows slice as a +// GitHub-flavoured Markdown table. The first row is treated as the +// header; if it is entirely blank, a row of empty header cells is +// emitted so the markdown stays well-formed. +// +// Cell content is escaped minimally: pipe characters inside a cell are +// replaced with the HTML entity so they don't terminate the cell. We +// don't escape backslashes or newlines — newlines inside a cell would +// break the GFM table syntax, so we collapse them to spaces here too. +func tableToMarkdown(rows [][]string) string { + if len(rows) == 0 || len(rows[0]) == 0 { + return "" + } + cols := len(rows[0]) + var sb strings.Builder + + emitRow := func(cells []string) { + sb.WriteByte('|') + for i := 0; i < cols; i++ { + cell := "" + if i < len(cells) { + cell = escapeMarkdownCell(cells[i]) + } + sb.WriteByte(' ') + sb.WriteString(cell) + sb.WriteByte(' ') + sb.WriteByte('|') + } + sb.WriteByte('\n') + } + + // Header row. + header := rows[0] + if isAllBlank(header) { + header = make([]string, cols) + } + emitRow(header) + + // Separator row (GFM uses --- per column). + sb.WriteByte('|') + for i := 0; i < cols; i++ { + sb.WriteString(" --- |") + } + sb.WriteByte('\n') + + // Data rows. + for _, r := range rows[1:] { + emitRow(r) + } + + return strings.TrimRight(sb.String(), "\n") +} + +// escapeMarkdownCell makes a cell safe for inclusion in a GFM table: +// pipes are entity-encoded (they would otherwise close the cell) and +// embedded newlines / tabs are collapsed to single spaces (GFM tables +// are single-line per cell). Runs of whitespace produced by the +// collapse are squashed to one space for readability. +func escapeMarkdownCell(s string) string { + if s == "" { + return "" + } + s = strings.ReplaceAll(s, "|", "|") + // Newlines and tabs become spaces; multiple spaces collapse. + repl := strings.NewReplacer("\r\n", " ", "\n", " ", "\r", " ", "\t", " ") + s = repl.Replace(s) + // Squash runs of spaces. + for strings.Contains(s, " ") { + s = strings.ReplaceAll(s, " ", " ") + } + return strings.TrimSpace(s) +} + +// attachTableSections appends every table section to doc.Sections at +// the document root, after a synthetic "Tables" parent — keeping +// retrieval able to find them but not interleaving them with the +// document outline (which would confuse callers that rely on outline +// order matching page order). +// +// We always create a single "Tables" parent so the top level of the +// outline doesn't balloon: a 10-K with 80 tables would otherwise dwarf +// the actual section list. The parent inherits the union of its +// children's page ranges. +func attachTableSections(doc *ParsedDoc, tables []Section) { + if doc == nil || len(tables) == 0 { + return + } + parent := Section{ + Level: 1, + Title: "Tables", + Children: tables, + Metadata: map[string]string{"tables_container": "true"}, + } + // Compute the parent's page span as the union of children's. + for _, t := range tables { + if t.PageStart > 0 && (parent.PageStart == 0 || t.PageStart < parent.PageStart) { + parent.PageStart = t.PageStart + } + if t.PageEnd > parent.PageEnd { + parent.PageEnd = t.PageEnd + } + } + doc.Sections = append(doc.Sections, parent) +} diff --git a/pkg/parser/pdf_tables_test.go b/pkg/parser/pdf_tables_test.go new file mode 100644 index 0000000..1dea5f1 --- /dev/null +++ b/pkg/parser/pdf_tables_test.go @@ -0,0 +1,255 @@ +package parser_test + +import ( + "bytes" + "context" + "errors" + "os" + "path/filepath" + "strings" + "testing" + + "github.com/hallelx2/vectorless-engine/pkg/parser" +) + +// readFixture is a tiny helper that fails the test if the fixture can't +// be read. Keeps the per-test setup boilerplate-free. +func readFixture(t *testing.T, name string) []byte { + t.Helper() + path := filepath.Join("testdata", name) + b, err := os.ReadFile(path) + if err != nil { + t.Fatalf("read fixture %q: %v", path, err) + } + return b +} + +// TestPDFParserEmitsTableSections asserts the table-extraction stage +// produces at least one Section flagged with Metadata["table"]="true" +// containing well-formed Markdown when fed the issue-466 fixture from +// pdftable (two ruled tables on page 1, with known cell contents). +// +// This is the single most important assertion of the integration: a +// regression here means numeric question answers from FinanceBench-class +// documents would collapse back into space-joined text runs. +func TestPDFParserEmitsTableSections(t *testing.T) { + b := readFixture(t, "tables-example.pdf") + p := parser.NewPDF() + doc, err := p.Parse(context.Background(), bytes.NewReader(b)) + if err != nil { + t.Fatalf("parse: %v", err) + } + + var tables []parser.Section + for _, s := range doc.Flatten() { + if s.Metadata["table"] == "true" { + tables = append(tables, s) + } + } + if len(tables) == 0 { + t.Fatalf("expected at least one table section, got 0 (sections: %d)", len(doc.Sections)) + } + + for i, ts := range tables { + if ts.PageStart != 1 || ts.PageEnd != 1 { + t.Errorf("table %d: expected pages 1-1, got %d-%d", i, ts.PageStart, ts.PageEnd) + } + if !strings.Contains(ts.Title, "page 1") { + t.Errorf("table %d: title %q should mention the page", i, ts.Title) + } + // Markdown rows must have a header + separator + at least one data row. + lines := strings.Split(ts.Content, "\n") + if len(lines) < 3 { + t.Errorf("table %d: content has too few lines (%d): %q", i, len(lines), ts.Content) + continue + } + // Separator row is always second. + if !strings.HasPrefix(lines[1], "|") || !strings.Contains(lines[1], "---") { + t.Errorf("table %d: missing GFM separator row, got %q", i, lines[1]) + } + // Each row starts and ends with a pipe. + for j, l := range lines { + if !strings.HasPrefix(l, "|") || !strings.HasSuffix(l, "|") { + t.Errorf("table %d line %d not pipe-delimited: %q", i, j, l) + } + } + // Rows / cols metadata must agree with the rendered rows + // (header is row 0 in the rendering but still counted). + rowsMeta := ts.Metadata["rows"] + colsMeta := ts.Metadata["cols"] + if rowsMeta == "" || colsMeta == "" { + t.Errorf("table %d: missing rows/cols metadata: %+v", i, ts.Metadata) + } + } + + // At least one of the tables in this fixture has the known cell text + // "T0-C0" (header) and "T0-22-last" (last data row). If pdftable + // reshuffled the columns we'd still see these as substrings somewhere. + joined := "" + for _, ts := range tables { + joined += ts.Content + } + if !strings.Contains(joined, "T0-C0") { + t.Errorf("expected 'T0-C0' (header cell) somewhere in table content, missing") + } + if !strings.Contains(joined, "T0-22-last") { + t.Errorf("expected 'T0-22-last' (last data row) somewhere in table content, missing") + } +} + +// TestPDFParserTablesContainerHidesUnderParent verifies that the engine +// wraps table sections under a synthetic "Tables" container at the +// document root rather than inlining them into the outline. This keeps +// the outline order matching page order for the prose sections — which +// downstream callers rely on for citation rendering. +func TestPDFParserTablesContainerHidesUnderParent(t *testing.T) { + b := readFixture(t, "tables-example.pdf") + p := parser.NewPDF() + doc, err := p.Parse(context.Background(), bytes.NewReader(b)) + if err != nil { + t.Fatalf("parse: %v", err) + } + + var container *parser.Section + for i := range doc.Sections { + if doc.Sections[i].Title == "Tables" && doc.Sections[i].Metadata["tables_container"] == "true" { + container = &doc.Sections[i] + break + } + } + if container == nil { + t.Fatal(`missing synthetic "Tables" container at the document root`) + } + if len(container.Children) == 0 { + t.Fatalf("Tables container has no children") + } + for _, ch := range container.Children { + if ch.Metadata["table"] != "true" { + t.Errorf("Tables container has non-table child %q (metadata=%+v)", ch.Title, ch.Metadata) + } + } +} + +// TestPDFParserDisabledTables ensures the kill-switch works: when the +// parser is constructed with nil TableOpts (or Enabled=false) no table +// sections are emitted and the rest of the document still ingests cleanly. +// This is the rollback path if a real-world PDF ever surfaces a regression. +func TestPDFParserDisabledTables(t *testing.T) { + b := readFixture(t, "tables-example.pdf") + p := parser.NewPDFWithTables(nil) + doc, err := p.Parse(context.Background(), bytes.NewReader(b)) + if err != nil { + t.Fatalf("parse: %v", err) + } + for _, s := range doc.Flatten() { + if s.Metadata["table"] == "true" { + t.Errorf("expected no table sections when tables disabled, got %q", s.Title) + } + if s.Title == "Tables" && s.Metadata["tables_container"] == "true" { + t.Errorf("expected no Tables container when tables disabled") + } + } +} + +// TestPDFParserCorruptInputReturnsCleanError exercises the resilience +// guarantee: a malformed PDF (header bytes mutated) does NOT panic and +// returns a descriptive error rather than collapsing the engine. +func TestPDFParserCorruptInputReturnsCleanError(t *testing.T) { + // Mutating the magic header is enough to make every PDF library + // reject it. The error path we want to validate is "OpenBytes + // returns; we wrap with 'pdf: open:' and propagate". + corrupt := []byte("%PDFFOOBAR-1.4\n%garbage\nendoffile") + p := parser.NewPDF() + _, err := p.Parse(context.Background(), bytes.NewReader(corrupt)) + if err == nil { + t.Fatal("expected error for corrupt PDF, got nil") + } + if !strings.HasPrefix(err.Error(), "pdf: open:") { + t.Errorf("expected 'pdf: open:' prefix, got %q", err.Error()) + } +} + +// TestPDFParser10KSmokeOptional runs the parser over a real 10-K when +// VLE_TEST_FILING_PDF points at one. It's a discovery aid for benchmark +// validation, not a regression gate, so we skip cleanly when the env +// var is unset (the default CI path). The point of this test is to +// confirm pdftable-driven extraction finds real balance-sheet tables in +// real financial filings before benchmark numbers come in. +func TestPDFParser10KSmokeOptional(t *testing.T) { + path := os.Getenv("VLE_TEST_FILING_PDF") + if path == "" { + t.Skip("set VLE_TEST_FILING_PDF= to run") + } + b, err := os.ReadFile(path) + if err != nil { + t.Fatalf("read %s: %v", path, err) + } + p := parser.NewPDF() + doc, err := p.Parse(context.Background(), bytes.NewReader(b)) + if err != nil { + t.Fatalf("parse: %v", err) + } + tables := 0 + pages := map[int]struct{}{} + for _, s := range doc.Flatten() { + if s.Metadata["table"] == "true" { + tables++ + pages[s.PageStart] = struct{}{} + } + } + t.Logf("10-K smoke: %d table sections across %d distinct pages", tables, len(pages)) + if tables == 0 { + t.Errorf("expected at least one table section in a 10-K, got 0") + } +} + +// TestPDFParserResilienceToTableExtractionPanic is a smoke test that the +// safeExtractTables wrapper never propagates a panic from inside +// pdftable. We can't easily synthesise a panicking PDF, but we can run +// table extraction against the corrupted-but-still-PDF-shaped fixture +// to confirm the safety net is wired (any panic would also fail the +// previous corrupt-input test). +func TestPDFParserResilienceToTableExtractionPanic(t *testing.T) { + // A valid PDF with no extractable tables should produce zero table + // sections and zero errors — the resilience contract is "tables on + // or off, ingest never breaks". + b := readFixture(t, "tables-example.pdf") + p := parser.NewPDF() + doc, err := p.Parse(context.Background(), bytes.NewReader(b)) + if err != nil { + t.Fatalf("parse: %v", err) + } + if doc == nil { + t.Fatal("doc is nil") + } + // Verify there's at least one non-table section: ingest must produce + // SOMETHING usable even on a tables-only fixture. + hasNonTable := false + for _, s := range doc.Flatten() { + if s.Metadata["table"] != "true" && strings.TrimSpace(s.Content) != "" { + hasNonTable = true + break + } + } + if !hasNonTable { + // On this specific fixture the document is essentially "tables + // only" — every word lives inside a table cell. The outline + // might therefore contain no non-table prose, which is fine. + // What we MUST have, though, is a non-empty Sections slice + // (the Tables container at minimum). + if len(doc.Sections) == 0 { + t.Fatal("doc has no sections at all") + } + } + + // Trivially exercise the corruption path too — make sure we never + // panic regardless of the input shape. We use errors.Is to catch + // the case where a future change adds a sentinel. + _, perr := p.Parse(context.Background(), bytes.NewReader([]byte{0x25, 0x50, 0x44, 0x46, 0x2d, 0x31, 0x2e, 0x37, 0x0a})) // bare "%PDF-1.7\n" with no body + if perr == nil { + t.Fatal("expected error for bare-header PDF, got nil") + } + // We don't pin the error type — pdftable evolves the wrapping — but + // it must be a real error, not nil. + _ = errors.Is(perr, errors.New("placeholder")) // sanity that errors.Is doesn't barf +} diff --git a/pkg/parser/testdata/tables-example.pdf b/pkg/parser/testdata/tables-example.pdf new file mode 100644 index 0000000..7a4e084 Binary files /dev/null and b/pkg/parser/testdata/tables-example.pdf differ