From 1ae2b095017a7528f0dd155aee3329bd5285078f Mon Sep 17 00:00:00 2001 From: Halleluyah Oludele Date: Wed, 27 May 2026 02:07:38 +0100 Subject: [PATCH 1/3] feat: text + explicit table strategies (pdfplumber parity) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements the two remaining pdfplumber table-finding strategies: - text: infer column boundaries by clustering words on X0 / X1 / centre; infer row boundaries by clustering on top-Y. Direct port of pdfplumber's words_to_edges_v / words_to_edges_h with the same MinWordsVertical (3) / MinWordsHorizontal (1) defaults. - explicit: caller-supplied edges via TableSettings.Explicit*Lines. At least two coordinates required per axis (matches pdfplumber's validation); non-finite values dropped with a log warning. Each axis selects its strategy independently, so mixed-strategy settings (e.g. vertical=text + horizontal=lines) work out of the box. - New layout.SourceText enum tagging text-derived edges. - Page.findTableEdges refactored to dispatch per-axis on strategy instead of starting from a single primitive-edge slice. - ensureSupportedStrategies now only rejects unknown strategy strings. - New table_test.go cases: unit tests on hand-crafted Words slices; borderless / explicit / mixed extraction end-to-end on the new testdata.TableBorderless() fixture. - pdfplumber parity test for the borderless fixture (TestGoldenTablesTextStrategyAgainstPdfplumber) — matches cell-for-cell against pdfplumber's find_tables({text, text}). - scripts/capture_pdfplumber_text_golden.py captures the text-strategy expectation for any fixture with a sibling .tables-text.target marker. --- finder.go | 32 +- finder_text.go | 345 +++++++++++++++ golden_test.go | 54 ++- internal/layout/lines.go | 3 + page.go | 193 ++++---- scripts/capture_pdfplumber_text_golden.py | 84 ++++ scripts/gen_table_fixture.go | 4 + table.go | 50 ++- table_test.go | 414 ++++++++++++++++-- testdata/fixtures.go | 60 +++ testdata/golden/table-3x4-borderless.pdf | Bin 0 -> 950 bytes ...e-3x4-borderless.tables-text.expected.json | 49 +++ .../table-3x4-borderless.tables-text.target | 0 13 files changed, 1146 insertions(+), 142 deletions(-) create mode 100644 finder_text.go create mode 100644 scripts/capture_pdfplumber_text_golden.py create mode 100644 testdata/golden/table-3x4-borderless.pdf create mode 100644 testdata/golden/table-3x4-borderless.tables-text.expected.json create mode 100644 testdata/golden/table-3x4-borderless.tables-text.target diff --git a/finder.go b/finder.go index 9d1a75e..ff8c2e0 100644 --- a/finder.go +++ b/finder.go @@ -618,10 +618,10 @@ func runTableFinder(edges []layout.Edge, xTol, yTol float64) TableFinder { } } -// ensureSupportedStrategies returns an error if either strategy is -// "text" or "explicit" — those are deferred to Phase 1.3.D (v0.3.0). -// Returning a clear ErrUnsupported keeps callers from silently getting -// empty results when they ask for a strategy we don't implement yet. +// ensureSupportedStrategies validates that both axes' strategies are +// one of the four pdfplumber-defined values. As of v0.3.0 all four +// strategies (lines, lines_strict, text, explicit) are implemented; +// the function now exists only to reject unknown strategy strings. func ensureSupportedStrategies(s TableSettings) error { for _, pair := range []struct { axis string @@ -631,15 +631,29 @@ func ensureSupportedStrategies(s TableSettings) error { {"horizontal", s.HorizontalStrategy}, } { switch pair.strategy { - case StrategyLines, StrategyLinesStrict: + case StrategyLines, StrategyLinesStrict, StrategyText, StrategyExplicit: // ok - case StrategyText: - return fmt.Errorf("%w: %s_strategy=%q (Phase 1.3.D)", ErrUnsupported, pair.axis, pair.strategy) - case StrategyExplicit: - return fmt.Errorf("%w: %s_strategy=%q (Phase 1.3.D)", ErrUnsupported, pair.axis, pair.strategy) default: return fmt.Errorf("%w: unknown %s_strategy %q", ErrUnsupported, pair.axis, pair.strategy) } } return nil } + +// errExplicitNeedsTwo is the error returned when the caller selects +// the "explicit" strategy on an axis but supplies fewer than two +// coordinates. pdfplumber raises ValueError with the same message. +func errExplicitNeedsTwo(axis string) error { + return fmt.Errorf("pdftable: %s_strategy=%q requires at least two coordinates in Explicit%sLines", + axis, StrategyExplicit, axisFieldName(axis)) +} + +// axisFieldName returns the field-name suffix for the axis ("Vertical" +// or "Horizontal") so error messages reference the actual struct field +// the caller would need to populate. +func axisFieldName(axis string) string { + if axis == "vertical" { + return "Vertical" + } + return "Horizontal" +} diff --git a/finder_text.go b/finder_text.go new file mode 100644 index 0000000..da1548d --- /dev/null +++ b/finder_text.go @@ -0,0 +1,345 @@ +// Copyright (c) 2026 Halleluyah Oludele +// Licensed under the MIT License. + +package pdftable + +// finder_text.go implements the "text" and "explicit" edge-derivation +// strategies that complement the "lines" / "lines_strict" strategies in +// finder.go. +// +// The "text" strategy is a direct port of pdfplumber's +// words_to_edges_v / words_to_edges_h (pdfplumber/table.py lines +// 101-204). The pipeline: +// +// - vertical: cluster words by X0 (left), X1 (right), and center +// position with tolerance=1 PDF point; pick clusters with at least +// MinWordsVertical members; deduplicate overlapping clusters by +// bbox-overlap; emit one vertical edge per cluster's X0, plus one +// trailing edge at the right-most X1. +// - horizontal: cluster words by their top Y (Y1 in user space) with +// tolerance=1; pick clusters with at least MinWordsHorizontal +// members; emit BOTH the top and bottom edges of each cluster so +// the last row of the table is captured. +// +// The "explicit" strategy is a direct port of pdfplumber's handling of +// explicit_vertical_lines / explicit_horizontal_lines as standalone +// edge lists (table.py lines 623-695). The caller passes float +// coordinates; we promote each to a full-extent edge spanning the +// page's bbox. +// +// Coordinate system: pdfplumber works in image space (Y growing down, +// "top" = smaller Y). pdftable uses PDF user space (Y growing up). Two +// translations matter: +// +// - pdfplumber's "top" is our Y1 (the visually-upper edge of a word's +// bbox in user space). +// - pdfplumber's "bottom" is our Y0. +// +// The clustering and threshold logic is the same in both coordinate +// systems; only the field names flip. + +import ( + "log" + "math" + "sort" + + "github.com/hallelx2/pdftable/internal/layout" +) + +// wordEdgeTolerance is the per-axis tolerance used when clustering +// words for the "text" strategy. pdfplumber hardcodes this to 1 PDF +// point in words_to_edges_v / words_to_edges_h (the third argument to +// cluster_objects); we mirror that. +const wordEdgeTolerance = 1.0 + +// wordsToEdgesV is the Go port of pdfplumber's words_to_edges_v +// (table.py lines 144-204). Given a slice of words and a +// MinWordsVertical threshold, infer vertical edges by clustering the +// words' X0 (left), X1 (right), and centre positions, picking +// clusters whose membership is >= threshold, dropping overlapping +// clusters (the larger one wins), and emitting one edge at each +// cluster's leftmost X plus a trailing edge at the rightmost X1. +// +// pdfplumber sorts the clusters by descending size before the overlap +// dedupe so that, when two candidate column boundaries overlap, the +// bigger one (the one supported by more words) wins. +func wordsToEdgesV(words []Word, wordThreshold int) []layout.Edge { + if len(words) == 0 || wordThreshold <= 0 { + return nil + } + + // Three candidate cluster sources: words sharing left edge (X0), + // right edge (X1), centre. Each contributes one cluster set; we + // concatenate all three, drop those below threshold, then dedupe + // by overlap (larger cluster wins). + byX0 := clusterObjects(words, func(w Word) float64 { return w.X0 }, wordEdgeTolerance, false) + byX1 := clusterObjects(words, func(w Word) float64 { return w.X1 }, wordEdgeTolerance, false) + byCenter := clusterObjects(words, func(w Word) float64 { return (w.X0 + w.X1) / 2 }, wordEdgeTolerance, false) + + all := make([][]Word, 0, len(byX0)+len(byX1)+len(byCenter)) + all = append(all, byX0...) + all = append(all, byX1...) + all = append(all, byCenter...) + + // Sort by descending size so the dedupe pass keeps the largest + // cluster when two overlap. + sort.SliceStable(all, func(i, j int) bool { + return len(all[i]) > len(all[j]) + }) + + // Filter below-threshold clusters. + large := make([][]Word, 0, len(all)) + for _, c := range all { + if len(c) >= wordThreshold { + large = append(large, c) + } + } + if len(large) == 0 { + return nil + } + + // Convert each surviving cluster to its bbox. + bboxes := make([]BBox, len(large)) + for i, c := range large { + bboxes[i] = wordsBBox(c) + } + + // Dedupe: keep clusters whose bbox doesn't overlap any already- + // kept one. Pdfplumber uses get_bbox_overlap which is non-empty + // only when there's positive overlap area — touching boxes don't + // count. + condensed := make([]BBox, 0, len(bboxes)) + for _, b := range bboxes { + overlap := false + for _, c := range condensed { + if _, ok := b.Intersect(c); ok { + overlap = true + break + } + } + if !overlap { + condensed = append(condensed, b) + } + } + if len(condensed) == 0 { + return nil + } + + // Sort the surviving clusters left-to-right by X0. + sort.SliceStable(condensed, func(i, j int) bool { + return condensed[i].X0 < condensed[j].X0 + }) + + // pdfplumber emits, for each cluster, an edge at its LEFT (X0); + // then one final trailing edge at the right edge of the right-most + // cluster (max X1). The Y span on each edge is the union of all + // participating clusters' Y range so that downstream merging / + // intersection sees consistent spans. + minY0 := math.Inf(1) + maxY1 := math.Inf(-1) + maxX1 := math.Inf(-1) + for _, b := range condensed { + if b.Y0 < minY0 { + minY0 = b.Y0 + } + if b.Y1 > maxY1 { + maxY1 = b.Y1 + } + if b.X1 > maxX1 { + maxX1 = b.X1 + } + } + + out := make([]layout.Edge, 0, len(condensed)+1) + for _, b := range condensed { + out = append(out, layout.Edge{ + X0: b.X0, X1: b.X0, + Y0: minY0, Y1: maxY1, + Orientation: layout.Vertical, + Source: layout.SourceText, + }) + } + out = append(out, layout.Edge{ + X0: maxX1, X1: maxX1, + Y0: minY0, Y1: maxY1, + Orientation: layout.Vertical, + Source: layout.SourceText, + }) + return out +} + +// wordsToEdgesH is the Go port of pdfplumber's words_to_edges_h +// (table.py lines 101-141). Given a slice of words and a +// MinWordsHorizontal threshold, infer horizontal edges by clustering +// the words by their visual TOP (Y1 in PDF user space, "top" in +// pdfplumber's image-space dict) and emitting one edge per cluster at +// its Y1, plus a second edge at the cluster's Y0 so the bottom of the +// last row gets captured. +// +// pdfplumber notes (table.py lines 128-130): "For each detected row, +// we also add the 'bottom' line. This will generate extra edges, +// (some will be redundant with the next row 'top' line), but this +// catches the last row of every table." +func wordsToEdgesH(words []Word, wordThreshold int) []layout.Edge { + if len(words) == 0 || wordThreshold <= 0 { + return nil + } + + // Cluster by visual top (Y1 in user space). pdfplumber uses + // "top" — which is image-space and so corresponds to user-space + // Y1 (the upper edge of the bbox). + byTop := clusterObjects(words, func(w Word) float64 { return w.Y1 }, wordEdgeTolerance, false) + + large := make([][]Word, 0, len(byTop)) + for _, c := range byTop { + if len(c) >= wordThreshold { + large = append(large, c) + } + } + if len(large) == 0 { + return nil + } + + // Per-cluster bbox: bottom = min(Y0), top = max(Y1) — i.e. the + // union of every word's vertical extent in the cluster. + bboxes := make([]BBox, len(large)) + for i, c := range large { + bboxes[i] = wordsBBox(c) + } + + // Span the resulting edges from min(X0) to max(X1) across ALL + // clusters — pdfplumber emits horizontal edges as page-spanning + // rules across every detected row. + minX0 := math.Inf(1) + maxX1 := math.Inf(-1) + for _, b := range bboxes { + if b.X0 < minX0 { + minX0 = b.X0 + } + if b.X1 > maxX1 { + maxX1 = b.X1 + } + } + + out := make([]layout.Edge, 0, 2*len(bboxes)) + for _, b := range bboxes { + // Top edge (Y1 in user space, "top" in image space). + out = append(out, layout.Edge{ + X0: minX0, X1: maxX1, + Y0: b.Y1, Y1: b.Y1, + Orientation: layout.Horizontal, + Source: layout.SourceText, + }) + // Bottom edge (Y0 in user space, "bottom" in image space). + out = append(out, layout.Edge{ + X0: minX0, X1: maxX1, + Y0: b.Y0, Y1: b.Y0, + Orientation: layout.Horizontal, + Source: layout.SourceText, + }) + } + return out +} + +// wordsBBox returns the union bbox of every word in cs. Used by the +// text-strategy to derive a cluster's spatial extent. +func wordsBBox(cs []Word) BBox { + if len(cs) == 0 { + return BBox{} + } + out := BBox{X0: cs[0].X0, Y0: cs[0].Y0, X1: cs[0].X1, Y1: cs[0].Y1} + for _, w := range cs[1:] { + if w.X0 < out.X0 { + out.X0 = w.X0 + } + if w.Y0 < out.Y0 { + out.Y0 = w.Y0 + } + if w.X1 > out.X1 { + out.X1 = w.X1 + } + if w.Y1 > out.Y1 { + out.Y1 = w.Y1 + } + } + return out +} + +// explicitVerticalEdges promotes a slice of float X coordinates into +// vertical edges that span the page's Y range. Each non-finite or +// zero-length entry is dropped with a warning so a misconfigured +// caller gets visible feedback rather than silent empty results. +// +// pageY0 / pageY1 are the page's vertical bounds (typically 0 and +// page.Height()) — pdfplumber's `page.bbox[1]` and `page.bbox[3]`. +func explicitVerticalEdges(xs []float64, pageY0, pageY1 float64) []layout.Edge { + if len(xs) == 0 { + return nil + } + out := make([]layout.Edge, 0, len(xs)) + for _, x := range xs { + if math.IsNaN(x) || math.IsInf(x, 0) { + log.Printf("pdftable: explicit vertical line %v ignored (non-finite)", x) + continue + } + if pageY1 <= pageY0 { + log.Printf("pdftable: explicit vertical line %v ignored (page height is zero)", x) + continue + } + out = append(out, layout.Edge{ + X0: x, X1: x, + Y0: pageY0, Y1: pageY1, + Orientation: layout.Vertical, + Source: layout.SourceExplicit, + }) + } + return out +} + +// explicitHorizontalEdges promotes a slice of float Y coordinates into +// horizontal edges that span the page's X range. Same invalid-input +// behaviour as explicitVerticalEdges. +// +// pageX0 / pageX1 are the page's horizontal bounds. +func explicitHorizontalEdges(ys []float64, pageX0, pageX1 float64) []layout.Edge { + if len(ys) == 0 { + return nil + } + out := make([]layout.Edge, 0, len(ys)) + for _, y := range ys { + if math.IsNaN(y) || math.IsInf(y, 0) { + log.Printf("pdftable: explicit horizontal line %v ignored (non-finite)", y) + continue + } + if pageX1 <= pageX0 { + log.Printf("pdftable: explicit horizontal line %v ignored (page width is zero)", y) + continue + } + out = append(out, layout.Edge{ + X0: pageX0, X1: pageX1, + Y0: y, Y1: y, + Orientation: layout.Horizontal, + Source: layout.SourceExplicit, + }) + } + return out +} + +// validateExplicitForStrategy reports an error if either axis uses +// the "explicit" strategy but the caller supplied fewer than two +// coordinates on that axis. pdfplumber raises ValueError in this +// case (table.py lines 605-615). The check is a no-op for the +// other three strategies. +// +// Coordinates that survive the validation aren't checked for finite- +// ness here — that happens in explicit*Edges below, where invalid +// entries are logged and skipped individually. +func validateExplicitForStrategy(s TableSettings) error { + if s.VerticalStrategy == StrategyExplicit && len(s.ExplicitVerticalLines) < 2 { + return errExplicitNeedsTwo("vertical") + } + if s.HorizontalStrategy == StrategyExplicit && len(s.ExplicitHorizontalLines) < 2 { + return errExplicitNeedsTwo("horizontal") + } + return nil +} diff --git a/golden_test.go b/golden_test.go index d6e9dcc..bcf658f 100644 --- a/golden_test.go +++ b/golden_test.go @@ -72,10 +72,10 @@ func TestGoldenAgainstPdfplumber(t *testing.T) { t.Fatalf("read golden dir: %v", err) } - // Find every .expected.json (but NOT .tables.expected.json — the - // tables golden files have a different schema and are exercised - // by TestGoldenTablesAgainstPdfplumber below) and run a sub-test - // for each. + // Find every .expected.json (but NOT one of the table goldens — + // those have a different schema and are exercised by the + // strategy-specific TestGoldenTables* tests below) and run a + // sub-test for each. for _, e := range entries { if e.IsDir() { continue @@ -87,6 +87,9 @@ func TestGoldenAgainstPdfplumber(t *testing.T) { if strings.HasSuffix(name, ".tables.expected.json") { continue } + if strings.HasSuffix(name, ".tables-text.expected.json") { + continue + } stem := strings.TrimSuffix(name, ".expected.json") t.Run(stem, func(t *testing.T) { runGoldenCase(t, dir, stem) @@ -144,15 +147,50 @@ func TestGoldenTablesAgainstPdfplumber(t *testing.T) { } stem := strings.TrimSuffix(name, ".tables.expected.json") t.Run(stem, func(t *testing.T) { - runGoldenTablesCase(t, dir, stem) + runGoldenTablesCase(t, dir, stem, pdftable.DefaultTableSettings()) + }) + } +} + +// TestGoldenTablesTextStrategyAgainstPdfplumber asserts pdftable's +// "text" strategy output matches pdfplumber's +// find_tables({"text", "text"}) on every .tables-text.expected.json +// fixture in testdata/golden. The strategy-specific suffix lets us +// pin the parity expectation per fixture independently of the +// default-lines test above. +func TestGoldenTablesTextStrategyAgainstPdfplumber(t *testing.T) { + dir := filepath.Join("testdata", "golden") + entries, err := os.ReadDir(dir) + if err != nil { + t.Fatalf("read golden dir: %v", err) + } + settings := pdftable.DefaultTableSettings() + settings.VerticalStrategy = pdftable.StrategyText + settings.HorizontalStrategy = pdftable.StrategyText + for _, e := range entries { + if e.IsDir() { + continue + } + name := e.Name() + if !strings.HasSuffix(name, ".tables-text.expected.json") { + continue + } + stem := strings.TrimSuffix(name, ".tables-text.expected.json") + t.Run(stem, func(t *testing.T) { + runGoldenTablesCaseSuffix(t, dir, stem, ".tables-text.expected.json", settings) }) } } -func runGoldenTablesCase(t *testing.T, dir, stem string) { +func runGoldenTablesCase(t *testing.T, dir, stem string, settings pdftable.TableSettings) { + t.Helper() + runGoldenTablesCaseSuffix(t, dir, stem, ".tables.expected.json", settings) +} + +func runGoldenTablesCaseSuffix(t *testing.T, dir, stem, suffix string, settings pdftable.TableSettings) { t.Helper() pdfPath := filepath.Join(dir, stem+".pdf") - jsonPath := filepath.Join(dir, stem+".tables.expected.json") + jsonPath := filepath.Join(dir, stem+suffix) data, err := os.ReadFile(jsonPath) if err != nil { @@ -174,7 +212,7 @@ func runGoldenTablesCase(t *testing.T, dir, stem string) { if err != nil { t.Fatalf("Page(%d): %v", expPage.Number, err) } - gotTables, err := p.ExtractTables(pdftable.DefaultTableSettings()) + gotTables, err := p.ExtractTables(settings) if err != nil { t.Fatalf("ExtractTables: %v", err) } diff --git a/internal/layout/lines.go b/internal/layout/lines.go index 3a94e85..a688de5 100644 --- a/internal/layout/lines.go +++ b/internal/layout/lines.go @@ -56,6 +56,9 @@ const ( // SourceExplicit: an edge constructed from an // ExplicitVerticalLines / ExplicitHorizontalLines setting. SourceExplicit + // SourceText: an edge inferred from word alignment by the "text" + // strategy. words_to_edges_v / words_to_edges_h in pdfplumber. + SourceText ) // Edge is one axis-aligned line segment carrying the data the table- diff --git a/page.go b/page.go index 48e0225..5b04c97 100644 --- a/page.go +++ b/page.go @@ -102,10 +102,11 @@ type Page interface { // the intermediate stages (edges / intersections / raw cells) // alongside the assembled per-table CellsGrid. // - // v0.2.0 supports VerticalStrategy / HorizontalStrategy values - // of "lines" and "lines_strict". Passing "text" or "explicit" - // returns ErrUnsupported — those strategies land in Phase 1.3.D - // (v0.3.0). + // v0.3.0 supports all four pdfplumber strategies: "lines", + // "lines_strict", "text", and "explicit". Each axis (vertical, + // horizontal) selects its strategy independently, so mixed + // settings like vertical="text" + horizontal="lines" work as + // expected. FindTables(settings TableSettings) ([]TableFinder, error) // ExtractTables wraps FindTables and runs per-cell text @@ -390,11 +391,13 @@ func charsJoinedText(chars []Char) string { // // 1. Walk the page once via Objects(), so we pay the content-stream // parse cost a single time rather than once per primitive type. -// 2. Convert every Line / Rect / Curve into one or more layout.Edge -// instances. Lines produce 0 or 1 edge; Rects produce 4; Curves -// produce one per axis-aligned segment. -// 3. For lines_strict, drop SourceRect and SourceCurve edges before -// the prefilter. +// 2. Compute per-axis base edges according to the requested strategy: +// - "lines" → Lines + Rects + Curves (axis-aligned) +// - "lines_strict" → Lines only +// - "text" → words clustered by x0/x1/centre (v) or top (h) +// - "explicit" → empty (caller supplies the edges) +// 3. Append the caller's ExplicitVerticalLines / ExplicitHorizontalLines +// on top of whichever base set was chosen. // 4. Apply the prefilter (drop edges shorter than // EdgeMinLengthPrefilter — pdfplumber default 1 pt). // 5. Merge (snap onto cluster means, then join collinear edges @@ -402,77 +405,69 @@ func charsJoinedText(chars []Char) string { // 6. Apply the post-merge length filter (drop edges shorter than // EdgeMinLength — pdfplumber default 3 pt). // -// The returned slice is the input both the vertical and horizontal -// stages share — pdfplumber's TableFinder.get_edges takes the union -// of vertical-strategy edges and horizontal-strategy edges and runs -// the merge across both at once. We do the same, but with one -// wrinkle: if the two strategies differ ("lines" + "lines_strict"), -// we apply the source-filter PER ORIENTATION so a strict horizontal -// strategy can still benefit from rect-derived vertical edges and -// vice versa. The pdfplumber implementation handles this implicitly -// because its filter_edges receives the requested orientation as an -// argument; our code mirrors that branch explicitly. +// Each axis uses its own strategy, so "text" vertical + "lines" +// horizontal (or any of the 16 combinations) works exactly as in +// pdfplumber. func (p *page) findTableEdges(s TableSettings) ([]layout.Edge, error) { - objs, err := p.Objects() - if err != nil { - return nil, err + // Resolve text strategy's input once (and only when needed) — both + // axes can ask for it, and Words is an expensive call. + var words []Word + needWords := s.VerticalStrategy == StrategyText || s.HorizontalStrategy == StrategyText + if needWords { + opts := DefaultWordOpts() + opts.XTolerance = s.TextTolerance + opts.YTolerance = s.TextTolerance + opts.KeepBlankChars = s.KeepBlankChars + w, err := p.Words(opts) + if err != nil { + return nil, err + } + words = w } - tol := 0.1 // near-axis-aligned slack for FromLine/FromCurve - rawEdges := make([]layout.Edge, 0, len(objs.Lines)+4*len(objs.Rects)+2*len(objs.Curves)) - - for _, l := range objs.Lines { - if e, ok := layout.FromLine(layout.LineSegment{ - X0: l.X0, Y0: l.Y0, X1: l.X1, Y1: l.Y1, Width: l.Width, - }, tol); ok { - rawEdges = append(rawEdges, e) + // Resolve drawn-primitive edges once if either axis needs them + // (lines or lines_strict). + var lineLikeEdges []layout.Edge + needPrimitives := isLineLike(s.VerticalStrategy) || isLineLike(s.HorizontalStrategy) + if needPrimitives { + objs, err := p.Objects() + if err != nil { + return nil, err + } + tol := 0.1 // near-axis-aligned slack for FromLine/FromCurve + lineLikeEdges = make([]layout.Edge, 0, len(objs.Lines)+4*len(objs.Rects)+2*len(objs.Curves)) + for _, l := range objs.Lines { + if e, ok := layout.FromLine(layout.LineSegment{ + X0: l.X0, Y0: l.Y0, X1: l.X1, Y1: l.Y1, Width: l.Width, + }, tol); ok { + lineLikeEdges = append(lineLikeEdges, e) + } + } + for _, r := range objs.Rects { + lineLikeEdges = append(lineLikeEdges, layout.FromRect(layout.RectSegment{ + X0: r.X0, Y0: r.Y0, X1: r.X1, Y1: r.Y1, Width: r.Width, + })...) + } + for _, c := range objs.Curves { + lineLikeEdges = append(lineLikeEdges, layout.FromCurve(layout.CurveSegment{ + Points: c.Points, Width: c.Width, + }, tol)...) } } - for _, r := range objs.Rects { - rawEdges = append(rawEdges, layout.FromRect(layout.RectSegment{ - X0: r.X0, Y0: r.Y0, X1: r.X1, Y1: r.Y1, Width: r.Width, - })...) - } - for _, c := range objs.Curves { - rawEdges = append(rawEdges, layout.FromCurve(layout.CurveSegment{ - Points: c.Points, Width: c.Width, - }, tol)...) - } - - // Per-orientation source filter: lines_strict on an axis drops - // non-line sources on that axis. We split into v/h, filter each - // according to its own strategy, then recombine before the - // length filter and merge. - vEdges := layout.FilterEdgesByOrientation(rawEdges, layout.Vertical) - hEdges := layout.FilterEdgesByOrientation(rawEdges, layout.Horizontal) - - if s.VerticalStrategy == StrategyLinesStrict { - vEdges = layout.FilterEdgesBySource(vEdges, layout.SourceLine, layout.SourceExplicit) - } - if s.HorizontalStrategy == StrategyLinesStrict { - hEdges = layout.FilterEdgesBySource(hEdges, layout.SourceLine, layout.SourceExplicit) - } - - // Explicit overrides are added on top of the derived edges. - // pdfplumber accepts these even with the lines / lines_strict - // strategies (the "explicit" strategy itself replaces the - // derived edges; that strategy is deferred to v0.3.0). - for _, x := range s.ExplicitVerticalLines { - vEdges = append(vEdges, layout.Edge{ - X0: x, X1: x, - Y0: 0, Y1: p.Height(), - Orientation: layout.Vertical, - Source: layout.SourceExplicit, - }) - } - for _, y := range s.ExplicitHorizontalLines { - hEdges = append(hEdges, layout.Edge{ - X0: 0, X1: p.Width(), - Y0: y, Y1: y, - Orientation: layout.Horizontal, - Source: layout.SourceExplicit, - }) - } + + pageWidth := p.Width() + pageHeight := p.Height() + + // Per-axis base edge derivation. + vEdges := p.baseEdges(s.VerticalStrategy, layout.Vertical, lineLikeEdges, words, s) + hEdges := p.baseEdges(s.HorizontalStrategy, layout.Horizontal, lineLikeEdges, words, s) + + // Explicit overrides are added on top of whichever base set was + // chosen. With StrategyExplicit the base set is empty so the + // explicit edges are the only source; with the other strategies + // they're additive (helpful when a column boundary isn't drawn). + vEdges = append(vEdges, explicitVerticalEdges(s.ExplicitVerticalLines, 0, pageHeight)...) + hEdges = append(hEdges, explicitHorizontalEdges(s.ExplicitHorizontalLines, 0, pageWidth)...) combined := make([]layout.Edge, 0, len(vEdges)+len(hEdges)) combined = append(combined, vEdges...) @@ -492,9 +487,49 @@ func (p *page) findTableEdges(s TableSettings) ([]layout.Edge, error) { return merged, nil } +// isLineLike reports whether the strategy derives its edges from +// drawn primitives (Lines / Rects / Curves), i.e. whether +// findTableEdges needs to call Objects(). Text and explicit +// strategies don't. +func isLineLike(s TableStrategy) bool { + return s == StrategyLines || s == StrategyLinesStrict +} + +// baseEdges returns the per-axis edges produced by the named strategy. +// lineLikeEdges is the unfiltered slice of edges derived from the +// page's drawn primitives (Lines + Rects + Curves); it's consulted +// only when the strategy is "lines" or "lines_strict". words is the +// page's extracted text runs; consulted only for the "text" strategy. +// +// "explicit" returns nil here — the caller's explicit slice is +// concatenated separately in findTableEdges. +func (p *page) baseEdges(strategy TableStrategy, orientation layout.Orientation, lineLikeEdges []layout.Edge, words []Word, s TableSettings) []layout.Edge { + switch strategy { + case StrategyLines: + out := layout.FilterEdgesByOrientation(lineLikeEdges, orientation) + return out + case StrategyLinesStrict: + out := layout.FilterEdgesByOrientation(lineLikeEdges, orientation) + return layout.FilterEdgesBySource(out, layout.SourceLine) + case StrategyText: + if orientation == layout.Vertical { + return wordsToEdgesV(words, s.MinWordsVertical) + } + return wordsToEdgesH(words, s.MinWordsHorizontal) + case StrategyExplicit: + // Caller-supplied edges are concatenated elsewhere; the base + // set for the "explicit" strategy is empty. + return nil + default: + return nil + } +} + // FindTables runs the geometry-only pipeline (edges → intersections // → cells → tables) and returns one TableFinder per detected table -// group. Strategies "text" and "explicit" return ErrUnsupported. +// group. All four pdfplumber strategies (lines, lines_strict, text, +// explicit) are supported; the two axes can use different strategies +// independently. // // The returned slice is in visual top-to-bottom-left-to-right order // (sorted by the topmost-leftmost cell of each table). @@ -503,6 +538,9 @@ func (p *page) FindTables(settings TableSettings) ([]TableFinder, error) { if err := ensureSupportedStrategies(s); err != nil { return nil, err } + if err := validateExplicitForStrategy(s); err != nil { + return nil, err + } edges, err := p.findTableEdges(s) if err != nil { @@ -548,6 +586,9 @@ func (p *page) ExtractTables(settings TableSettings) ([]*Table, error) { if err := ensureSupportedStrategies(s); err != nil { return nil, err } + if err := validateExplicitForStrategy(s); err != nil { + return nil, err + } finders, err := p.FindTables(s) if err != nil { diff --git a/scripts/capture_pdfplumber_text_golden.py b/scripts/capture_pdfplumber_text_golden.py new file mode 100644 index 0000000..02e402d --- /dev/null +++ b/scripts/capture_pdfplumber_text_golden.py @@ -0,0 +1,84 @@ +"""Generate text-strategy golden files for pdftable's parity tests. + +Run from the repo root after copying any new borderless / text-strategy +fixture PDFs into testdata/golden/: + + pip install pdfplumber + python scripts/capture_pdfplumber_text_golden.py + +The script reads every *.pdf in testdata/golden/ that has a sibling +.tables-text.target file (the marker says "this fixture is in the +text-strategy parity set") and writes +.tables-text.expected.json — pdfplumber's find_tables output +under {vertical_strategy: 'text', horizontal_strategy: 'text'} with +matching MinWordsVertical / MinWordsHorizontal defaults (3 / 1). + +We separate the text-strategy goldens from the line-strategy ones +(*.tables.expected.json) because the same fixture may produce +different tables depending on the strategy, and we want to assert +parity per strategy. + +Re-run to refresh after upgrading pdfplumber or changing the +target list. +""" + +from __future__ import annotations + +import json +import os +import sys + +import pdfplumber + +DIR = os.path.join("testdata", "golden") + + +def main() -> int: + target = DIR if len(sys.argv) < 2 else sys.argv[1] + targets = sorted( + f[: -len(".tables-text.target")] + for f in os.listdir(target) + if f.endswith(".tables-text.target") + ) + if not targets: + print( + f"no .tables-text.target files in {target}. " + "Create one alongside each fixture you want in the " + "text-strategy parity set (the file can be empty; its " + "presence is the signal).", + file=sys.stderr, + ) + return 1 + for name in targets: + pdf_path = os.path.join(target, f"{name}.pdf") + if not os.path.exists(pdf_path): + print(f"missing {pdf_path}", file=sys.stderr) + continue + out = {"name": name, "pages": []} + with pdfplumber.open(pdf_path) as pdf: + for p in pdf.pages: + tbls = p.find_tables( + { + "vertical_strategy": "text", + "horizontal_strategy": "text", + "min_words_vertical": 3, + "min_words_horizontal": 1, + } + ) + page_obj = { + "number": p.page_number, + "width": p.width, + "height": p.height, + "tables": [t.extract() for t in tbls], + } + out["pages"].append(page_obj) + outpath = os.path.join(target, f"{name}.tables-text.expected.json") + with open(outpath, "w", encoding="utf-8") as f: + json.dump(out, f, ensure_ascii=False, indent=2) + ntables = sum(len(pp["tables"]) for pp in out["pages"]) + print(f"wrote {outpath}: {ntables} tables across all pages") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/gen_table_fixture.go b/scripts/gen_table_fixture.go index 042e7f6..1dabb3b 100644 --- a/scripts/gen_table_fixture.go +++ b/scripts/gen_table_fixture.go @@ -40,6 +40,10 @@ func main() { path: filepath.Join("testdata", "table-2x3-ruled.pdf"), data: testdata.TableRuled(), }, + { + path: filepath.Join("testdata", "golden", "table-3x4-borderless.pdf"), + data: testdata.TableBorderless(), + }, } for _, o := range outputs { if err := os.MkdirAll(filepath.Dir(o.path), 0o755); err != nil { diff --git a/table.go b/table.go index 359a915..e33f506 100644 --- a/table.go +++ b/table.go @@ -22,9 +22,8 @@ package pdftable // describing "below" / "right" change their sign. // TableStrategy is the enum of edge-derivation strategies. Each axis -// (vertical, horizontal) picks one. v0.2.0 implements "lines" and -// "lines_strict"; "text" and "explicit" are reserved for the next -// release (Phase 1.3.D) and return ErrUnsupported if requested. +// (vertical, horizontal) picks one independently. All four pdfplumber +// strategies are implemented as of v0.3.0. type TableStrategy string const ( @@ -41,10 +40,23 @@ const ( // boundaries. StrategyLinesStrict TableStrategy = "lines_strict" - // StrategyText (Phase 1.3.D) infers edges from word alignment. + // StrategyText infers edges from word alignment. Vertical edges + // come from clusters of words sharing X0 / X1 / centre positions; + // horizontal edges from clusters sharing visual top. Best for + // borderless tables — bank statements, narrative tables in 10-K + // filings, scanned-then-OCR'd content — where the columns and + // rows are conveyed by whitespace alignment rather than rules. + // Tunable via MinWordsVertical (default 3) and + // MinWordsHorizontal (default 1). StrategyText TableStrategy = "text" - // StrategyExplicit (Phase 1.3.D) uses caller-supplied lines. + // StrategyExplicit uses caller-supplied coordinates from + // ExplicitVerticalLines / ExplicitHorizontalLines as the only + // source of edges on that axis. Useful when the table boundaries + // are known from an external source (layout analysis, manual + // annotation) and you want to bypass edge detection entirely. + // The "explicit" strategy on an axis requires at least two + // coordinates on that axis; fewer than two produces an error. StrategyExplicit TableStrategy = "explicit" ) @@ -96,10 +108,13 @@ type TableSettings struct { TextTolerance float64 // MinWordsVertical / MinWordsHorizontal control the "text" - // strategy thresholds (Phase 1.3.D). They have no effect when - // both strategies are "lines" / "lines_strict" — kept on this - // struct so callers don't have to switch types when migrating to - // the text strategy later. + // strategy thresholds. A candidate column-boundary cluster must + // contain at least MinWordsVertical words sharing X0 / X1 / + // centre alignment to be promoted to a vertical edge; row + // boundaries need MinWordsHorizontal words sharing a top edge. + // pdfplumber defaults (3 / 1) mirror those in pdfplumber's + // table.py:11-12. These fields are ignored when the corresponding + // strategy is anything other than "text". MinWordsVertical int MinWordsHorizontal int @@ -108,15 +123,18 @@ type TableSettings struct { KeepBlankChars bool // ExplicitVerticalLines / ExplicitHorizontalLines hold caller- - // supplied edge positions. With StrategyLines or - // StrategyLinesStrict they are ADDED to the derived edges; with - // StrategyExplicit they ARE the edges. In v0.2.0 the explicit - // strategy is not yet implemented; these slices have effect only - // when one of the lines strategies is in use and you want extra - // hand-placed rules (e.g. when your column boundary isn't drawn). + // supplied edge positions. With StrategyLines, StrategyLinesStrict, + // or StrategyText they are ADDED to the derived edges; with + // StrategyExplicit they ARE the only source of edges on that axis. + // Useful when a column or row boundary is invisible in the PDF but + // known from an external source. // // Values are X coordinates for vertical lines, Y coordinates for - // horizontal lines, both in PDF user-space points. + // horizontal lines, both in PDF user-space points. Non-finite + // values (NaN, Inf) are dropped with a log warning. When + // StrategyExplicit is selected on an axis, at least two + // coordinates must be supplied on that axis — fewer than two + // returns an error. ExplicitVerticalLines []float64 ExplicitHorizontalLines []float64 } diff --git a/table_test.go b/table_test.go index 09eb4e7..87e6169 100644 --- a/table_test.go +++ b/table_test.go @@ -261,18 +261,18 @@ func TestRunTableFinder_2x3Grid(t *testing.T) { } } -// TestEnsureSupportedStrategies_RejectsTextAndExplicit asserts that -// the v0.3.0 strategies return ErrUnsupported rather than silently -// running an empty pipeline. -func TestEnsureSupportedStrategies_RejectsTextAndExplicit(t *testing.T) { +// TestEnsureSupportedStrategies_RejectsUnknown asserts that an +// unrecognised strategy string returns ErrUnsupported rather than +// silently running an empty pipeline. All four pdfplumber strategies +// (lines / lines_strict / text / explicit) are now implemented, so +// this test only exercises the unknown-string path. +func TestEnsureSupportedStrategies_RejectsUnknown(t *testing.T) { cases := []struct { name string s TableSettings }{ - {"text/lines", TableSettings{VerticalStrategy: StrategyText, HorizontalStrategy: StrategyLines}}, - {"lines/text", TableSettings{VerticalStrategy: StrategyLines, HorizontalStrategy: StrategyText}}, - {"explicit/lines", TableSettings{VerticalStrategy: StrategyExplicit, HorizontalStrategy: StrategyLines}}, - {"lines/explicit", TableSettings{VerticalStrategy: StrategyLines, HorizontalStrategy: StrategyExplicit}}, + {"unknown_v", TableSettings{VerticalStrategy: "blah", HorizontalStrategy: StrategyLines}}, + {"unknown_h", TableSettings{VerticalStrategy: StrategyLines, HorizontalStrategy: "blah"}}, } for _, c := range cases { t.Run(c.name, func(t *testing.T) { @@ -287,24 +287,20 @@ func TestEnsureSupportedStrategies_RejectsTextAndExplicit(t *testing.T) { } } -// TestEnsureSupportedStrategies_AcceptsLines asserts that both lines -// strategies pass validation. -func TestEnsureSupportedStrategies_AcceptsLines(t *testing.T) { - cases := []struct { - name string - s TableSettings - }{ - {"lines/lines", TableSettings{VerticalStrategy: StrategyLines, HorizontalStrategy: StrategyLines}}, - {"strict/lines", TableSettings{VerticalStrategy: StrategyLinesStrict, HorizontalStrategy: StrategyLines}}, - {"lines/strict", TableSettings{VerticalStrategy: StrategyLines, HorizontalStrategy: StrategyLinesStrict}}, - {"strict/strict", TableSettings{VerticalStrategy: StrategyLinesStrict, HorizontalStrategy: StrategyLinesStrict}}, - } - for _, c := range cases { - t.Run(c.name, func(t *testing.T) { - if err := ensureSupportedStrategies(c.s.applyDefaults()); err != nil { - t.Errorf("got %v, want nil", err) - } - }) +// TestEnsureSupportedStrategies_AcceptsAllFour asserts that all four +// pdfplumber strategies pass validation, in every paired combination. +func TestEnsureSupportedStrategies_AcceptsAllFour(t *testing.T) { + strategies := []TableStrategy{StrategyLines, StrategyLinesStrict, StrategyText, StrategyExplicit} + for _, v := range strategies { + for _, h := range strategies { + name := string(v) + "/" + string(h) + t.Run(name, func(t *testing.T) { + s := TableSettings{VerticalStrategy: v, HorizontalStrategy: h}.applyDefaults() + if err := ensureSupportedStrategies(s); err != nil { + t.Errorf("got %v, want nil", err) + } + }) + } } } @@ -397,10 +393,11 @@ func TestExtractTables_RuledFixture(t *testing.T) { } } -// TestExtractTables_UnsupportedStrategyReturnsErrUnsupported asserts -// the public API surfaces ErrUnsupported when callers request "text" -// or "explicit" strategies. -func TestExtractTables_UnsupportedStrategyReturnsErrUnsupported(t *testing.T) { +// TestExtractTables_UnknownStrategyReturnsErrUnsupported asserts the +// public API surfaces ErrUnsupported when callers pass an unrecognised +// strategy string. All four standard strategies are implemented as of +// v0.3.0; this guard catches typos. +func TestExtractTables_UnknownStrategyReturnsErrUnsupported(t *testing.T) { doc, err := OpenBytes(testdata.TableRuled()) if err != nil { t.Fatalf("OpenBytes: %v", err) @@ -409,7 +406,7 @@ func TestExtractTables_UnsupportedStrategyReturnsErrUnsupported(t *testing.T) { p, _ := doc.Page(1) settings := DefaultTableSettings() - settings.VerticalStrategy = StrategyText + settings.VerticalStrategy = "not-a-strategy" _, err = p.ExtractTables(settings) if err == nil { t.Fatal("got nil, want ErrUnsupported") @@ -417,12 +414,34 @@ func TestExtractTables_UnsupportedStrategyReturnsErrUnsupported(t *testing.T) { if !errIs(err, ErrUnsupported) { t.Errorf("got %v, want ErrUnsupported", err) } - // The error should mention what was unsupported and the phase. - if !strings.Contains(err.Error(), "text") { + if !strings.Contains(err.Error(), "not-a-strategy") { t.Errorf("error %q should name the strategy", err.Error()) } } +// TestExtractTables_ExplicitWithoutCoordinatesReturnsError asserts +// that StrategyExplicit on an axis with fewer than two coordinates +// returns a clear validation error (matching pdfplumber's behaviour). +func TestExtractTables_ExplicitWithoutCoordinatesReturnsError(t *testing.T) { + doc, err := OpenBytes(testdata.TableRuled()) + if err != nil { + t.Fatalf("OpenBytes: %v", err) + } + defer doc.Close() + p, _ := doc.Page(1) + + settings := DefaultTableSettings() + settings.VerticalStrategy = StrategyExplicit + settings.ExplicitVerticalLines = []float64{100} + _, err = p.ExtractTables(settings) + if err == nil { + t.Fatal("got nil, want validation error") + } + if !strings.Contains(err.Error(), "two") { + t.Errorf("error %q should mention the two-coordinate minimum", err.Error()) + } +} + // TestFindTables_NoEdgesReturnsEmpty asserts that a page with no // edges (e.g. a text-only page) returns an empty slice, not an // error. @@ -441,3 +460,332 @@ func TestFindTables_NoEdgesReturnsEmpty(t *testing.T) { t.Errorf("got %d finders, want 0 (text-only page)", len(finders)) } } + +// makeWord builds a Word at the given bbox with the given text. +// Helper for the text-strategy unit tests which feed hand-crafted +// Word slices directly into wordsToEdgesV / wordsToEdgesH. +func makeWord(text string, x0, y0, x1, y1 float64) Word { + return Word{ + Text: text, + X0: x0, Y0: y0, X1: x1, Y1: y1, + Upright: true, + Direction: "ltr", + } +} + +// TestWordsToEdgesV_ThreeColumnAlignment exercises the vertical "text" +// strategy with three columns of three words each, all left-aligned +// at X = 100, 200, 300. The expected output is four vertical edges: +// three at the columns' X0 plus one trailing at the rightmost X1. +func TestWordsToEdgesV_ThreeColumnAlignment(t *testing.T) { + words := []Word{ + // Row 1: y near 700 + makeWord("AAA", 100, 700, 130, 710), + makeWord("BBB", 200, 700, 230, 710), + makeWord("CCC", 300, 700, 330, 710), + // Row 2: y near 685 + makeWord("DDD", 100, 685, 130, 695), + makeWord("EEE", 200, 685, 230, 695), + makeWord("FFF", 300, 685, 330, 695), + // Row 3: y near 670 + makeWord("GGG", 100, 670, 130, 680), + makeWord("HHH", 200, 670, 230, 680), + makeWord("III", 300, 670, 330, 680), + } + edges := wordsToEdgesV(words, 3) + if len(edges) != 4 { + t.Fatalf("got %d edges, want 4 (3 columns + trailing)", len(edges)) + } + xs := make(map[float64]struct{}, 4) + for _, e := range edges { + if e.Orientation != layout.Vertical { + t.Errorf("edge %+v: not vertical", e) + } + if e.Source != layout.SourceText { + t.Errorf("edge %+v: source %v, want SourceText", e, e.Source) + } + xs[e.X0] = struct{}{} + } + for _, want := range []float64{100, 200, 300, 330} { + if _, ok := xs[want]; !ok { + t.Errorf("missing vertical edge at X=%v; got %v", want, xs) + } + } +} + +// TestWordsToEdgesV_BelowThresholdDropsCluster asserts that a column +// candidate with fewer than MinWordsVertical words doesn't survive +// the threshold filter. +func TestWordsToEdgesV_BelowThresholdDropsCluster(t *testing.T) { + words := []Word{ + // Column at X=100 has only 2 words; threshold of 3 should + // drop it. + makeWord("AAA", 100, 700, 130, 710), + makeWord("DDD", 100, 685, 130, 695), + // Column at X=200 has 3 words. + makeWord("BBB", 200, 700, 230, 710), + makeWord("EEE", 200, 685, 230, 695), + makeWord("HHH", 200, 670, 230, 680), + } + edges := wordsToEdgesV(words, 3) + // Expected: 1 column boundary (X=200) + 1 trailing (X=230) = 2 edges. + if len(edges) != 2 { + t.Fatalf("got %d edges, want 2", len(edges)) + } +} + +// TestWordsToEdgesH_DetectsRows asserts that horizontal clusters of +// words sharing a top-Y produce one top + one bottom edge per row. +func TestWordsToEdgesH_DetectsRows(t *testing.T) { + words := []Word{ + // Row 1: top at Y=710 + makeWord("AAA", 100, 700, 130, 710), + makeWord("BBB", 200, 700, 230, 710), + makeWord("CCC", 300, 700, 330, 710), + // Row 2: top at Y=695 + makeWord("DDD", 100, 685, 130, 695), + makeWord("EEE", 200, 685, 230, 695), + makeWord("FFF", 300, 685, 330, 695), + } + // Threshold 1 → every cluster counts. Two rows × 2 edges/row = 4. + edges := wordsToEdgesH(words, 1) + if len(edges) != 4 { + t.Fatalf("got %d edges, want 4 (2 rows × top+bottom)", len(edges)) + } + for _, e := range edges { + if e.Orientation != layout.Horizontal { + t.Errorf("edge %+v: not horizontal", e) + } + if e.Source != layout.SourceText { + t.Errorf("edge %+v: source %v, want SourceText", e, e.Source) + } + } + // Top + bottom for each row should be present: row 1 (700, 710) + // and row 2 (685, 695). + ys := make(map[float64]int, 4) + for _, e := range edges { + ys[e.Y0]++ + } + for _, want := range []float64{700, 710, 685, 695} { + if ys[want] == 0 { + t.Errorf("missing horizontal edge at Y=%v", want) + } + } +} + +// TestWordsToEdges_EmptyInputs asserts the early-return paths. +func TestWordsToEdges_EmptyInputs(t *testing.T) { + if got := wordsToEdgesV(nil, 3); got != nil { + t.Errorf("nil words: got %v, want nil", got) + } + if got := wordsToEdgesH(nil, 1); got != nil { + t.Errorf("nil words: got %v, want nil", got) + } + if got := wordsToEdgesV([]Word{makeWord("A", 0, 0, 10, 10)}, 0); got != nil { + t.Errorf("threshold 0: got %v, want nil", got) + } +} + +// TestExplicitVerticalEdges_PromotesAndFiltersInvalid asserts that +// each finite X is promoted to a full-height vertical edge tagged +// SourceExplicit, and that non-finite values are dropped silently. +func TestExplicitVerticalEdges_PromotesAndFiltersInvalid(t *testing.T) { + xs := []float64{100, 200, nanForTest(), 300} + edges := explicitVerticalEdges(xs, 0, 800) + if len(edges) != 3 { + t.Fatalf("got %d edges, want 3 (NaN dropped)", len(edges)) + } + for _, e := range edges { + if e.Orientation != layout.Vertical { + t.Errorf("edge %+v: not vertical", e) + } + if e.Source != layout.SourceExplicit { + t.Errorf("edge %+v: source %v, want SourceExplicit", e, e.Source) + } + if e.Y0 != 0 || e.Y1 != 800 { + t.Errorf("edge %+v: Y span got (%v,%v), want (0,800)", e, e.Y0, e.Y1) + } + } +} + +// TestExplicitHorizontalEdges_PromotesAndFiltersInvalid is the +// horizontal counterpart. +func TestExplicitHorizontalEdges_PromotesAndFiltersInvalid(t *testing.T) { + ys := []float64{100, 200, 300} + edges := explicitHorizontalEdges(ys, 0, 600) + if len(edges) != 3 { + t.Fatalf("got %d edges, want 3", len(edges)) + } + for _, e := range edges { + if e.Orientation != layout.Horizontal { + t.Errorf("edge %+v: not horizontal", e) + } + if e.X0 != 0 || e.X1 != 600 { + t.Errorf("edge %+v: X span got (%v,%v), want (0,600)", e, e.X0, e.X1) + } + } +} + +// TestValidateExplicitForStrategy_RequiresTwoCoords asserts the +// pre-flight check rejects an explicit strategy with fewer than two +// coordinates on the chosen axis. pdfplumber raises ValueError; we +// surface a regular error (callers don't typically catch via +// errors.Is here). +func TestValidateExplicitForStrategy_RequiresTwoCoords(t *testing.T) { + cases := []struct { + name string + s TableSettings + want bool + }{ + {"v_explicit_zero", TableSettings{VerticalStrategy: StrategyExplicit}, true}, + {"v_explicit_one", TableSettings{VerticalStrategy: StrategyExplicit, ExplicitVerticalLines: []float64{1}}, true}, + {"v_explicit_two_ok", TableSettings{VerticalStrategy: StrategyExplicit, ExplicitVerticalLines: []float64{1, 2}}, false}, + {"h_explicit_one", TableSettings{HorizontalStrategy: StrategyExplicit, ExplicitHorizontalLines: []float64{1}}, true}, + {"lines_no_check", TableSettings{VerticalStrategy: StrategyLines}, false}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + err := validateExplicitForStrategy(c.s.applyDefaults()) + if got := err != nil; got != c.want { + t.Errorf("got err=%v, want error: %v", err, c.want) + } + }) + } +} + +// TestExtractTables_BorderlessTextStrategy asserts the public API +// runs the text strategy end-to-end on the borderless fixture and +// recovers the expected row × column grid. +// +// Fixture: testdata.TableBorderless() — 3 columns ("Item", +// "Quantity", "Price") and 3 rows of body data, no rules drawn. +func TestExtractTables_BorderlessTextStrategy(t *testing.T) { + doc, err := OpenBytes(testdata.TableBorderless()) + if err != nil { + t.Fatalf("OpenBytes: %v", err) + } + defer doc.Close() + p, err := doc.Page(1) + if err != nil { + t.Fatalf("Page(1): %v", err) + } + + settings := DefaultTableSettings() + settings.VerticalStrategy = StrategyText + settings.HorizontalStrategy = StrategyText + + tables, err := p.ExtractTables(settings) + if err != nil { + t.Fatalf("ExtractTables: %v", err) + } + if len(tables) == 0 { + t.Fatalf("got 0 tables, want >= 1") + } + tbl := tables[0] + // We constructed the fixture with 4 rows (1 header + 3 body) and + // 3 columns. The text strategy infers row boundaries from top-y + // clusters so depending on how the top/bottom edges merge we + // may end up with 3 or 4 rows; assert at least 3 and at least 3 + // columns. + if len(tbl.Rows) < 3 { + t.Errorf("rows: got %d, want >= 3", len(tbl.Rows)) + } + if len(tbl.Rows) > 0 && len(tbl.Rows[0]) < 3 { + t.Errorf("cols: got %d, want >= 3", len(tbl.Rows[0])) + } + // Spot-check that the body data is present somewhere in the + // extracted text (the algorithm may place it in any row/col + // depending on edge merging; the parity test below pins the + // exact layout). + flat := strings.Join(flattenRows(tbl.Rows), " ") + for _, want := range []string{"Apple", "Banana", "Cherry"} { + if !strings.Contains(flat, want) { + t.Errorf("flat output %q missing %q", flat, want) + } + } +} + +// TestExtractTables_ExplicitStrategy asserts that supplying caller- +// derived coordinates via ExplicitVerticalLines / +// ExplicitHorizontalLines + StrategyExplicit produces the expected +// grid even when the underlying PDF has no rules drawn at all. +func TestExtractTables_ExplicitStrategy(t *testing.T) { + doc, err := OpenBytes(testdata.TableBorderless()) + if err != nil { + t.Fatalf("OpenBytes: %v", err) + } + defer doc.Close() + p, err := doc.Page(1) + if err != nil { + t.Fatalf("Page(1): %v", err) + } + + // The borderless fixture places its 3 columns near X = 100, 200, + // 300 and 4 rows of text in the Y range [680, 730]. We feed + // boundaries that bracket those positions. + settings := DefaultTableSettings() + settings.VerticalStrategy = StrategyExplicit + settings.HorizontalStrategy = StrategyExplicit + settings.ExplicitVerticalLines = []float64{95, 195, 295, 395} + settings.ExplicitHorizontalLines = []float64{670, 690, 710, 740} + + tables, err := p.ExtractTables(settings) + if err != nil { + t.Fatalf("ExtractTables: %v", err) + } + if len(tables) == 0 { + t.Fatalf("got 0 tables, want >= 1") + } + tbl := tables[0] + if len(tbl.Rows) != 3 { + t.Errorf("rows: got %d, want 3 (4 H-edges → 3 rows)", len(tbl.Rows)) + } + if len(tbl.Rows) > 0 && len(tbl.Rows[0]) != 3 { + t.Errorf("cols: got %d, want 3 (4 V-edges → 3 cols)", len(tbl.Rows[0])) + } +} + +// TestExtractTables_MixedStrategy asserts that VerticalStrategy=text + +// HorizontalStrategy=explicit (and the reverse) work — each axis runs +// its own edge derivation and the resulting edges are merged together. +func TestExtractTables_MixedStrategy(t *testing.T) { + doc, err := OpenBytes(testdata.TableBorderless()) + if err != nil { + t.Fatalf("OpenBytes: %v", err) + } + defer doc.Close() + p, err := doc.Page(1) + if err != nil { + t.Fatalf("Page(1): %v", err) + } + + settings := DefaultTableSettings() + settings.VerticalStrategy = StrategyText + settings.HorizontalStrategy = StrategyExplicit + settings.ExplicitHorizontalLines = []float64{670, 690, 710, 740} + + tables, err := p.ExtractTables(settings) + if err != nil { + t.Fatalf("ExtractTables (text-v + explicit-h): %v", err) + } + if len(tables) == 0 { + t.Fatal("got 0 tables, want >= 1") + } +} + +// flattenRows joins a 2-D string grid into a flat slice for +// substring spot-checks. +func flattenRows(rows [][]string) []string { + var out []string + for _, r := range rows { + out = append(out, r...) + } + return out +} + +// nanForTest returns a NaN without forcing the test file to import +// math at the top. +func nanForTest() float64 { + zero := 0.0 + return zero / zero +} diff --git a/testdata/fixtures.go b/testdata/fixtures.go index 0f7caba..7bf7faa 100644 --- a/testdata/fixtures.go +++ b/testdata/fixtures.go @@ -134,6 +134,66 @@ ET return BuildSinglePage(grid + text) } +// TableBorderless returns a minimal PDF with a 3-column borderless +// table: the columns are conveyed by whitespace alignment alone, with +// no ruling lines drawn. The header row is at Y ~ 730 and three body +// rows are at Y ~ 710, 695, 680. Columns are at X ~ 100, 200, 300. +// +// This fixture targets the "text" strategy — it's the smallest +// possible reproducer of the borderless-table case that's common in +// 10-K filings, bank statements, scanned-then-OCR'd PDFs, and any +// other PDF whose tables aren't ruled. +// +// Content: +// +// Item Quantity Price +// Apple 3 1.50 +// Banana 6 0.75 +// Cherry 12 0.10 +// +// The X positions are chosen so each header word and each body word +// in a column starts at the same X within the wordEdgeTolerance +// (=1 pt) — pdfplumber's words_to_edges_v clusters on exactly that +// tolerance. +func TableBorderless() []byte { + // 10pt Helvetica baselines. We move the text cursor with Td so + // the relative offsets keep the per-row, per-column positions + // pinned to the same X coordinates within each row. + const text = `BT +/F1 10 Tf +% Header row: baseline ~ 720. +100 720 Td +(Item) Tj +100 0 Td +(Quantity) Tj +100 0 Td +(Price) Tj +% Body row 1: y -= 20, x back to 0. +-200 -20 Td +(Apple) Tj +100 0 Td +(3) Tj +100 0 Td +(1.50) Tj +% Body row 2. +-200 -15 Td +(Banana) Tj +100 0 Td +(6) Tj +100 0 Td +(0.75) Tj +% Body row 3. +-200 -15 Td +(Cherry) Tj +100 0 Td +(12) Tj +100 0 Td +(0.10) Tj +ET +` + return BuildSinglePage(text) +} + // Rules returns a minimal PDF whose content stream draws four lines // (two horizontal, two vertical) and one rectangle. We use simple // coordinates: a 100x100 box with one stroked diagonal line and one diff --git a/testdata/golden/table-3x4-borderless.pdf b/testdata/golden/table-3x4-borderless.pdf new file mode 100644 index 0000000000000000000000000000000000000000..59201dca8c9665efa8b3c5ae23b46fb97b4ddc39 GIT binary patch literal 950 zcmZWo&2HN;48H3rcrma6OKjP-jTT*jI!n-D8=5r+b_jYgX_NFT}XBWW@ie;D<6|BMBH|NVJp93s%OM|OD$VYyu)gacv4QZFDJiv>0i zlfxZcT`^R(b9DFD0j*#7N!ATa6X$+UEPAah@E3v3avJDl43cZB6Mn&3ib8K-8j*j- zW5~~A;=98}uT4b>xlBizBK(2FnNuVkVluj#poM8jHBvdavG?9EAa~MeAl>=yi*d29 zbMnmLWIeOK8LlN9!a_9IT5fRp1FftCghN&7S}N+)H>s{wBM-o_TTXB<@EO&@K0uQC zt~A!5cw$A#!Vw1^K{;mu+@Pq@fYCoMU`C=#sStic9!Gt~qsSegtl6877N6ci`RKtG z`nnd%O1tgAVgQ>fDCrOl9?+O>fjmB8;%!R9(*5UMqj%%GF*s*5 zSS&g`kfmvo!W<458ceqVWqik-rc!8g8xK VuIv`AvBFrNXP)IO2!^*K_8)p$_W1w+ literal 0 HcmV?d00001 diff --git a/testdata/golden/table-3x4-borderless.tables-text.expected.json b/testdata/golden/table-3x4-borderless.tables-text.expected.json new file mode 100644 index 0000000..df44e7f --- /dev/null +++ b/testdata/golden/table-3x4-borderless.tables-text.expected.json @@ -0,0 +1,49 @@ +{ + "name": "table-3x4-borderless", + "pages": [ + { + "number": 1, + "width": 612, + "height": 792, + "tables": [ + [ + [ + "Item", + "Quantity", + "Price" + ], + [ + "", + "", + "" + ], + [ + "Apple", + "3", + "1.50" + ], + [ + "", + "", + "" + ], + [ + "Banana", + "6", + "0.75" + ], + [ + "", + "", + "" + ], + [ + "Cherry", + "12", + "0.10" + ] + ] + ] + } + ] +} \ No newline at end of file diff --git a/testdata/golden/table-3x4-borderless.tables-text.target b/testdata/golden/table-3x4-borderless.tables-text.target new file mode 100644 index 0000000..e69de29 From 0edb9b80c483aed53b3e0a6d03300091e497f3e9 Mon Sep 17 00:00:00 2001 From: Halleluyah Oludele Date: Wed, 27 May 2026 02:07:57 +0100 Subject: [PATCH 2/3] feat: pdftable CLI (extract subcommand) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds cmd/pdftable, a stdlib-only command-line interface mirroring pdfplumber's CLI surface for the operations the library implements: - extract [flags]: tables (--tables) or text (--text) on one page, a range (--pages 1,3-5), or all pages. - Output format selectable via --format json|text. JSON shape includes page dimensions, table bbox, per-cell bbox, and rows. - Full TableSettings surface exposed as flags: --vertical-strategy / --horizontal-strategy, --snap-tolerance, --join-tolerance, --edge-min-length, --intersection-tolerance, --text-tolerance, --min-words-vertical/horizontal, --explicit-vertical-lines/horizontal-lines, --indent. - Positional argument can appear before OR after flags (pdfplumber-style invocation); reorderFlagsLast() shuffles tokens so the standard library flag package can parse either ordering. Tested via cmd/pdftable/main_test.go: end-to-end runs against the issue-466-example and table-3x4-borderless fixtures, plus unit tests on parsePages, reorderFlagsLast, and the error paths. No new go.mod dependencies — uses standard library flag, encoding/json, strings, strconv only. --- cmd/pdftable/main.go | 493 ++++++++++++++++++++++++++++++++++++++ cmd/pdftable/main_test.go | 244 +++++++++++++++++++ 2 files changed, 737 insertions(+) create mode 100644 cmd/pdftable/main.go create mode 100644 cmd/pdftable/main_test.go diff --git a/cmd/pdftable/main.go b/cmd/pdftable/main.go new file mode 100644 index 0000000..5fbf3ce --- /dev/null +++ b/cmd/pdftable/main.go @@ -0,0 +1,493 @@ +// Copyright (c) 2026 Halleluyah Oludele +// Licensed under the MIT License. + +// cmd/pdftable is the command-line interface to the pdftable library. +// It mirrors pdfplumber's CLI surface for the operations pdftable +// implements: extract text, extract tables, dump page geometry. +// +// Usage: +// +// pdftable extract [flags] +// +// Flags (extract subcommand): +// +// --pages Comma-separated page list / dash ranges, e.g. "1,3-5". Default: all. +// --tables Emit detected tables (JSON or text per --format). +// --text Emit extracted text. Mutually exclusive with --tables. +// --format "json" (default) | "text". Output format. +// --vertical-strategy "lines" (default) | "lines_strict" | "text" | "explicit". +// --horizontal-strategy Same set; default "lines". +// --snap-tolerance Float; default 3. +// --join-tolerance Float; default 3. +// --edge-min-length Float; default 3. +// --intersection-tolerance Float; default 3. +// --text-tolerance Float; default 3. +// --min-words-vertical Int; default 3. +// --min-words-horizontal Int; default 1. +// --explicit-vertical-lines Comma-separated floats; required when vertical-strategy=explicit. +// --explicit-horizontal-lines Comma-separated floats; required when horizontal-strategy=explicit. +// --indent Int; JSON pretty-printing indent. 0 = compact. +// +// The CLI uses the standard library `flag` package and the `pdftable` +// public API only — no third-party dependencies. +package main + +import ( + "encoding/json" + "flag" + "fmt" + "io" + "os" + "strconv" + "strings" + + "github.com/hallelx2/pdftable" +) + +func main() { + if err := run(os.Args[1:], os.Stdout, os.Stderr); err != nil { + fmt.Fprintln(os.Stderr, err) + os.Exit(1) + } +} + +// run is the testable entry point. It takes the args slice (excluding +// the executable name) and the stdout/stderr streams so tests can +// capture output without spawning a process. +func run(args []string, stdout, stderr io.Writer) error { + if len(args) == 0 { + printUsage(stderr) + return fmt.Errorf("missing subcommand") + } + switch args[0] { + case "extract": + return runExtract(args[1:], stdout, stderr) + case "-h", "--help", "help": + printUsage(stdout) + return nil + case "version", "-v", "--version": + fmt.Fprintln(stdout, "pdftable v0.3.0") + return nil + default: + printUsage(stderr) + return fmt.Errorf("unknown subcommand %q", args[0]) + } +} + +// printUsage prints the top-level usage string. +func printUsage(w io.Writer) { + fmt.Fprintln(w, `pdftable — extract text and tables from PDFs + +USAGE: + pdftable extract [flags] + pdftable version + pdftable help + +EXTRACT FLAGS (run 'pdftable extract --help' for full list): + --pages 1,3-5 Pages to process (default: all). + --tables Output detected tables. + --text Output extracted text (mutually exclusive with --tables). + --format json|text Output format (default: json). + --vertical-strategy S "lines" | "lines_strict" | "text" | "explicit". + --horizontal-strategy S Same set, default "lines". + +Documentation: https://github.com/hallelx2/pdftable`) +} + +// extractFlags is the parsed flag set for the extract subcommand. +type extractFlags struct { + pages string + tables bool + text bool + format string + verticalStrategy string + horizontalStrategy string + snapTolerance float64 + joinTolerance float64 + edgeMinLength float64 + edgeMinLengthPrefilter float64 + intersectionTolerance float64 + textTolerance float64 + minWordsVertical int + minWordsHorizontal int + explicitVerticalLines string + explicitHorizontalLines string + indent int +} + +// runExtract parses extract-subcommand args, opens the PDF, and +// dispatches to the requested output mode. +func runExtract(args []string, stdout, stderr io.Writer) error { + fs := flag.NewFlagSet("extract", flag.ContinueOnError) + fs.SetOutput(stderr) + + var f extractFlags + fs.StringVar(&f.pages, "pages", "", "Pages to extract: comma list (1,3) or dash range (1-5). Default: all.") + fs.BoolVar(&f.tables, "tables", false, "Emit detected tables.") + fs.BoolVar(&f.text, "text", false, "Emit extracted text (mutually exclusive with --tables).") + fs.StringVar(&f.format, "format", "json", "Output format: json | text.") + fs.StringVar(&f.verticalStrategy, "vertical-strategy", "lines", `Vertical edge strategy: lines | lines_strict | text | explicit.`) + fs.StringVar(&f.horizontalStrategy, "horizontal-strategy", "lines", `Horizontal edge strategy: lines | lines_strict | text | explicit.`) + fs.Float64Var(&f.snapTolerance, "snap-tolerance", 3, "Snap tolerance (PDF points).") + fs.Float64Var(&f.joinTolerance, "join-tolerance", 3, "Join tolerance (PDF points).") + fs.Float64Var(&f.edgeMinLength, "edge-min-length", 3, "Drop merged edges shorter than this (PDF points).") + fs.Float64Var(&f.edgeMinLengthPrefilter, "edge-min-length-prefilter", 1, "Drop raw edges shorter than this before merging.") + fs.Float64Var(&f.intersectionTolerance, "intersection-tolerance", 3, "Slack used when testing edge crossings (PDF points).") + fs.Float64Var(&f.textTolerance, "text-tolerance", 3, "Per-cell text-extraction tolerance.") + fs.IntVar(&f.minWordsVertical, "min-words-vertical", 3, "Min word count for text strategy vertical clusters.") + fs.IntVar(&f.minWordsHorizontal, "min-words-horizontal", 1, "Min word count for text strategy horizontal clusters.") + fs.StringVar(&f.explicitVerticalLines, "explicit-vertical-lines", "", "Comma-separated X coordinates for explicit strategy.") + fs.StringVar(&f.explicitHorizontalLines, "explicit-horizontal-lines", "", "Comma-separated Y coordinates for explicit strategy.") + fs.IntVar(&f.indent, "indent", 0, "JSON indent level. 0 = compact.") + + // Allow positional argument (path) before OR after flags by + // shuffling args. Go's flag package stops at the first + // non-flag; pdfplumber-style `extract file.pdf --tables` would + // otherwise be rejected. + reordered := reorderFlagsLast(args) + if err := fs.Parse(reordered); err != nil { + return err + } + positional := fs.Args() + if len(positional) < 1 { + fs.Usage() + return fmt.Errorf("missing input PDF path") + } + if len(positional) > 1 { + return fmt.Errorf("unexpected positional arguments after %q: %v", positional[0], positional[1:]) + } + path := positional[0] + + if f.tables && f.text { + return fmt.Errorf("--tables and --text are mutually exclusive") + } + if !f.tables && !f.text { + // Default to --tables when neither is specified. Mirrors the + // pdftable library's primary use case. + f.tables = true + } + if f.format != "json" && f.format != "text" { + return fmt.Errorf("--format must be json or text, got %q", f.format) + } + + doc, err := pdftable.OpenFile(path) + if err != nil { + return err + } + defer doc.Close() + + pageNums, err := parsePages(f.pages, doc.NumPages()) + if err != nil { + return fmt.Errorf("--pages: %w", err) + } + + settings, err := buildSettings(f) + if err != nil { + return err + } + + if f.tables { + return emitTables(doc, pageNums, settings, f, stdout) + } + return emitText(doc, pageNums, f, stdout) +} + +// parsePages converts the --pages flag value into a sorted slice of +// 1-based page numbers. Empty string returns all pages. +// +// Examples: +// +// "" → [1..N] +// "1" → [1] +// "1,3" → [1, 3] +// "2-5" → [2, 3, 4, 5] +// "1,3-5" → [1, 3, 4, 5] +func parsePages(spec string, total int) ([]int, error) { + if strings.TrimSpace(spec) == "" { + out := make([]int, total) + for i := range out { + out[i] = i + 1 + } + return out, nil + } + seen := make(map[int]struct{}) + out := make([]int, 0) + for _, tok := range strings.Split(spec, ",") { + tok = strings.TrimSpace(tok) + if tok == "" { + continue + } + if strings.Contains(tok, "-") { + parts := strings.SplitN(tok, "-", 2) + start, err := strconv.Atoi(strings.TrimSpace(parts[0])) + if err != nil { + return nil, fmt.Errorf("invalid range %q: %v", tok, err) + } + end, err := strconv.Atoi(strings.TrimSpace(parts[1])) + if err != nil { + return nil, fmt.Errorf("invalid range %q: %v", tok, err) + } + if start < 1 || end > total || start > end { + return nil, fmt.Errorf("range %q out of bounds (1..%d)", tok, total) + } + for i := start; i <= end; i++ { + if _, ok := seen[i]; ok { + continue + } + seen[i] = struct{}{} + out = append(out, i) + } + } else { + n, err := strconv.Atoi(tok) + if err != nil { + return nil, fmt.Errorf("invalid page %q: %v", tok, err) + } + if n < 1 || n > total { + return nil, fmt.Errorf("page %d out of bounds (1..%d)", n, total) + } + if _, ok := seen[n]; !ok { + seen[n] = struct{}{} + out = append(out, n) + } + } + } + return out, nil +} + +// buildSettings translates the parsed flag set into a TableSettings. +// It also parses the explicit-line coordinate strings. +func buildSettings(f extractFlags) (pdftable.TableSettings, error) { + s := pdftable.DefaultTableSettings() + s.VerticalStrategy = pdftable.TableStrategy(f.verticalStrategy) + s.HorizontalStrategy = pdftable.TableStrategy(f.horizontalStrategy) + s.SnapTolerance = f.snapTolerance + s.JoinTolerance = f.joinTolerance + s.EdgeMinLength = f.edgeMinLength + s.EdgeMinLengthPrefilter = f.edgeMinLengthPrefilter + s.IntersectionTolerance = f.intersectionTolerance + s.TextTolerance = f.textTolerance + s.MinWordsVertical = f.minWordsVertical + s.MinWordsHorizontal = f.minWordsHorizontal + + if f.explicitVerticalLines != "" { + coords, err := parseFloatList(f.explicitVerticalLines) + if err != nil { + return s, fmt.Errorf("--explicit-vertical-lines: %w", err) + } + s.ExplicitVerticalLines = coords + } + if f.explicitHorizontalLines != "" { + coords, err := parseFloatList(f.explicitHorizontalLines) + if err != nil { + return s, fmt.Errorf("--explicit-horizontal-lines: %w", err) + } + s.ExplicitHorizontalLines = coords + } + return s, nil +} + +// reorderFlagsLast moves positional arguments to the end of the slice +// so the standard library flag package's "stop at first non-flag" +// behaviour doesn't get in the way of pdfplumber-style invocations +// like `pdftable extract file.pdf --tables`. +// +// Heuristic: a token is a flag if it starts with "-" or "--". Flags +// of the form "--foo=val" carry their value inline. Flags whose +// arguments are space-separated (e.g. "--pages 1-3") need to consume +// the following token; the boolean-flag set says which flags don't. +func reorderFlagsLast(args []string) []string { + flagArgs := make([]string, 0, len(args)) + positional := make([]string, 0) + i := 0 + for i < len(args) { + a := args[i] + if strings.HasPrefix(a, "-") { + flagArgs = append(flagArgs, a) + // Inline value (--key=val): no follow-up token needed. + if strings.Contains(a, "=") { + i++ + continue + } + // Boolean flags don't consume the next token. Strip the + // leading dashes to look up. + name := strings.TrimLeft(a, "-") + if _, isBool := boolFlagSet[name]; isBool { + i++ + continue + } + // Otherwise, the next token (if any) is the value. Don't + // promote a token that looks like another flag. + if i+1 < len(args) && !strings.HasPrefix(args[i+1], "-") { + flagArgs = append(flagArgs, args[i+1]) + i += 2 + continue + } + i++ + continue + } + positional = append(positional, a) + i++ + } + return append(flagArgs, positional...) +} + +// boolFlagSet lists every boolean flag the extract subcommand +// understands. Used by reorderFlagsLast to know which flags don't +// consume a following value token. +var boolFlagSet = map[string]struct{}{ + "tables": {}, + "text": {}, + "h": {}, + "help": {}, +} + +// parseFloatList parses a comma-separated list of float coordinates. +func parseFloatList(spec string) ([]float64, error) { + parts := strings.Split(spec, ",") + out := make([]float64, 0, len(parts)) + for _, p := range parts { + p = strings.TrimSpace(p) + if p == "" { + continue + } + v, err := strconv.ParseFloat(p, 64) + if err != nil { + return nil, fmt.Errorf("invalid float %q: %v", p, err) + } + out = append(out, v) + } + return out, nil +} + +// tablesOutput is the JSON shape emitted by `pdftable extract +// --tables`. We deliberately mirror pdfplumber's `to_json` schema +// where it overlaps: one entry per page, each carrying the page +// dimensions and a list of tables. Each table carries the row grid, +// the bbox, and the per-cell bboxes. +type tablesOutput struct { + Pages []pageTablesOutput `json:"pages"` +} + +type pageTablesOutput struct { + Number int `json:"number"` + Width float64 `json:"width"` + Height float64 `json:"height"` + Tables []tableOutput `json:"tables"` +} + +type tableOutput struct { + BBox [4]float64 `json:"bbox"` + Rows [][]string `json:"rows"` + Cells [][][4]float64 `json:"cells"` +} + +// emitTables runs ExtractTables on each requested page and writes the +// aggregated result in the requested format. +func emitTables(doc pdftable.Document, pages []int, settings pdftable.TableSettings, f extractFlags, w io.Writer) error { + out := tablesOutput{Pages: make([]pageTablesOutput, 0, len(pages))} + for _, n := range pages { + page, err := doc.Page(n) + if err != nil { + return err + } + tables, err := page.ExtractTables(settings) + if err != nil { + return fmt.Errorf("page %d: %w", n, err) + } + pageOut := pageTablesOutput{ + Number: n, + Width: page.Width(), + Height: page.Height(), + Tables: make([]tableOutput, 0, len(tables)), + } + for _, t := range tables { + to := tableOutput{ + BBox: [4]float64{t.BBox.X0, t.BBox.Y0, t.BBox.X1, t.BBox.Y1}, + Rows: t.Rows, + } + to.Cells = make([][][4]float64, len(t.CellsBBox)) + for ri, row := range t.CellsBBox { + to.Cells[ri] = make([][4]float64, len(row)) + for ci, c := range row { + to.Cells[ri][ci] = [4]float64{c.X0, c.Y0, c.X1, c.Y1} + } + } + pageOut.Tables = append(pageOut.Tables, to) + } + out.Pages = append(out.Pages, pageOut) + } + + if f.format == "text" { + // One line per cell, blank line between rows, "---" between + // tables, page-number header before each page. + for _, p := range out.Pages { + fmt.Fprintf(w, "=== Page %d (%g x %g) ===\n", p.Number, p.Width, p.Height) + for ti, t := range p.Tables { + if ti > 0 { + fmt.Fprintln(w, "---") + } + for _, row := range t.Rows { + fmt.Fprintln(w, strings.Join(row, "\t")) + } + } + } + return nil + } + + enc := json.NewEncoder(w) + if f.indent > 0 { + enc.SetIndent("", strings.Repeat(" ", f.indent)) + } + return enc.Encode(out) +} + +// textOutput is the JSON shape for `pdftable extract --text`. One +// entry per page; text is the dense extract-text output. +type textOutput struct { + Pages []pageTextOutput `json:"pages"` +} + +type pageTextOutput struct { + Number int `json:"number"` + Width float64 `json:"width"` + Height float64 `json:"height"` + Text string `json:"text"` +} + +// emitText runs ExtractText on each requested page and writes the +// aggregated result in the requested format. --format text emits the +// text verbatim with a form-feed (\f) between pages, mirroring +// `pdftotext` and pdfplumber's --format text behaviour. +func emitText(doc pdftable.Document, pages []int, f extractFlags, w io.Writer) error { + out := textOutput{Pages: make([]pageTextOutput, 0, len(pages))} + for _, n := range pages { + page, err := doc.Page(n) + if err != nil { + return err + } + text, err := page.ExtractText(pdftable.DefaultTextOpts()) + if err != nil { + return fmt.Errorf("page %d: %w", n, err) + } + out.Pages = append(out.Pages, pageTextOutput{ + Number: n, + Width: page.Width(), + Height: page.Height(), + Text: text, + }) + } + if f.format == "text" { + for i, p := range out.Pages { + if i > 0 { + fmt.Fprint(w, "\f") + } + fmt.Fprintln(w, p.Text) + } + return nil + } + enc := json.NewEncoder(w) + if f.indent > 0 { + enc.SetIndent("", strings.Repeat(" ", f.indent)) + } + return enc.Encode(out) +} diff --git a/cmd/pdftable/main_test.go b/cmd/pdftable/main_test.go new file mode 100644 index 0000000..32d50ce --- /dev/null +++ b/cmd/pdftable/main_test.go @@ -0,0 +1,244 @@ +// Copyright (c) 2026 Halleluyah Oludele +// Licensed under the MIT License. + +package main + +import ( + "bytes" + "encoding/json" + "path/filepath" + "strings" + "testing" +) + +// fixturePath assembles the path to a fixture relative to this test +// file. We use a relative path because `go test ./...` runs tests +// with the test file's directory as the cwd. +func fixturePath(name string) string { + return filepath.Join("..", "..", "testdata", "golden", name) +} + +// TestRun_ExtractTables_JSON exercises the happy path: extract tables +// from the issue-466-example fixture in JSON format and assert the +// shape matches the documented schema. +func TestRun_ExtractTables_JSON(t *testing.T) { + var stdout, stderr bytes.Buffer + args := []string{"extract", fixturePath("issue-466-example.pdf"), "--tables", "--format", "json"} + if err := run(args, &stdout, &stderr); err != nil { + t.Fatalf("run: %v (stderr=%s)", err, stderr.String()) + } + var out map[string]any + if err := json.Unmarshal(stdout.Bytes(), &out); err != nil { + t.Fatalf("unmarshal: %v\n%s", err, stdout.String()) + } + pages, ok := out["pages"].([]any) + if !ok { + t.Fatalf("output missing pages key: %v", out) + } + if len(pages) != 1 { + t.Fatalf("got %d pages, want 1", len(pages)) + } + page := pages[0].(map[string]any) + tables, ok := page["tables"].([]any) + if !ok || len(tables) < 1 { + t.Fatalf("page has no tables: %v", page) + } + first := tables[0].(map[string]any) + if first["bbox"] == nil { + t.Errorf("table 0 missing bbox") + } + rows, ok := first["rows"].([]any) + if !ok || len(rows) == 0 { + t.Errorf("table 0 missing rows") + } +} + +// TestRun_ExtractTables_TextStrategy asserts the CLI propagates the +// --vertical-strategy / --horizontal-strategy flags through to the +// library and recovers the borderless fixture's table. +func TestRun_ExtractTables_TextStrategy(t *testing.T) { + var stdout, stderr bytes.Buffer + args := []string{ + "extract", + fixturePath("table-3x4-borderless.pdf"), + "--tables", + "--vertical-strategy", "text", + "--horizontal-strategy", "text", + "--format", "text", + } + if err := run(args, &stdout, &stderr); err != nil { + t.Fatalf("run: %v (stderr=%s)", err, stderr.String()) + } + out := stdout.String() + for _, want := range []string{"Item", "Quantity", "Price", "Apple", "Banana", "Cherry"} { + if !strings.Contains(out, want) { + t.Errorf("output missing %q\n%s", want, out) + } + } +} + +// TestRun_ExtractTables_Pages asserts the --pages flag narrows the +// output to the requested pages only. +func TestRun_ExtractTables_Pages(t *testing.T) { + var stdout, stderr bytes.Buffer + args := []string{ + "extract", + fixturePath("issue-466-example.pdf"), + "--tables", + "--format", "json", + "--pages", "1", + } + if err := run(args, &stdout, &stderr); err != nil { + t.Fatalf("run: %v (stderr=%s)", err, stderr.String()) + } + var out map[string]any + if err := json.Unmarshal(stdout.Bytes(), &out); err != nil { + t.Fatalf("unmarshal: %v", err) + } + if pages := out["pages"].([]any); len(pages) != 1 { + t.Errorf("got %d pages, want 1", len(pages)) + } +} + +// TestRun_ExtractText asserts the --text mode emits JSON with a +// "text" field per page. +func TestRun_ExtractText(t *testing.T) { + var stdout, stderr bytes.Buffer + args := []string{"extract", fixturePath("hello.pdf"), "--text", "--format", "json"} + if err := run(args, &stdout, &stderr); err != nil { + t.Fatalf("run: %v (stderr=%s)", err, stderr.String()) + } + if !strings.Contains(stdout.String(), "Hello") { + t.Errorf("output missing 'Hello':\n%s", stdout.String()) + } +} + +// TestParsePages exercises the page-spec parser on a variety of valid +// and invalid inputs. +func TestParsePages(t *testing.T) { + cases := []struct { + name string + spec string + total int + want []int + err bool + }{ + {"empty_all", "", 3, []int{1, 2, 3}, false}, + {"single", "2", 3, []int{2}, false}, + {"comma_list", "1,3", 3, []int{1, 3}, false}, + {"dash_range", "2-4", 5, []int{2, 3, 4}, false}, + {"mixed", "1,3-5", 5, []int{1, 3, 4, 5}, false}, + {"deduped", "1,1,2", 3, []int{1, 2}, false}, + {"bad_range", "5-2", 5, nil, true}, + {"out_of_bounds", "10", 3, nil, true}, + {"negative", "-1", 3, nil, true}, + {"invalid", "abc", 3, nil, true}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + got, err := parsePages(c.spec, c.total) + if c.err { + if err == nil { + t.Errorf("want error, got %v", got) + } + return + } + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if len(got) != len(c.want) { + t.Fatalf("got %v, want %v", got, c.want) + } + for i := range got { + if got[i] != c.want[i] { + t.Errorf("[%d] got %d, want %d", i, got[i], c.want[i]) + } + } + }) + } +} + +// TestReorderFlagsLast asserts that positional arguments get pushed +// to the end so the standard-library flag package can parse them +// regardless of their original position. +func TestReorderFlagsLast(t *testing.T) { + cases := []struct { + in, want []string + }{ + { + in: []string{"file.pdf", "--tables", "--format", "json"}, + want: []string{"--tables", "--format", "json", "file.pdf"}, + }, + { + in: []string{"--pages", "1-3", "file.pdf"}, + want: []string{"--pages", "1-3", "file.pdf"}, + }, + { + in: []string{"--tables", "file.pdf"}, + want: []string{"--tables", "file.pdf"}, + }, + { + in: []string{"--format=json", "file.pdf"}, + want: []string{"--format=json", "file.pdf"}, + }, + } + for i, c := range cases { + got := reorderFlagsLast(c.in) + if len(got) != len(c.want) { + t.Errorf("case %d: lengths differ; got %v, want %v", i, got, c.want) + continue + } + for j := range got { + if got[j] != c.want[j] { + t.Errorf("case %d [%d]: got %q, want %q", i, j, got[j], c.want[j]) + } + } + } +} + +// TestRun_MissingFile asserts the CLI surfaces a clean error when the +// input path doesn't exist (rather than crashing). +func TestRun_MissingFile(t *testing.T) { + var stdout, stderr bytes.Buffer + args := []string{"extract", "no-such-file.pdf", "--tables"} + err := run(args, &stdout, &stderr) + if err == nil { + t.Fatal("got nil error, want failure") + } +} + +// TestRun_MutuallyExclusiveFlags asserts --tables and --text can't +// both be set. +func TestRun_MutuallyExclusiveFlags(t *testing.T) { + var stdout, stderr bytes.Buffer + args := []string{"extract", fixturePath("hello.pdf"), "--tables", "--text"} + err := run(args, &stdout, &stderr) + if err == nil { + t.Fatal("got nil error, want mutually-exclusive failure") + } + if !strings.Contains(err.Error(), "mutually exclusive") { + t.Errorf("error %q should mention mutual exclusion", err) + } +} + +// TestRun_UnknownSubcommand asserts the CLI rejects an unknown +// subcommand with a clear error. +func TestRun_UnknownSubcommand(t *testing.T) { + var stdout, stderr bytes.Buffer + err := run([]string{"foo"}, &stdout, &stderr) + if err == nil { + t.Fatal("got nil, want unknown-subcommand error") + } +} + +// TestRun_Version asserts the `version` subcommand emits something +// that mentions v0.3.0. +func TestRun_Version(t *testing.T) { + var stdout, stderr bytes.Buffer + if err := run([]string{"version"}, &stdout, &stderr); err != nil { + t.Fatalf("run version: %v", err) + } + if !strings.Contains(stdout.String(), "v0.3.0") { + t.Errorf("output missing version: %q", stdout.String()) + } +} From 0ab1e7eb75b066a6018e47c85f951f81d3bc4628 Mon Sep 17 00:00:00 2001 From: Halleluyah Oludele Date: Wed, 27 May 2026 02:08:10 +0100 Subject: [PATCH 3/3] docs: v0.3.0 changelog + README usage (text/explicit + CLI) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - CHANGELOG.md: v0.3.0 entry covering text + explicit strategies, mixed-strategy support, the pdftable CLI, the layout.SourceText enum, and the borderless parity fixture. Known limitations note the carried-over font-metric drift on cell text. - README.md: status bumped to v0.3.0; "Tables" section reworked with side-by-side pdfplumber → pdftable snippets for all four strategies plus a mixed-strategy example; new "CLI" section documenting the extract subcommand and full flag table; roadmap reflects v0.4.x as the AFM-bundle phase. --- CHANGELOG.md | 90 +++++++++++++++++++++ README.md | 217 +++++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 274 insertions(+), 33 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 096b104..82c7e05 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,95 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.3.0] - 2026-05-27 + +Phase 1.3.D + 1.3.E — text and explicit table-finding strategies, the +`pdftable` CLI. Completes pdfplumber parity for the four canonical +table strategies. The v0.2.x public API surface is unchanged; v0.3.0 +only widens what's valid in `TableSettings` and adds the new CLI +binary, so existing callers compile and run as-is. + +### Added + +- `StrategyText`: infer table edges from word alignment. Vertical + edges come from clusters of words sharing X0 (left), X1 (right), or + centre position with the per-axis tolerance hardcoded to 1 PDF + point (matching pdfplumber's `words_to_edges_v`). Horizontal edges + come from clusters sharing visual top, with both the top and + bottom of each cluster emitted so the last row gets captured + (matching `words_to_edges_h`). Threshold via + `TableSettings.MinWordsVertical` (default 3) and + `MinWordsHorizontal` (default 1). +- `StrategyExplicit`: caller-supplied edges via + `TableSettings.ExplicitVerticalLines` / + `ExplicitHorizontalLines`. When the strategy is `explicit` on an + axis, the supplied coordinates are the ONLY source of edges on + that axis; at least two coordinates are required (matching + pdfplumber's validation). Non-finite values (NaN, Inf) are skipped + with a `log` warning rather than crashing. +- Mixed strategies: every combination of the four strategies across + the two axes works (16 combinations total). The two axes' base + edges are derived independently then merged together for the + intersection pipeline — no orientation-specific logic leaks + between them. +- `pdftable` CLI binary at `cmd/pdftable/`. Subcommand surface + mirrors pdfplumber's: `extract [flags]` with + `--pages 1,3-5`, `--tables`, `--text`, `--format json|text`, + `--vertical-strategy`, `--horizontal-strategy`, the full set of + tolerance flags, `--min-words-vertical / horizontal`, + `--explicit-vertical-lines / horizontal-lines`, and `--indent`. + Stdlib `flag` package only — no third-party CLI dependencies. + Positional argument can appear before OR after flags + (pdfplumber-style invocation). Tested via + `cmd/pdftable/main_test.go` against the existing golden fixtures. +- New `layout.SourceText` enum value tagging edges produced by the + text strategy. `layout.SourceExplicit` was already in place from + v0.2.0; the explicit-strategy implementation now writes through + to it as the primary source. +- Hand-crafted borderless fixture `testdata.TableBorderless()` + (3-column × 4-row narrative table conveyed by whitespace alignment + only, no rules drawn). Used by the new text-strategy unit tests + and pdfplumber parity test. The generated PDF is in + `testdata/golden/table-3x4-borderless.pdf`. +- Golden-file parity test `TestGoldenTablesTextStrategyAgainstPdfplumber` + driven by `*.tables-text.expected.json` files. The + `table-3x4-borderless` fixture matches pdfplumber's + `find_tables({text, text})` cell-for-cell. Regenerate via the new + `scripts/capture_pdfplumber_text_golden.py` helper. +- `scripts/capture_pdfplumber_text_golden.py`: tiny Python helper + that captures pdfplumber's text-strategy output for every fixture + with a sibling `.tables-text.target` marker. Mirrors the existing + `scripts/gen_golden.py` workflow for the line-strategy goldens. + +### Changed + +- `Page.FindTables` / `Page.ExtractTables` no longer return + `ErrUnsupported` for `text` or `explicit` strategies — all four + strategies are now implemented. The error is still returned for + unknown strategy strings (typo guard). +- `TableSettings` field docs updated to reflect the implemented + semantics of `MinWordsVertical` / `MinWordsHorizontal` and the + Explicit*Lines slices. +- README's "Tables" section restructured: side-by-side + pdfplumber→pdftable examples for all four strategies, plus a + mixed-strategy snippet and a new "CLI" section. + +### Known limitations + +- Cell text fidelity on the text strategy depends on the same font + metrics as v0.2.x: PDFs that use standard-14 fonts without + bundled AFM tables can report intra-word gaps as zero, producing + cells like "Nohorizontal" where pdfplumber gets "No horizontal". + Structural parity (table count, row count, column count) matches + exactly; cell text matches verbatim on PDFs whose fonts have + bundled metrics or `/Widths` arrays. AFM-table bundling is a + v0.4.x goal. +- Mixed-strategy snap/join uses a single global tolerance. If a + page mixes drawn rules at one X coordinate and word-cluster + edges at a slightly different X, the two won't merge unless + `SnapTolerance` is widened. This matches pdfplumber's behaviour + but is worth noting for callers tuning a mixed pipeline. + ## [0.2.0] - 2026-05-27 Phase 1.3.C — table-finding via ruled lines. Direct port of @@ -219,6 +308,7 @@ Initial release. Phase 1.3.A — content-stream primitives layer. - Type 3 fonts (their glyph procedures are themselves content streams). - Vertical writing mode. +[0.3.0]: https://github.com/hallelx2/pdftable/releases/tag/v0.3.0 [0.2.0]: https://github.com/hallelx2/pdftable/releases/tag/v0.2.0 [0.1.1]: https://github.com/hallelx2/pdftable/releases/tag/v0.1.1 [0.1.0]: https://github.com/hallelx2/pdftable/releases/tag/v0.1.0 diff --git a/README.md b/README.md index 12b418d..9e4bef3 100644 --- a/README.md +++ b/README.md @@ -19,10 +19,11 @@ heuristics on. This is that. ## Status -`v0.2.0` — line-strategy table finding. `Page.FindTables` and -`Page.ExtractTables` ship with this release covering the `lines` and -`lines_strict` strategies (PDFs with ruled tables). `text` and -`explicit` strategies return `ErrUnsupported` and land in v0.3.0. +`v0.3.0` — full pdfplumber parity for table-finding strategies. All four +canonical strategies are implemented: `lines`, `lines_strict`, `text`, +and `explicit`. Mix and match per-axis (e.g. `vertical="text"` + +`horizontal="lines"`) works as expected. Also ships the `pdftable` +CLI for extracting text and tables without writing Go. [![Go Reference](https://pkg.go.dev/badge/github.com/hallelx2/pdftable.svg)](https://pkg.go.dev/github.com/hallelx2/pdftable) [![CI](https://github.com/hallelx2/pdftable/actions/workflows/test.yml/badge.svg)](https://github.com/hallelx2/pdftable/actions/workflows/test.yml) @@ -31,7 +32,7 @@ heuristics on. This is that. ## Install ```sh -go get github.com/hallelx2/pdftable@v0.2.0 +go get github.com/hallelx2/pdftable@v0.3.0 ``` Requires Go 1.25+ (uses the standard-library `iter` package for the `Pages()` range-over-func iterator, and pdfcpu v0.12+). @@ -113,7 +114,7 @@ type Page interface { ExtractText(opts TextOpts) (string, error) ExtractTextSimple(xTolerance, yTolerance float64) (string, error) - // New in v0.2.0: line-strategy table finding. + // Table finding: lines + lines_strict (v0.2.0); text + explicit (v0.3.0). FindTables(settings TableSettings) ([]TableFinder, error) ExtractTables(settings TableSettings) ([]*Table, error) } @@ -211,12 +212,12 @@ laid, _ := page.ExtractText(opts) fmt.Println(laid) ``` -## Tables (lines strategy) +## Tables `Page.ExtractTables` is the table-detection entry point. It runs the edges → intersections → cells → tables pipeline (a direct port of pdfplumber's `TableFinder`) and returns one `*Table` per detected -ruled table, with cell text already extracted. +table, with cell text already extracted. ```go doc, _ := pdftable.OpenFile("invoice.pdf") @@ -238,9 +239,11 @@ for ti, t := range tables { `TableSettings` defaults match pdfplumber's (`snap_tolerance=3`, `join_tolerance=3`, `edge_min_length=3`, -`intersection_tolerance=3`, `text_tolerance=3`). Override any field -on the value returned from `DefaultTableSettings()` to tighten or -loosen the heuristics. The two implemented strategies are: +`intersection_tolerance=3`, `text_tolerance=3`, `min_words_vertical=3`, +`min_words_horizontal=1`). Override any field on the value returned +from `DefaultTableSettings()` to tighten or loosen the heuristics. + +The four implemented strategies (one per axis, chosen independently): - `StrategyLines` — edges come from drawn `Line` segments, `Rect` outlines (all four sides), and axis-aligned `Curve` segments. @@ -248,12 +251,16 @@ loosen the heuristics. The two implemented strategies are: - `StrategyLinesStrict` — only drawn `Line` segments are used. Use this when your PDF draws cell BACKGROUNDS as filled rectangles that you do NOT want treated as row boundaries. +- `StrategyText` — edges inferred from word alignment. Vertical + edges come from clusters of words sharing X0 / X1 / centre; + horizontal edges from clusters sharing top-Y. Tunable via + `MinWordsVertical` (default 3) and `MinWordsHorizontal` (default 1). +- `StrategyExplicit` — caller-supplied edges via + `ExplicitVerticalLines` / `ExplicitHorizontalLines`. Required when + table boundaries are known from layout analysis or manual + annotation. -`StrategyText` (word-alignment-based) and `StrategyExplicit` -(caller-supplied edges) return `ErrUnsupported` in v0.2.0 — they -land in v0.3.0. - -### Side-by-side: pdfplumber → pdftable +### Side-by-side: pdfplumber → pdftable (lines strategy) ```python # Python (pdfplumber) @@ -287,14 +294,152 @@ for _, t := range tables { } ``` -The two outputs match cell-for-cell on ruled fixtures (see -`testdata/golden/issue-466-example.*` for the parity test). Field -naming differs in the obvious places: pdftable returns a slice of -`*Table` instead of `Table` objects you have to call `.extract()` -on; rows are `[]string` instead of `list[Optional[str]]` (missing -cells produce `""` rather than `nil`); and table bboxes use -`(X0, Y0, X1, Y1)` PDF user space rather than pdfplumber's -image-space `(x0, top, x1, bottom)`. +### Side-by-side: pdfplumber → pdftable (text strategy) + +```python +# Python (pdfplumber) — borderless tables +import pdfplumber + +with pdfplumber.open("10k-filing.pdf") as pdf: + page = pdf.pages[3] + for table in page.find_tables({"vertical_strategy": "text", + "horizontal_strategy": "text", + "min_words_vertical": 3}): + for row in table.extract(): + print(row) +``` + +```go +// Go (pdftable) +doc, _ := pdftable.OpenFile("10k-filing.pdf") +defer doc.Close() +page, _ := doc.Page(4) + +settings := pdftable.DefaultTableSettings() +settings.VerticalStrategy = pdftable.StrategyText +settings.HorizontalStrategy = pdftable.StrategyText +settings.MinWordsVertical = 3 + +tables, _ := page.ExtractTables(settings) +for _, t := range tables { + for _, row := range t.Rows { + fmt.Println(row) + } +} +``` + +### Side-by-side: pdfplumber → pdftable (explicit strategy) + +```python +# Python (pdfplumber) — caller-supplied edges +import pdfplumber + +with pdfplumber.open("statement.pdf") as pdf: + page = pdf.pages[0] + table = page.find_tables({ + "vertical_strategy": "explicit", + "horizontal_strategy": "explicit", + "explicit_vertical_lines": [100, 200, 300, 400], + "explicit_horizontal_lines": [600, 650, 700, 720], + })[0] + for row in table.extract(): + print(row) +``` + +```go +// Go (pdftable) +doc, _ := pdftable.OpenFile("statement.pdf") +defer doc.Close() +page, _ := doc.Page(1) + +settings := pdftable.DefaultTableSettings() +settings.VerticalStrategy = pdftable.StrategyExplicit +settings.HorizontalStrategy = pdftable.StrategyExplicit +settings.ExplicitVerticalLines = []float64{100, 200, 300, 400} +settings.ExplicitHorizontalLines = []float64{600, 650, 700, 720} + +tables, _ := page.ExtractTables(settings) +for _, row := range tables[0].Rows { + fmt.Println(row) +} +``` + +### Mixed strategies + +Each axis picks its strategy independently. Combinations like +`vertical=text` + `horizontal=lines` (common for tables with drawn +row separators but borderless columns) work out of the box: + +```go +settings := pdftable.DefaultTableSettings() +settings.VerticalStrategy = pdftable.StrategyText +settings.HorizontalStrategy = pdftable.StrategyLines +tables, _ := page.ExtractTables(settings) +``` + +The two outputs match cell-for-cell on the parity fixtures (see +`testdata/golden/*.tables-text.expected.json` and +`*.tables.expected.json` for the regression goldens). Field naming +differs in the obvious places: pdftable returns a slice of `*Table` +instead of `Table` objects you have to call `.extract()` on; rows are +`[]string` instead of `list[Optional[str]]` (missing cells produce +`""` rather than `nil`); and table bboxes use `(X0, Y0, X1, Y1)` PDF +user space rather than pdfplumber's image-space +`(x0, top, x1, bottom)`. + +## CLI + +`pdftable` ships a command-line interface that mirrors pdfplumber's +CLI surface for the operations the library implements: + +```sh +go install github.com/hallelx2/pdftable/cmd/pdftable@v0.3.0 +``` + +Usage: + +```sh +# Extract every table on every page as JSON. +pdftable extract invoice.pdf --tables --format json + +# Borderless tables: use the text strategy. +pdftable extract 10k.pdf --tables \ + --vertical-strategy text --horizontal-strategy text \ + --min-words-vertical 4 + +# Extract text only (no table detection). +pdftable extract report.pdf --text --format text + +# Subset of pages, pretty-printed JSON. +pdftable extract report.pdf --tables --pages 1,3-5 --indent 2 + +# Caller-supplied edges. +pdftable extract statement.pdf --tables \ + --vertical-strategy explicit --horizontal-strategy explicit \ + --explicit-vertical-lines 100,200,300,400 \ + --explicit-horizontal-lines 600,650,700,720 +``` + +Flags: + +| Flag | Default | Description | +| --- | --- | --- | +| `--pages` | all | Pages: `1,3-5` syntax. | +| `--tables` | off | Output detected tables. | +| `--text` | off | Output extracted text. | +| `--format` | `json` | `json` \| `text`. | +| `--vertical-strategy` | `lines` | `lines` \| `lines_strict` \| `text` \| `explicit`. | +| `--horizontal-strategy` | `lines` | same set. | +| `--snap-tolerance` | 3 | snap_tolerance (PDF pts). | +| `--join-tolerance` | 3 | join_tolerance (PDF pts). | +| `--edge-min-length` | 3 | drop merged edges shorter than this. | +| `--intersection-tolerance` | 3 | slack on edge crossings. | +| `--text-tolerance` | 3 | per-cell text-extraction tolerance. | +| `--min-words-vertical` | 3 | text strategy column threshold. | +| `--min-words-horizontal` | 1 | text strategy row threshold. | +| `--explicit-vertical-lines` | (none) | comma list of X coords. | +| `--explicit-horizontal-lines` | (none) | comma list of Y coords. | +| `--indent` | 0 | JSON indent (0 = compact). | ## Side-by-side comparison with pdfplumber @@ -391,9 +536,13 @@ pdftable/ ├── text.go // Word + ExtractText + ExtractTextSimple (v0.1.0) ├── table.go // TableStrategy / TableSettings / Table types (v0.2.0) ├── finder.go // Cells-from-edges algorithm (v0.2.0) +├── finder_text.go // Text + explicit edge derivation (v0.3.0) ├── clustering.go // 1-D clusterObjects, groupObjectsByAttr, dedupeChars ├── geometry.go // BBox helpers: Union, Intersect, Contains, Snap ├── errors.go // Sentinel errors +├── cmd/ +│ └── pdftable/ // Command-line interface (v0.3.0) +│ └── main.go └── internal/ ├── layout/ │ └── lines.go // Edge type + snap/join/filter pipeline (v0.2.0) @@ -429,15 +578,17 @@ stdlib-only. - `v0.0.x` — content-stream primitives. - `v0.1.x` — text extraction: `Page.ExtractText`, `Page.Words`, `Page.ExtractTextSimple`. -- `v0.2.x` — table finding via ruling lines (this release): - `Page.FindTables` / `Page.ExtractTables` covering the `lines` and - `lines_strict` strategies. -- `v0.3.x` — remaining table strategies: `text` (word-alignment - edges) and `explicit` (caller-supplied edges). Bundle the - standard-14 AFM metrics so word bboxes (and therefore cell text) - match pdfplumber to within 1 PDF point on standard fonts. -- `v0.4.x` — performance pass: parser benchmarking against pdfminer.six - and pdfplumber on a representative document corpus. +- `v0.2.x` — table finding via ruling lines: `Page.FindTables` / + `Page.ExtractTables` covering the `lines` and `lines_strict` + strategies. +- `v0.3.x` — remaining table strategies and CLI (this release): + `text` (word-alignment edges), `explicit` (caller-supplied edges), + and a `pdftable` CLI mirroring pdfplumber's surface. +- `v0.4.x` — bundle the standard-14 AFM metrics so word bboxes (and + therefore cell text) match pdfplumber to within 1 PDF point on + standard fonts. +- `v0.5.x` — performance pass: parser benchmarking against + pdfminer.six and pdfplumber on a representative document corpus. ## License