From 1ae2b095017a7528f0dd155aee3329bd5285078f Mon Sep 17 00:00:00 2001
From: Halleluyah Oludele <halleluyaholudele@gmail.com>
Date: Wed, 27 May 2026 02:07:38 +0100
Subject: [PATCH 1/3] feat: text + explicit table strategies (pdfplumber
 parity)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implements the two remaining pdfplumber table-finding strategies:

- text: infer column boundaries by clustering words on X0 / X1 /
  centre; infer row boundaries by clustering on top-Y. Direct port of
  pdfplumber's words_to_edges_v / words_to_edges_h with the same
  MinWordsVertical (3) / MinWordsHorizontal (1) defaults.
- explicit: caller-supplied edges via TableSettings.Explicit*Lines.
  At least two coordinates required per axis (matches pdfplumber's
  validation); non-finite values dropped with a log warning.

Each axis selects its strategy independently, so mixed-strategy
settings (e.g. vertical=text + horizontal=lines) work out of the box.

- New layout.SourceText enum tagging text-derived edges.
- Page.findTableEdges refactored to dispatch per-axis on strategy
  instead of starting from a single primitive-edge slice.
- ensureSupportedStrategies now only rejects unknown strategy strings.
- New table_test.go cases: unit tests on hand-crafted Words slices;
  borderless / explicit / mixed extraction end-to-end on the new
  testdata.TableBorderless() fixture.
- pdfplumber parity test for the borderless fixture
  (TestGoldenTablesTextStrategyAgainstPdfplumber) — matches
  cell-for-cell against pdfplumber's find_tables({text, text}).
- scripts/capture_pdfplumber_text_golden.py captures the
  text-strategy expectation for any fixture with a sibling
  .tables-text.target marker.
---
 finder.go                                     |  32 +-
 finder_text.go                                | 345 +++++++++++++++
 golden_test.go                                |  54 ++-
 internal/layout/lines.go                      |   3 +
 page.go                                       | 193 ++++----
 scripts/capture_pdfplumber_text_golden.py     |  84 ++++
 scripts/gen_table_fixture.go                  |   4 +
 table.go                                      |  50 ++-
 table_test.go                                 | 414 ++++++++++++++++--
 testdata/fixtures.go                          |  60 +++
 testdata/golden/table-3x4-borderless.pdf      | Bin 0 -> 950 bytes
 ...e-3x4-borderless.tables-text.expected.json |  49 +++
 .../table-3x4-borderless.tables-text.target   |   0
 13 files changed, 1146 insertions(+), 142 deletions(-)
 create mode 100644 finder_text.go
 create mode 100644 scripts/capture_pdfplumber_text_golden.py
 create mode 100644 testdata/golden/table-3x4-borderless.pdf
 create mode 100644 testdata/golden/table-3x4-borderless.tables-text.expected.json
 create mode 100644 testdata/golden/table-3x4-borderless.tables-text.target

diff --git a/finder.go b/finder.go
index 9d1a75e..ff8c2e0 100644
--- a/finder.go
+++ b/finder.go
@@ -618,10 +618,10 @@ func runTableFinder(edges []layout.Edge, xTol, yTol float64) TableFinder {
 	}
 }
 
-// ensureSupportedStrategies returns an error if either strategy is
-// "text" or "explicit" — those are deferred to Phase 1.3.D (v0.3.0).
-// Returning a clear ErrUnsupported keeps callers from silently getting
-// empty results when they ask for a strategy we don't implement yet.
+// ensureSupportedStrategies validates that both axes' strategies are
+// one of the four pdfplumber-defined values. As of v0.3.0 all four
+// strategies (lines, lines_strict, text, explicit) are implemented;
+// the function now exists only to reject unknown strategy strings.
 func ensureSupportedStrategies(s TableSettings) error {
 	for _, pair := range []struct {
 		axis     string
@@ -631,15 +631,29 @@ func ensureSupportedStrategies(s TableSettings) error {
 		{"horizontal", s.HorizontalStrategy},
 	} {
 		switch pair.strategy {
-		case StrategyLines, StrategyLinesStrict:
+		case StrategyLines, StrategyLinesStrict, StrategyText, StrategyExplicit:
 			// ok
-		case StrategyText:
-			return fmt.Errorf("%w: %s_strategy=%q (Phase 1.3.D)", ErrUnsupported, pair.axis, pair.strategy)
-		case StrategyExplicit:
-			return fmt.Errorf("%w: %s_strategy=%q (Phase 1.3.D)", ErrUnsupported, pair.axis, pair.strategy)
 		default:
 			return fmt.Errorf("%w: unknown %s_strategy %q", ErrUnsupported, pair.axis, pair.strategy)
 		}
 	}
 	return nil
 }
+
+// errExplicitNeedsTwo is the error returned when the caller selects
+// the "explicit" strategy on an axis but supplies fewer than two
+// coordinates. pdfplumber raises ValueError with the same message.
+func errExplicitNeedsTwo(axis string) error {
+	return fmt.Errorf("pdftable: %s_strategy=%q requires at least two coordinates in Explicit%sLines",
+		axis, StrategyExplicit, axisFieldName(axis))
+}
+
+// axisFieldName returns the field-name suffix for the axis ("Vertical"
+// or "Horizontal") so error messages reference the actual struct field
+// the caller would need to populate.
+func axisFieldName(axis string) string {
+	if axis == "vertical" {
+		return "Vertical"
+	}
+	return "Horizontal"
+}
diff --git a/finder_text.go b/finder_text.go
new file mode 100644
index 0000000..da1548d
--- /dev/null
+++ b/finder_text.go
@@ -0,0 +1,345 @@
+// Copyright (c) 2026 Halleluyah Oludele
+// Licensed under the MIT License.
+
+package pdftable
+
+// finder_text.go implements the "text" and "explicit" edge-derivation
+// strategies that complement the "lines" / "lines_strict" strategies in
+// finder.go.
+//
+// The "text" strategy is a direct port of pdfplumber's
+// words_to_edges_v / words_to_edges_h (pdfplumber/table.py lines
+// 101-204). The pipeline:
+//
+//   - vertical: cluster words by X0 (left), X1 (right), and center
+//     position with tolerance=1 PDF point; pick clusters with at least
+//     MinWordsVertical members; deduplicate overlapping clusters by
+//     bbox-overlap; emit one vertical edge per cluster's X0, plus one
+//     trailing edge at the right-most X1.
+//   - horizontal: cluster words by their top Y (Y1 in user space) with
+//     tolerance=1; pick clusters with at least MinWordsHorizontal
+//     members; emit BOTH the top and bottom edges of each cluster so
+//     the last row of the table is captured.
+//
+// The "explicit" strategy is a direct port of pdfplumber's handling of
+// explicit_vertical_lines / explicit_horizontal_lines as standalone
+// edge lists (table.py lines 623-695). The caller passes float
+// coordinates; we promote each to a full-extent edge spanning the
+// page's bbox.
+//
+// Coordinate system: pdfplumber works in image space (Y growing down,
+// "top" = smaller Y). pdftable uses PDF user space (Y growing up). Two
+// translations matter:
+//
+//   - pdfplumber's "top" is our Y1 (the visually-upper edge of a word's
+//     bbox in user space).
+//   - pdfplumber's "bottom" is our Y0.
+//
+// The clustering and threshold logic is the same in both coordinate
+// systems; only the field names flip.
+
+import (
+	"log"
+	"math"
+	"sort"
+
+	"github.com/hallelx2/pdftable/internal/layout"
+)
+
+// wordEdgeTolerance is the per-axis tolerance used when clustering
+// words for the "text" strategy. pdfplumber hardcodes this to 1 PDF
+// point in words_to_edges_v / words_to_edges_h (the third argument to
+// cluster_objects); we mirror that.
+const wordEdgeTolerance = 1.0
+
+// wordsToEdgesV is the Go port of pdfplumber's words_to_edges_v
+// (table.py lines 144-204). Given a slice of words and a
+// MinWordsVertical threshold, infer vertical edges by clustering the
+// words' X0 (left), X1 (right), and centre positions, picking
+// clusters whose membership is >= threshold, dropping overlapping
+// clusters (the larger one wins), and emitting one edge at each
+// cluster's leftmost X plus a trailing edge at the rightmost X1.
+//
+// pdfplumber sorts the clusters by descending size before the overlap
+// dedupe so that, when two candidate column boundaries overlap, the
+// bigger one (the one supported by more words) wins.
+func wordsToEdgesV(words []Word, wordThreshold int) []layout.Edge {
+	if len(words) == 0 || wordThreshold <= 0 {
+		return nil
+	}
+
+	// Three candidate cluster sources: words sharing left edge (X0),
+	// right edge (X1), centre. Each contributes one cluster set; we
+	// concatenate all three, drop those below threshold, then dedupe
+	// by overlap (larger cluster wins).
+	byX0 := clusterObjects(words, func(w Word) float64 { return w.X0 }, wordEdgeTolerance, false)
+	byX1 := clusterObjects(words, func(w Word) float64 { return w.X1 }, wordEdgeTolerance, false)
+	byCenter := clusterObjects(words, func(w Word) float64 { return (w.X0 + w.X1) / 2 }, wordEdgeTolerance, false)
+
+	all := make([][]Word, 0, len(byX0)+len(byX1)+len(byCenter))
+	all = append(all, byX0...)
+	all = append(all, byX1...)
+	all = append(all, byCenter...)
+
+	// Sort by descending size so the dedupe pass keeps the largest
+	// cluster when two overlap.
+	sort.SliceStable(all, func(i, j int) bool {
+		return len(all[i]) > len(all[j])
+	})
+
+	// Filter below-threshold clusters.
+	large := make([][]Word, 0, len(all))
+	for _, c := range all {
+		if len(c) >= wordThreshold {
+			large = append(large, c)
+		}
+	}
+	if len(large) == 0 {
+		return nil
+	}
+
+	// Convert each surviving cluster to its bbox.
+	bboxes := make([]BBox, len(large))
+	for i, c := range large {
+		bboxes[i] = wordsBBox(c)
+	}
+
+	// Dedupe: keep clusters whose bbox doesn't overlap any already-
+	// kept one. Pdfplumber uses get_bbox_overlap which is non-empty
+	// only when there's positive overlap area — touching boxes don't
+	// count.
+	condensed := make([]BBox, 0, len(bboxes))
+	for _, b := range bboxes {
+		overlap := false
+		for _, c := range condensed {
+			if _, ok := b.Intersect(c); ok {
+				overlap = true
+				break
+			}
+		}
+		if !overlap {
+			condensed = append(condensed, b)
+		}
+	}
+	if len(condensed) == 0 {
+		return nil
+	}
+
+	// Sort the surviving clusters left-to-right by X0.
+	sort.SliceStable(condensed, func(i, j int) bool {
+		return condensed[i].X0 < condensed[j].X0
+	})
+
+	// pdfplumber emits, for each cluster, an edge at its LEFT (X0);
+	// then one final trailing edge at the right edge of the right-most
+	// cluster (max X1). The Y span on each edge is the union of all
+	// participating clusters' Y range so that downstream merging /
+	// intersection sees consistent spans.
+	minY0 := math.Inf(1)
+	maxY1 := math.Inf(-1)
+	maxX1 := math.Inf(-1)
+	for _, b := range condensed {
+		if b.Y0 < minY0 {
+			minY0 = b.Y0
+		}
+		if b.Y1 > maxY1 {
+			maxY1 = b.Y1
+		}
+		if b.X1 > maxX1 {
+			maxX1 = b.X1
+		}
+	}
+
+	out := make([]layout.Edge, 0, len(condensed)+1)
+	for _, b := range condensed {
+		out = append(out, layout.Edge{
+			X0: b.X0, X1: b.X0,
+			Y0: minY0, Y1: maxY1,
+			Orientation: layout.Vertical,
+			Source:      layout.SourceText,
+		})
+	}
+	out = append(out, layout.Edge{
+		X0: maxX1, X1: maxX1,
+		Y0: minY0, Y1: maxY1,
+		Orientation: layout.Vertical,
+		Source:      layout.SourceText,
+	})
+	return out
+}
+
+// wordsToEdgesH is the Go port of pdfplumber's words_to_edges_h
+// (table.py lines 101-141). Given a slice of words and a
+// MinWordsHorizontal threshold, infer horizontal edges by clustering
+// the words by their visual TOP (Y1 in PDF user space, "top" in
+// pdfplumber's image-space dict) and emitting one edge per cluster at
+// its Y1, plus a second edge at the cluster's Y0 so the bottom of the
+// last row gets captured.
+//
+// pdfplumber notes (table.py lines 128-130): "For each detected row,
+// we also add the 'bottom' line. This will generate extra edges,
+// (some will be redundant with the next row 'top' line), but this
+// catches the last row of every table."
+func wordsToEdgesH(words []Word, wordThreshold int) []layout.Edge {
+	if len(words) == 0 || wordThreshold <= 0 {
+		return nil
+	}
+
+	// Cluster by visual top (Y1 in user space). pdfplumber uses
+	// "top" — which is image-space and so corresponds to user-space
+	// Y1 (the upper edge of the bbox).
+	byTop := clusterObjects(words, func(w Word) float64 { return w.Y1 }, wordEdgeTolerance, false)
+
+	large := make([][]Word, 0, len(byTop))
+	for _, c := range byTop {
+		if len(c) >= wordThreshold {
+			large = append(large, c)
+		}
+	}
+	if len(large) == 0 {
+		return nil
+	}
+
+	// Per-cluster bbox: bottom = min(Y0), top = max(Y1) — i.e. the
+	// union of every word's vertical extent in the cluster.
+	bboxes := make([]BBox, len(large))
+	for i, c := range large {
+		bboxes[i] = wordsBBox(c)
+	}
+
+	// Span the resulting edges from min(X0) to max(X1) across ALL
+	// clusters — pdfplumber emits horizontal edges as page-spanning
+	// rules across every detected row.
+	minX0 := math.Inf(1)
+	maxX1 := math.Inf(-1)
+	for _, b := range bboxes {
+		if b.X0 < minX0 {
+			minX0 = b.X0
+		}
+		if b.X1 > maxX1 {
+			maxX1 = b.X1
+		}
+	}
+
+	out := make([]layout.Edge, 0, 2*len(bboxes))
+	for _, b := range bboxes {
+		// Top edge (Y1 in user space, "top" in image space).
+		out = append(out, layout.Edge{
+			X0: minX0, X1: maxX1,
+			Y0: b.Y1, Y1: b.Y1,
+			Orientation: layout.Horizontal,
+			Source:      layout.SourceText,
+		})
+		// Bottom edge (Y0 in user space, "bottom" in image space).
+		out = append(out, layout.Edge{
+			X0: minX0, X1: maxX1,
+			Y0: b.Y0, Y1: b.Y0,
+			Orientation: layout.Horizontal,
+			Source:      layout.SourceText,
+		})
+	}
+	return out
+}
+
+// wordsBBox returns the union bbox of every word in cs. Used by the
+// text-strategy to derive a cluster's spatial extent.
+func wordsBBox(cs []Word) BBox {
+	if len(cs) == 0 {
+		return BBox{}
+	}
+	out := BBox{X0: cs[0].X0, Y0: cs[0].Y0, X1: cs[0].X1, Y1: cs[0].Y1}
+	for _, w := range cs[1:] {
+		if w.X0 < out.X0 {
+			out.X0 = w.X0
+		}
+		if w.Y0 < out.Y0 {
+			out.Y0 = w.Y0
+		}
+		if w.X1 > out.X1 {
+			out.X1 = w.X1
+		}
+		if w.Y1 > out.Y1 {
+			out.Y1 = w.Y1
+		}
+	}
+	return out
+}
+
+// explicitVerticalEdges promotes a slice of float X coordinates into
+// vertical edges that span the page's Y range. Each non-finite or
+// zero-length entry is dropped with a warning so a misconfigured
+// caller gets visible feedback rather than silent empty results.
+//
+// pageY0 / pageY1 are the page's vertical bounds (typically 0 and
+// page.Height()) — pdfplumber's `page.bbox[1]` and `page.bbox[3]`.
+func explicitVerticalEdges(xs []float64, pageY0, pageY1 float64) []layout.Edge {
+	if len(xs) == 0 {
+		return nil
+	}
+	out := make([]layout.Edge, 0, len(xs))
+	for _, x := range xs {
+		if math.IsNaN(x) || math.IsInf(x, 0) {
+			log.Printf("pdftable: explicit vertical line %v ignored (non-finite)", x)
+			continue
+		}
+		if pageY1 <= pageY0 {
+			log.Printf("pdftable: explicit vertical line %v ignored (page height is zero)", x)
+			continue
+		}
+		out = append(out, layout.Edge{
+			X0: x, X1: x,
+			Y0: pageY0, Y1: pageY1,
+			Orientation: layout.Vertical,
+			Source:      layout.SourceExplicit,
+		})
+	}
+	return out
+}
+
+// explicitHorizontalEdges promotes a slice of float Y coordinates into
+// horizontal edges that span the page's X range. Same invalid-input
+// behaviour as explicitVerticalEdges.
+//
+// pageX0 / pageX1 are the page's horizontal bounds.
+func explicitHorizontalEdges(ys []float64, pageX0, pageX1 float64) []layout.Edge {
+	if len(ys) == 0 {
+		return nil
+	}
+	out := make([]layout.Edge, 0, len(ys))
+	for _, y := range ys {
+		if math.IsNaN(y) || math.IsInf(y, 0) {
+			log.Printf("pdftable: explicit horizontal line %v ignored (non-finite)", y)
+			continue
+		}
+		if pageX1 <= pageX0 {
+			log.Printf("pdftable: explicit horizontal line %v ignored (page width is zero)", y)
+			continue
+		}
+		out = append(out, layout.Edge{
+			X0: pageX0, X1: pageX1,
+			Y0: y, Y1: y,
+			Orientation: layout.Horizontal,
+			Source:      layout.SourceExplicit,
+		})
+	}
+	return out
+}
+
+// validateExplicitForStrategy reports an error if either axis uses
+// the "explicit" strategy but the caller supplied fewer than two
+// coordinates on that axis. pdfplumber raises ValueError in this
+// case (table.py lines 605-615). The check is a no-op for the
+// other three strategies.
+//
+// Coordinates that survive the validation aren't checked for finite-
+// ness here — that happens in explicit*Edges below, where invalid
+// entries are logged and skipped individually.
+func validateExplicitForStrategy(s TableSettings) error {
+	if s.VerticalStrategy == StrategyExplicit && len(s.ExplicitVerticalLines) < 2 {
+		return errExplicitNeedsTwo("vertical")
+	}
+	if s.HorizontalStrategy == StrategyExplicit && len(s.ExplicitHorizontalLines) < 2 {
+		return errExplicitNeedsTwo("horizontal")
+	}
+	return nil
+}
diff --git a/golden_test.go b/golden_test.go
index d6e9dcc..bcf658f 100644
--- a/golden_test.go
+++ b/golden_test.go
@@ -72,10 +72,10 @@ func TestGoldenAgainstPdfplumber(t *testing.T) {
 		t.Fatalf("read golden dir: %v", err)
 	}
 
-	// Find every .expected.json (but NOT .tables.expected.json — the
-	// tables golden files have a different schema and are exercised
-	// by TestGoldenTablesAgainstPdfplumber below) and run a sub-test
-	// for each.
+	// Find every .expected.json (but NOT one of the table goldens —
+	// those have a different schema and are exercised by the
+	// strategy-specific TestGoldenTables* tests below) and run a
+	// sub-test for each.
 	for _, e := range entries {
 		if e.IsDir() {
 			continue
@@ -87,6 +87,9 @@ func TestGoldenAgainstPdfplumber(t *testing.T) {
 		if strings.HasSuffix(name, ".tables.expected.json") {
 			continue
 		}
+		if strings.HasSuffix(name, ".tables-text.expected.json") {
+			continue
+		}
 		stem := strings.TrimSuffix(name, ".expected.json")
 		t.Run(stem, func(t *testing.T) {
 			runGoldenCase(t, dir, stem)
@@ -144,15 +147,50 @@ func TestGoldenTablesAgainstPdfplumber(t *testing.T) {
 		}
 		stem := strings.TrimSuffix(name, ".tables.expected.json")
 		t.Run(stem, func(t *testing.T) {
-			runGoldenTablesCase(t, dir, stem)
+			runGoldenTablesCase(t, dir, stem, pdftable.DefaultTableSettings())
+		})
+	}
+}
+
+// TestGoldenTablesTextStrategyAgainstPdfplumber asserts pdftable's
+// "text" strategy output matches pdfplumber's
+// find_tables({"text", "text"}) on every .tables-text.expected.json
+// fixture in testdata/golden. The strategy-specific suffix lets us
+// pin the parity expectation per fixture independently of the
+// default-lines test above.
+func TestGoldenTablesTextStrategyAgainstPdfplumber(t *testing.T) {
+	dir := filepath.Join("testdata", "golden")
+	entries, err := os.ReadDir(dir)
+	if err != nil {
+		t.Fatalf("read golden dir: %v", err)
+	}
+	settings := pdftable.DefaultTableSettings()
+	settings.VerticalStrategy = pdftable.StrategyText
+	settings.HorizontalStrategy = pdftable.StrategyText
+	for _, e := range entries {
+		if e.IsDir() {
+			continue
+		}
+		name := e.Name()
+		if !strings.HasSuffix(name, ".tables-text.expected.json") {
+			continue
+		}
+		stem := strings.TrimSuffix(name, ".tables-text.expected.json")
+		t.Run(stem, func(t *testing.T) {
+			runGoldenTablesCaseSuffix(t, dir, stem, ".tables-text.expected.json", settings)
 		})
 	}
 }
 
-func runGoldenTablesCase(t *testing.T, dir, stem string) {
+func runGoldenTablesCase(t *testing.T, dir, stem string, settings pdftable.TableSettings) {
+	t.Helper()
+	runGoldenTablesCaseSuffix(t, dir, stem, ".tables.expected.json", settings)
+}
+
+func runGoldenTablesCaseSuffix(t *testing.T, dir, stem, suffix string, settings pdftable.TableSettings) {
 	t.Helper()
 	pdfPath := filepath.Join(dir, stem+".pdf")
-	jsonPath := filepath.Join(dir, stem+".tables.expected.json")
+	jsonPath := filepath.Join(dir, stem+suffix)
 
 	data, err := os.ReadFile(jsonPath)
 	if err != nil {
@@ -174,7 +212,7 @@ func runGoldenTablesCase(t *testing.T, dir, stem string) {
 		if err != nil {
 			t.Fatalf("Page(%d): %v", expPage.Number, err)
 		}
-		gotTables, err := p.ExtractTables(pdftable.DefaultTableSettings())
+		gotTables, err := p.ExtractTables(settings)
 		if err != nil {
 			t.Fatalf("ExtractTables: %v", err)
 		}
diff --git a/internal/layout/lines.go b/internal/layout/lines.go
index 3a94e85..a688de5 100644
--- a/internal/layout/lines.go
+++ b/internal/layout/lines.go
@@ -56,6 +56,9 @@ const (
 	// SourceExplicit: an edge constructed from an
 	// ExplicitVerticalLines / ExplicitHorizontalLines setting.
 	SourceExplicit
+	// SourceText: an edge inferred from word alignment by the "text"
+	// strategy. words_to_edges_v / words_to_edges_h in pdfplumber.
+	SourceText
 )
 
 // Edge is one axis-aligned line segment carrying the data the table-
diff --git a/page.go b/page.go
index 48e0225..5b04c97 100644
--- a/page.go
+++ b/page.go
@@ -102,10 +102,11 @@ type Page interface {
 	// the intermediate stages (edges / intersections / raw cells)
 	// alongside the assembled per-table CellsGrid.
 	//
-	// v0.2.0 supports VerticalStrategy / HorizontalStrategy values
-	// of "lines" and "lines_strict". Passing "text" or "explicit"
-	// returns ErrUnsupported — those strategies land in Phase 1.3.D
-	// (v0.3.0).
+	// v0.3.0 supports all four pdfplumber strategies: "lines",
+	// "lines_strict", "text", and "explicit". Each axis (vertical,
+	// horizontal) selects its strategy independently, so mixed
+	// settings like vertical="text" + horizontal="lines" work as
+	// expected.
 	FindTables(settings TableSettings) ([]TableFinder, error)
 
 	// ExtractTables wraps FindTables and runs per-cell text
@@ -390,11 +391,13 @@ func charsJoinedText(chars []Char) string {
 //
 //  1. Walk the page once via Objects(), so we pay the content-stream
 //     parse cost a single time rather than once per primitive type.
-//  2. Convert every Line / Rect / Curve into one or more layout.Edge
-//     instances. Lines produce 0 or 1 edge; Rects produce 4; Curves
-//     produce one per axis-aligned segment.
-//  3. For lines_strict, drop SourceRect and SourceCurve edges before
-//     the prefilter.
+//  2. Compute per-axis base edges according to the requested strategy:
+//     - "lines"        → Lines + Rects + Curves (axis-aligned)
+//     - "lines_strict" → Lines only
+//     - "text"         → words clustered by x0/x1/centre (v) or top (h)
+//     - "explicit"     → empty (caller supplies the edges)
+//  3. Append the caller's ExplicitVerticalLines / ExplicitHorizontalLines
+//     on top of whichever base set was chosen.
 //  4. Apply the prefilter (drop edges shorter than
 //     EdgeMinLengthPrefilter — pdfplumber default 1 pt).
 //  5. Merge (snap onto cluster means, then join collinear edges
@@ -402,77 +405,69 @@ func charsJoinedText(chars []Char) string {
 //  6. Apply the post-merge length filter (drop edges shorter than
 //     EdgeMinLength — pdfplumber default 3 pt).
 //
-// The returned slice is the input both the vertical and horizontal
-// stages share — pdfplumber's TableFinder.get_edges takes the union
-// of vertical-strategy edges and horizontal-strategy edges and runs
-// the merge across both at once. We do the same, but with one
-// wrinkle: if the two strategies differ ("lines" + "lines_strict"),
-// we apply the source-filter PER ORIENTATION so a strict horizontal
-// strategy can still benefit from rect-derived vertical edges and
-// vice versa. The pdfplumber implementation handles this implicitly
-// because its filter_edges receives the requested orientation as an
-// argument; our code mirrors that branch explicitly.
+// Each axis uses its own strategy, so "text" vertical + "lines"
+// horizontal (or any of the 16 combinations) works exactly as in
+// pdfplumber.
 func (p *page) findTableEdges(s TableSettings) ([]layout.Edge, error) {
-	objs, err := p.Objects()
-	if err != nil {
-		return nil, err
+	// Resolve text strategy's input once (and only when needed) — both
+	// axes can ask for it, and Words is an expensive call.
+	var words []Word
+	needWords := s.VerticalStrategy == StrategyText || s.HorizontalStrategy == StrategyText
+	if needWords {
+		opts := DefaultWordOpts()
+		opts.XTolerance = s.TextTolerance
+		opts.YTolerance = s.TextTolerance
+		opts.KeepBlankChars = s.KeepBlankChars
+		w, err := p.Words(opts)
+		if err != nil {
+			return nil, err
+		}
+		words = w
 	}
 
-	tol := 0.1 // near-axis-aligned slack for FromLine/FromCurve
-	rawEdges := make([]layout.Edge, 0, len(objs.Lines)+4*len(objs.Rects)+2*len(objs.Curves))
-
-	for _, l := range objs.Lines {
-		if e, ok := layout.FromLine(layout.LineSegment{
-			X0: l.X0, Y0: l.Y0, X1: l.X1, Y1: l.Y1, Width: l.Width,
-		}, tol); ok {
-			rawEdges = append(rawEdges, e)
+	// Resolve drawn-primitive edges once if either axis needs them
+	// (lines or lines_strict).
+	var lineLikeEdges []layout.Edge
+	needPrimitives := isLineLike(s.VerticalStrategy) || isLineLike(s.HorizontalStrategy)
+	if needPrimitives {
+		objs, err := p.Objects()
+		if err != nil {
+			return nil, err
+		}
+		tol := 0.1 // near-axis-aligned slack for FromLine/FromCurve
+		lineLikeEdges = make([]layout.Edge, 0, len(objs.Lines)+4*len(objs.Rects)+2*len(objs.Curves))
+		for _, l := range objs.Lines {
+			if e, ok := layout.FromLine(layout.LineSegment{
+				X0: l.X0, Y0: l.Y0, X1: l.X1, Y1: l.Y1, Width: l.Width,
+			}, tol); ok {
+				lineLikeEdges = append(lineLikeEdges, e)
+			}
+		}
+		for _, r := range objs.Rects {
+			lineLikeEdges = append(lineLikeEdges, layout.FromRect(layout.RectSegment{
+				X0: r.X0, Y0: r.Y0, X1: r.X1, Y1: r.Y1, Width: r.Width,
+			})...)
+		}
+		for _, c := range objs.Curves {
+			lineLikeEdges = append(lineLikeEdges, layout.FromCurve(layout.CurveSegment{
+				Points: c.Points, Width: c.Width,
+			}, tol)...)
 		}
 	}
-	for _, r := range objs.Rects {
-		rawEdges = append(rawEdges, layout.FromRect(layout.RectSegment{
-			X0: r.X0, Y0: r.Y0, X1: r.X1, Y1: r.Y1, Width: r.Width,
-		})...)
-	}
-	for _, c := range objs.Curves {
-		rawEdges = append(rawEdges, layout.FromCurve(layout.CurveSegment{
-			Points: c.Points, Width: c.Width,
-		}, tol)...)
-	}
-
-	// Per-orientation source filter: lines_strict on an axis drops
-	// non-line sources on that axis. We split into v/h, filter each
-	// according to its own strategy, then recombine before the
-	// length filter and merge.
-	vEdges := layout.FilterEdgesByOrientation(rawEdges, layout.Vertical)
-	hEdges := layout.FilterEdgesByOrientation(rawEdges, layout.Horizontal)
-
-	if s.VerticalStrategy == StrategyLinesStrict {
-		vEdges = layout.FilterEdgesBySource(vEdges, layout.SourceLine, layout.SourceExplicit)
-	}
-	if s.HorizontalStrategy == StrategyLinesStrict {
-		hEdges = layout.FilterEdgesBySource(hEdges, layout.SourceLine, layout.SourceExplicit)
-	}
-
-	// Explicit overrides are added on top of the derived edges.
-	// pdfplumber accepts these even with the lines / lines_strict
-	// strategies (the "explicit" strategy itself replaces the
-	// derived edges; that strategy is deferred to v0.3.0).
-	for _, x := range s.ExplicitVerticalLines {
-		vEdges = append(vEdges, layout.Edge{
-			X0: x, X1: x,
-			Y0: 0, Y1: p.Height(),
-			Orientation: layout.Vertical,
-			Source:      layout.SourceExplicit,
-		})
-	}
-	for _, y := range s.ExplicitHorizontalLines {
-		hEdges = append(hEdges, layout.Edge{
-			X0: 0, X1: p.Width(),
-			Y0: y, Y1: y,
-			Orientation: layout.Horizontal,
-			Source:      layout.SourceExplicit,
-		})
-	}
+
+	pageWidth := p.Width()
+	pageHeight := p.Height()
+
+	// Per-axis base edge derivation.
+	vEdges := p.baseEdges(s.VerticalStrategy, layout.Vertical, lineLikeEdges, words, s)
+	hEdges := p.baseEdges(s.HorizontalStrategy, layout.Horizontal, lineLikeEdges, words, s)
+
+	// Explicit overrides are added on top of whichever base set was
+	// chosen. With StrategyExplicit the base set is empty so the
+	// explicit edges are the only source; with the other strategies
+	// they're additive (helpful when a column boundary isn't drawn).
+	vEdges = append(vEdges, explicitVerticalEdges(s.ExplicitVerticalLines, 0, pageHeight)...)
+	hEdges = append(hEdges, explicitHorizontalEdges(s.ExplicitHorizontalLines, 0, pageWidth)...)
 
 	combined := make([]layout.Edge, 0, len(vEdges)+len(hEdges))
 	combined = append(combined, vEdges...)
@@ -492,9 +487,49 @@ func (p *page) findTableEdges(s TableSettings) ([]layout.Edge, error) {
 	return merged, nil
 }
 
+// isLineLike reports whether the strategy derives its edges from
+// drawn primitives (Lines / Rects / Curves), i.e. whether
+// findTableEdges needs to call Objects(). Text and explicit
+// strategies don't.
+func isLineLike(s TableStrategy) bool {
+	return s == StrategyLines || s == StrategyLinesStrict
+}
+
+// baseEdges returns the per-axis edges produced by the named strategy.
+// lineLikeEdges is the unfiltered slice of edges derived from the
+// page's drawn primitives (Lines + Rects + Curves); it's consulted
+// only when the strategy is "lines" or "lines_strict". words is the
+// page's extracted text runs; consulted only for the "text" strategy.
+//
+// "explicit" returns nil here — the caller's explicit slice is
+// concatenated separately in findTableEdges.
+func (p *page) baseEdges(strategy TableStrategy, orientation layout.Orientation, lineLikeEdges []layout.Edge, words []Word, s TableSettings) []layout.Edge {
+	switch strategy {
+	case StrategyLines:
+		out := layout.FilterEdgesByOrientation(lineLikeEdges, orientation)
+		return out
+	case StrategyLinesStrict:
+		out := layout.FilterEdgesByOrientation(lineLikeEdges, orientation)
+		return layout.FilterEdgesBySource(out, layout.SourceLine)
+	case StrategyText:
+		if orientation == layout.Vertical {
+			return wordsToEdgesV(words, s.MinWordsVertical)
+		}
+		return wordsToEdgesH(words, s.MinWordsHorizontal)
+	case StrategyExplicit:
+		// Caller-supplied edges are concatenated elsewhere; the base
+		// set for the "explicit" strategy is empty.
+		return nil
+	default:
+		return nil
+	}
+}
+
 // FindTables runs the geometry-only pipeline (edges → intersections
 // → cells → tables) and returns one TableFinder per detected table
-// group. Strategies "text" and "explicit" return ErrUnsupported.
+// group. All four pdfplumber strategies (lines, lines_strict, text,
+// explicit) are supported; the two axes can use different strategies
+// independently.
 //
 // The returned slice is in visual top-to-bottom-left-to-right order
 // (sorted by the topmost-leftmost cell of each table).
@@ -503,6 +538,9 @@ func (p *page) FindTables(settings TableSettings) ([]TableFinder, error) {
 	if err := ensureSupportedStrategies(s); err != nil {
 		return nil, err
 	}
+	if err := validateExplicitForStrategy(s); err != nil {
+		return nil, err
+	}
 
 	edges, err := p.findTableEdges(s)
 	if err != nil {
@@ -548,6 +586,9 @@ func (p *page) ExtractTables(settings TableSettings) ([]*Table, error) {
 	if err := ensureSupportedStrategies(s); err != nil {
 		return nil, err
 	}
+	if err := validateExplicitForStrategy(s); err != nil {
+		return nil, err
+	}
 
 	finders, err := p.FindTables(s)
 	if err != nil {
diff --git a/scripts/capture_pdfplumber_text_golden.py b/scripts/capture_pdfplumber_text_golden.py
new file mode 100644
index 0000000..02e402d
--- /dev/null
+++ b/scripts/capture_pdfplumber_text_golden.py
@@ -0,0 +1,84 @@
+"""Generate text-strategy golden files for pdftable's parity tests.
+
+Run from the repo root after copying any new borderless / text-strategy
+fixture PDFs into testdata/golden/:
+
+    pip install pdfplumber
+    python scripts/capture_pdfplumber_text_golden.py
+
+The script reads every *.pdf in testdata/golden/ that has a sibling
+.tables-text.target file (the marker says "this fixture is in the
+text-strategy parity set") and writes
+<name>.tables-text.expected.json — pdfplumber's find_tables output
+under {vertical_strategy: 'text', horizontal_strategy: 'text'} with
+matching MinWordsVertical / MinWordsHorizontal defaults (3 / 1).
+
+We separate the text-strategy goldens from the line-strategy ones
+(*.tables.expected.json) because the same fixture may produce
+different tables depending on the strategy, and we want to assert
+parity per strategy.
+
+Re-run to refresh after upgrading pdfplumber or changing the
+target list.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import sys
+
+import pdfplumber
+
+DIR = os.path.join("testdata", "golden")
+
+
+def main() -> int:
+    target = DIR if len(sys.argv) < 2 else sys.argv[1]
+    targets = sorted(
+        f[: -len(".tables-text.target")]
+        for f in os.listdir(target)
+        if f.endswith(".tables-text.target")
+    )
+    if not targets:
+        print(
+            f"no .tables-text.target files in {target}. "
+            "Create one alongside each fixture you want in the "
+            "text-strategy parity set (the file can be empty; its "
+            "presence is the signal).",
+            file=sys.stderr,
+        )
+        return 1
+    for name in targets:
+        pdf_path = os.path.join(target, f"{name}.pdf")
+        if not os.path.exists(pdf_path):
+            print(f"missing {pdf_path}", file=sys.stderr)
+            continue
+        out = {"name": name, "pages": []}
+        with pdfplumber.open(pdf_path) as pdf:
+            for p in pdf.pages:
+                tbls = p.find_tables(
+                    {
+                        "vertical_strategy": "text",
+                        "horizontal_strategy": "text",
+                        "min_words_vertical": 3,
+                        "min_words_horizontal": 1,
+                    }
+                )
+                page_obj = {
+                    "number": p.page_number,
+                    "width": p.width,
+                    "height": p.height,
+                    "tables": [t.extract() for t in tbls],
+                }
+                out["pages"].append(page_obj)
+        outpath = os.path.join(target, f"{name}.tables-text.expected.json")
+        with open(outpath, "w", encoding="utf-8") as f:
+            json.dump(out, f, ensure_ascii=False, indent=2)
+        ntables = sum(len(pp["tables"]) for pp in out["pages"])
+        print(f"wrote {outpath}: {ntables} tables across all pages")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/scripts/gen_table_fixture.go b/scripts/gen_table_fixture.go
index 042e7f6..1dabb3b 100644
--- a/scripts/gen_table_fixture.go
+++ b/scripts/gen_table_fixture.go
@@ -40,6 +40,10 @@ func main() {
 			path: filepath.Join("testdata", "table-2x3-ruled.pdf"),
 			data: testdata.TableRuled(),
 		},
+		{
+			path: filepath.Join("testdata", "golden", "table-3x4-borderless.pdf"),
+			data: testdata.TableBorderless(),
+		},
 	}
 	for _, o := range outputs {
 		if err := os.MkdirAll(filepath.Dir(o.path), 0o755); err != nil {
diff --git a/table.go b/table.go
index 359a915..e33f506 100644
--- a/table.go
+++ b/table.go
@@ -22,9 +22,8 @@ package pdftable
 //     describing "below" / "right" change their sign.
 
 // TableStrategy is the enum of edge-derivation strategies. Each axis
-// (vertical, horizontal) picks one. v0.2.0 implements "lines" and
-// "lines_strict"; "text" and "explicit" are reserved for the next
-// release (Phase 1.3.D) and return ErrUnsupported if requested.
+// (vertical, horizontal) picks one independently. All four pdfplumber
+// strategies are implemented as of v0.3.0.
 type TableStrategy string
 
 const (
@@ -41,10 +40,23 @@ const (
 	// boundaries.
 	StrategyLinesStrict TableStrategy = "lines_strict"
 
-	// StrategyText (Phase 1.3.D) infers edges from word alignment.
+	// StrategyText infers edges from word alignment. Vertical edges
+	// come from clusters of words sharing X0 / X1 / centre positions;
+	// horizontal edges from clusters sharing visual top. Best for
+	// borderless tables — bank statements, narrative tables in 10-K
+	// filings, scanned-then-OCR'd content — where the columns and
+	// rows are conveyed by whitespace alignment rather than rules.
+	// Tunable via MinWordsVertical (default 3) and
+	// MinWordsHorizontal (default 1).
 	StrategyText TableStrategy = "text"
 
-	// StrategyExplicit (Phase 1.3.D) uses caller-supplied lines.
+	// StrategyExplicit uses caller-supplied coordinates from
+	// ExplicitVerticalLines / ExplicitHorizontalLines as the only
+	// source of edges on that axis. Useful when the table boundaries
+	// are known from an external source (layout analysis, manual
+	// annotation) and you want to bypass edge detection entirely.
+	// The "explicit" strategy on an axis requires at least two
+	// coordinates on that axis; fewer than two produces an error.
 	StrategyExplicit TableStrategy = "explicit"
 )
 
@@ -96,10 +108,13 @@ type TableSettings struct {
 	TextTolerance float64
 
 	// MinWordsVertical / MinWordsHorizontal control the "text"
-	// strategy thresholds (Phase 1.3.D). They have no effect when
-	// both strategies are "lines" / "lines_strict" — kept on this
-	// struct so callers don't have to switch types when migrating to
-	// the text strategy later.
+	// strategy thresholds. A candidate column-boundary cluster must
+	// contain at least MinWordsVertical words sharing X0 / X1 /
+	// centre alignment to be promoted to a vertical edge; row
+	// boundaries need MinWordsHorizontal words sharing a top edge.
+	// pdfplumber defaults (3 / 1) mirror those in pdfplumber's
+	// table.py:11-12. These fields are ignored when the corresponding
+	// strategy is anything other than "text".
 	MinWordsVertical   int
 	MinWordsHorizontal int
 
@@ -108,15 +123,18 @@ type TableSettings struct {
 	KeepBlankChars bool
 
 	// ExplicitVerticalLines / ExplicitHorizontalLines hold caller-
-	// supplied edge positions. With StrategyLines or
-	// StrategyLinesStrict they are ADDED to the derived edges; with
-	// StrategyExplicit they ARE the edges. In v0.2.0 the explicit
-	// strategy is not yet implemented; these slices have effect only
-	// when one of the lines strategies is in use and you want extra
-	// hand-placed rules (e.g. when your column boundary isn't drawn).
+	// supplied edge positions. With StrategyLines, StrategyLinesStrict,
+	// or StrategyText they are ADDED to the derived edges; with
+	// StrategyExplicit they ARE the only source of edges on that axis.
+	// Useful when a column or row boundary is invisible in the PDF but
+	// known from an external source.
 	//
 	// Values are X coordinates for vertical lines, Y coordinates for
-	// horizontal lines, both in PDF user-space points.
+	// horizontal lines, both in PDF user-space points. Non-finite
+	// values (NaN, Inf) are dropped with a log warning. When
+	// StrategyExplicit is selected on an axis, at least two
+	// coordinates must be supplied on that axis — fewer than two
+	// returns an error.
 	ExplicitVerticalLines   []float64
 	ExplicitHorizontalLines []float64
 }
diff --git a/table_test.go b/table_test.go
index 09eb4e7..87e6169 100644
--- a/table_test.go
+++ b/table_test.go
@@ -261,18 +261,18 @@ func TestRunTableFinder_2x3Grid(t *testing.T) {
 	}
 }
 
-// TestEnsureSupportedStrategies_RejectsTextAndExplicit asserts that
-// the v0.3.0 strategies return ErrUnsupported rather than silently
-// running an empty pipeline.
-func TestEnsureSupportedStrategies_RejectsTextAndExplicit(t *testing.T) {
+// TestEnsureSupportedStrategies_RejectsUnknown asserts that an
+// unrecognised strategy string returns ErrUnsupported rather than
+// silently running an empty pipeline. All four pdfplumber strategies
+// (lines / lines_strict / text / explicit) are now implemented, so
+// this test only exercises the unknown-string path.
+func TestEnsureSupportedStrategies_RejectsUnknown(t *testing.T) {
 	cases := []struct {
 		name string
 		s    TableSettings
 	}{
-		{"text/lines", TableSettings{VerticalStrategy: StrategyText, HorizontalStrategy: StrategyLines}},
-		{"lines/text", TableSettings{VerticalStrategy: StrategyLines, HorizontalStrategy: StrategyText}},
-		{"explicit/lines", TableSettings{VerticalStrategy: StrategyExplicit, HorizontalStrategy: StrategyLines}},
-		{"lines/explicit", TableSettings{VerticalStrategy: StrategyLines, HorizontalStrategy: StrategyExplicit}},
+		{"unknown_v", TableSettings{VerticalStrategy: "blah", HorizontalStrategy: StrategyLines}},
+		{"unknown_h", TableSettings{VerticalStrategy: StrategyLines, HorizontalStrategy: "blah"}},
 	}
 	for _, c := range cases {
 		t.Run(c.name, func(t *testing.T) {
@@ -287,24 +287,20 @@ func TestEnsureSupportedStrategies_RejectsTextAndExplicit(t *testing.T) {
 	}
 }
 
-// TestEnsureSupportedStrategies_AcceptsLines asserts that both lines
-// strategies pass validation.
-func TestEnsureSupportedStrategies_AcceptsLines(t *testing.T) {
-	cases := []struct {
-		name string
-		s    TableSettings
-	}{
-		{"lines/lines", TableSettings{VerticalStrategy: StrategyLines, HorizontalStrategy: StrategyLines}},
-		{"strict/lines", TableSettings{VerticalStrategy: StrategyLinesStrict, HorizontalStrategy: StrategyLines}},
-		{"lines/strict", TableSettings{VerticalStrategy: StrategyLines, HorizontalStrategy: StrategyLinesStrict}},
-		{"strict/strict", TableSettings{VerticalStrategy: StrategyLinesStrict, HorizontalStrategy: StrategyLinesStrict}},
-	}
-	for _, c := range cases {
-		t.Run(c.name, func(t *testing.T) {
-			if err := ensureSupportedStrategies(c.s.applyDefaults()); err != nil {
-				t.Errorf("got %v, want nil", err)
-			}
-		})
+// TestEnsureSupportedStrategies_AcceptsAllFour asserts that all four
+// pdfplumber strategies pass validation, in every paired combination.
+func TestEnsureSupportedStrategies_AcceptsAllFour(t *testing.T) {
+	strategies := []TableStrategy{StrategyLines, StrategyLinesStrict, StrategyText, StrategyExplicit}
+	for _, v := range strategies {
+		for _, h := range strategies {
+			name := string(v) + "/" + string(h)
+			t.Run(name, func(t *testing.T) {
+				s := TableSettings{VerticalStrategy: v, HorizontalStrategy: h}.applyDefaults()
+				if err := ensureSupportedStrategies(s); err != nil {
+					t.Errorf("got %v, want nil", err)
+				}
+			})
+		}
 	}
 }
 
@@ -397,10 +393,11 @@ func TestExtractTables_RuledFixture(t *testing.T) {
 	}
 }
 
-// TestExtractTables_UnsupportedStrategyReturnsErrUnsupported asserts
-// the public API surfaces ErrUnsupported when callers request "text"
-// or "explicit" strategies.
-func TestExtractTables_UnsupportedStrategyReturnsErrUnsupported(t *testing.T) {
+// TestExtractTables_UnknownStrategyReturnsErrUnsupported asserts the
+// public API surfaces ErrUnsupported when callers pass an unrecognised
+// strategy string. All four standard strategies are implemented as of
+// v0.3.0; this guard catches typos.
+func TestExtractTables_UnknownStrategyReturnsErrUnsupported(t *testing.T) {
 	doc, err := OpenBytes(testdata.TableRuled())
 	if err != nil {
 		t.Fatalf("OpenBytes: %v", err)
@@ -409,7 +406,7 @@ func TestExtractTables_UnsupportedStrategyReturnsErrUnsupported(t *testing.T) {
 	p, _ := doc.Page(1)
 
 	settings := DefaultTableSettings()
-	settings.VerticalStrategy = StrategyText
+	settings.VerticalStrategy = "not-a-strategy"
 	_, err = p.ExtractTables(settings)
 	if err == nil {
 		t.Fatal("got nil, want ErrUnsupported")
@@ -417,12 +414,34 @@ func TestExtractTables_UnsupportedStrategyReturnsErrUnsupported(t *testing.T) {
 	if !errIs(err, ErrUnsupported) {
 		t.Errorf("got %v, want ErrUnsupported", err)
 	}
-	// The error should mention what was unsupported and the phase.
-	if !strings.Contains(err.Error(), "text") {
+	if !strings.Contains(err.Error(), "not-a-strategy") {
 		t.Errorf("error %q should name the strategy", err.Error())
 	}
 }
 
+// TestExtractTables_ExplicitWithoutCoordinatesReturnsError asserts
+// that StrategyExplicit on an axis with fewer than two coordinates
+// returns a clear validation error (matching pdfplumber's behaviour).
+func TestExtractTables_ExplicitWithoutCoordinatesReturnsError(t *testing.T) {
+	doc, err := OpenBytes(testdata.TableRuled())
+	if err != nil {
+		t.Fatalf("OpenBytes: %v", err)
+	}
+	defer doc.Close()
+	p, _ := doc.Page(1)
+
+	settings := DefaultTableSettings()
+	settings.VerticalStrategy = StrategyExplicit
+	settings.ExplicitVerticalLines = []float64{100}
+	_, err = p.ExtractTables(settings)
+	if err == nil {
+		t.Fatal("got nil, want validation error")
+	}
+	if !strings.Contains(err.Error(), "two") {
+		t.Errorf("error %q should mention the two-coordinate minimum", err.Error())
+	}
+}
+
 // TestFindTables_NoEdgesReturnsEmpty asserts that a page with no
 // edges (e.g. a text-only page) returns an empty slice, not an
 // error.
@@ -441,3 +460,332 @@ func TestFindTables_NoEdgesReturnsEmpty(t *testing.T) {
 		t.Errorf("got %d finders, want 0 (text-only page)", len(finders))
 	}
 }
+
+// makeWord builds a Word at the given bbox with the given text.
+// Helper for the text-strategy unit tests which feed hand-crafted
+// Word slices directly into wordsToEdgesV / wordsToEdgesH.
+func makeWord(text string, x0, y0, x1, y1 float64) Word {
+	return Word{
+		Text: text,
+		X0:   x0, Y0: y0, X1: x1, Y1: y1,
+		Upright:   true,
+		Direction: "ltr",
+	}
+}
+
+// TestWordsToEdgesV_ThreeColumnAlignment exercises the vertical "text"
+// strategy with three columns of three words each, all left-aligned
+// at X = 100, 200, 300.  The expected output is four vertical edges:
+// three at the columns' X0 plus one trailing at the rightmost X1.
+func TestWordsToEdgesV_ThreeColumnAlignment(t *testing.T) {
+	words := []Word{
+		// Row 1: y near 700
+		makeWord("AAA", 100, 700, 130, 710),
+		makeWord("BBB", 200, 700, 230, 710),
+		makeWord("CCC", 300, 700, 330, 710),
+		// Row 2: y near 685
+		makeWord("DDD", 100, 685, 130, 695),
+		makeWord("EEE", 200, 685, 230, 695),
+		makeWord("FFF", 300, 685, 330, 695),
+		// Row 3: y near 670
+		makeWord("GGG", 100, 670, 130, 680),
+		makeWord("HHH", 200, 670, 230, 680),
+		makeWord("III", 300, 670, 330, 680),
+	}
+	edges := wordsToEdgesV(words, 3)
+	if len(edges) != 4 {
+		t.Fatalf("got %d edges, want 4 (3 columns + trailing)", len(edges))
+	}
+	xs := make(map[float64]struct{}, 4)
+	for _, e := range edges {
+		if e.Orientation != layout.Vertical {
+			t.Errorf("edge %+v: not vertical", e)
+		}
+		if e.Source != layout.SourceText {
+			t.Errorf("edge %+v: source %v, want SourceText", e, e.Source)
+		}
+		xs[e.X0] = struct{}{}
+	}
+	for _, want := range []float64{100, 200, 300, 330} {
+		if _, ok := xs[want]; !ok {
+			t.Errorf("missing vertical edge at X=%v; got %v", want, xs)
+		}
+	}
+}
+
+// TestWordsToEdgesV_BelowThresholdDropsCluster asserts that a column
+// candidate with fewer than MinWordsVertical words doesn't survive
+// the threshold filter.
+func TestWordsToEdgesV_BelowThresholdDropsCluster(t *testing.T) {
+	words := []Word{
+		// Column at X=100 has only 2 words; threshold of 3 should
+		// drop it.
+		makeWord("AAA", 100, 700, 130, 710),
+		makeWord("DDD", 100, 685, 130, 695),
+		// Column at X=200 has 3 words.
+		makeWord("BBB", 200, 700, 230, 710),
+		makeWord("EEE", 200, 685, 230, 695),
+		makeWord("HHH", 200, 670, 230, 680),
+	}
+	edges := wordsToEdgesV(words, 3)
+	// Expected: 1 column boundary (X=200) + 1 trailing (X=230) = 2 edges.
+	if len(edges) != 2 {
+		t.Fatalf("got %d edges, want 2", len(edges))
+	}
+}
+
+// TestWordsToEdgesH_DetectsRows asserts that horizontal clusters of
+// words sharing a top-Y produce one top + one bottom edge per row.
+func TestWordsToEdgesH_DetectsRows(t *testing.T) {
+	words := []Word{
+		// Row 1: top at Y=710
+		makeWord("AAA", 100, 700, 130, 710),
+		makeWord("BBB", 200, 700, 230, 710),
+		makeWord("CCC", 300, 700, 330, 710),
+		// Row 2: top at Y=695
+		makeWord("DDD", 100, 685, 130, 695),
+		makeWord("EEE", 200, 685, 230, 695),
+		makeWord("FFF", 300, 685, 330, 695),
+	}
+	// Threshold 1 → every cluster counts. Two rows × 2 edges/row = 4.
+	edges := wordsToEdgesH(words, 1)
+	if len(edges) != 4 {
+		t.Fatalf("got %d edges, want 4 (2 rows × top+bottom)", len(edges))
+	}
+	for _, e := range edges {
+		if e.Orientation != layout.Horizontal {
+			t.Errorf("edge %+v: not horizontal", e)
+		}
+		if e.Source != layout.SourceText {
+			t.Errorf("edge %+v: source %v, want SourceText", e, e.Source)
+		}
+	}
+	// Top + bottom for each row should be present: row 1 (700, 710)
+	// and row 2 (685, 695).
+	ys := make(map[float64]int, 4)
+	for _, e := range edges {
+		ys[e.Y0]++
+	}
+	for _, want := range []float64{700, 710, 685, 695} {
+		if ys[want] == 0 {
+			t.Errorf("missing horizontal edge at Y=%v", want)
+		}
+	}
+}
+
+// TestWordsToEdges_EmptyInputs asserts the early-return paths.
+func TestWordsToEdges_EmptyInputs(t *testing.T) {
+	if got := wordsToEdgesV(nil, 3); got != nil {
+		t.Errorf("nil words: got %v, want nil", got)
+	}
+	if got := wordsToEdgesH(nil, 1); got != nil {
+		t.Errorf("nil words: got %v, want nil", got)
+	}
+	if got := wordsToEdgesV([]Word{makeWord("A", 0, 0, 10, 10)}, 0); got != nil {
+		t.Errorf("threshold 0: got %v, want nil", got)
+	}
+}
+
+// TestExplicitVerticalEdges_PromotesAndFiltersInvalid asserts that
+// each finite X is promoted to a full-height vertical edge tagged
+// SourceExplicit, and that non-finite values are dropped silently.
+func TestExplicitVerticalEdges_PromotesAndFiltersInvalid(t *testing.T) {
+	xs := []float64{100, 200, nanForTest(), 300}
+	edges := explicitVerticalEdges(xs, 0, 800)
+	if len(edges) != 3 {
+		t.Fatalf("got %d edges, want 3 (NaN dropped)", len(edges))
+	}
+	for _, e := range edges {
+		if e.Orientation != layout.Vertical {
+			t.Errorf("edge %+v: not vertical", e)
+		}
+		if e.Source != layout.SourceExplicit {
+			t.Errorf("edge %+v: source %v, want SourceExplicit", e, e.Source)
+		}
+		if e.Y0 != 0 || e.Y1 != 800 {
+			t.Errorf("edge %+v: Y span got (%v,%v), want (0,800)", e, e.Y0, e.Y1)
+		}
+	}
+}
+
+// TestExplicitHorizontalEdges_PromotesAndFiltersInvalid is the
+// horizontal counterpart.
+func TestExplicitHorizontalEdges_PromotesAndFiltersInvalid(t *testing.T) {
+	ys := []float64{100, 200, 300}
+	edges := explicitHorizontalEdges(ys, 0, 600)
+	if len(edges) != 3 {
+		t.Fatalf("got %d edges, want 3", len(edges))
+	}
+	for _, e := range edges {
+		if e.Orientation != layout.Horizontal {
+			t.Errorf("edge %+v: not horizontal", e)
+		}
+		if e.X0 != 0 || e.X1 != 600 {
+			t.Errorf("edge %+v: X span got (%v,%v), want (0,600)", e, e.X0, e.X1)
+		}
+	}
+}
+
+// TestValidateExplicitForStrategy_RequiresTwoCoords asserts the
+// pre-flight check rejects an explicit strategy with fewer than two
+// coordinates on the chosen axis. pdfplumber raises ValueError; we
+// surface a regular error (callers don't typically catch via
+// errors.Is here).
+func TestValidateExplicitForStrategy_RequiresTwoCoords(t *testing.T) {
+	cases := []struct {
+		name string
+		s    TableSettings
+		want bool
+	}{
+		{"v_explicit_zero", TableSettings{VerticalStrategy: StrategyExplicit}, true},
+		{"v_explicit_one", TableSettings{VerticalStrategy: StrategyExplicit, ExplicitVerticalLines: []float64{1}}, true},
+		{"v_explicit_two_ok", TableSettings{VerticalStrategy: StrategyExplicit, ExplicitVerticalLines: []float64{1, 2}}, false},
+		{"h_explicit_one", TableSettings{HorizontalStrategy: StrategyExplicit, ExplicitHorizontalLines: []float64{1}}, true},
+		{"lines_no_check", TableSettings{VerticalStrategy: StrategyLines}, false},
+	}
+	for _, c := range cases {
+		t.Run(c.name, func(t *testing.T) {
+			err := validateExplicitForStrategy(c.s.applyDefaults())
+			if got := err != nil; got != c.want {
+				t.Errorf("got err=%v, want error: %v", err, c.want)
+			}
+		})
+	}
+}
+
+// TestExtractTables_BorderlessTextStrategy asserts the public API
+// runs the text strategy end-to-end on the borderless fixture and
+// recovers the expected row × column grid.
+//
+// Fixture: testdata.TableBorderless() — 3 columns ("Item",
+// "Quantity", "Price") and 3 rows of body data, no rules drawn.
+func TestExtractTables_BorderlessTextStrategy(t *testing.T) {
+	doc, err := OpenBytes(testdata.TableBorderless())
+	if err != nil {
+		t.Fatalf("OpenBytes: %v", err)
+	}
+	defer doc.Close()
+	p, err := doc.Page(1)
+	if err != nil {
+		t.Fatalf("Page(1): %v", err)
+	}
+
+	settings := DefaultTableSettings()
+	settings.VerticalStrategy = StrategyText
+	settings.HorizontalStrategy = StrategyText
+
+	tables, err := p.ExtractTables(settings)
+	if err != nil {
+		t.Fatalf("ExtractTables: %v", err)
+	}
+	if len(tables) == 0 {
+		t.Fatalf("got 0 tables, want >= 1")
+	}
+	tbl := tables[0]
+	// We constructed the fixture with 4 rows (1 header + 3 body) and
+	// 3 columns. The text strategy infers row boundaries from top-y
+	// clusters so depending on how the top/bottom edges merge we
+	// may end up with 3 or 4 rows; assert at least 3 and at least 3
+	// columns.
+	if len(tbl.Rows) < 3 {
+		t.Errorf("rows: got %d, want >= 3", len(tbl.Rows))
+	}
+	if len(tbl.Rows) > 0 && len(tbl.Rows[0]) < 3 {
+		t.Errorf("cols: got %d, want >= 3", len(tbl.Rows[0]))
+	}
+	// Spot-check that the body data is present somewhere in the
+	// extracted text (the algorithm may place it in any row/col
+	// depending on edge merging; the parity test below pins the
+	// exact layout).
+	flat := strings.Join(flattenRows(tbl.Rows), " ")
+	for _, want := range []string{"Apple", "Banana", "Cherry"} {
+		if !strings.Contains(flat, want) {
+			t.Errorf("flat output %q missing %q", flat, want)
+		}
+	}
+}
+
+// TestExtractTables_ExplicitStrategy asserts that supplying caller-
+// derived coordinates via ExplicitVerticalLines /
+// ExplicitHorizontalLines + StrategyExplicit produces the expected
+// grid even when the underlying PDF has no rules drawn at all.
+func TestExtractTables_ExplicitStrategy(t *testing.T) {
+	doc, err := OpenBytes(testdata.TableBorderless())
+	if err != nil {
+		t.Fatalf("OpenBytes: %v", err)
+	}
+	defer doc.Close()
+	p, err := doc.Page(1)
+	if err != nil {
+		t.Fatalf("Page(1): %v", err)
+	}
+
+	// The borderless fixture places its 3 columns near X = 100, 200,
+	// 300 and 4 rows of text in the Y range [680, 730]. We feed
+	// boundaries that bracket those positions.
+	settings := DefaultTableSettings()
+	settings.VerticalStrategy = StrategyExplicit
+	settings.HorizontalStrategy = StrategyExplicit
+	settings.ExplicitVerticalLines = []float64{95, 195, 295, 395}
+	settings.ExplicitHorizontalLines = []float64{670, 690, 710, 740}
+
+	tables, err := p.ExtractTables(settings)
+	if err != nil {
+		t.Fatalf("ExtractTables: %v", err)
+	}
+	if len(tables) == 0 {
+		t.Fatalf("got 0 tables, want >= 1")
+	}
+	tbl := tables[0]
+	if len(tbl.Rows) != 3 {
+		t.Errorf("rows: got %d, want 3 (4 H-edges → 3 rows)", len(tbl.Rows))
+	}
+	if len(tbl.Rows) > 0 && len(tbl.Rows[0]) != 3 {
+		t.Errorf("cols: got %d, want 3 (4 V-edges → 3 cols)", len(tbl.Rows[0]))
+	}
+}
+
+// TestExtractTables_MixedStrategy asserts that VerticalStrategy=text +
+// HorizontalStrategy=explicit (and the reverse) work — each axis runs
+// its own edge derivation and the resulting edges are merged together.
+func TestExtractTables_MixedStrategy(t *testing.T) {
+	doc, err := OpenBytes(testdata.TableBorderless())
+	if err != nil {
+		t.Fatalf("OpenBytes: %v", err)
+	}
+	defer doc.Close()
+	p, err := doc.Page(1)
+	if err != nil {
+		t.Fatalf("Page(1): %v", err)
+	}
+
+	settings := DefaultTableSettings()
+	settings.VerticalStrategy = StrategyText
+	settings.HorizontalStrategy = StrategyExplicit
+	settings.ExplicitHorizontalLines = []float64{670, 690, 710, 740}
+
+	tables, err := p.ExtractTables(settings)
+	if err != nil {
+		t.Fatalf("ExtractTables (text-v + explicit-h): %v", err)
+	}
+	if len(tables) == 0 {
+		t.Fatal("got 0 tables, want >= 1")
+	}
+}
+
+// flattenRows joins a 2-D string grid into a flat slice for
+// substring spot-checks.
+func flattenRows(rows [][]string) []string {
+	var out []string
+	for _, r := range rows {
+		out = append(out, r...)
+	}
+	return out
+}
+
+// nanForTest returns a NaN without forcing the test file to import
+// math at the top.
+func nanForTest() float64 {
+	zero := 0.0
+	return zero / zero
+}
diff --git a/testdata/fixtures.go b/testdata/fixtures.go
index 0f7caba..7bf7faa 100644
--- a/testdata/fixtures.go
+++ b/testdata/fixtures.go
@@ -134,6 +134,66 @@ ET
 	return BuildSinglePage(grid + text)
 }
 
+// TableBorderless returns a minimal PDF with a 3-column borderless
+// table: the columns are conveyed by whitespace alignment alone, with
+// no ruling lines drawn. The header row is at Y ~ 730 and three body
+// rows are at Y ~ 710, 695, 680. Columns are at X ~ 100, 200, 300.
+//
+// This fixture targets the "text" strategy — it's the smallest
+// possible reproducer of the borderless-table case that's common in
+// 10-K filings, bank statements, scanned-then-OCR'd PDFs, and any
+// other PDF whose tables aren't ruled.
+//
+// Content:
+//
+//	Item    Quantity  Price
+//	Apple   3         1.50
+//	Banana  6         0.75
+//	Cherry  12        0.10
+//
+// The X positions are chosen so each header word and each body word
+// in a column starts at the same X within the wordEdgeTolerance
+// (=1 pt) — pdfplumber's words_to_edges_v clusters on exactly that
+// tolerance.
+func TableBorderless() []byte {
+	// 10pt Helvetica baselines. We move the text cursor with Td so
+	// the relative offsets keep the per-row, per-column positions
+	// pinned to the same X coordinates within each row.
+	const text = `BT
+/F1 10 Tf
+% Header row: baseline ~ 720.
+100 720 Td
+(Item) Tj
+100 0 Td
+(Quantity) Tj
+100 0 Td
+(Price) Tj
+% Body row 1: y -= 20, x back to 0.
+-200 -20 Td
+(Apple) Tj
+100 0 Td
+(3) Tj
+100 0 Td
+(1.50) Tj
+% Body row 2.
+-200 -15 Td
+(Banana) Tj
+100 0 Td
+(6) Tj
+100 0 Td
+(0.75) Tj
+% Body row 3.
+-200 -15 Td
+(Cherry) Tj
+100 0 Td
+(12) Tj
+100 0 Td
+(0.10) Tj
+ET
+`
+	return BuildSinglePage(text)
+}
+
 // Rules returns a minimal PDF whose content stream draws four lines
 // (two horizontal, two vertical) and one rectangle. We use simple
 // coordinates: a 100x100 box with one stroked diagonal line and one
diff --git a/testdata/golden/table-3x4-borderless.pdf b/testdata/golden/table-3x4-borderless.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..59201dca8c9665efa8b3c5ae23b46fb97b4ddc39
GIT binary patch
literal 950
zcmZWo&2HN;48H3rcrma6OKjP-jTT*jI!n-D8=5r+b_jY<Y$jEgN`b6w$qswP9r^~_
zi;R-<(>gX_NFT}XBWW@ie;D<6|BMBH|NVJp93s%OM|OD$VYyu)gacv4QZFDJiv>0i
zlfxZcT`^R(b9DFD0j*#7N!ATa6X$+UEPAah@E3v3avJDl43cZB6Mn&3ib8K-8j*j-
zW5~~A;=98}uT4b>xlBizBK(2FnNuVkVluj#poM8jHBvdavG?9EAa~MeAl>=yi*d29
zbMnmLWIeOK8LlN9!a_9IT5fRp1FftCghN&7S}N+)H>s{wBM-o_TTXB<@EO&@K0uQC
zt~A!5cw$A#!Vw1^K{;mu+@Pq@fYCoMU`C=#sStic9!Gt~qsSegtl6877N6ci`RKtG
z`nnd%O1tgAV<Ri{SOA5tw~ib51#F@B9^&X7Y)Gj34pswc^kNdBj~BUKt(I-5lWwu3
zL(ltZ)MbzNJUsP0g;4Z%0<+FN>gQ>fDCrOl9?+O>fjmB8;%!R9(*5UMqj%%GF*s*5
zSS&g`kfmvo!W<45<?XGYj#x&s;+_{6k7;{4Se%`oU`du8F>8ceqVWqik-rc!8g8xK
VuIv`AvBFrNXP)IO2!^*K_8)p$_W1w+

literal 0
HcmV?d00001

diff --git a/testdata/golden/table-3x4-borderless.tables-text.expected.json b/testdata/golden/table-3x4-borderless.tables-text.expected.json
new file mode 100644
index 0000000..df44e7f
--- /dev/null
+++ b/testdata/golden/table-3x4-borderless.tables-text.expected.json
@@ -0,0 +1,49 @@
+{
+  "name": "table-3x4-borderless",
+  "pages": [
+    {
+      "number": 1,
+      "width": 612,
+      "height": 792,
+      "tables": [
+        [
+          [
+            "Item",
+            "Quantity",
+            "Price"
+          ],
+          [
+            "",
+            "",
+            ""
+          ],
+          [
+            "Apple",
+            "3",
+            "1.50"
+          ],
+          [
+            "",
+            "",
+            ""
+          ],
+          [
+            "Banana",
+            "6",
+            "0.75"
+          ],
+          [
+            "",
+            "",
+            ""
+          ],
+          [
+            "Cherry",
+            "12",
+            "0.10"
+          ]
+        ]
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/testdata/golden/table-3x4-borderless.tables-text.target b/testdata/golden/table-3x4-borderless.tables-text.target
new file mode 100644
index 0000000..e69de29

From 0edb9b80c483aed53b3e0a6d03300091e497f3e9 Mon Sep 17 00:00:00 2001
From: Halleluyah Oludele <halleluyaholudele@gmail.com>
Date: Wed, 27 May 2026 02:07:57 +0100
Subject: [PATCH 2/3] feat: pdftable CLI (extract subcommand)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds cmd/pdftable, a stdlib-only command-line interface mirroring
pdfplumber's CLI surface for the operations the library implements:

- extract <file.pdf> [flags]: tables (--tables) or text (--text) on
  one page, a range (--pages 1,3-5), or all pages.
- Output format selectable via --format json|text. JSON shape includes
  page dimensions, table bbox, per-cell bbox, and rows.
- Full TableSettings surface exposed as flags:
  --vertical-strategy / --horizontal-strategy, --snap-tolerance,
  --join-tolerance, --edge-min-length, --intersection-tolerance,
  --text-tolerance, --min-words-vertical/horizontal,
  --explicit-vertical-lines/horizontal-lines, --indent.
- Positional argument can appear before OR after flags
  (pdfplumber-style invocation); reorderFlagsLast() shuffles tokens
  so the standard library flag package can parse either ordering.

Tested via cmd/pdftable/main_test.go: end-to-end runs against the
issue-466-example and table-3x4-borderless fixtures, plus unit tests
on parsePages, reorderFlagsLast, and the error paths.

No new go.mod dependencies — uses standard library flag, encoding/json,
strings, strconv only.
---
 cmd/pdftable/main.go      | 493 ++++++++++++++++++++++++++++++++++++++
 cmd/pdftable/main_test.go | 244 +++++++++++++++++++
 2 files changed, 737 insertions(+)
 create mode 100644 cmd/pdftable/main.go
 create mode 100644 cmd/pdftable/main_test.go

diff --git a/cmd/pdftable/main.go b/cmd/pdftable/main.go
new file mode 100644
index 0000000..5fbf3ce
--- /dev/null
+++ b/cmd/pdftable/main.go
@@ -0,0 +1,493 @@
+// Copyright (c) 2026 Halleluyah Oludele
+// Licensed under the MIT License.
+
+// cmd/pdftable is the command-line interface to the pdftable library.
+// It mirrors pdfplumber's CLI surface for the operations pdftable
+// implements: extract text, extract tables, dump page geometry.
+//
+// Usage:
+//
+//	pdftable extract <file.pdf> [flags]
+//
+// Flags (extract subcommand):
+//
+//	--pages          Comma-separated page list / dash ranges, e.g. "1,3-5". Default: all.
+//	--tables         Emit detected tables (JSON or text per --format).
+//	--text           Emit extracted text. Mutually exclusive with --tables.
+//	--format         "json" (default) | "text". Output format.
+//	--vertical-strategy    "lines" (default) | "lines_strict" | "text" | "explicit".
+//	--horizontal-strategy  Same set; default "lines".
+//	--snap-tolerance       Float; default 3.
+//	--join-tolerance       Float; default 3.
+//	--edge-min-length      Float; default 3.
+//	--intersection-tolerance Float; default 3.
+//	--text-tolerance         Float; default 3.
+//	--min-words-vertical     Int; default 3.
+//	--min-words-horizontal   Int; default 1.
+//	--explicit-vertical-lines    Comma-separated floats; required when vertical-strategy=explicit.
+//	--explicit-horizontal-lines  Comma-separated floats; required when horizontal-strategy=explicit.
+//	--indent                 Int; JSON pretty-printing indent. 0 = compact.
+//
+// The CLI uses the standard library `flag` package and the `pdftable`
+// public API only — no third-party dependencies.
+package main
+
+import (
+	"encoding/json"
+	"flag"
+	"fmt"
+	"io"
+	"os"
+	"strconv"
+	"strings"
+
+	"github.com/hallelx2/pdftable"
+)
+
+func main() {
+	if err := run(os.Args[1:], os.Stdout, os.Stderr); err != nil {
+		fmt.Fprintln(os.Stderr, err)
+		os.Exit(1)
+	}
+}
+
+// run is the testable entry point. It takes the args slice (excluding
+// the executable name) and the stdout/stderr streams so tests can
+// capture output without spawning a process.
+func run(args []string, stdout, stderr io.Writer) error {
+	if len(args) == 0 {
+		printUsage(stderr)
+		return fmt.Errorf("missing subcommand")
+	}
+	switch args[0] {
+	case "extract":
+		return runExtract(args[1:], stdout, stderr)
+	case "-h", "--help", "help":
+		printUsage(stdout)
+		return nil
+	case "version", "-v", "--version":
+		fmt.Fprintln(stdout, "pdftable v0.3.0")
+		return nil
+	default:
+		printUsage(stderr)
+		return fmt.Errorf("unknown subcommand %q", args[0])
+	}
+}
+
+// printUsage prints the top-level usage string.
+func printUsage(w io.Writer) {
+	fmt.Fprintln(w, `pdftable — extract text and tables from PDFs
+
+USAGE:
+  pdftable extract <file.pdf> [flags]
+  pdftable version
+  pdftable help
+
+EXTRACT FLAGS (run 'pdftable extract --help' for full list):
+  --pages 1,3-5            Pages to process (default: all).
+  --tables                 Output detected tables.
+  --text                   Output extracted text (mutually exclusive with --tables).
+  --format json|text       Output format (default: json).
+  --vertical-strategy S    "lines" | "lines_strict" | "text" | "explicit".
+  --horizontal-strategy S  Same set, default "lines".
+
+Documentation: https://github.com/hallelx2/pdftable`)
+}
+
+// extractFlags is the parsed flag set for the extract subcommand.
+type extractFlags struct {
+	pages                    string
+	tables                   bool
+	text                     bool
+	format                   string
+	verticalStrategy         string
+	horizontalStrategy       string
+	snapTolerance            float64
+	joinTolerance            float64
+	edgeMinLength            float64
+	edgeMinLengthPrefilter   float64
+	intersectionTolerance    float64
+	textTolerance            float64
+	minWordsVertical         int
+	minWordsHorizontal       int
+	explicitVerticalLines    string
+	explicitHorizontalLines  string
+	indent                   int
+}
+
+// runExtract parses extract-subcommand args, opens the PDF, and
+// dispatches to the requested output mode.
+func runExtract(args []string, stdout, stderr io.Writer) error {
+	fs := flag.NewFlagSet("extract", flag.ContinueOnError)
+	fs.SetOutput(stderr)
+
+	var f extractFlags
+	fs.StringVar(&f.pages, "pages", "", "Pages to extract: comma list (1,3) or dash range (1-5). Default: all.")
+	fs.BoolVar(&f.tables, "tables", false, "Emit detected tables.")
+	fs.BoolVar(&f.text, "text", false, "Emit extracted text (mutually exclusive with --tables).")
+	fs.StringVar(&f.format, "format", "json", "Output format: json | text.")
+	fs.StringVar(&f.verticalStrategy, "vertical-strategy", "lines", `Vertical edge strategy: lines | lines_strict | text | explicit.`)
+	fs.StringVar(&f.horizontalStrategy, "horizontal-strategy", "lines", `Horizontal edge strategy: lines | lines_strict | text | explicit.`)
+	fs.Float64Var(&f.snapTolerance, "snap-tolerance", 3, "Snap tolerance (PDF points).")
+	fs.Float64Var(&f.joinTolerance, "join-tolerance", 3, "Join tolerance (PDF points).")
+	fs.Float64Var(&f.edgeMinLength, "edge-min-length", 3, "Drop merged edges shorter than this (PDF points).")
+	fs.Float64Var(&f.edgeMinLengthPrefilter, "edge-min-length-prefilter", 1, "Drop raw edges shorter than this before merging.")
+	fs.Float64Var(&f.intersectionTolerance, "intersection-tolerance", 3, "Slack used when testing edge crossings (PDF points).")
+	fs.Float64Var(&f.textTolerance, "text-tolerance", 3, "Per-cell text-extraction tolerance.")
+	fs.IntVar(&f.minWordsVertical, "min-words-vertical", 3, "Min word count for text strategy vertical clusters.")
+	fs.IntVar(&f.minWordsHorizontal, "min-words-horizontal", 1, "Min word count for text strategy horizontal clusters.")
+	fs.StringVar(&f.explicitVerticalLines, "explicit-vertical-lines", "", "Comma-separated X coordinates for explicit strategy.")
+	fs.StringVar(&f.explicitHorizontalLines, "explicit-horizontal-lines", "", "Comma-separated Y coordinates for explicit strategy.")
+	fs.IntVar(&f.indent, "indent", 0, "JSON indent level. 0 = compact.")
+
+	// Allow positional argument (path) before OR after flags by
+	// shuffling args. Go's flag package stops at the first
+	// non-flag; pdfplumber-style `extract file.pdf --tables` would
+	// otherwise be rejected.
+	reordered := reorderFlagsLast(args)
+	if err := fs.Parse(reordered); err != nil {
+		return err
+	}
+	positional := fs.Args()
+	if len(positional) < 1 {
+		fs.Usage()
+		return fmt.Errorf("missing input PDF path")
+	}
+	if len(positional) > 1 {
+		return fmt.Errorf("unexpected positional arguments after %q: %v", positional[0], positional[1:])
+	}
+	path := positional[0]
+
+	if f.tables && f.text {
+		return fmt.Errorf("--tables and --text are mutually exclusive")
+	}
+	if !f.tables && !f.text {
+		// Default to --tables when neither is specified. Mirrors the
+		// pdftable library's primary use case.
+		f.tables = true
+	}
+	if f.format != "json" && f.format != "text" {
+		return fmt.Errorf("--format must be json or text, got %q", f.format)
+	}
+
+	doc, err := pdftable.OpenFile(path)
+	if err != nil {
+		return err
+	}
+	defer doc.Close()
+
+	pageNums, err := parsePages(f.pages, doc.NumPages())
+	if err != nil {
+		return fmt.Errorf("--pages: %w", err)
+	}
+
+	settings, err := buildSettings(f)
+	if err != nil {
+		return err
+	}
+
+	if f.tables {
+		return emitTables(doc, pageNums, settings, f, stdout)
+	}
+	return emitText(doc, pageNums, f, stdout)
+}
+
+// parsePages converts the --pages flag value into a sorted slice of
+// 1-based page numbers. Empty string returns all pages.
+//
+// Examples:
+//
+//	""      → [1..N]
+//	"1"     → [1]
+//	"1,3"   → [1, 3]
+//	"2-5"   → [2, 3, 4, 5]
+//	"1,3-5" → [1, 3, 4, 5]
+func parsePages(spec string, total int) ([]int, error) {
+	if strings.TrimSpace(spec) == "" {
+		out := make([]int, total)
+		for i := range out {
+			out[i] = i + 1
+		}
+		return out, nil
+	}
+	seen := make(map[int]struct{})
+	out := make([]int, 0)
+	for _, tok := range strings.Split(spec, ",") {
+		tok = strings.TrimSpace(tok)
+		if tok == "" {
+			continue
+		}
+		if strings.Contains(tok, "-") {
+			parts := strings.SplitN(tok, "-", 2)
+			start, err := strconv.Atoi(strings.TrimSpace(parts[0]))
+			if err != nil {
+				return nil, fmt.Errorf("invalid range %q: %v", tok, err)
+			}
+			end, err := strconv.Atoi(strings.TrimSpace(parts[1]))
+			if err != nil {
+				return nil, fmt.Errorf("invalid range %q: %v", tok, err)
+			}
+			if start < 1 || end > total || start > end {
+				return nil, fmt.Errorf("range %q out of bounds (1..%d)", tok, total)
+			}
+			for i := start; i <= end; i++ {
+				if _, ok := seen[i]; ok {
+					continue
+				}
+				seen[i] = struct{}{}
+				out = append(out, i)
+			}
+		} else {
+			n, err := strconv.Atoi(tok)
+			if err != nil {
+				return nil, fmt.Errorf("invalid page %q: %v", tok, err)
+			}
+			if n < 1 || n > total {
+				return nil, fmt.Errorf("page %d out of bounds (1..%d)", n, total)
+			}
+			if _, ok := seen[n]; !ok {
+				seen[n] = struct{}{}
+				out = append(out, n)
+			}
+		}
+	}
+	return out, nil
+}
+
+// buildSettings translates the parsed flag set into a TableSettings.
+// It also parses the explicit-line coordinate strings.
+func buildSettings(f extractFlags) (pdftable.TableSettings, error) {
+	s := pdftable.DefaultTableSettings()
+	s.VerticalStrategy = pdftable.TableStrategy(f.verticalStrategy)
+	s.HorizontalStrategy = pdftable.TableStrategy(f.horizontalStrategy)
+	s.SnapTolerance = f.snapTolerance
+	s.JoinTolerance = f.joinTolerance
+	s.EdgeMinLength = f.edgeMinLength
+	s.EdgeMinLengthPrefilter = f.edgeMinLengthPrefilter
+	s.IntersectionTolerance = f.intersectionTolerance
+	s.TextTolerance = f.textTolerance
+	s.MinWordsVertical = f.minWordsVertical
+	s.MinWordsHorizontal = f.minWordsHorizontal
+
+	if f.explicitVerticalLines != "" {
+		coords, err := parseFloatList(f.explicitVerticalLines)
+		if err != nil {
+			return s, fmt.Errorf("--explicit-vertical-lines: %w", err)
+		}
+		s.ExplicitVerticalLines = coords
+	}
+	if f.explicitHorizontalLines != "" {
+		coords, err := parseFloatList(f.explicitHorizontalLines)
+		if err != nil {
+			return s, fmt.Errorf("--explicit-horizontal-lines: %w", err)
+		}
+		s.ExplicitHorizontalLines = coords
+	}
+	return s, nil
+}
+
+// reorderFlagsLast moves positional arguments to the end of the slice
+// so the standard library flag package's "stop at first non-flag"
+// behaviour doesn't get in the way of pdfplumber-style invocations
+// like `pdftable extract file.pdf --tables`.
+//
+// Heuristic: a token is a flag if it starts with "-" or "--". Flags
+// of the form "--foo=val" carry their value inline. Flags whose
+// arguments are space-separated (e.g. "--pages 1-3") need to consume
+// the following token; the boolean-flag set says which flags don't.
+func reorderFlagsLast(args []string) []string {
+	flagArgs := make([]string, 0, len(args))
+	positional := make([]string, 0)
+	i := 0
+	for i < len(args) {
+		a := args[i]
+		if strings.HasPrefix(a, "-") {
+			flagArgs = append(flagArgs, a)
+			// Inline value (--key=val): no follow-up token needed.
+			if strings.Contains(a, "=") {
+				i++
+				continue
+			}
+			// Boolean flags don't consume the next token. Strip the
+			// leading dashes to look up.
+			name := strings.TrimLeft(a, "-")
+			if _, isBool := boolFlagSet[name]; isBool {
+				i++
+				continue
+			}
+			// Otherwise, the next token (if any) is the value. Don't
+			// promote a token that looks like another flag.
+			if i+1 < len(args) && !strings.HasPrefix(args[i+1], "-") {
+				flagArgs = append(flagArgs, args[i+1])
+				i += 2
+				continue
+			}
+			i++
+			continue
+		}
+		positional = append(positional, a)
+		i++
+	}
+	return append(flagArgs, positional...)
+}
+
+// boolFlagSet lists every boolean flag the extract subcommand
+// understands. Used by reorderFlagsLast to know which flags don't
+// consume a following value token.
+var boolFlagSet = map[string]struct{}{
+	"tables": {},
+	"text":   {},
+	"h":      {},
+	"help":   {},
+}
+
+// parseFloatList parses a comma-separated list of float coordinates.
+func parseFloatList(spec string) ([]float64, error) {
+	parts := strings.Split(spec, ",")
+	out := make([]float64, 0, len(parts))
+	for _, p := range parts {
+		p = strings.TrimSpace(p)
+		if p == "" {
+			continue
+		}
+		v, err := strconv.ParseFloat(p, 64)
+		if err != nil {
+			return nil, fmt.Errorf("invalid float %q: %v", p, err)
+		}
+		out = append(out, v)
+	}
+	return out, nil
+}
+
+// tablesOutput is the JSON shape emitted by `pdftable extract
+// --tables`. We deliberately mirror pdfplumber's `to_json` schema
+// where it overlaps: one entry per page, each carrying the page
+// dimensions and a list of tables. Each table carries the row grid,
+// the bbox, and the per-cell bboxes.
+type tablesOutput struct {
+	Pages []pageTablesOutput `json:"pages"`
+}
+
+type pageTablesOutput struct {
+	Number int            `json:"number"`
+	Width  float64        `json:"width"`
+	Height float64        `json:"height"`
+	Tables []tableOutput  `json:"tables"`
+}
+
+type tableOutput struct {
+	BBox     [4]float64    `json:"bbox"`
+	Rows     [][]string    `json:"rows"`
+	Cells    [][][4]float64 `json:"cells"`
+}
+
+// emitTables runs ExtractTables on each requested page and writes the
+// aggregated result in the requested format.
+func emitTables(doc pdftable.Document, pages []int, settings pdftable.TableSettings, f extractFlags, w io.Writer) error {
+	out := tablesOutput{Pages: make([]pageTablesOutput, 0, len(pages))}
+	for _, n := range pages {
+		page, err := doc.Page(n)
+		if err != nil {
+			return err
+		}
+		tables, err := page.ExtractTables(settings)
+		if err != nil {
+			return fmt.Errorf("page %d: %w", n, err)
+		}
+		pageOut := pageTablesOutput{
+			Number: n,
+			Width:  page.Width(),
+			Height: page.Height(),
+			Tables: make([]tableOutput, 0, len(tables)),
+		}
+		for _, t := range tables {
+			to := tableOutput{
+				BBox: [4]float64{t.BBox.X0, t.BBox.Y0, t.BBox.X1, t.BBox.Y1},
+				Rows: t.Rows,
+			}
+			to.Cells = make([][][4]float64, len(t.CellsBBox))
+			for ri, row := range t.CellsBBox {
+				to.Cells[ri] = make([][4]float64, len(row))
+				for ci, c := range row {
+					to.Cells[ri][ci] = [4]float64{c.X0, c.Y0, c.X1, c.Y1}
+				}
+			}
+			pageOut.Tables = append(pageOut.Tables, to)
+		}
+		out.Pages = append(out.Pages, pageOut)
+	}
+
+	if f.format == "text" {
+		// One line per cell, blank line between rows, "---" between
+		// tables, page-number header before each page.
+		for _, p := range out.Pages {
+			fmt.Fprintf(w, "=== Page %d (%g x %g) ===\n", p.Number, p.Width, p.Height)
+			for ti, t := range p.Tables {
+				if ti > 0 {
+					fmt.Fprintln(w, "---")
+				}
+				for _, row := range t.Rows {
+					fmt.Fprintln(w, strings.Join(row, "\t"))
+				}
+			}
+		}
+		return nil
+	}
+
+	enc := json.NewEncoder(w)
+	if f.indent > 0 {
+		enc.SetIndent("", strings.Repeat(" ", f.indent))
+	}
+	return enc.Encode(out)
+}
+
+// textOutput is the JSON shape for `pdftable extract --text`. One
+// entry per page; text is the dense extract-text output.
+type textOutput struct {
+	Pages []pageTextOutput `json:"pages"`
+}
+
+type pageTextOutput struct {
+	Number int     `json:"number"`
+	Width  float64 `json:"width"`
+	Height float64 `json:"height"`
+	Text   string  `json:"text"`
+}
+
+// emitText runs ExtractText on each requested page and writes the
+// aggregated result in the requested format. --format text emits the
+// text verbatim with a form-feed (\f) between pages, mirroring
+// `pdftotext` and pdfplumber's --format text behaviour.
+func emitText(doc pdftable.Document, pages []int, f extractFlags, w io.Writer) error {
+	out := textOutput{Pages: make([]pageTextOutput, 0, len(pages))}
+	for _, n := range pages {
+		page, err := doc.Page(n)
+		if err != nil {
+			return err
+		}
+		text, err := page.ExtractText(pdftable.DefaultTextOpts())
+		if err != nil {
+			return fmt.Errorf("page %d: %w", n, err)
+		}
+		out.Pages = append(out.Pages, pageTextOutput{
+			Number: n,
+			Width:  page.Width(),
+			Height: page.Height(),
+			Text:   text,
+		})
+	}
+	if f.format == "text" {
+		for i, p := range out.Pages {
+			if i > 0 {
+				fmt.Fprint(w, "\f")
+			}
+			fmt.Fprintln(w, p.Text)
+		}
+		return nil
+	}
+	enc := json.NewEncoder(w)
+	if f.indent > 0 {
+		enc.SetIndent("", strings.Repeat(" ", f.indent))
+	}
+	return enc.Encode(out)
+}
diff --git a/cmd/pdftable/main_test.go b/cmd/pdftable/main_test.go
new file mode 100644
index 0000000..32d50ce
--- /dev/null
+++ b/cmd/pdftable/main_test.go
@@ -0,0 +1,244 @@
+// Copyright (c) 2026 Halleluyah Oludele
+// Licensed under the MIT License.
+
+package main
+
+import (
+	"bytes"
+	"encoding/json"
+	"path/filepath"
+	"strings"
+	"testing"
+)
+
+// fixturePath assembles the path to a fixture relative to this test
+// file. We use a relative path because `go test ./...` runs tests
+// with the test file's directory as the cwd.
+func fixturePath(name string) string {
+	return filepath.Join("..", "..", "testdata", "golden", name)
+}
+
+// TestRun_ExtractTables_JSON exercises the happy path: extract tables
+// from the issue-466-example fixture in JSON format and assert the
+// shape matches the documented schema.
+func TestRun_ExtractTables_JSON(t *testing.T) {
+	var stdout, stderr bytes.Buffer
+	args := []string{"extract", fixturePath("issue-466-example.pdf"), "--tables", "--format", "json"}
+	if err := run(args, &stdout, &stderr); err != nil {
+		t.Fatalf("run: %v (stderr=%s)", err, stderr.String())
+	}
+	var out map[string]any
+	if err := json.Unmarshal(stdout.Bytes(), &out); err != nil {
+		t.Fatalf("unmarshal: %v\n%s", err, stdout.String())
+	}
+	pages, ok := out["pages"].([]any)
+	if !ok {
+		t.Fatalf("output missing pages key: %v", out)
+	}
+	if len(pages) != 1 {
+		t.Fatalf("got %d pages, want 1", len(pages))
+	}
+	page := pages[0].(map[string]any)
+	tables, ok := page["tables"].([]any)
+	if !ok || len(tables) < 1 {
+		t.Fatalf("page has no tables: %v", page)
+	}
+	first := tables[0].(map[string]any)
+	if first["bbox"] == nil {
+		t.Errorf("table 0 missing bbox")
+	}
+	rows, ok := first["rows"].([]any)
+	if !ok || len(rows) == 0 {
+		t.Errorf("table 0 missing rows")
+	}
+}
+
+// TestRun_ExtractTables_TextStrategy asserts the CLI propagates the
+// --vertical-strategy / --horizontal-strategy flags through to the
+// library and recovers the borderless fixture's table.
+func TestRun_ExtractTables_TextStrategy(t *testing.T) {
+	var stdout, stderr bytes.Buffer
+	args := []string{
+		"extract",
+		fixturePath("table-3x4-borderless.pdf"),
+		"--tables",
+		"--vertical-strategy", "text",
+		"--horizontal-strategy", "text",
+		"--format", "text",
+	}
+	if err := run(args, &stdout, &stderr); err != nil {
+		t.Fatalf("run: %v (stderr=%s)", err, stderr.String())
+	}
+	out := stdout.String()
+	for _, want := range []string{"Item", "Quantity", "Price", "Apple", "Banana", "Cherry"} {
+		if !strings.Contains(out, want) {
+			t.Errorf("output missing %q\n%s", want, out)
+		}
+	}
+}
+
+// TestRun_ExtractTables_Pages asserts the --pages flag narrows the
+// output to the requested pages only.
+func TestRun_ExtractTables_Pages(t *testing.T) {
+	var stdout, stderr bytes.Buffer
+	args := []string{
+		"extract",
+		fixturePath("issue-466-example.pdf"),
+		"--tables",
+		"--format", "json",
+		"--pages", "1",
+	}
+	if err := run(args, &stdout, &stderr); err != nil {
+		t.Fatalf("run: %v (stderr=%s)", err, stderr.String())
+	}
+	var out map[string]any
+	if err := json.Unmarshal(stdout.Bytes(), &out); err != nil {
+		t.Fatalf("unmarshal: %v", err)
+	}
+	if pages := out["pages"].([]any); len(pages) != 1 {
+		t.Errorf("got %d pages, want 1", len(pages))
+	}
+}
+
+// TestRun_ExtractText asserts the --text mode emits JSON with a
+// "text" field per page.
+func TestRun_ExtractText(t *testing.T) {
+	var stdout, stderr bytes.Buffer
+	args := []string{"extract", fixturePath("hello.pdf"), "--text", "--format", "json"}
+	if err := run(args, &stdout, &stderr); err != nil {
+		t.Fatalf("run: %v (stderr=%s)", err, stderr.String())
+	}
+	if !strings.Contains(stdout.String(), "Hello") {
+		t.Errorf("output missing 'Hello':\n%s", stdout.String())
+	}
+}
+
+// TestParsePages exercises the page-spec parser on a variety of valid
+// and invalid inputs.
+func TestParsePages(t *testing.T) {
+	cases := []struct {
+		name  string
+		spec  string
+		total int
+		want  []int
+		err   bool
+	}{
+		{"empty_all", "", 3, []int{1, 2, 3}, false},
+		{"single", "2", 3, []int{2}, false},
+		{"comma_list", "1,3", 3, []int{1, 3}, false},
+		{"dash_range", "2-4", 5, []int{2, 3, 4}, false},
+		{"mixed", "1,3-5", 5, []int{1, 3, 4, 5}, false},
+		{"deduped", "1,1,2", 3, []int{1, 2}, false},
+		{"bad_range", "5-2", 5, nil, true},
+		{"out_of_bounds", "10", 3, nil, true},
+		{"negative", "-1", 3, nil, true},
+		{"invalid", "abc", 3, nil, true},
+	}
+	for _, c := range cases {
+		t.Run(c.name, func(t *testing.T) {
+			got, err := parsePages(c.spec, c.total)
+			if c.err {
+				if err == nil {
+					t.Errorf("want error, got %v", got)
+				}
+				return
+			}
+			if err != nil {
+				t.Fatalf("unexpected error: %v", err)
+			}
+			if len(got) != len(c.want) {
+				t.Fatalf("got %v, want %v", got, c.want)
+			}
+			for i := range got {
+				if got[i] != c.want[i] {
+					t.Errorf("[%d] got %d, want %d", i, got[i], c.want[i])
+				}
+			}
+		})
+	}
+}
+
+// TestReorderFlagsLast asserts that positional arguments get pushed
+// to the end so the standard-library flag package can parse them
+// regardless of their original position.
+func TestReorderFlagsLast(t *testing.T) {
+	cases := []struct {
+		in, want []string
+	}{
+		{
+			in:   []string{"file.pdf", "--tables", "--format", "json"},
+			want: []string{"--tables", "--format", "json", "file.pdf"},
+		},
+		{
+			in:   []string{"--pages", "1-3", "file.pdf"},
+			want: []string{"--pages", "1-3", "file.pdf"},
+		},
+		{
+			in:   []string{"--tables", "file.pdf"},
+			want: []string{"--tables", "file.pdf"},
+		},
+		{
+			in:   []string{"--format=json", "file.pdf"},
+			want: []string{"--format=json", "file.pdf"},
+		},
+	}
+	for i, c := range cases {
+		got := reorderFlagsLast(c.in)
+		if len(got) != len(c.want) {
+			t.Errorf("case %d: lengths differ; got %v, want %v", i, got, c.want)
+			continue
+		}
+		for j := range got {
+			if got[j] != c.want[j] {
+				t.Errorf("case %d [%d]: got %q, want %q", i, j, got[j], c.want[j])
+			}
+		}
+	}
+}
+
+// TestRun_MissingFile asserts the CLI surfaces a clean error when the
+// input path doesn't exist (rather than crashing).
+func TestRun_MissingFile(t *testing.T) {
+	var stdout, stderr bytes.Buffer
+	args := []string{"extract", "no-such-file.pdf", "--tables"}
+	err := run(args, &stdout, &stderr)
+	if err == nil {
+		t.Fatal("got nil error, want failure")
+	}
+}
+
+// TestRun_MutuallyExclusiveFlags asserts --tables and --text can't
+// both be set.
+func TestRun_MutuallyExclusiveFlags(t *testing.T) {
+	var stdout, stderr bytes.Buffer
+	args := []string{"extract", fixturePath("hello.pdf"), "--tables", "--text"}
+	err := run(args, &stdout, &stderr)
+	if err == nil {
+		t.Fatal("got nil error, want mutually-exclusive failure")
+	}
+	if !strings.Contains(err.Error(), "mutually exclusive") {
+		t.Errorf("error %q should mention mutual exclusion", err)
+	}
+}
+
+// TestRun_UnknownSubcommand asserts the CLI rejects an unknown
+// subcommand with a clear error.
+func TestRun_UnknownSubcommand(t *testing.T) {
+	var stdout, stderr bytes.Buffer
+	err := run([]string{"foo"}, &stdout, &stderr)
+	if err == nil {
+		t.Fatal("got nil, want unknown-subcommand error")
+	}
+}
+
+// TestRun_Version asserts the `version` subcommand emits something
+// that mentions v0.3.0.
+func TestRun_Version(t *testing.T) {
+	var stdout, stderr bytes.Buffer
+	if err := run([]string{"version"}, &stdout, &stderr); err != nil {
+		t.Fatalf("run version: %v", err)
+	}
+	if !strings.Contains(stdout.String(), "v0.3.0") {
+		t.Errorf("output missing version: %q", stdout.String())
+	}
+}

From 0ab1e7eb75b066a6018e47c85f951f81d3bc4628 Mon Sep 17 00:00:00 2001
From: Halleluyah Oludele <halleluyaholudele@gmail.com>
Date: Wed, 27 May 2026 02:08:10 +0100
Subject: [PATCH 3/3] docs: v0.3.0 changelog + README usage (text/explicit +
 CLI)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- CHANGELOG.md: v0.3.0 entry covering text + explicit strategies,
  mixed-strategy support, the pdftable CLI, the layout.SourceText
  enum, and the borderless parity fixture. Known limitations note
  the carried-over font-metric drift on cell text.
- README.md: status bumped to v0.3.0; "Tables" section reworked with
  side-by-side pdfplumber → pdftable snippets for all four
  strategies plus a mixed-strategy example; new "CLI" section
  documenting the extract subcommand and full flag table; roadmap
  reflects v0.4.x as the AFM-bundle phase.
---
 CHANGELOG.md |  90 +++++++++++++++++++++
 README.md    | 217 +++++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 274 insertions(+), 33 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 096b104..82c7e05 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,95 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.3.0] - 2026-05-27
+
+Phase 1.3.D + 1.3.E — text and explicit table-finding strategies, the
+`pdftable` CLI. Completes pdfplumber parity for the four canonical
+table strategies. The v0.2.x public API surface is unchanged; v0.3.0
+only widens what's valid in `TableSettings` and adds the new CLI
+binary, so existing callers compile and run as-is.
+
+### Added
+
+- `StrategyText`: infer table edges from word alignment. Vertical
+  edges come from clusters of words sharing X0 (left), X1 (right), or
+  centre position with the per-axis tolerance hardcoded to 1 PDF
+  point (matching pdfplumber's `words_to_edges_v`). Horizontal edges
+  come from clusters sharing visual top, with both the top and
+  bottom of each cluster emitted so the last row gets captured
+  (matching `words_to_edges_h`). Threshold via
+  `TableSettings.MinWordsVertical` (default 3) and
+  `MinWordsHorizontal` (default 1).
+- `StrategyExplicit`: caller-supplied edges via
+  `TableSettings.ExplicitVerticalLines` /
+  `ExplicitHorizontalLines`. When the strategy is `explicit` on an
+  axis, the supplied coordinates are the ONLY source of edges on
+  that axis; at least two coordinates are required (matching
+  pdfplumber's validation). Non-finite values (NaN, Inf) are skipped
+  with a `log` warning rather than crashing.
+- Mixed strategies: every combination of the four strategies across
+  the two axes works (16 combinations total). The two axes' base
+  edges are derived independently then merged together for the
+  intersection pipeline — no orientation-specific logic leaks
+  between them.
+- `pdftable` CLI binary at `cmd/pdftable/`. Subcommand surface
+  mirrors pdfplumber's: `extract <file.pdf> [flags]` with
+  `--pages 1,3-5`, `--tables`, `--text`, `--format json|text`,
+  `--vertical-strategy`, `--horizontal-strategy`, the full set of
+  tolerance flags, `--min-words-vertical / horizontal`,
+  `--explicit-vertical-lines / horizontal-lines`, and `--indent`.
+  Stdlib `flag` package only — no third-party CLI dependencies.
+  Positional argument can appear before OR after flags
+  (pdfplumber-style invocation). Tested via
+  `cmd/pdftable/main_test.go` against the existing golden fixtures.
+- New `layout.SourceText` enum value tagging edges produced by the
+  text strategy. `layout.SourceExplicit` was already in place from
+  v0.2.0; the explicit-strategy implementation now writes through
+  to it as the primary source.
+- Hand-crafted borderless fixture `testdata.TableBorderless()`
+  (3-column × 4-row narrative table conveyed by whitespace alignment
+  only, no rules drawn). Used by the new text-strategy unit tests
+  and pdfplumber parity test. The generated PDF is in
+  `testdata/golden/table-3x4-borderless.pdf`.
+- Golden-file parity test `TestGoldenTablesTextStrategyAgainstPdfplumber`
+  driven by `*.tables-text.expected.json` files. The
+  `table-3x4-borderless` fixture matches pdfplumber's
+  `find_tables({text, text})` cell-for-cell. Regenerate via the new
+  `scripts/capture_pdfplumber_text_golden.py` helper.
+- `scripts/capture_pdfplumber_text_golden.py`: tiny Python helper
+  that captures pdfplumber's text-strategy output for every fixture
+  with a sibling `.tables-text.target` marker. Mirrors the existing
+  `scripts/gen_golden.py` workflow for the line-strategy goldens.
+
+### Changed
+
+- `Page.FindTables` / `Page.ExtractTables` no longer return
+  `ErrUnsupported` for `text` or `explicit` strategies — all four
+  strategies are now implemented. The error is still returned for
+  unknown strategy strings (typo guard).
+- `TableSettings` field docs updated to reflect the implemented
+  semantics of `MinWordsVertical` / `MinWordsHorizontal` and the
+  Explicit*Lines slices.
+- README's "Tables" section restructured: side-by-side
+  pdfplumber→pdftable examples for all four strategies, plus a
+  mixed-strategy snippet and a new "CLI" section.
+
+### Known limitations
+
+- Cell text fidelity on the text strategy depends on the same font
+  metrics as v0.2.x: PDFs that use standard-14 fonts without
+  bundled AFM tables can report intra-word gaps as zero, producing
+  cells like "Nohorizontal" where pdfplumber gets "No horizontal".
+  Structural parity (table count, row count, column count) matches
+  exactly; cell text matches verbatim on PDFs whose fonts have
+  bundled metrics or `/Widths` arrays. AFM-table bundling is a
+  v0.4.x goal.
+- Mixed-strategy snap/join uses a single global tolerance. If a
+  page mixes drawn rules at one X coordinate and word-cluster
+  edges at a slightly different X, the two won't merge unless
+  `SnapTolerance` is widened. This matches pdfplumber's behaviour
+  but is worth noting for callers tuning a mixed pipeline.
+
 ## [0.2.0] - 2026-05-27
 
 Phase 1.3.C — table-finding via ruled lines. Direct port of
@@ -219,6 +308,7 @@ Initial release. Phase 1.3.A — content-stream primitives layer.
 - Type 3 fonts (their glyph procedures are themselves content streams).
 - Vertical writing mode.
 
+[0.3.0]: https://github.com/hallelx2/pdftable/releases/tag/v0.3.0
 [0.2.0]: https://github.com/hallelx2/pdftable/releases/tag/v0.2.0
 [0.1.1]: https://github.com/hallelx2/pdftable/releases/tag/v0.1.1
 [0.1.0]: https://github.com/hallelx2/pdftable/releases/tag/v0.1.0
diff --git a/README.md b/README.md
index 12b418d..9e4bef3 100644
--- a/README.md
+++ b/README.md
@@ -19,10 +19,11 @@ heuristics on. This is that.
 
 ## Status
 
-`v0.2.0` — line-strategy table finding. `Page.FindTables` and
-`Page.ExtractTables` ship with this release covering the `lines` and
-`lines_strict` strategies (PDFs with ruled tables). `text` and
-`explicit` strategies return `ErrUnsupported` and land in v0.3.0.
+`v0.3.0` — full pdfplumber parity for table-finding strategies. All four
+canonical strategies are implemented: `lines`, `lines_strict`, `text`,
+and `explicit`. Mix and match per-axis (e.g. `vertical="text"` +
+`horizontal="lines"`) works as expected. Also ships the `pdftable`
+CLI for extracting text and tables without writing Go.
 
 [![Go Reference](https://pkg.go.dev/badge/github.com/hallelx2/pdftable.svg)](https://pkg.go.dev/github.com/hallelx2/pdftable)
 [![CI](https://github.com/hallelx2/pdftable/actions/workflows/test.yml/badge.svg)](https://github.com/hallelx2/pdftable/actions/workflows/test.yml)
@@ -31,7 +32,7 @@ heuristics on. This is that.
 ## Install
 
 ```sh
-go get github.com/hallelx2/pdftable@v0.2.0
+go get github.com/hallelx2/pdftable@v0.3.0
 ```
 
 Requires Go 1.25+ (uses the standard-library `iter` package for the `Pages()` range-over-func iterator, and pdfcpu v0.12+).
@@ -113,7 +114,7 @@ type Page interface {
     ExtractText(opts TextOpts) (string, error)
     ExtractTextSimple(xTolerance, yTolerance float64) (string, error)
 
-    // New in v0.2.0: line-strategy table finding.
+    // Table finding: lines + lines_strict (v0.2.0); text + explicit (v0.3.0).
     FindTables(settings TableSettings) ([]TableFinder, error)
     ExtractTables(settings TableSettings) ([]*Table, error)
 }
@@ -211,12 +212,12 @@ laid, _ := page.ExtractText(opts)
 fmt.Println(laid)
 ```
 
-## Tables (lines strategy)
+## Tables
 
 `Page.ExtractTables` is the table-detection entry point. It runs the
 edges → intersections → cells → tables pipeline (a direct port of
 pdfplumber's `TableFinder`) and returns one `*Table` per detected
-ruled table, with cell text already extracted.
+table, with cell text already extracted.
 
 ```go
 doc, _ := pdftable.OpenFile("invoice.pdf")
@@ -238,9 +239,11 @@ for ti, t := range tables {
 
 `TableSettings` defaults match pdfplumber's
 (`snap_tolerance=3`, `join_tolerance=3`, `edge_min_length=3`,
-`intersection_tolerance=3`, `text_tolerance=3`). Override any field
-on the value returned from `DefaultTableSettings()` to tighten or
-loosen the heuristics. The two implemented strategies are:
+`intersection_tolerance=3`, `text_tolerance=3`, `min_words_vertical=3`,
+`min_words_horizontal=1`). Override any field on the value returned
+from `DefaultTableSettings()` to tighten or loosen the heuristics.
+
+The four implemented strategies (one per axis, chosen independently):
 
 - `StrategyLines` — edges come from drawn `Line` segments, `Rect`
   outlines (all four sides), and axis-aligned `Curve` segments.
@@ -248,12 +251,16 @@ loosen the heuristics. The two implemented strategies are:
 - `StrategyLinesStrict` — only drawn `Line` segments are used. Use
   this when your PDF draws cell BACKGROUNDS as filled rectangles
   that you do NOT want treated as row boundaries.
+- `StrategyText` — edges inferred from word alignment. Vertical
+  edges come from clusters of words sharing X0 / X1 / centre;
+  horizontal edges from clusters sharing top-Y. Tunable via
+  `MinWordsVertical` (default 3) and `MinWordsHorizontal` (default 1).
+- `StrategyExplicit` — caller-supplied edges via
+  `ExplicitVerticalLines` / `ExplicitHorizontalLines`. Required when
+  table boundaries are known from layout analysis or manual
+  annotation.
 
-`StrategyText` (word-alignment-based) and `StrategyExplicit`
-(caller-supplied edges) return `ErrUnsupported` in v0.2.0 — they
-land in v0.3.0.
-
-### Side-by-side: pdfplumber → pdftable
+### Side-by-side: pdfplumber → pdftable (lines strategy)
 
 ```python
 # Python (pdfplumber)
@@ -287,14 +294,152 @@ for _, t := range tables {
 }
 ```
 
-The two outputs match cell-for-cell on ruled fixtures (see
-`testdata/golden/issue-466-example.*` for the parity test). Field
-naming differs in the obvious places: pdftable returns a slice of
-`*Table` instead of `Table` objects you have to call `.extract()`
-on; rows are `[]string` instead of `list[Optional[str]]` (missing
-cells produce `""` rather than `nil`); and table bboxes use
-`(X0, Y0, X1, Y1)` PDF user space rather than pdfplumber's
-image-space `(x0, top, x1, bottom)`.
+### Side-by-side: pdfplumber → pdftable (text strategy)
+
+```python
+# Python (pdfplumber) — borderless tables
+import pdfplumber
+
+with pdfplumber.open("10k-filing.pdf") as pdf:
+    page = pdf.pages[3]
+    for table in page.find_tables({"vertical_strategy": "text",
+                                    "horizontal_strategy": "text",
+                                    "min_words_vertical": 3}):
+        for row in table.extract():
+            print(row)
+```
+
+```go
+// Go (pdftable)
+doc, _ := pdftable.OpenFile("10k-filing.pdf")
+defer doc.Close()
+page, _ := doc.Page(4)
+
+settings := pdftable.DefaultTableSettings()
+settings.VerticalStrategy = pdftable.StrategyText
+settings.HorizontalStrategy = pdftable.StrategyText
+settings.MinWordsVertical = 3
+
+tables, _ := page.ExtractTables(settings)
+for _, t := range tables {
+    for _, row := range t.Rows {
+        fmt.Println(row)
+    }
+}
+```
+
+### Side-by-side: pdfplumber → pdftable (explicit strategy)
+
+```python
+# Python (pdfplumber) — caller-supplied edges
+import pdfplumber
+
+with pdfplumber.open("statement.pdf") as pdf:
+    page = pdf.pages[0]
+    table = page.find_tables({
+        "vertical_strategy": "explicit",
+        "horizontal_strategy": "explicit",
+        "explicit_vertical_lines":   [100, 200, 300, 400],
+        "explicit_horizontal_lines": [600, 650, 700, 720],
+    })[0]
+    for row in table.extract():
+        print(row)
+```
+
+```go
+// Go (pdftable)
+doc, _ := pdftable.OpenFile("statement.pdf")
+defer doc.Close()
+page, _ := doc.Page(1)
+
+settings := pdftable.DefaultTableSettings()
+settings.VerticalStrategy = pdftable.StrategyExplicit
+settings.HorizontalStrategy = pdftable.StrategyExplicit
+settings.ExplicitVerticalLines   = []float64{100, 200, 300, 400}
+settings.ExplicitHorizontalLines = []float64{600, 650, 700, 720}
+
+tables, _ := page.ExtractTables(settings)
+for _, row := range tables[0].Rows {
+    fmt.Println(row)
+}
+```
+
+### Mixed strategies
+
+Each axis picks its strategy independently. Combinations like
+`vertical=text` + `horizontal=lines` (common for tables with drawn
+row separators but borderless columns) work out of the box:
+
+```go
+settings := pdftable.DefaultTableSettings()
+settings.VerticalStrategy   = pdftable.StrategyText
+settings.HorizontalStrategy = pdftable.StrategyLines
+tables, _ := page.ExtractTables(settings)
+```
+
+The two outputs match cell-for-cell on the parity fixtures (see
+`testdata/golden/*.tables-text.expected.json` and
+`*.tables.expected.json` for the regression goldens). Field naming
+differs in the obvious places: pdftable returns a slice of `*Table`
+instead of `Table` objects you have to call `.extract()` on; rows are
+`[]string` instead of `list[Optional[str]]` (missing cells produce
+`""` rather than `nil`); and table bboxes use `(X0, Y0, X1, Y1)` PDF
+user space rather than pdfplumber's image-space
+`(x0, top, x1, bottom)`.
+
+## CLI
+
+`pdftable` ships a command-line interface that mirrors pdfplumber's
+CLI surface for the operations the library implements:
+
+```sh
+go install github.com/hallelx2/pdftable/cmd/pdftable@v0.3.0
+```
+
+Usage:
+
+```sh
+# Extract every table on every page as JSON.
+pdftable extract invoice.pdf --tables --format json
+
+# Borderless tables: use the text strategy.
+pdftable extract 10k.pdf --tables \
+    --vertical-strategy text --horizontal-strategy text \
+    --min-words-vertical 4
+
+# Extract text only (no table detection).
+pdftable extract report.pdf --text --format text
+
+# Subset of pages, pretty-printed JSON.
+pdftable extract report.pdf --tables --pages 1,3-5 --indent 2
+
+# Caller-supplied edges.
+pdftable extract statement.pdf --tables \
+    --vertical-strategy explicit --horizontal-strategy explicit \
+    --explicit-vertical-lines 100,200,300,400 \
+    --explicit-horizontal-lines 600,650,700,720
+```
+
+Flags:
+
+| Flag | Default | Description |
+| --- | --- | --- |
+| `--pages` | all | Pages: `1,3-5` syntax. |
+| `--tables` | off | Output detected tables. |
+| `--text` | off | Output extracted text. |
+| `--format` | `json` | `json` \| `text`. |
+| `--vertical-strategy` | `lines` | `lines` \| `lines_strict` \| `text` \| `explicit`. |
+| `--horizontal-strategy` | `lines` | same set. |
+| `--snap-tolerance` | 3 | snap_tolerance (PDF pts). |
+| `--join-tolerance` | 3 | join_tolerance (PDF pts). |
+| `--edge-min-length` | 3 | drop merged edges shorter than this. |
+| `--intersection-tolerance` | 3 | slack on edge crossings. |
+| `--text-tolerance` | 3 | per-cell text-extraction tolerance. |
+| `--min-words-vertical` | 3 | text strategy column threshold. |
+| `--min-words-horizontal` | 1 | text strategy row threshold. |
+| `--explicit-vertical-lines` | (none) | comma list of X coords. |
+| `--explicit-horizontal-lines` | (none) | comma list of Y coords. |
+| `--indent` | 0 | JSON indent (0 = compact). |
 
 ## Side-by-side comparison with pdfplumber
 
@@ -391,9 +536,13 @@ pdftable/
 ├── text.go            // Word + ExtractText + ExtractTextSimple (v0.1.0)
 ├── table.go           // TableStrategy / TableSettings / Table types (v0.2.0)
 ├── finder.go          // Cells-from-edges algorithm (v0.2.0)
+├── finder_text.go     // Text + explicit edge derivation (v0.3.0)
 ├── clustering.go      // 1-D clusterObjects, groupObjectsByAttr, dedupeChars
 ├── geometry.go        // BBox helpers: Union, Intersect, Contains, Snap
 ├── errors.go          // Sentinel errors
+├── cmd/
+│   └── pdftable/      // Command-line interface (v0.3.0)
+│       └── main.go
 └── internal/
     ├── layout/
     │   └── lines.go   // Edge type + snap/join/filter pipeline (v0.2.0)
@@ -429,15 +578,17 @@ stdlib-only.
 - `v0.0.x` — content-stream primitives.
 - `v0.1.x` — text extraction: `Page.ExtractText`, `Page.Words`,
   `Page.ExtractTextSimple`.
-- `v0.2.x` — table finding via ruling lines (this release):
-  `Page.FindTables` / `Page.ExtractTables` covering the `lines` and
-  `lines_strict` strategies.
-- `v0.3.x` — remaining table strategies: `text` (word-alignment
-  edges) and `explicit` (caller-supplied edges). Bundle the
-  standard-14 AFM metrics so word bboxes (and therefore cell text)
-  match pdfplumber to within 1 PDF point on standard fonts.
-- `v0.4.x` — performance pass: parser benchmarking against pdfminer.six
-  and pdfplumber on a representative document corpus.
+- `v0.2.x` — table finding via ruling lines: `Page.FindTables` /
+  `Page.ExtractTables` covering the `lines` and `lines_strict`
+  strategies.
+- `v0.3.x` — remaining table strategies and CLI (this release):
+  `text` (word-alignment edges), `explicit` (caller-supplied edges),
+  and a `pdftable` CLI mirroring pdfplumber's surface.
+- `v0.4.x` — bundle the standard-14 AFM metrics so word bboxes (and
+  therefore cell text) match pdfplumber to within 1 PDF point on
+  standard fonts.
+- `v0.5.x` — performance pass: parser benchmarking against
+  pdfminer.six and pdfplumber on a representative document corpus.
 
 ## License