From a431cf4222135dea17a4feb94edc290c9b5e491e Mon Sep 17 00:00:00 2001
From: Halleluyah Oludele <halleluyaholudele@gmail.com>
Date: Tue, 26 May 2026 23:51:18 +0100
Subject: [PATCH] v0.1.0: Words + ExtractText (Phase 1.3.B)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Port pdfplumber's WordExtractor and extract_text into Go. Three new
methods on the Page interface:

- Page.Words(WordOpts)             → []Word, error
- Page.ExtractText(TextOpts)       → string, error
- Page.ExtractTextSimple(xt, yt)   → string, error

Each Word carries its bbox, font name/size, upright flag, and
direction (ltr/rtl/ttb/btt), with an optional Chars slice when
KeepChars=true.

Supporting infrastructure:

- geometry.go — BBox value type with Union/Intersect/Contains/Snap
  and MergeBBoxes helpers.
- clustering.go — 1-D agglomerative clustering primitives
  (clusterFloat1D, clusterObjects[T], groupObjectsByAttr[T,K],
  dedupeChars). Ports of pdfplumber/utils/clustering.py.
- text.go — Word + WordExtractor algorithm, dense and layout-
  preserving ExtractText paths, ligature expansion table.

The Page interface is additive: v0.0.1 callers that only use
Chars/Lines/Rects/Curves continue to compile and work unchanged.

Tests:

- geometry_test.go, clustering_test.go, text_test.go — table-
  driven unit tests for each primitive and each public entry
  point.
- golden_test.go — parity tests against pdfplumber output on three
  fixture PDFs (hello, rules, simple1). Expected outputs in
  testdata/golden/*.expected.json, regenerable via
  scripts/gen_golden.py.

Parity notes:

- Word text, count, order, and direction match pdfplumber exactly.
- Word bbox positions drift by up to ~10 PDF points on standard-14
  fonts because the AFM metrics aren't yet bundled (planned for
  v0.2.x). The golden test tolerance is 15 points to absorb this.
---
 CHANGELOG.md                          |  46 ++
 README.md                             | 174 ++++-
 clustering.go                         | 286 ++++++++
 clustering_test.go                    | 207 ++++++
 geometry.go                           | 158 +++++
 geometry_test.go                      | 236 +++++++
 golden_test.go                        | 225 ++++++
 page.go                               |  26 +
 pdftable.go                           |  11 +-
 scripts/gen_golden.py                 |  90 +++
 testdata/golden/hello.expected.json   |  31 +
 testdata/golden/hello.pdf             | Bin 0 -> 643 bytes
 testdata/golden/rules.expected.json   |  12 +
 testdata/golden/rules.pdf             | Bin 0 -> 790 bytes
 testdata/golden/simple1.expected.json | 229 ++++++
 testdata/golden/simple1.pdf           | Bin 0 -> 849 bytes
 text.go                               | 956 ++++++++++++++++++++++++++
 text_test.go                          | 495 +++++++++++++
 18 files changed, 3156 insertions(+), 26 deletions(-)
 create mode 100644 clustering.go
 create mode 100644 clustering_test.go
 create mode 100644 geometry.go
 create mode 100644 geometry_test.go
 create mode 100644 golden_test.go
 create mode 100644 scripts/gen_golden.py
 create mode 100644 testdata/golden/hello.expected.json
 create mode 100644 testdata/golden/hello.pdf
 create mode 100644 testdata/golden/rules.expected.json
 create mode 100644 testdata/golden/rules.pdf
 create mode 100644 testdata/golden/simple1.expected.json
 create mode 100644 testdata/golden/simple1.pdf
 create mode 100644 text.go
 create mode 100644 text_test.go

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 98dce6b..0a50bcc 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,51 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.1.0] - 2026-05-26
+
+Phase 1.3.B — words and text extraction. Direct port of pdfplumber's
+`WordExtractor`, `extract_text`, `extract_text_simple`. The v0.0.1
+public API surface is unchanged; v0.1.0 only adds methods to the
+`Page` interface, so existing callers compile and run as-is.
+
+### Added
+
+- `Page.Words(opts WordOpts) ([]Word, error)` — extract positioned
+  text runs. Each `Word` carries `Text`, `X0/Y0/X1/Y1` bbox,
+  `Upright`, `Direction` (ltr/rtl/ttb/btt), `FontName`, `FontSize`,
+  and an optional `Chars` slice (when `WordOpts.KeepChars=true`).
+- `Page.ExtractText(opts TextOpts) (string, error)` — page text as a
+  single string. Supports both dense (`Layout=false`, the default)
+  and layout-preserving (`Layout=true`) modes. The layout mode emits
+  a fixed-width grid mimicking `pdftotext -layout` / pdfplumber's
+  `extract_text(layout=True)`.
+- `Page.ExtractTextSimple(xTolerance, yTolerance float64) (string, error)` —
+  no-frills extraction baseline (ports pdfplumber's
+  `extract_text_simple`).
+- `WordOpts` / `TextOpts` option structs with `DefaultWordOpts()` /
+  `DefaultTextOpts()` constructors carrying pdfplumber-matching
+  defaults (XTolerance=3, YTolerance=3, Expand=true).
+- `BBox` value type with `Union`, `Intersect`, `Contains`, `Snap`,
+  `MergeBBoxes`, `BBoxOfChar`, `BBoxOfChars` helpers.
+- Internal clustering primitives in `clustering.go`:
+  `clusterFloat1D`, `makeClusterDict`, `clusterObjects[T]`,
+  `groupObjectsByAttr[T,K]`, `dedupeChars`. Ports of
+  pdfplumber/utils/clustering.py.
+- Ligature expansion table (ﬁ, ﬂ, ﬀ, ﬃ, ﬄ, ﬅ, ﬆ → fi/fl/ff/ffi/ffl/st).
+- Golden-file parity tests against pdfplumber output on three
+  fixtures (hello.pdf, rules.pdf, simple1.pdf). Regenerate via
+  `python scripts/gen_golden.py`.
+
+### Known limitations
+
+- Word bboxes drift by up to ~10 PDF points from pdfplumber's output
+  on standard-14 fonts because the AFM metrics aren't yet bundled.
+  Word text + count + order match exactly. The AFM bundle is a v0.2.x
+  goal.
+- `extract_text_lines` (regex-based line extraction) is not yet
+  ported.
+- `TextMap.search` is not yet ported.
+
 ## [0.0.1] - 2026-05-26
 
 Initial release. Phase 1.3.A — content-stream primitives layer.
@@ -51,4 +96,5 @@ Initial release. Phase 1.3.A — content-stream primitives layer.
 - Type 3 fonts (their glyph procedures are themselves content streams).
 - Vertical writing mode.
 
+[0.1.0]: https://github.com/hallelx2/pdftable/releases/tag/v0.1.0
 [0.0.1]: https://github.com/hallelx2/pdftable/releases/tag/v0.0.1
diff --git a/README.md b/README.md
index 44842cd..64642dd 100644
--- a/README.md
+++ b/README.md
@@ -19,9 +19,9 @@ heuristics on. This is that.
 
 ## Status
 
-`v0.0.1` — content-stream primitives layer. The public API surface is
-stable; higher-level operations (`ExtractText`, `FindTables`,
-`ExtractTables`) are coming in subsequent releases.
+`v0.1.0` — words and text extraction. `Page.Words`, `Page.ExtractText`,
+and `Page.ExtractTextSimple` ship with this release; table-finding
+(`FindTables`, `ExtractTables`) is the next phase.
 
 [![Go Reference](https://pkg.go.dev/badge/github.com/hallelx2/pdftable.svg)](https://pkg.go.dev/github.com/hallelx2/pdftable)
 [![CI](https://github.com/hallelx2/pdftable/actions/workflows/test.yml/badge.svg)](https://github.com/hallelx2/pdftable/actions/workflows/test.yml)
@@ -30,7 +30,7 @@ stable; higher-level operations (`ExtractText`, `FindTables`,
 ## Install
 
 ```sh
-go get github.com/hallelx2/pdftable@v0.0.1
+go get github.com/hallelx2/pdftable@v0.1.0
 ```
 
 Requires Go 1.25+ (uses the standard-library `iter` package for the `Pages()` range-over-func iterator, and pdfcpu v0.12+).
@@ -55,19 +55,28 @@ func main() {
     defer doc.Close()
 
     for n, page := range doc.Pages() {
+        // Primitives (v0.0.1).
         chars, _ := page.Chars()
         rects, _ := page.Rects()
         lines, _ := page.Lines()
         fmt.Printf("page %d: %d chars, %d rects, %d lines\n",
             n, len(chars), len(rects), len(lines))
 
-        // Each Char carries its own bbox, font name, font size, and
-        // upright flag — feed them to your own layout algorithm.
-        for _, c := range chars[:min(5, len(chars))] {
-            fmt.Printf("  %q at (%.1f, %.1f) - (%.1f, %.1f) %s %.1fpt\n",
-                c.Text, c.X0, c.Y0, c.X1, c.Y1, c.FontName, c.FontSize)
+        // Words and text extraction (v0.1.0).
+        words, _ := page.Words(pdftable.DefaultWordOpts())
+        text, _ := page.ExtractText(pdftable.DefaultTextOpts())
+        fmt.Printf("  %d words; first line: %q\n",
+            len(words), firstLine(text))
+    }
+}
+
+func firstLine(s string) string {
+    for i, r := range s {
+        if r == '\n' {
+            return s[:i]
         }
     }
+    return s
 }
 ```
 
@@ -97,6 +106,11 @@ type Page interface {
     Rects() ([]Rect, error)
     Curves() ([]Curve, error)
     Objects() (Objects, error)
+
+    // New in v0.1.0: word + text extraction.
+    Words(opts WordOpts) ([]Word, error)
+    ExtractText(opts TextOpts) (string, error)
+    ExtractTextSimple(xTolerance, yTolerance float64) (string, error)
 }
 
 // Primitives.
@@ -117,6 +131,45 @@ type Curve struct { Points [][2]float64; Stroke, Fill bool; Width float64 }
 
 type Objects struct { Chars []Char; Lines []Line; Rects []Rect; Curves []Curve }
 
+// Word (new in v0.1.0).
+type Word struct {
+    Text                string
+    X0, Y0, X1, Y1      float64
+    Upright             bool
+    Direction           string // "ltr" | "rtl" | "ttb" | "btt"
+    FontName            string
+    FontSize            float64
+    Chars               []Char // populated when WordOpts.KeepChars=true
+}
+
+// WordOpts: configure Page.Words. Use DefaultWordOpts() for pdfplumber-matching defaults.
+type WordOpts struct {
+    XTolerance         float64 // default 3
+    YTolerance         float64 // default 3
+    KeepBlankChars     bool
+    UseTextFlow        bool
+    HorizontalLTR      bool   // default true
+    VerticalTTB        bool   // default true
+    ExtraAttrs         []string
+    SplitAtPunctuation bool
+    Expand             bool   // ligature expansion; default true
+    KeepChars          bool
+}
+
+// TextOpts: configure Page.ExtractText. Use DefaultTextOpts() for defaults.
+type TextOpts struct {
+    XTolerance, YTolerance       float64
+    Layout                       bool
+    LayoutWidthChars             int
+    LayoutHeightChars            int
+    XDensity, YDensity           float64 // PDF points per character / per line
+    UseTextFlow                  bool
+    HorizontalLTR                bool
+    VerticalTTB                  bool
+    ExtraAttrs                   []string
+    Expand                       bool
+}
+
 // Sentinel errors.
 var (
     ErrInvalidPDF     = errors.New("pdftable: invalid PDF")
@@ -126,6 +179,33 @@ var (
 )
 ```
 
+## Text extraction
+
+```go
+doc, _ := pdftable.OpenFile("report.pdf")
+defer doc.Close()
+page, _ := doc.Page(1)
+
+// Words: each Word is a contiguous text run.
+words, _ := page.Words(pdftable.DefaultWordOpts())
+for _, w := range words {
+    fmt.Printf("%-20s @ (%.1f, %.1f) %s %.1fpt\n",
+        w.Text, w.X0, w.Y0, w.FontName, w.FontSize)
+}
+
+// ExtractText: all text on the page as one string. Dense (no layout)
+// joins words with spaces and lines with "\n".
+text, _ := page.ExtractText(pdftable.DefaultTextOpts())
+fmt.Println(text)
+
+// Layout-preserving extraction emulates `pdftotext -layout` / pdfplumber's
+// extract_text(layout=True) — column-aligned output suitable for forms.
+opts := pdftable.DefaultTextOpts()
+opts.Layout = true
+laid, _ := page.ExtractText(opts)
+fmt.Println(laid)
+```
+
 ## Side-by-side comparison with pdfplumber
 
 ```python
@@ -134,8 +214,9 @@ import pdfplumber
 
 with pdfplumber.open("report.pdf") as pdf:
     page = pdf.pages[0]
-    for char in page.chars:
-        print(char["text"], char["x0"], char["y0"])
+    for word in page.extract_words(x_tolerance=3, y_tolerance=3):
+        print(word["text"], word["x0"], word["top"])
+    print(page.extract_text())
 ```
 
 ```go
@@ -145,10 +226,14 @@ import "github.com/hallelx2/pdftable"
 doc, _ := pdftable.OpenFile("report.pdf")
 defer doc.Close()
 page, _ := doc.Page(1)
-chars, _ := page.Chars()
-for _, c := range chars {
-    fmt.Println(c.Text, c.X0, c.Y0)
+
+words, _ := page.Words(pdftable.DefaultWordOpts())
+for _, w := range words {
+    // pdftable's Y is PDF user-space (origin bottom-left). The
+    // pdfplumber-equivalent "top" is page.Height() - w.Y1.
+    fmt.Println(w.Text, w.X0, page.Height()-w.Y1)
 }
+fmt.Println(must(page.ExtractText(pdftable.DefaultTextOpts())))
 ```
 
 Three differences worth noting:
@@ -158,10 +243,52 @@ Three differences worth noting:
    pdfplumber compensates). Our `Page(1)` is the same first page.
 2. **Coordinates are in PDF user space with origin at bottom-left**.
    pdfplumber by default reports `top` (origin top-left, Y growing down)
-   on its chars; we report `Y0` / `Y1` in PDF native coordinates. The
-   conversion is `top = mediabox.height - Y1`.
-3. **No layout-analysis methods yet**. `extract_text`, `extract_tables`,
-   `find_tables` are coming in later releases.
+   on its chars and words; we report `Y0` / `Y1` in PDF native
+   coordinates. The conversion is `top = page.Height() - Y1`.
+3. **Options are explicit Go structs, not `**kwargs`**. Build a
+   `WordOpts` / `TextOpts`, override the fields you care about, pass
+   it through. `DefaultWordOpts()` / `DefaultTextOpts()` return
+   pdfplumber-matching defaults.
+
+## Parity with pdfplumber
+
+The word-grouping and text-extraction algorithms are direct ports of
+pdfplumber's `WordExtractor` and `extract_text` (see
+[`pdfplumber/utils/text.py`](https://github.com/jsvine/pdfplumber/blob/main/pdfplumber/utils/text.py)).
+Tests in [`golden_test.go`](golden_test.go) compare the Go output
+against pdfplumber's reference output on shared fixture PDFs.
+
+Behaviours that match exactly:
+
+- Word grouping: same line-cluster-then-merge-by-gap algorithm, same
+  defaults (XTolerance=3, YTolerance=3), same handling of blank-char
+  filtering, ligature expansion (ﬁ→fi, etc.), and split-at-punctuation.
+- Ordering: words returned in pdfplumber's order (top-to-bottom, then
+  left-to-right within each line) when UseTextFlow is false.
+- Direction handling: ltr / rtl / ttb / btt mapping from
+  upright + HorizontalLTR + VerticalTTB.
+
+Behaviours that intentionally differ:
+
+- **Position precision drifts when font metrics aren't bundled**.
+  pdfplumber uses pdfminer.six's AFM tables for the standard 14 fonts;
+  we use a default-width fallback for now. Word text and order match
+  exactly; word bboxes drift by up to ~10 PDF points on glyphs whose
+  width isn't in the PDF's /Widths array. Golden tests assert text
+  parity exactly and position parity within a 15-point envelope; the
+  envelope tightens to <1pt once the AFM bundle lands (planned for
+  v0.2.x).
+- **`Layout=true` output is structurally similar but not byte-equal**.
+  Pdfplumber's layout algorithm has version-to-version drift; we
+  produce a column-aligned grid with the same density defaults but
+  don't promise byte-equal output across pdfplumber releases.
+
+Behaviours not yet ported:
+
+- `extract_text_lines` (regex-based line extraction).
+- `search` on TextMap (regex over assembled page text with char-level
+  match back-references).
+- Per-character extra_attrs hooks beyond `fontname` and `size`.
 
 ## Architecture
 
@@ -171,6 +298,9 @@ pdftable/
 ├── pdf.go             // Document interface + implementation
 ├── page.go            // Page interface + implementation
 ├── char.go            // Public Char / Line / Rect / Curve / Objects
+├── text.go            // Word + ExtractText + ExtractTextSimple (v0.1.0)
+├── clustering.go      // 1-D clusterObjects, groupObjectsByAttr, dedupeChars
+├── geometry.go        // BBox helpers: Union, Intersect, Contains, Snap
 ├── errors.go          // Sentinel errors
 └── internal/pdf/
     ├── reader.go      // pdfcpu bridge
@@ -201,11 +331,13 @@ stdlib-only.
 
 ## Roadmap
 
-- `v0.0.x` — content-stream primitives (this release).
-- `v0.1.x` — text extraction: `Page.ExtractText`, `Page.Words`, word
-  grouping with reading-order sort.
+- `v0.0.x` — content-stream primitives.
+- `v0.1.x` — text extraction: `Page.ExtractText`, `Page.Words`,
+  `Page.ExtractTextSimple` (this release).
 - `v0.2.x` — table finding: `Page.FindTables` using ruling-line +
   whitespace heuristics, `Page.ExtractTables` returning row/cell text.
+  Bundle the standard-14 AFM metrics so word bboxes match pdfplumber
+  to within 1 PDF point.
 - `v0.3.x` — performance pass: parser benchmarking against pdfminer.six
   and pdfplumber on a representative document corpus.
 
diff --git a/clustering.go b/clustering.go
new file mode 100644
index 0000000..0431bd8
--- /dev/null
+++ b/clustering.go
@@ -0,0 +1,286 @@
+// Copyright (c) 2026 Halleluyah Oludele
+// Licensed under the MIT License.
+
+package pdftable
+
+import (
+	"math"
+	"sort"
+)
+
+// This file is the Go port of pdfplumber/utils/clustering.py. The shape
+// is the same — a 1-D agglomerative clusterer over a key extracted from
+// each input — but the API uses generics so callers don't have to wrap
+// every value in a dict-of-strings.
+//
+// The clustering primitives here are the load-bearing piece of the
+// text-extraction pipeline: words are formed by clustering chars whose
+// Y position is "close enough" (within YTolerance), and similarly for
+// the layout grid that maps chars onto a fixed-width column.
+
+// clusterFloat1D buckets sorted, deduped floats into clusters where
+// each consecutive pair differs by <= tolerance. The output groups
+// preserve the SORTED order of input — callers that need the original
+// input order should use clusterObjects with preserveOrder=true.
+//
+// This is the workhorse pdfplumber calls cluster_list.
+func clusterFloat1D(xs []float64, tolerance float64) [][]float64 {
+	if len(xs) == 0 {
+		return nil
+	}
+	sorted := make([]float64, len(xs))
+	copy(sorted, xs)
+	sort.Float64s(sorted)
+
+	if tolerance == 0 {
+		// Special-case to match pdfplumber: each value gets its own
+		// singleton cluster. Without this branch we'd still cluster
+		// equal values together, which differs from pdfplumber's
+		// "tolerance==0 → no clustering" semantics.
+		out := make([][]float64, len(sorted))
+		for i, v := range sorted {
+			out[i] = []float64{v}
+		}
+		return out
+	}
+
+	var groups [][]float64
+	current := []float64{sorted[0]}
+	last := sorted[0]
+	for _, v := range sorted[1:] {
+		if v <= last+tolerance {
+			current = append(current, v)
+		} else {
+			groups = append(groups, current)
+			current = []float64{v}
+		}
+		last = v
+	}
+	groups = append(groups, current)
+	return groups
+}
+
+// makeClusterDict returns a map from each unique input value to the
+// integer cluster id it lands in. Used internally by clusterObjects to
+// translate per-object keys into group indices.
+//
+// We dedupe the input on the way in (pdfplumber does the same with
+// `set(values)`) so that callers passing 10k chars don't pay 10k×
+// sort-and-compare cost when many chars share the same key.
+func makeClusterDict(values []float64, tolerance float64) map[float64]int {
+	if len(values) == 0 {
+		return map[float64]int{}
+	}
+	seen := make(map[float64]struct{}, len(values))
+	uniq := make([]float64, 0, len(values))
+	for _, v := range values {
+		if _, ok := seen[v]; ok {
+			continue
+		}
+		seen[v] = struct{}{}
+		uniq = append(uniq, v)
+	}
+	clusters := clusterFloat1D(uniq, tolerance)
+	out := make(map[float64]int, len(uniq))
+	for i, c := range clusters {
+		for _, v := range c {
+			out[v] = i
+		}
+	}
+	return out
+}
+
+// clusterObjects groups xs by the float key returned from keyFn,
+// bucketing keys that differ by <= tolerance into the same cluster.
+//
+// preserveOrder=false (the default in pdfplumber) emits clusters in
+// ascending-key order; xs WITHIN a cluster keep their relative input
+// order. preserveOrder=true keeps the original input order at both
+// levels — which is what UseTextFlow text extraction wants, since the
+// content stream's order conveys reading order in many PDFs. With
+// preserveOrder=true, a NEW group is started every time consecutive
+// items have different cluster ids (matching pdfplumber's
+// itertools.groupby on un-sorted cluster_tuples).
+//
+// The function is generic over T so callers can cluster Chars, Words,
+// or any other domain type without an interface{} cast on every
+// element.
+func clusterObjects[T any](xs []T, keyFn func(T) float64, tolerance float64, preserveOrder bool) [][]T {
+	if len(xs) == 0 {
+		return nil
+	}
+
+	keys := make([]float64, len(xs))
+	for i, x := range xs {
+		keys[i] = keyFn(x)
+	}
+	dict := makeClusterDict(keys, tolerance)
+
+	type indexed struct {
+		x   T
+		cid int
+	}
+	buf := make([]indexed, len(xs))
+	for i, x := range xs {
+		buf[i] = indexed{x: x, cid: dict[keys[i]]}
+	}
+
+	if !preserveOrder {
+		// Sort by cluster id; SliceStable so ties keep input order.
+		sort.SliceStable(buf, func(i, j int) bool {
+			return buf[i].cid < buf[j].cid
+		})
+	}
+
+	// Emit a new group whenever the cluster id changes from the
+	// previous entry. With preserveOrder=true this matches Python's
+	// itertools.groupby on un-sorted cluster_tuples.
+	var out [][]T
+	current := []T{buf[0].x}
+	currentID := buf[0].cid
+	for _, e := range buf[1:] {
+		if e.cid == currentID {
+			current = append(current, e.x)
+		} else {
+			out = append(out, current)
+			current = []T{e.x}
+			currentID = e.cid
+		}
+	}
+	out = append(out, current)
+	return out
+}
+
+// groupObjectsByAttr buckets xs into groups that share the exact same
+// value for the comparable key returned by keyFn. Order of input is
+// preserved within each group; groups appear in the order their key
+// first appears. This is the Go port of pdfplumber's itertools.groupby
+// on (upright, *extra_attrs) — the outer grouping step in
+// iter_extract_tuples.
+//
+// Unlike clusterObjects, this is an EXACT match on the key, not a
+// tolerance-based clustering. The two are intentionally different
+// operations and we name them differently to avoid confusion.
+func groupObjectsByAttr[T any, K comparable](xs []T, keyFn func(T) K) [][]T {
+	if len(xs) == 0 {
+		return nil
+	}
+	var out [][]T
+	current := []T{xs[0]}
+	currentKey := keyFn(xs[0])
+	for _, x := range xs[1:] {
+		k := keyFn(x)
+		if k == currentKey {
+			current = append(current, x)
+		} else {
+			out = append(out, current)
+			current = []T{x}
+			currentKey = k
+		}
+	}
+	out = append(out, current)
+	return out
+}
+
+// dedupeChars removes near-duplicate chars (same text + position within
+// tolerance). The classic case is a PDF that draws each glyph twice
+// for an emboss/shadow effect — text extraction should report one
+// glyph, not two. Mirrors pdfplumber.utils.text.dedupe_chars, with
+// the same extra_attrs hook for letting callers tighten/loosen the
+// "what counts as a duplicate" predicate.
+//
+// Supported extra_attrs: "fontname", "size". Other values are
+// ignored (pdfplumber accepts any attribute name and indexes into the
+// char dict; our Char struct doesn't have arbitrary keys, so we
+// surface the two attrs callers actually use).
+//
+// The output is in the SAME ORDER as the input — the first occurrence
+// of each cluster is kept and subsequent duplicates are dropped. This
+// preserves content-stream order, which downstream code may rely on.
+func dedupeChars(chars []Char, tolerance float64, extraAttrs []string) []Char {
+	if len(chars) == 0 {
+		return nil
+	}
+
+	// Group key: (upright, text, *extra_attrs). We collapse to a
+	// string because (a) the key needs to be hashable for the equality
+	// check, and (b) the extra_attrs slice is variable.
+	keyOf := func(c Char) string {
+		buf := make([]byte, 0, 32+len(c.Text)+len(c.FontName))
+		if c.Upright {
+			buf = append(buf, 'U')
+		} else {
+			buf = append(buf, 'u')
+		}
+		buf = append(buf, '\x00')
+		buf = append(buf, c.Text...)
+		for _, attr := range extraAttrs {
+			buf = append(buf, '\x00')
+			switch attr {
+			case "fontname":
+				buf = append(buf, c.FontName...)
+			case "size":
+				bits := math.Float64bits(c.FontSize)
+				for i := 7; i >= 0; i-- {
+					buf = append(buf, byte(bits>>(i*8)))
+				}
+			}
+		}
+		return string(buf)
+	}
+
+	type indexed struct {
+		c   Char
+		idx int
+	}
+	sorted := make([]indexed, len(chars))
+	for i, c := range chars {
+		sorted[i] = indexed{c: c, idx: i}
+	}
+	sort.SliceStable(sorted, func(i, j int) bool {
+		return keyOf(sorted[i].c) < keyOf(sorted[j].c)
+	})
+
+	keepIdx := make(map[int]struct{}, len(chars))
+
+	// Walk equal-key runs; within each run cluster by Y0 then X0.
+	for i := 0; i < len(sorted); {
+		j := i + 1
+		k := keyOf(sorted[i].c)
+		for j < len(sorted) && keyOf(sorted[j].c) == k {
+			j++
+		}
+		runChars := sorted[i:j]
+		yClusters := clusterObjects(runChars, func(e indexed) float64 { return e.c.Y0 }, tolerance, false)
+		for _, yc := range yClusters {
+			xClusters := clusterObjects(yc, func(e indexed) float64 { return e.c.X0 }, tolerance, false)
+			for _, xc := range xClusters {
+				// Keep the char with the smallest original index in
+				// the position bucket — preserves "first occurrence
+				// wins" semantics relative to content-stream order.
+				minIdx := xc[0].idx
+				for _, e := range xc[1:] {
+					if e.idx < minIdx {
+						minIdx = e.idx
+					}
+				}
+				keepIdx[minIdx] = struct{}{}
+			}
+		}
+		i = j
+	}
+
+	out := make([]Char, 0, len(keepIdx))
+	for i, c := range chars {
+		if _, ok := keepIdx[i]; ok {
+			out = append(out, c)
+		}
+	}
+	return out
+}
+
+// float64Bits is a tiny re-export point that consolidates the math.
+// Float64bits dependency so other tests/files don't have to import
+// "math" just to compare floats. Left as a package-private helper for
+// now — only dedupeChars's key construction uses it.
+func float64Bits(f float64) uint64 { return math.Float64bits(f) }
diff --git a/clustering_test.go b/clustering_test.go
new file mode 100644
index 0000000..a3b41ab
--- /dev/null
+++ b/clustering_test.go
@@ -0,0 +1,207 @@
+// Copyright (c) 2026 Halleluyah Oludele
+// Licensed under the MIT License.
+
+package pdftable
+
+import (
+	"reflect"
+	"testing"
+)
+
+func TestClusterFloat1D(t *testing.T) {
+	tests := []struct {
+		name      string
+		in        []float64
+		tolerance float64
+		want      [][]float64
+	}{
+		{
+			name: "empty",
+			in:   nil,
+			want: nil,
+		},
+		{
+			name:      "single",
+			in:        []float64{42},
+			tolerance: 5,
+			want:      [][]float64{{42}},
+		},
+		{
+			name:      "all within tolerance",
+			in:        []float64{1, 2, 3, 4, 5},
+			tolerance: 1.5,
+			want:      [][]float64{{1, 2, 3, 4, 5}},
+		},
+		{
+			name:      "out of order input gets sorted",
+			in:        []float64{5, 1, 3, 4, 2},
+			tolerance: 1.5,
+			want:      [][]float64{{1, 2, 3, 4, 5}},
+		},
+		{
+			name:      "two clusters",
+			in:        []float64{1, 2, 10, 11, 12},
+			tolerance: 2,
+			want:      [][]float64{{1, 2}, {10, 11, 12}},
+		},
+		{
+			name:      "tolerance 0 → singletons",
+			in:        []float64{1, 1, 2, 2, 3},
+			tolerance: 0,
+			want:      [][]float64{{1}, {1}, {2}, {2}, {3}},
+		},
+		{
+			name:      "chain growth (each in range of last)",
+			in:        []float64{1, 1.4, 1.7, 2.0, 2.3, 2.6, 100},
+			tolerance: 0.5,
+			want:      [][]float64{{1, 1.4, 1.7, 2.0, 2.3, 2.6}, {100}},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := clusterFloat1D(tt.in, tt.tolerance)
+			if !reflect.DeepEqual(got, tt.want) {
+				t.Errorf("got %v, want %v", got, tt.want)
+			}
+		})
+	}
+}
+
+func TestMakeClusterDict(t *testing.T) {
+	// Duplicates should be deduped before clustering, but the result
+	// map should contain an entry for every unique input value.
+	got := makeClusterDict([]float64{1, 1, 2, 10, 10, 11}, 2)
+	// Cluster 0 = {1, 2}, cluster 1 = {10, 11}.
+	if got[1] != got[2] {
+		t.Errorf("1 and 2 should be in same cluster, got %d vs %d", got[1], got[2])
+	}
+	if got[10] != got[11] {
+		t.Errorf("10 and 11 should be in same cluster, got %d vs %d", got[10], got[11])
+	}
+	if got[1] == got[10] {
+		t.Errorf("1 and 10 should be in different clusters, both got %d", got[1])
+	}
+}
+
+// TestClusterObjects exercises both preserveOrder modes on a tiny set
+// of struct-valued inputs.
+func TestClusterObjects(t *testing.T) {
+	type pt struct {
+		x   float64
+		tag string
+	}
+	xs := []pt{
+		{x: 1, tag: "a"},
+		{x: 10, tag: "b"},
+		{x: 2, tag: "c"},
+		{x: 11, tag: "d"},
+	}
+
+	// preserveOrder=false: clusters sorted by key, items sorted within
+	// by input order (a,c then b,d).
+	got := clusterObjects(xs, func(p pt) float64 { return p.x }, 5, false)
+	want := [][]pt{
+		{{1, "a"}, {2, "c"}},
+		{{10, "b"}, {11, "d"}},
+	}
+	if !reflect.DeepEqual(got, want) {
+		t.Errorf("preserveOrder=false: got %v, want %v", got, want)
+	}
+
+	// preserveOrder=true: groups in input order; new group whenever
+	// cluster id changes from previous entry.
+	// Input cluster ids: a=0, b=1, c=0, d=1 → groups: [a], [b], [c], [d].
+	got = clusterObjects(xs, func(p pt) float64 { return p.x }, 5, true)
+	want = [][]pt{
+		{{1, "a"}},
+		{{10, "b"}},
+		{{2, "c"}},
+		{{11, "d"}},
+	}
+	if !reflect.DeepEqual(got, want) {
+		t.Errorf("preserveOrder=true: got %v, want %v", got, want)
+	}
+}
+
+func TestGroupObjectsByAttr(t *testing.T) {
+	type item struct {
+		key string
+		val int
+	}
+	xs := []item{
+		{"a", 1},
+		{"a", 2},
+		{"b", 3},
+		{"a", 4}, // restart "a" group — exact-match groupby preserves order
+	}
+	got := groupObjectsByAttr(xs, func(i item) string { return i.key })
+	want := [][]item{
+		{{"a", 1}, {"a", 2}},
+		{{"b", 3}},
+		{{"a", 4}},
+	}
+	if !reflect.DeepEqual(got, want) {
+		t.Errorf("got %v, want %v", got, want)
+	}
+}
+
+func TestDedupeCharsSimple(t *testing.T) {
+	// Two pairs of overlapping glyphs (shadow effect) drawn at near-
+	// identical positions. After dedupe we expect one of each pair.
+	chars := []Char{
+		{Text: "A", X0: 0, Y0: 0, X1: 10, Y1: 10, FontName: "F", FontSize: 12, Upright: true},
+		{Text: "A", X0: 0.2, Y0: 0.3, X1: 10.2, Y1: 10.3, FontName: "F", FontSize: 12, Upright: true},
+		{Text: "B", X0: 11, Y0: 0, X1: 20, Y1: 10, FontName: "F", FontSize: 12, Upright: true},
+		{Text: "B", X0: 11.1, Y0: 0.2, X1: 20.1, Y1: 10.2, FontName: "F", FontSize: 12, Upright: true},
+	}
+	got := dedupeChars(chars, 1, []string{"fontname", "size"})
+	if len(got) != 2 {
+		t.Fatalf("got %d chars after dedupe, want 2", len(got))
+	}
+	// First occurrence should be kept.
+	if got[0].Text != "A" || got[0].X0 != 0 {
+		t.Errorf("first kept char = %+v, want first 'A'", got[0])
+	}
+	if got[1].Text != "B" || got[1].X0 != 11 {
+		t.Errorf("second kept char = %+v, want first 'B'", got[1])
+	}
+}
+
+func TestDedupeCharsKeepsDifferentText(t *testing.T) {
+	// Two glyphs at identical positions but different text should NOT
+	// be deduped.
+	chars := []Char{
+		{Text: "A", X0: 0, Y0: 0, X1: 10, Y1: 10, FontName: "F", FontSize: 12, Upright: true},
+		{Text: "B", X0: 0, Y0: 0, X1: 10, Y1: 10, FontName: "F", FontSize: 12, Upright: true},
+	}
+	got := dedupeChars(chars, 1, []string{"fontname", "size"})
+	if len(got) != 2 {
+		t.Fatalf("got %d chars, want 2 (different text → keep both)", len(got))
+	}
+}
+
+func TestDedupeCharsKeepsDifferentFont(t *testing.T) {
+	// Same text at same position but different fontname → keep both
+	// when fontname is in extra_attrs.
+	chars := []Char{
+		{Text: "A", X0: 0, Y0: 0, X1: 10, Y1: 10, FontName: "Helvetica", FontSize: 12, Upright: true},
+		{Text: "A", X0: 0, Y0: 0, X1: 10, Y1: 10, FontName: "Times", FontSize: 12, Upright: true},
+	}
+	got := dedupeChars(chars, 1, []string{"fontname"})
+	if len(got) != 2 {
+		t.Fatalf("got %d chars, want 2 (different fontname → keep both)", len(got))
+	}
+
+	// Without fontname in extra_attrs → drop the duplicate.
+	got = dedupeChars(chars, 1, nil)
+	if len(got) != 1 {
+		t.Fatalf("got %d chars, want 1 (no extra_attrs → dedupe by text+pos)", len(got))
+	}
+}
+
+func TestDedupeCharsEmpty(t *testing.T) {
+	if got := dedupeChars(nil, 1, nil); got != nil {
+		t.Errorf("dedupeChars(nil) = %v, want nil", got)
+	}
+}
diff --git a/geometry.go b/geometry.go
new file mode 100644
index 0000000..7c06a33
--- /dev/null
+++ b/geometry.go
@@ -0,0 +1,158 @@
+// Copyright (c) 2026 Halleluyah Oludele
+// Licensed under the MIT License.
+
+package pdftable
+
+import "math"
+
+// BBox is the canonical four-tuple bounding-box helper that the layout
+// algorithms (clustering, word grouping, text extraction) operate on.
+// Field naming follows the Char/Line/Rect convention used throughout
+// the package: x0,y0 is the lower-left corner and x1,y1 is the upper-
+// right corner in PDF user space (origin at bottom-left, Y growing up).
+//
+// We expose BBox as a value type — small, stack-allocatable, trivially
+// copyable. Algorithms that need to pass a bbox around without poking
+// at the larger Char/Rect/Line wrappers can construct one with NewBBox
+// or pull one out with the BBoxOf helpers below.
+//
+// The Go API intentionally chooses (X0,Y0,X1,Y1) over pdfplumber's
+// dict-of-strings ({"x0","top","x1","bottom"}). The two flavours differ
+// because pdfplumber operates in image space (Y growing down, "top" =
+// small Y, "bottom" = large Y) and we operate in PDF user space (Y
+// growing up). Comments call out the mapping wherever it matters.
+type BBox struct {
+	X0, Y0, X1, Y1 float64
+}
+
+// NewBBox builds a BBox and normalises it so X0<=X1 and Y0<=Y1.
+// Algorithms downstream rely on the normal form, so we never let an
+// inverted bbox leak past this constructor.
+func NewBBox(x0, y0, x1, y1 float64) BBox {
+	if x1 < x0 {
+		x0, x1 = x1, x0
+	}
+	if y1 < y0 {
+		y0, y1 = y1, y0
+	}
+	return BBox{X0: x0, Y0: y0, X1: x1, Y1: y1}
+}
+
+// Width returns the bbox's horizontal extent.
+func (b BBox) Width() float64 { return b.X1 - b.X0 }
+
+// Height returns the bbox's vertical extent.
+func (b BBox) Height() float64 { return b.Y1 - b.Y0 }
+
+// Area returns Width * Height.
+func (b BBox) Area() float64 { return b.Width() * b.Height() }
+
+// IsZero reports whether the bbox is the zero value (all four fields
+// equal to zero). Useful for "did I forget to populate this" checks.
+func (b BBox) IsZero() bool { return b == BBox{} }
+
+// Union returns the smallest bbox enclosing both b and other.
+//
+// We DON'T treat a zero-value BBox as "empty" here — a caller that
+// passes BBox{} to Union genuinely means "enclose the origin point".
+// Use MergeBBoxes when you have a slice and want it to be a no-op on
+// the empty slice.
+func (b BBox) Union(other BBox) BBox {
+	return BBox{
+		X0: math.Min(b.X0, other.X0),
+		Y0: math.Min(b.Y0, other.Y0),
+		X1: math.Max(b.X1, other.X1),
+		Y1: math.Max(b.Y1, other.Y1),
+	}
+}
+
+// Intersect returns the overlapping rectangle of b and other, and a
+// boolean reporting whether the intersection has non-empty area (i.e.
+// the two bboxes actually overlap). This mirrors pdfplumber's
+// get_bbox_overlap, which returns None when the boxes don't touch and
+// the overlapping bbox otherwise.
+//
+// We treat touching-but-not-overlapping (shared edge, zero area) as
+// non-overlap, matching pdfplumber's `o_height + o_width > 0` check —
+// a single-line ruler that grazes a word's bbox should not be reported
+// as "intersecting" the word.
+func (b BBox) Intersect(other BBox) (BBox, bool) {
+	oLeft := math.Max(b.X0, other.X0)
+	oRight := math.Min(b.X1, other.X1)
+	oBottom := math.Max(b.Y0, other.Y0)
+	oTop := math.Min(b.Y1, other.Y1)
+
+	w := oRight - oLeft
+	h := oTop - oBottom
+	// pdfplumber requires width>=0, height>=0, AND width+height>0
+	// (so zero-area overlaps don't count). Matching that exactly.
+	if w < 0 || h < 0 || w+h <= 0 {
+		return BBox{}, false
+	}
+	return BBox{X0: oLeft, Y0: oBottom, X1: oRight, Y1: oTop}, true
+}
+
+// Contains reports whether b fully encloses other. Edges are
+// considered inside (>= on the low side, <= on the high side), so a
+// bbox contains itself.
+func (b BBox) Contains(other BBox) bool {
+	return other.X0 >= b.X0 &&
+		other.Y0 >= b.Y0 &&
+		other.X1 <= b.X1 &&
+		other.Y1 <= b.Y1
+}
+
+// ContainsPoint reports whether (x,y) lies inside b (inclusive on
+// edges).
+func (b BBox) ContainsPoint(x, y float64) bool {
+	return x >= b.X0 && x <= b.X1 && y >= b.Y0 && y <= b.Y1
+}
+
+// Snap rounds each of b's four coordinates to the nearest multiple of
+// step. Used by layout-analysis code to coalesce near-equal positions
+// (e.g. ruling lines drawn at 99.9, 100.0, 100.1) before clustering.
+// A step of 0 returns the original bbox unchanged.
+func (b BBox) Snap(step float64) BBox {
+	if step == 0 {
+		return b
+	}
+	return BBox{
+		X0: math.Round(b.X0/step) * step,
+		Y0: math.Round(b.Y0/step) * step,
+		X1: math.Round(b.X1/step) * step,
+		Y1: math.Round(b.Y1/step) * step,
+	}
+}
+
+// MergeBBoxes returns the smallest bbox enclosing every input. Empty
+// input returns a zero BBox. This mirrors pdfplumber's merge_bboxes
+// and objects_to_bbox helpers — the typical caller has a slice of
+// Chars and wants the combined bounding box for the resulting Word.
+func MergeBBoxes(bboxes []BBox) BBox {
+	if len(bboxes) == 0 {
+		return BBox{}
+	}
+	out := bboxes[0]
+	for _, bb := range bboxes[1:] {
+		out = out.Union(bb)
+	}
+	return out
+}
+
+// BBoxOfChar returns the bounding box of a Char.
+func BBoxOfChar(c Char) BBox {
+	return BBox{X0: c.X0, Y0: c.Y0, X1: c.X1, Y1: c.Y1}
+}
+
+// BBoxOfChars returns the smallest bbox enclosing every char in cs.
+// Returns the zero BBox for an empty slice.
+func BBoxOfChars(cs []Char) BBox {
+	if len(cs) == 0 {
+		return BBox{}
+	}
+	out := BBoxOfChar(cs[0])
+	for _, c := range cs[1:] {
+		out = out.Union(BBoxOfChar(c))
+	}
+	return out
+}
diff --git a/geometry_test.go b/geometry_test.go
new file mode 100644
index 0000000..6149a1d
--- /dev/null
+++ b/geometry_test.go
@@ -0,0 +1,236 @@
+// Copyright (c) 2026 Halleluyah Oludele
+// Licensed under the MIT License.
+
+package pdftable
+
+import (
+	"math"
+	"testing"
+)
+
+func TestBBoxNormalisation(t *testing.T) {
+	// NewBBox should always normalise so the first corner is the
+	// lower-left and the second corner is the upper-right.
+	got := NewBBox(10, 20, 5, 5)
+	want := BBox{X0: 5, Y0: 5, X1: 10, Y1: 20}
+	if got != want {
+		t.Errorf("NewBBox normalisation: got %+v, want %+v", got, want)
+	}
+}
+
+func TestBBoxWidthHeight(t *testing.T) {
+	b := BBox{X0: 10, Y0: 20, X1: 30, Y1: 50}
+	if b.Width() != 20 {
+		t.Errorf("Width = %v, want 20", b.Width())
+	}
+	if b.Height() != 30 {
+		t.Errorf("Height = %v, want 30", b.Height())
+	}
+	if b.Area() != 600 {
+		t.Errorf("Area = %v, want 600", b.Area())
+	}
+}
+
+func TestBBoxIsZero(t *testing.T) {
+	if !(BBox{}).IsZero() {
+		t.Error("BBox{} should be zero")
+	}
+	if (BBox{X0: 1}).IsZero() {
+		t.Error("non-zero BBox should not be zero")
+	}
+}
+
+func TestBBoxUnion(t *testing.T) {
+	a := BBox{X0: 0, Y0: 0, X1: 10, Y1: 10}
+	b := BBox{X0: 5, Y0: 5, X1: 20, Y1: 30}
+	got := a.Union(b)
+	want := BBox{X0: 0, Y0: 0, X1: 20, Y1: 30}
+	if got != want {
+		t.Errorf("Union = %+v, want %+v", got, want)
+	}
+
+	// Disjoint bboxes: union spans both.
+	c := BBox{X0: 100, Y0: 100, X1: 110, Y1: 110}
+	got = a.Union(c)
+	want = BBox{X0: 0, Y0: 0, X1: 110, Y1: 110}
+	if got != want {
+		t.Errorf("Union disjoint = %+v, want %+v", got, want)
+	}
+}
+
+func TestBBoxIntersect(t *testing.T) {
+	tests := []struct {
+		name    string
+		a, b    BBox
+		want    BBox
+		overlap bool
+	}{
+		{
+			name:    "fully overlapping",
+			a:       BBox{X0: 0, Y0: 0, X1: 10, Y1: 10},
+			b:       BBox{X0: 5, Y0: 5, X1: 20, Y1: 20},
+			want:    BBox{X0: 5, Y0: 5, X1: 10, Y1: 10},
+			overlap: true,
+		},
+		{
+			name:    "disjoint",
+			a:       BBox{X0: 0, Y0: 0, X1: 10, Y1: 10},
+			b:       BBox{X0: 20, Y0: 20, X1: 30, Y1: 30},
+			overlap: false,
+		},
+		{
+			name:    "share a single corner point (zero w + zero h)",
+			a:       BBox{X0: 0, Y0: 0, X1: 10, Y1: 10},
+			b:       BBox{X0: 10, Y0: 10, X1: 20, Y1: 20},
+			overlap: false, // pdfplumber: w+h must be > 0; point touch has 0+0
+		},
+		{
+			name:    "share horizontal edge (zero h, non-zero w)",
+			a:       BBox{X0: 0, Y0: 0, X1: 10, Y1: 10},
+			b:       BBox{X0: 0, Y0: 10, X1: 10, Y1: 20},
+			want:    BBox{X0: 0, Y0: 10, X1: 10, Y1: 10},
+			overlap: true, // pdfplumber: 10+0 > 0 → counted as overlap
+		},
+		{
+			name:    "b fully inside a",
+			a:       BBox{X0: 0, Y0: 0, X1: 100, Y1: 100},
+			b:       BBox{X0: 10, Y0: 10, X1: 20, Y1: 20},
+			want:    BBox{X0: 10, Y0: 10, X1: 20, Y1: 20},
+			overlap: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got, ok := tt.a.Intersect(tt.b)
+			if ok != tt.overlap {
+				t.Fatalf("overlap = %v, want %v", ok, tt.overlap)
+			}
+			if !ok {
+				return
+			}
+			if got != tt.want {
+				t.Errorf("Intersect = %+v, want %+v", got, tt.want)
+			}
+		})
+	}
+}
+
+func TestBBoxContains(t *testing.T) {
+	outer := BBox{X0: 0, Y0: 0, X1: 100, Y1: 100}
+	inner := BBox{X0: 10, Y0: 10, X1: 20, Y1: 20}
+	crossing := BBox{X0: 50, Y0: 50, X1: 150, Y1: 150}
+
+	if !outer.Contains(inner) {
+		t.Error("outer should contain inner")
+	}
+	if !outer.Contains(outer) {
+		t.Error("bbox should contain itself")
+	}
+	if outer.Contains(crossing) {
+		t.Error("outer should not contain crossing")
+	}
+}
+
+func TestBBoxContainsPoint(t *testing.T) {
+	b := BBox{X0: 0, Y0: 0, X1: 10, Y1: 10}
+	if !b.ContainsPoint(5, 5) {
+		t.Error("centre point should be contained")
+	}
+	if !b.ContainsPoint(0, 0) {
+		t.Error("corner point should be contained (inclusive)")
+	}
+	if !b.ContainsPoint(10, 10) {
+		t.Error("opposite corner should be contained (inclusive)")
+	}
+	if b.ContainsPoint(11, 5) {
+		t.Error("outside point should not be contained")
+	}
+}
+
+func TestBBoxSnap(t *testing.T) {
+	b := BBox{X0: 99.7, Y0: 100.2, X1: 199.4, Y1: 200.8}
+	got := b.Snap(1)
+	want := BBox{X0: 100, Y0: 100, X1: 199, Y1: 201}
+	if got != want {
+		t.Errorf("Snap(1) = %+v, want %+v", got, want)
+	}
+
+	// Snap to 0.5 multiples.
+	got = b.Snap(0.5)
+	want = BBox{X0: 99.5, Y0: 100, X1: 199.5, Y1: 201}
+	if got != want {
+		t.Errorf("Snap(0.5) = %+v, want %+v", got, want)
+	}
+
+	// Snap with step 0 is a no-op.
+	got = b.Snap(0)
+	if got != b {
+		t.Errorf("Snap(0) should be no-op, got %+v want %+v", got, b)
+	}
+}
+
+func TestMergeBBoxes(t *testing.T) {
+	// Empty: zero BBox.
+	if got := MergeBBoxes(nil); !got.IsZero() {
+		t.Errorf("MergeBBoxes(nil) = %+v, want zero", got)
+	}
+
+	// Single: same bbox.
+	one := BBox{X0: 1, Y0: 2, X1: 3, Y1: 4}
+	if got := MergeBBoxes([]BBox{one}); got != one {
+		t.Errorf("MergeBBoxes([one]) = %+v, want %+v", got, one)
+	}
+
+	// Multiple.
+	bs := []BBox{
+		{X0: 10, Y0: 20, X1: 30, Y1: 40},
+		{X0: 5, Y0: 25, X1: 25, Y1: 35},
+		{X0: 15, Y0: 10, X1: 35, Y1: 30},
+	}
+	got := MergeBBoxes(bs)
+	want := BBox{X0: 5, Y0: 10, X1: 35, Y1: 40}
+	if got != want {
+		t.Errorf("MergeBBoxes = %+v, want %+v", got, want)
+	}
+}
+
+func TestBBoxOfChar(t *testing.T) {
+	c := Char{Text: "x", X0: 1, Y0: 2, X1: 3, Y1: 4}
+	got := BBoxOfChar(c)
+	want := BBox{X0: 1, Y0: 2, X1: 3, Y1: 4}
+	if got != want {
+		t.Errorf("BBoxOfChar = %+v, want %+v", got, want)
+	}
+}
+
+func TestBBoxOfChars(t *testing.T) {
+	// Empty.
+	if got := BBoxOfChars(nil); !got.IsZero() {
+		t.Errorf("BBoxOfChars(nil) = %+v, want zero", got)
+	}
+
+	cs := []Char{
+		{X0: 10, Y0: 20, X1: 15, Y1: 30},
+		{X0: 15, Y0: 20, X1: 25, Y1: 30},
+		{X0: 25, Y0: 20, X1: 35, Y1: 30},
+	}
+	got := BBoxOfChars(cs)
+	want := BBox{X0: 10, Y0: 20, X1: 35, Y1: 30}
+	if got != want {
+		t.Errorf("BBoxOfChars = %+v, want %+v", got, want)
+	}
+}
+
+func TestBBoxIntersectArea(t *testing.T) {
+	// Sanity: small overlap area matches expected via math.
+	a := BBox{X0: 0, Y0: 0, X1: 10, Y1: 10}
+	b := BBox{X0: 8, Y0: 8, X1: 20, Y1: 20}
+	overlap, ok := a.Intersect(b)
+	if !ok {
+		t.Fatal("expected overlap")
+	}
+	if math.Abs(overlap.Area()-4) > 1e-9 {
+		t.Errorf("overlap area = %v, want 4", overlap.Area())
+	}
+}
diff --git a/golden_test.go b/golden_test.go
new file mode 100644
index 0000000..2453e81
--- /dev/null
+++ b/golden_test.go
@@ -0,0 +1,225 @@
+// Copyright (c) 2026 Halleluyah Oludele
+// Licensed under the MIT License.
+
+package pdftable_test
+
+import (
+	"encoding/json"
+	"fmt"
+	"math"
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+
+	"github.com/hallelx2/pdftable"
+)
+
+// Golden-file tests against pdfplumber output. The expected JSON files
+// in testdata/golden are generated by scripts/gen_golden.py running the
+// reference pdfplumber library on the same fixture PDFs. We assert
+// that pdftable's output matches the algorithm-level behaviour of
+// pdfplumber: word count, word text, word order, and word direction
+// are exact; word positions are checked with a tolerance wide enough
+// to absorb known font-metric drift (we use 1000-units-per-em default
+// fallback for standard fonts whose AFM tables we don't bundle yet;
+// pdfplumber uses pdfminer.six's bundled AFM metrics). See AGENT A's
+// v0.0.1 report and the "Parity with pdfplumber" section of the
+// README for the up-to-date list of metric differences.
+//
+// Tolerances:
+//
+//   - Word count: must match EXACTLY.
+//   - Word text: must match EXACTLY (byte-equal).
+//   - Word direction: must match EXACTLY.
+//   - Word bbox: ±15 PDF points to absorb font-width drift. Tightening
+//     this tolerance is a v0.2.x goal once we bundle the standard-14
+//     AFM metrics.
+//
+// extract_text() is compared as a sequence of whitespace-separated
+// words (order + text), absorbing spacing differences that come from
+// the same font-metric origin.
+
+type goldenWord struct {
+	Text      string  `json:"text"`
+	X0        float64 `json:"x0"`
+	X1        float64 `json:"x1"`
+	Y0        float64 `json:"y0"`
+	Y1        float64 `json:"y1"`
+	Upright   bool    `json:"upright"`
+	Direction string  `json:"direction"`
+}
+
+type goldenPage struct {
+	Number       int          `json:"number"`
+	Width        float64      `json:"width"`
+	Height       float64      `json:"height"`
+	ExtractText  string       `json:"extract_text"`
+	ExtractWords []goldenWord `json:"extract_words"`
+}
+
+type golden struct {
+	Name  string       `json:"name"`
+	Pages []goldenPage `json:"pages"`
+}
+
+// TestGoldenAgainstPdfplumber loads each fixture, runs the pdftable
+// extraction, and diffs against the pre-generated pdfplumber output.
+func TestGoldenAgainstPdfplumber(t *testing.T) {
+	dir := filepath.Join("testdata", "golden")
+	entries, err := os.ReadDir(dir)
+	if err != nil {
+		t.Fatalf("read golden dir: %v", err)
+	}
+
+	// Find every .expected.json and run a sub-test for each.
+	for _, e := range entries {
+		if e.IsDir() {
+			continue
+		}
+		name := e.Name()
+		if !strings.HasSuffix(name, ".expected.json") {
+			continue
+		}
+		stem := strings.TrimSuffix(name, ".expected.json")
+		t.Run(stem, func(t *testing.T) {
+			runGoldenCase(t, dir, stem)
+		})
+	}
+}
+
+func runGoldenCase(t *testing.T, dir, stem string) {
+	t.Helper()
+
+	pdfPath := filepath.Join(dir, stem+".pdf")
+	jsonPath := filepath.Join(dir, stem+".expected.json")
+
+	data, err := os.ReadFile(jsonPath)
+	if err != nil {
+		t.Fatalf("read %s: %v", jsonPath, err)
+	}
+	var g golden
+	if err := json.Unmarshal(data, &g); err != nil {
+		t.Fatalf("parse %s: %v", jsonPath, err)
+	}
+
+	doc, err := pdftable.OpenFile(pdfPath)
+	if err != nil {
+		t.Fatalf("OpenFile %s: %v", pdfPath, err)
+	}
+	defer doc.Close()
+
+	if doc.NumPages() != len(g.Pages) {
+		t.Fatalf("page count: got %d, want %d", doc.NumPages(), len(g.Pages))
+	}
+
+	for _, expPage := range g.Pages {
+		p, err := doc.Page(expPage.Number)
+		if err != nil {
+			t.Fatalf("Page(%d): %v", expPage.Number, err)
+		}
+
+		// Page dimensions should match exactly (these come from the
+		// MediaBox, not from font metrics).
+		if math.Abs(p.Width()-expPage.Width) > 0.01 {
+			t.Errorf("page %d width: got %v, want %v", expPage.Number, p.Width(), expPage.Width)
+		}
+		if math.Abs(p.Height()-expPage.Height) > 0.01 {
+			t.Errorf("page %d height: got %v, want %v", expPage.Number, p.Height(), expPage.Height)
+		}
+
+		gotWords, err := p.Words(pdftable.DefaultWordOpts())
+		if err != nil {
+			t.Fatalf("Words: %v", err)
+		}
+		assertGoldenWords(t, expPage.Number, gotWords, expPage.ExtractWords)
+
+		// Dense extract_text should produce the same word *sequence*
+		// when split on whitespace. We don't compare byte-equal because
+		// pdfplumber sometimes inserts extra spaces for vertical jitter
+		// that our YTolerance clustering smooths over.
+		gotText, err := p.ExtractText(pdftable.DefaultTextOpts())
+		if err != nil {
+			t.Fatalf("ExtractText: %v", err)
+		}
+		assertWordSequence(t, expPage.Number, gotText, expPage.ExtractText)
+	}
+}
+
+func assertGoldenWords(t *testing.T, page int, got []pdftable.Word, want []goldenWord) {
+	t.Helper()
+	if len(got) != len(want) {
+		t.Errorf("page %d: got %d words, want %d", page, len(got), len(want))
+		// Print first few mismatches for debugging.
+		n := len(got)
+		if len(want) > n {
+			n = len(want)
+		}
+		if n > 6 {
+			n = 6
+		}
+		for i := 0; i < n; i++ {
+			var gs, ws string
+			if i < len(got) {
+				gs = fmt.Sprintf("%q@(%.1f,%.1f)-(%.1f,%.1f)", got[i].Text, got[i].X0, got[i].Y0, got[i].X1, got[i].Y1)
+			}
+			if i < len(want) {
+				ws = fmt.Sprintf("%q@(%.1f,%.1f)-(%.1f,%.1f)", want[i].Text, want[i].X0, want[i].Y0, want[i].X1, want[i].Y1)
+			}
+			t.Logf("  [%d] got=%s want=%s", i, gs, ws)
+		}
+		return
+	}
+
+	// Generous position tolerance to absorb font-metric drift. The
+	// algorithm-level outputs (text, count, order, direction) below
+	// are still asserted exactly; the position checks here are a
+	// regression guard against catastrophic mis-placement, not a
+	// pixel-level parity check.
+	const posTol = 15.0 // PDF points
+	for i := range want {
+		g := got[i]
+		w := want[i]
+		if g.Text != w.Text {
+			t.Errorf("page %d word %d: text got %q, want %q", page, i, g.Text, w.Text)
+			continue
+		}
+		if math.Abs(g.X0-w.X0) > posTol {
+			t.Errorf("page %d word %d (%q): X0 got %v, want %v", page, i, g.Text, g.X0, w.X0)
+		}
+		if math.Abs(g.X1-w.X1) > posTol {
+			t.Errorf("page %d word %d (%q): X1 got %v, want %v", page, i, g.Text, g.X1, w.X1)
+		}
+		if math.Abs(g.Y0-w.Y0) > posTol {
+			t.Errorf("page %d word %d (%q): Y0 got %v, want %v", page, i, g.Text, g.Y0, w.Y0)
+		}
+		if math.Abs(g.Y1-w.Y1) > posTol {
+			t.Errorf("page %d word %d (%q): Y1 got %v, want %v", page, i, g.Text, g.Y1, w.Y1)
+		}
+		if g.Upright != w.Upright {
+			t.Errorf("page %d word %d (%q): Upright got %v, want %v", page, i, g.Text, g.Upright, w.Upright)
+		}
+		if g.Direction != w.Direction {
+			t.Errorf("page %d word %d (%q): Direction got %q, want %q", page, i, g.Text, g.Direction, w.Direction)
+		}
+	}
+}
+
+// assertWordSequence splits both strings on whitespace and compares the
+// resulting word lists. Spacing differences and newlines are absorbed.
+func assertWordSequence(t *testing.T, page int, got, want string) {
+	t.Helper()
+	gotWords := strings.Fields(got)
+	wantWords := strings.Fields(want)
+	if len(gotWords) != len(wantWords) {
+		t.Errorf("page %d: extract_text word count: got %d %v, want %d %v",
+			page, len(gotWords), gotWords, len(wantWords), wantWords)
+		return
+	}
+	for i := range wantWords {
+		if gotWords[i] != wantWords[i] {
+			t.Errorf("page %d: extract_text[%d] got %q, want %q",
+				page, i, gotWords[i], wantWords[i])
+		}
+	}
+}
diff --git a/page.go b/page.go
index f641400..996e7db 100644
--- a/page.go
+++ b/page.go
@@ -66,6 +66,32 @@ type Page interface {
 	// cheaper than calling each accessor separately because the
 	// content stream is parsed exactly once.
 	Objects() (Objects, error)
+
+	// Words extracts positioned text runs from the page. A "word"
+	// is a contiguous group of chars whose horizontal gaps are
+	// within WordOpts.XTolerance and whose vertical positions
+	// agree within WordOpts.YTolerance. Pass DefaultWordOpts() to
+	// use pdfplumber-matching defaults. See WordOpts for the full
+	// configuration surface.
+	//
+	// Returns an empty slice (not nil) when the page contains no
+	// extractable text.
+	Words(opts WordOpts) ([]Word, error)
+
+	// ExtractText returns the page's text as a single string. By
+	// default words on the same line are joined with a single
+	// space and lines are joined with "\n". When TextOpts.Layout is
+	// true, the output preserves spatial layout (column-aligned
+	// text, blank lines for vertical gaps) at the cost of more
+	// whitespace. Pass DefaultTextOpts() for pdfplumber-matching
+	// defaults.
+	ExtractText(opts TextOpts) (string, error)
+
+	// ExtractTextSimple is a no-frills extraction that clusters
+	// chars by visual line and joins them by gap detection. Use
+	// when ExtractText's word-grouping heuristics produce undesired
+	// results on adversarial input.
+	ExtractTextSimple(xTolerance, yTolerance float64) (string, error)
 }
 
 // page is the unexported implementation backing the Page interface.
diff --git a/pdftable.go b/pdftable.go
index 36d1452..dc5beb2 100644
--- a/pdftable.go
+++ b/pdftable.go
@@ -30,11 +30,12 @@
 //	    fmt.Printf("page %d: %d chars\n", n, len(chars))
 //	}
 //
-// Phase scope: this initial release exposes the primitives. The
-// higher-level operations (ExtractText, ExtractTables, FindTables,
-// Words) are explicit future phases — see the README for the
-// roadmap. The Page interface is designed so those methods can be
-// added without breaking existing callers.
+// Phase scope: v0.1.0 ships content-stream primitives plus text
+// extraction (Page.Words, Page.ExtractText, Page.ExtractTextSimple).
+// Table-finding (ExtractTables, FindTables) is the next phase — see
+// the README for the roadmap. The Page interface is additive across
+// releases; v0.0.1 callers using only Chars/Lines/Rects/Curves
+// continue to compile against v0.1.0 without changes.
 package pdftable
 
 import (
diff --git a/scripts/gen_golden.py b/scripts/gen_golden.py
new file mode 100644
index 0000000..3af3366
--- /dev/null
+++ b/scripts/gen_golden.py
@@ -0,0 +1,90 @@
+"""Generate golden-file expected outputs for pdftable's parity tests.
+
+Run from the repo root after copying the fixture PDFs into
+testdata/golden/:
+
+    pip install pdfplumber
+    python scripts/gen_golden.py
+
+The script reads every *.pdf in testdata/golden/, runs pdfplumber's
+extract_text() and extract_words() on each page, and writes the result
+as <name>.expected.json next to the PDF.
+
+Coordinate-system note: pdfplumber emits word "top" and "bottom" in
+image space (origin at top-left, Y growing DOWN). pdftable uses PDF
+user space (origin at bottom-left, Y growing UP). We translate
+pdfplumber's coords into PDF-user-space here so the JSON matches the
+y0/y1 fields on pdftable.Word directly.
+
+To regenerate after upgrading pdfplumber, simply re-run this script.
+The file outputs are deterministic and stable.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import sys
+
+import pdfplumber
+
+DIR = os.path.join("testdata", "golden")
+
+
+def main() -> int:
+    target = DIR if len(sys.argv) < 2 else sys.argv[1]
+    pdfs = sorted(
+        f for f in os.listdir(target) if f.endswith(".pdf")
+    )
+    if not pdfs:
+        print(f"no .pdf files in {target}", file=sys.stderr)
+        return 1
+    for fname in pdfs:
+        name = os.path.splitext(fname)[0]
+        pdf_path = os.path.join(target, fname)
+        out = {"name": name, "pages": []}
+        with pdfplumber.open(pdf_path) as pdf:
+            for p in pdf.pages:
+                page = {
+                    "number": p.page_number,
+                    "width": p.width,
+                    "height": p.height,
+                    "extract_text": p.extract_text() or "",
+                    "extract_words": [],
+                }
+                words = p.extract_words(
+                    x_tolerance=3,
+                    y_tolerance=3,
+                    keep_blank_chars=False,
+                    use_text_flow=False,
+                    horizontal_ltr=True,
+                    vertical_ttb=True,
+                    extra_attrs=None,
+                    split_at_punctuation=False,
+                    expand_ligatures=True,
+                )
+                for w in words:
+                    y1_user = p.height - w["top"]
+                    y0_user = p.height - w["bottom"]
+                    page["extract_words"].append(
+                        {
+                            "text": w["text"],
+                            "x0": w["x0"],
+                            "x1": w["x1"],
+                            "y0": y0_user,
+                            "y1": y1_user,
+                            "upright": bool(w.get("upright", True)),
+                            "direction": w.get("direction", "ltr"),
+                        }
+                    )
+                out["pages"].append(page)
+        expected = os.path.join(target, f"{name}.expected.json")
+        with open(expected, "w", encoding="utf-8") as f:
+            json.dump(out, f, ensure_ascii=False, indent=2)
+        nwords = sum(len(pp["extract_words"]) for pp in out["pages"])
+        print(f"wrote {expected}: {len(out['pages'])} pages, {nwords} words")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/testdata/golden/hello.expected.json b/testdata/golden/hello.expected.json
new file mode 100644
index 0000000..b7acfbe
--- /dev/null
+++ b/testdata/golden/hello.expected.json
@@ -0,0 +1,31 @@
+{
+  "name": "hello",
+  "pages": [
+    {
+      "number": 1,
+      "width": 612,
+      "height": 792,
+      "extract_text": "Hello, world!",
+      "extract_words": [
+        {
+          "text": "Hello,",
+          "x0": 72.0,
+          "x1": 102.672,
+          "y0": 717.516,
+          "y1": 729.516,
+          "upright": true,
+          "direction": "ltr"
+        },
+        {
+          "text": "world!",
+          "x0": 106.00800000000001,
+          "x1": 138.01200000000003,
+          "y0": 717.516,
+          "y1": 729.516,
+          "upright": true,
+          "direction": "ltr"
+        }
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/testdata/golden/hello.pdf b/testdata/golden/hello.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..4ed735fe92923fb45e83978777c3094349dfa061
GIT binary patch
literal 643
zcmZWnT~ER=6n*cnxIQe=MD03Q2?-$)WHC`Afh9f=9?Dj*WUi(yivCF-{0I6+yd4t+
z`m~*U&OP@l7+xnA?6OIM@1M^v!XN}aTar!(X!`ylf%b(Hg<b$1iiNZgAz=jFE|F#K
ziT;0Iu;I~Lm0OrJJbj8;^s02g{t12LSm<qxMC!(e-pgExo?gQwL_TK`wAvBo8%e8I
zCc}b!rHMvG^}(R&9nl9f8J&%#g9*hRu^qW~(<;zMI(k@$Ywlk&Ub|mYm6?uLGgl46
zkAwm32`e{|8(F+cr!oO_P-Z$;B~I#5l~<)zTVOXWZ}?|$C(DI<f@bW8Vx5uVne<XZ
z69(93Iwvh`HVPrl$tl_t`W)W0De{vuNS9=g`l4%Dy^FPxbAnSM;r0QK<3<c~*kTN8
z@DIuz#<B4Oj74|~2Ux`0M_7aN9p;Qsg*4UmjMaw(j+@fjFTTDS>x6MtolPE+AQ(I(
F<QKXkt*rn6

literal 0
HcmV?d00001

diff --git a/testdata/golden/rules.expected.json b/testdata/golden/rules.expected.json
new file mode 100644
index 0000000..05925f8
--- /dev/null
+++ b/testdata/golden/rules.expected.json
@@ -0,0 +1,12 @@
+{
+  "name": "rules",
+  "pages": [
+    {
+      "number": 1,
+      "width": 612,
+      "height": 792,
+      "extract_text": "",
+      "extract_words": []
+    }
+  ]
+}
\ No newline at end of file
diff --git a/testdata/golden/rules.pdf b/testdata/golden/rules.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..f53a84d7b1836086072a50e64061d60e3a320b81
GIT binary patch
literal 790
zcmah{O;6)65WV|X%q0>?1=}exR6<C=QmwS0MHGoc#lcO6;9akc92W|IlEeOk-9N(E
z4J{uBA~~_2-<vlxnd1E0cuQ{Yu;SnU=f6yX06+W1_V<BbKHeelV;!~iE%0;QVh1G=
zF5uvRp=%N`o!<+VJpRKp9ZZLbeoayQ+(nRYgz4t9Fxdno>e7dwVPo`>U%*rlU&#`7
zM<vC(z|PM@O$nt;$DR~<BVi{wk`F}+zMf!&DW@8#99+cfd~r;4^5~%Ixv#|}?faZ^
zWImbCVs1uii3EP6J6zs;M|+Ra)EfASt9@e}b?VBvL)V!Vu<lked^I>l*T!3rquiA)
z2GoD3{|~HyL!hJojSU2>qjs&u0Z1Vhv_MDLpzi>b5S!a9L9|QPc?D#X9OBjo^F)<u
z3)VP<8Cw4^fJ(q!1nXV9)K+@}$d}->v%UBpL#pZvDOr+~-WAD{l!l@&q#ut(z#B#h
zGO>C<Rn@S98(3kI+L9h{YfRC<_<+fhrt<?<s?jHGsMH#ZK^u!95A?)5At)Li@6#;y
TgWN?OV$M^ET~-t)Kga9^iJ{cR

literal 0
HcmV?d00001

diff --git a/testdata/golden/simple1.expected.json b/testdata/golden/simple1.expected.json
new file mode 100644
index 0000000..91dc973
--- /dev/null
+++ b/testdata/golden/simple1.expected.json
@@ -0,0 +1,229 @@
+{
+  "name": "simple1",
+  "pages": [
+    {
+      "number": 1,
+      "width": 612,
+      "height": 792,
+      "extract_text": "Hello World\nHello World\nH e l l o W o r l d\nH e l l o W o r l d",
+      "extract_words": [
+        {
+          "text": "Hello",
+          "x0": 100.0,
+          "x1": 154.672,
+          "y0": 695.032,
+          "y1": 719.032,
+          "upright": true,
+          "direction": "ltr"
+        },
+        {
+          "text": "World",
+          "x0": 261.32800000000003,
+          "x1": 323.992,
+          "y0": 695.032,
+          "y1": 719.032,
+          "upright": true,
+          "direction": "ltr"
+        },
+        {
+          "text": "Hello",
+          "x0": 100.0,
+          "x1": 154.672,
+          "y0": 595.032,
+          "y1": 619.032,
+          "upright": true,
+          "direction": "ltr"
+        },
+        {
+          "text": "World",
+          "x0": 261.344,
+          "x1": 324.008,
+          "y0": 595.032,
+          "y1": 619.032,
+          "upright": true,
+          "direction": "ltr"
+        },
+        {
+          "text": "H",
+          "x0": 100.0,
+          "x1": 117.328,
+          "y0": 495.032,
+          "y1": 519.032,
+          "upright": true,
+          "direction": "ltr"
+        },
+        {
+          "text": "e",
+          "x0": 127.328,
+          "x1": 140.672,
+          "y0": 495.032,
+          "y1": 519.032,
+          "upright": true,
+          "direction": "ltr"
+        },
+        {
+          "text": "l",
+          "x0": 150.672,
+          "x1": 156.0,
+          "y0": 495.032,
+          "y1": 519.032,
+          "upright": true,
+          "direction": "ltr"
+        },
+        {
+          "text": "l",
+          "x0": 166.0,
+          "x1": 171.328,
+          "y0": 495.032,
+          "y1": 519.032,
+          "upright": true,
+          "direction": "ltr"
+        },
+        {
+          "text": "o",
+          "x0": 181.328,
+          "x1": 194.672,
+          "y0": 495.032,
+          "y1": 519.032,
+          "upright": true,
+          "direction": "ltr"
+        },
+        {
+          "text": "W",
+          "x0": 321.344,
+          "x1": 344.0,
+          "y0": 495.032,
+          "y1": 519.032,
+          "upright": true,
+          "direction": "ltr"
+        },
+        {
+          "text": "o",
+          "x0": 354.0,
+          "x1": 367.344,
+          "y0": 495.032,
+          "y1": 519.032,
+          "upright": true,
+          "direction": "ltr"
+        },
+        {
+          "text": "r",
+          "x0": 377.344,
+          "x1": 385.336,
+          "y0": 495.032,
+          "y1": 519.032,
+          "upright": true,
+          "direction": "ltr"
+        },
+        {
+          "text": "l",
+          "x0": 395.336,
+          "x1": 400.664,
+          "y0": 495.032,
+          "y1": 519.032,
+          "upright": true,
+          "direction": "ltr"
+        },
+        {
+          "text": "d",
+          "x0": 410.664,
+          "x1": 424.008,
+          "y0": 495.032,
+          "y1": 519.032,
+          "upright": true,
+          "direction": "ltr"
+        },
+        {
+          "text": "H",
+          "x0": 100.0,
+          "x1": 117.328,
+          "y0": 395.032,
+          "y1": 419.032,
+          "upright": true,
+          "direction": "ltr"
+        },
+        {
+          "text": "e",
+          "x0": 127.312,
+          "x1": 140.656,
+          "y0": 395.032,
+          "y1": 419.032,
+          "upright": true,
+          "direction": "ltr"
+        },
+        {
+          "text": "l",
+          "x0": 150.64,
+          "x1": 155.968,
+          "y0": 395.032,
+          "y1": 419.032,
+          "upright": true,
+          "direction": "ltr"
+        },
+        {
+          "text": "l",
+          "x0": 165.952,
+          "x1": 171.28,
+          "y0": 395.032,
+          "y1": 419.032,
+          "upright": true,
+          "direction": "ltr"
+        },
+        {
+          "text": "o",
+          "x0": 181.264,
+          "x1": 194.608,
+          "y0": 395.032,
+          "y1": 419.032,
+          "upright": true,
+          "direction": "ltr"
+        },
+        {
+          "text": "W",
+          "x0": 321.23199999999997,
+          "x1": 343.888,
+          "y0": 395.032,
+          "y1": 419.032,
+          "upright": true,
+          "direction": "ltr"
+        },
+        {
+          "text": "o",
+          "x0": 353.872,
+          "x1": 367.216,
+          "y0": 395.032,
+          "y1": 419.032,
+          "upright": true,
+          "direction": "ltr"
+        },
+        {
+          "text": "r",
+          "x0": 377.2,
+          "x1": 385.192,
+          "y0": 395.032,
+          "y1": 419.032,
+          "upright": true,
+          "direction": "ltr"
+        },
+        {
+          "text": "l",
+          "x0": 395.176,
+          "x1": 400.50399999999996,
+          "y0": 395.032,
+          "y1": 419.032,
+          "upright": true,
+          "direction": "ltr"
+        },
+        {
+          "text": "d",
+          "x0": 410.48799999999994,
+          "x1": 423.83199999999994,
+          "y0": 395.032,
+          "y1": 419.032,
+          "upright": true,
+          "direction": "ltr"
+        }
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/testdata/golden/simple1.pdf b/testdata/golden/simple1.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..77b8623bb637cbeaa79296e58c7fa2cc8eefa627
GIT binary patch
literal 849
zcmaJ=OK*Ze5WerPn2U)XEG(sH6XU@LY5LM2Ne{-u3R{Xnwt=O!{q>#UrPZb(!0gQT
z&3ik`ncL@AHsfG|+V0qB#K7>s-X$1Qp+%x1JTD5J#Hq}|LX?M!rHJr!U7p5cCex6(
zWIglIFp)D=MXJGMM?CS*@TFq}Kg3}UYp^K-Y+5!yVUz6-c9~r!F;tZv7*8^c#Y7#@
zpNWsdEg0NdwbxW5G>01sgj7%Fs>lK?jv9f5W-3@o4NheT!&T^cG(v&HfRups(0eS4
zd{v2o@hH<szroNUNUk#}_H5!aVp}%&uZ)`}4Dj_shM!NQ`T-`|y^~3zpa*@MI}UVT
zRF;H2*noeJxIVFvQT}R9&8UI??h?jJ-~Zr#Cs4(Pu55(lLQsvSPPEOo(YJuC(X>vQ
zYUVzzPu-e&k*M3$!8HzG=HfoegT{H)R%-j%;Atv2y;ZT*<?7M`b5uNwJub3K;zZ=K
z9G-!#{g65i1ZwALpu#wfa7#qssl7-W&B;KS&P1HZ4656)ioYcck?bj@fi8|m;7(`0
GaM=%Yv)e=f

literal 0
HcmV?d00001

diff --git a/text.go b/text.go
new file mode 100644
index 0000000..fe23fa3
--- /dev/null
+++ b/text.go
@@ -0,0 +1,956 @@
+// Copyright (c) 2026 Halleluyah Oludele
+// Licensed under the MIT License.
+
+package pdftable
+
+import (
+	"math"
+	"sort"
+	"strings"
+	"unicode"
+)
+
+// This file is the Go port of pdfplumber/utils/text.py — the word and
+// text-extraction layer that turns a slice of positioned Chars into
+// reading-order Words and a string of text. Three exported entry
+// points hang off Page (defined below):
+//
+//   page.Words(opts)        →  []Word
+//   page.ExtractText(opts)  →  string
+//   page.ExtractTextSimple(...)  →  convenience wrapper (no layout)
+//
+// The algorithm is faithful to pdfplumber's WordExtractor: cluster
+// chars into lines by Y, then walk each line left-to-right merging
+// adjacent chars whose horizontal gap is within XTolerance. The
+// coordinate-system inversion (pdfplumber's image space has Y growing
+// down; we use PDF user space with Y growing up) is handled here by
+// clustering on Y1 (visual top of the glyph) and using Y1 / X0 as
+// sort keys.
+//
+// Defaults match pdfplumber:
+//   XTolerance = 3, YTolerance = 3
+//   Direction  = "ltr", HorizontalLTR = true, VerticalTTB = true
+//   Expand     = true (ligature expansion on by default)
+
+// Word is one extracted text run. It bundles the assembled string and
+// the bbox of the constituent chars, plus enough metadata for callers
+// who want to filter/restyle on font properties (font name + size of
+// the first char) or know which direction the run reads.
+//
+// Field names map onto pdfplumber's word dict the way the rest of the
+// package maps onto its char dict: X0/Y0/X1/Y1 instead of "x0"/"top"/
+// "x1"/"bottom". Y0 is the descender (lower edge of the lowest glyph
+// in the run); Y1 is the ascender (upper edge of the tallest glyph).
+type Word struct {
+	// Text is the concatenated Unicode payload of the run. Ligature
+	// glyphs are expanded into their constituent characters when
+	// WordOpts.Expand is true (the default), so "ﬁle" appears as
+	// "file" in the output.
+	Text string
+
+	// Bounding box of the run in PDF user space (origin at bottom-
+	// left, Y growing up). The bbox is the union of every char's bbox
+	// in this run.
+	X0, Y0, X1, Y1 float64
+
+	// Upright is true if every char in the run was drawn in normal
+	// reading orientation. We don't merge upright and rotated chars
+	// into the same Word — they end up in different runs.
+	Upright bool
+
+	// Direction is one of "ltr", "rtl", "ttb", "btt". Most words on
+	// most pages are "ltr"; rotated stamps may be "ttb"; Arabic/Hebrew
+	// content is "rtl". The value is the direction the chars were
+	// READ, not the direction they were drawn.
+	Direction string
+
+	// FontName / FontSize are copied from the first char in the run.
+	// pdfplumber does the same — if a word straddles a font change,
+	// only the leading font is reported, but in practice such words
+	// are rare because changing font emits a new BT/ET pair which
+	// breaks the run boundary at the content-stream level.
+	FontName string
+	FontSize float64
+
+	// Chars is the slice of Char objects this word was assembled from.
+	// Populated only when WordOpts.KeepChars is true (it costs O(n)
+	// memory per word so we default to off). Useful for callers that
+	// want to map word substrings back to glyph positions (highlight,
+	// search) or to filter further by per-char attributes.
+	Chars []Char
+}
+
+// Width returns x1 - x0.
+func (w Word) Width() float64 { return w.X1 - w.X0 }
+
+// Height returns y1 - y0.
+func (w Word) Height() float64 { return w.Y1 - w.Y0 }
+
+// WordOpts configures Page.Words. The zero value is NOT useful — call
+// DefaultWordOpts() to get a populated struct with pdfplumber-compatible
+// defaults, then override the fields you care about.
+//
+// Naming matches pdfplumber's WordExtractor kwargs where possible
+// (XTolerance → x_tolerance, KeepBlankChars → keep_blank_chars, etc.)
+// to make porting examples between the two libraries straightforward.
+type WordOpts struct {
+	// XTolerance is the maximum horizontal gap (in PDF points) between
+	// adjacent chars that still get merged into the same word.
+	// Default: 3.
+	XTolerance float64
+
+	// YTolerance is the maximum vertical jitter between chars that
+	// still get clustered onto the same line. Default: 3.
+	YTolerance float64
+
+	// KeepBlankChars: when false (the default), space chars in the
+	// content stream are dropped before word grouping — the word
+	// boundary is inferred from the gap, not from the explicit space.
+	// Set to true to preserve them (e.g. for diff-style line
+	// reconstruction).
+	KeepBlankChars bool
+
+	// UseTextFlow: when true, chars are processed in content-stream
+	// order rather than re-sorted by position. This is faster and
+	// often matches reading order in well-formed PDFs, but breaks for
+	// PDFs that draw glyphs in random order (e.g. some scanner OCR
+	// output).
+	UseTextFlow bool
+
+	// HorizontalLTR: when true (the default), upright text is read
+	// left-to-right; when false, right-to-left. Setting this to false
+	// is shorthand for Direction="rtl" but only for upright text.
+	HorizontalLTR bool
+
+	// VerticalTTB: when true (the default), rotated text is read top-
+	// to-bottom; when false, bottom-to-top.
+	VerticalTTB bool
+
+	// ExtraAttrs is a list of Char field names that must match
+	// EXACTLY for two chars to be merged into the same word. The
+	// supported names are: "fontname", "size". Useful when a single
+	// physical line has two runs that should be kept separate (e.g. a
+	// bold caption followed by regular body text).
+	ExtraAttrs []string
+
+	// SplitAtPunctuation: when true, every ASCII punctuation char
+	// (string.punctuation in Python) terminates the current word and
+	// becomes its own one-char word. Default: false.
+	SplitAtPunctuation bool
+
+	// Expand: when true (the default), ligature glyphs (ﬁ, ﬂ, …) are
+	// expanded into their constituent ASCII chars during text
+	// assembly. The Char's text payload is preserved unchanged; only
+	// the Word.Text string is expanded.
+	Expand bool
+
+	// KeepChars: when true, Word.Chars is populated with the source
+	// chars. Off by default to save memory.
+	KeepChars bool
+}
+
+// DefaultWordOpts returns a WordOpts populated with pdfplumber-matching
+// defaults. Use this and override the fields you care about:
+//
+//	opts := pdftable.DefaultWordOpts()
+//	opts.XTolerance = 1.5
+//	words, _ := page.Words(opts)
+func DefaultWordOpts() WordOpts {
+	return WordOpts{
+		XTolerance:    3,
+		YTolerance:    3,
+		HorizontalLTR: true,
+		VerticalTTB:   true,
+		Expand:        true,
+	}
+}
+
+// TextOpts configures Page.ExtractText. Like WordOpts the zero value
+// is not useful; call DefaultTextOpts() for sensible defaults.
+type TextOpts struct {
+	XTolerance float64
+	YTolerance float64
+
+	// Layout: when true, the output preserves the page's spatial
+	// layout — words at the same x-position appear in the same column
+	// across lines, and lines that are far apart are separated by
+	// extra newlines. When false (the default), output is dense:
+	// words on the same line are joined with single spaces, lines
+	// with "\n".
+	Layout bool
+
+	// LayoutWidthChars: when Layout=true, the total width of each
+	// emitted line in characters. If 0, defaults to round(page.Width /
+	// XDensity).
+	LayoutWidthChars int
+
+	// LayoutHeightChars: when Layout=true, the total number of
+	// emitted lines (extra blank lines at the bottom pad to this
+	// height). If 0, defaults to round(page.Height / YDensity).
+	LayoutHeightChars int
+
+	// XDensity / YDensity: PDF points per character / per line when
+	// computing layout grid dimensions. Default values match
+	// pdfplumber (XDensity=7.25, YDensity=13) — roughly the metrics
+	// of 10pt Helvetica.
+	XDensity float64
+	YDensity float64
+
+	// UseTextFlow / HorizontalLTR / VerticalTTB / ExtraAttrs are
+	// passed through to the underlying WordExtractor.
+	UseTextFlow   bool
+	HorizontalLTR bool
+	VerticalTTB   bool
+	ExtraAttrs    []string
+
+	// Expand passes through to WordOpts.Expand.
+	Expand bool
+}
+
+// DefaultTextOpts returns pdfplumber-matching defaults.
+func DefaultTextOpts() TextOpts {
+	return TextOpts{
+		XTolerance:    3,
+		YTolerance:    3,
+		XDensity:      7.25,
+		YDensity:      13,
+		HorizontalLTR: true,
+		VerticalTTB:   true,
+		Expand:        true,
+	}
+}
+
+// ligatures is the ligature-expansion table from pdfplumber.utils.text.
+// Keys are the single Unicode code points produced by glyph names like
+// "fi" / "ffi"; values are the expanded ASCII strings. The lookup is
+// done on the char's text string, so ligatures encoded via ToUnicode
+// CMaps that already expand them (e.g. some Adobe Reader-produced PDFs)
+// pass through unchanged.
+var ligatures = map[string]string{
+	"ﬀ": "ff",
+	"ﬁ": "fi",
+	"ﬂ": "fl",
+	"ﬃ": "ffi",
+	"ﬄ": "ffl",
+	"ﬅ": "st",
+	"ﬆ": "st",
+}
+
+// expandLigatures returns the expansion of s if it's a ligature glyph,
+// else s unchanged.
+func expandLigatures(s string) string {
+	if exp, ok := ligatures[s]; ok {
+		return exp
+	}
+	return s
+}
+
+// asciiPunctuation is the set of ASCII characters string.punctuation
+// (Python) covers: !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~. SplitAtPunctuation
+// terminates a word at every one of these.
+const asciiPunctuation = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
+
+// isPunctRune reports whether r is in asciiPunctuation.
+func isPunctRune(r rune) bool {
+	if r > 127 {
+		return false
+	}
+	return strings.ContainsRune(asciiPunctuation, r)
+}
+
+// extractWordsFromChars is the core word-grouping algorithm. It is
+// pulled out of Page.Words so it can be exercised by unit tests on
+// hand-crafted Char slices without spinning up a Page.
+//
+// Steps (port of WordExtractor.iter_extract_tuples):
+//  1. If !KeepBlankChars, drop chars whose text is whitespace.
+//  2. Filter out chars with empty text (PDF glyphs that failed to
+//     resolve via encoding / ToUnicode). Their bbox can still inform
+//     layout but they shouldn't participate in text assembly.
+//  3. Group chars by (upright, *extra_attrs) — exact-match grouping.
+//  4. For each group, either keep content-stream order (UseTextFlow)
+//     or cluster into LINES by Y position then sort each line by X.
+//  5. Within each line, walk left-to-right (or right-to-left for rtl)
+//     and split into words whenever the gap exceeds XTolerance or the
+//     char begins a new line within the cluster's tolerance band.
+//  6. Apply ligature expansion (if Expand) when concatenating text.
+func extractWordsFromChars(chars []Char, opts WordOpts) []Word {
+	if len(chars) == 0 {
+		return nil
+	}
+
+	// Step 1/2: filter blanks (unless KeepBlankChars) and empties.
+	filtered := chars
+	if !opts.KeepBlankChars {
+		out := make([]Char, 0, len(chars))
+		for _, c := range chars {
+			if c.Text == "" {
+				continue
+			}
+			if isAllSpace(c.Text) {
+				continue
+			}
+			out = append(out, c)
+		}
+		filtered = out
+	} else {
+		out := make([]Char, 0, len(chars))
+		for _, c := range chars {
+			if c.Text == "" {
+				continue
+			}
+			out = append(out, c)
+		}
+		filtered = out
+	}
+	if len(filtered) == 0 {
+		return nil
+	}
+
+	// Step 3: group by (upright, *extra_attrs). The group key is a
+	// string so it can be a map key. We keep the SAME ORDER as input
+	// — within a contiguous run of equal-key chars, all chars go in
+	// one group. A second equal-key run later in the slice starts a
+	// new group (matching itertools.groupby semantics).
+	keyOf := func(c Char) string {
+		buf := make([]byte, 0, 8+len(c.FontName))
+		if c.Upright {
+			buf = append(buf, 'U')
+		} else {
+			buf = append(buf, 'u')
+		}
+		for _, attr := range opts.ExtraAttrs {
+			buf = append(buf, '\x00')
+			switch attr {
+			case "fontname":
+				buf = append(buf, c.FontName...)
+			case "size":
+				bits := math.Float64bits(c.FontSize)
+				for i := 7; i >= 0; i-- {
+					buf = append(buf, byte(bits>>(i*8)))
+				}
+			}
+		}
+		return string(buf)
+	}
+	groups := groupObjectsByAttr(filtered, keyOf)
+
+	var words []Word
+	for _, group := range groups {
+		upright := group[0].Upright
+
+		// Step 4: cluster into lines (or honour use_text_flow).
+		var lines [][]Char
+		var charDir string
+		if opts.UseTextFlow {
+			charDir = directionFor(upright, opts.HorizontalLTR, opts.VerticalTTB)
+			lines = [][]Char{group}
+		} else {
+			lineDir := "ttb"
+			if !upright {
+				// For rotated text, pdfplumber flips line / char dir:
+				// line_dir_rotated defaults to char_dir, char_dir_rotated
+				// defaults to line_dir. So a rotated cluster's line_dir
+				// becomes "ltr" (left-to-right) and char_dir becomes
+				// "ttb" (top-to-bottom). The clustering then groups by
+				// x rather than y.
+				lineDir = "ltr"
+			}
+			charDir = directionFor(upright, opts.HorizontalLTR, opts.VerticalTTB)
+
+			var keyForCluster func(c Char) float64
+			var tol float64
+			switch lineDir {
+			case "ttb":
+				// Cluster by Y1 (visual top in PDF user space).
+				keyForCluster = func(c Char) float64 { return -c.Y1 }
+				tol = opts.YTolerance
+			case "ltr":
+				keyForCluster = func(c Char) float64 { return c.X0 }
+				tol = opts.XTolerance
+			}
+			lines = clusterObjects(group, keyForCluster, tol, false)
+
+			// Sort within each line by char_dir.
+			for i := range lines {
+				sortCharsByDir(lines[i], charDir)
+			}
+		}
+
+		// Step 5: walk each line and split into words.
+		for _, line := range lines {
+			words = append(words, mergeLineIntoWords(line, charDir, opts)...)
+		}
+	}
+
+	return words
+}
+
+// directionFor picks the char direction for a glyph based on its
+// upright flag and the HorizontalLTR / VerticalTTB toggles.
+//
+//	upright=true,  ltr=true   →  ltr
+//	upright=true,  ltr=false  →  rtl
+//	upright=false, ttb=true   →  ttb
+//	upright=false, ttb=false  →  btt
+func directionFor(upright, horizontalLTR, verticalTTB bool) string {
+	if upright {
+		if horizontalLTR {
+			return "ltr"
+		}
+		return "rtl"
+	}
+	if verticalTTB {
+		return "ttb"
+	}
+	return "btt"
+}
+
+// sortCharsByDir sorts chars in-place by the requested reading direction.
+// For "ltr" we sort ascending by X0; for "rtl" descending by X1; for "ttb"
+// ascending by Y1 (visual top first in PDF coords); for "btt" ascending by Y0.
+//
+// Note that PDF user space has Y growing UP, but pdfplumber's image space
+// has Y growing DOWN. pdfplumber's "ttb" sort key is `(top, bottom)`
+// where "top" is the smaller y in image space (visually higher). For us,
+// visually higher means LARGER Y1, so "ttb" sorts by -Y1 ascending = Y1
+// descending. We flip the sign in the comparison so the call sites read
+// naturally.
+func sortCharsByDir(chars []Char, dir string) {
+	switch dir {
+	case "ltr":
+		sort.SliceStable(chars, func(i, j int) bool { return chars[i].X0 < chars[j].X0 })
+	case "rtl":
+		sort.SliceStable(chars, func(i, j int) bool { return chars[i].X1 > chars[j].X1 })
+	case "ttb":
+		// Visually top-most first = largest Y1 first in PDF space.
+		sort.SliceStable(chars, func(i, j int) bool { return chars[i].Y1 > chars[j].Y1 })
+	case "btt":
+		// Visually bottom-most first = smallest Y0 first.
+		sort.SliceStable(chars, func(i, j int) bool { return chars[i].Y0 < chars[j].Y0 })
+	}
+}
+
+// mergeLineIntoWords walks a sorted line of chars and emits words. A
+// new word starts whenever the gap to the previous char exceeds
+// XTolerance (for ltr/rtl) / YTolerance (for ttb/btt), the perpendicular
+// distance exceeds the cross-tolerance, or a blank/punctuation char
+// triggers a split.
+func mergeLineIntoWords(line []Char, dir string, opts WordOpts) []Word {
+	if len(line) == 0 {
+		return nil
+	}
+
+	var words []Word
+	var current []Char
+
+	flush := func() {
+		if len(current) > 0 {
+			words = append(words, buildWord(current, dir, opts))
+			current = nil
+		}
+	}
+
+	for _, c := range line {
+		text := c.Text
+		// Whitespace breaks the word (we filtered earlier unless
+		// KeepBlankChars=true; if we're here with a space, the caller
+		// asked for explicit space chars and we honour the break).
+		if opts.KeepBlankChars && isAllSpace(text) {
+			flush()
+			continue
+		}
+
+		// Punctuation: split before AND after, so the punctuation char
+		// becomes its own one-char word.
+		if opts.SplitAtPunctuation && len(text) == 1 && isPunctRune(rune(text[0])) {
+			flush()
+			current = []Char{c}
+			flush()
+			continue
+		}
+
+		if len(current) == 0 {
+			current = []Char{c}
+			continue
+		}
+
+		if charBeginsNewWord(current[len(current)-1], c, dir, opts) {
+			flush()
+			current = []Char{c}
+		} else {
+			current = append(current, c)
+		}
+	}
+	flush()
+
+	return words
+}
+
+// charBeginsNewWord is the Go port of WordExtractor.char_begins_new_word.
+// Returns true if curr is far enough from prev to start a new word.
+//
+// pdfplumber's check has two parts:
+//   - INTRALINE: gap between previous char's TRAILING edge and current
+//     char's LEADING edge exceeds XTolerance. (Or the current char
+//     overlaps backwards — cx < ax.)
+//   - INTERLINE: chars within the same cluster but on visually
+//     different lines (|cy - ay| > YTolerance).
+//
+// We map pdfplumber's image-space y to our user-space Y1 (visual top).
+// In pdfplumber "top" decreases as you go down the page; in PDF user
+// space Y1 decreases as you go down the page too (Y grows up, so the
+// "top" Y1 of a lower char is smaller). So the |cy - ay| > y check
+// uses Y1 directly.
+func charBeginsNewWord(prev, curr Char, dir string, opts WordOpts) bool {
+	var ax, bx, cx float64 // intraline (along reading direction)
+	var ay, cy float64     // interline (perpendicular)
+	var xTol, yTol float64
+
+	switch dir {
+	case "ltr":
+		ax = prev.X0
+		bx = prev.X1
+		cx = curr.X0
+		ay = prev.Y1 // visual top
+		cy = curr.Y1
+		xTol = opts.XTolerance
+		yTol = opts.YTolerance
+	case "rtl":
+		ax = -prev.X1
+		bx = -prev.X0
+		cx = -curr.X1
+		ay = prev.Y1
+		cy = curr.Y1
+		xTol = opts.XTolerance
+		yTol = opts.YTolerance
+	case "ttb":
+		// Reading top-to-bottom: along-direction is Y (descending),
+		// perpendicular is X. Intraline gap measured from prev's
+		// BOTTOM (smaller Y1) to curr's TOP (larger Y1 of curr is
+		// AHEAD of prev in image space → we invert by negating).
+		ax = -prev.Y1
+		bx = -prev.Y0
+		cx = -curr.Y1
+		ay = prev.X0
+		cy = curr.X0
+		xTol = opts.YTolerance
+		yTol = opts.XTolerance
+	case "btt":
+		ax = prev.Y0
+		bx = prev.Y1
+		cx = curr.Y0
+		ay = prev.X0
+		cy = curr.X0
+		xTol = opts.YTolerance
+		yTol = opts.XTolerance
+	default:
+		// Unknown direction — default to ltr behaviour.
+		ax = prev.X0
+		bx = prev.X1
+		cx = curr.X0
+		ay = prev.Y1
+		cy = curr.Y1
+		xTol = opts.XTolerance
+		yTol = opts.YTolerance
+	}
+
+	intraline := cx < ax || cx > bx+xTol
+	interline := math.Abs(cy-ay) > yTol
+	return intraline || interline
+}
+
+// buildWord assembles a Word from the chars that should join it. Text
+// is concatenated with ligature expansion if Expand=true. Bbox is the
+// union of all char bboxes. FontName/FontSize/Upright are copied from
+// the first char.
+func buildWord(chars []Char, dir string, opts WordOpts) Word {
+	var sb strings.Builder
+	sb.Grow(len(chars))
+	for _, c := range chars {
+		if opts.Expand {
+			sb.WriteString(expandLigatures(c.Text))
+		} else {
+			sb.WriteString(c.Text)
+		}
+	}
+	bbox := BBoxOfChars(chars)
+	w := Word{
+		Text:      sb.String(),
+		X0:        bbox.X0,
+		Y0:        bbox.Y0,
+		X1:        bbox.X1,
+		Y1:        bbox.Y1,
+		Upright:   chars[0].Upright,
+		Direction: dir,
+		FontName:  chars[0].FontName,
+		FontSize:  chars[0].FontSize,
+	}
+	if opts.KeepChars {
+		copyChars := make([]Char, len(chars))
+		copy(copyChars, chars)
+		w.Chars = copyChars
+	}
+	return w
+}
+
+// isAllSpace returns true if every rune in s is whitespace (matching
+// Python's str.isspace()). An empty string returns false — same as
+// Python's "".isspace() == False.
+func isAllSpace(s string) bool {
+	if s == "" {
+		return false
+	}
+	for _, r := range s {
+		if !unicode.IsSpace(r) {
+			return false
+		}
+	}
+	return true
+}
+
+// extractTextFromChars implements the dense (non-layout) text-extraction
+// path: words → cluster into lines → join words with spaces → join lines
+// with newlines.
+func extractTextFromChars(chars []Char, opts TextOpts) string {
+	if len(chars) == 0 {
+		return ""
+	}
+	words := extractWordsFromChars(chars, textOptsToWordOpts(opts))
+	if len(words) == 0 {
+		return ""
+	}
+
+	// Cluster words into lines by visual top (Y1 in PDF coords).
+	lines := clusterObjects(words, func(w Word) float64 { return -w.Y1 }, opts.YTolerance, false)
+
+	// Within each line, sort by X0 ascending (ltr) or X1 descending (rtl).
+	dir := "ltr"
+	if !opts.HorizontalLTR {
+		dir = "rtl"
+	}
+	for i := range lines {
+		sortWordsByDir(lines[i], dir)
+	}
+
+	var sb strings.Builder
+	for i, line := range lines {
+		if i > 0 {
+			sb.WriteByte('\n')
+		}
+		for j, w := range line {
+			if j > 0 {
+				sb.WriteByte(' ')
+			}
+			sb.WriteString(w.Text)
+		}
+	}
+	return sb.String()
+}
+
+// sortWordsByDir is the Word equivalent of sortCharsByDir. Only
+// horizontal directions are supported in the dense path (rotated text
+// extraction falls back to per-word direction inside the chars).
+func sortWordsByDir(words []Word, dir string) {
+	switch dir {
+	case "ltr":
+		sort.SliceStable(words, func(i, j int) bool { return words[i].X0 < words[j].X0 })
+	case "rtl":
+		sort.SliceStable(words, func(i, j int) bool { return words[i].X1 > words[j].X1 })
+	}
+}
+
+// extractTextWithLayout implements the layout-preserving path. It
+// builds a fixed-width grid of characters where each glyph's column
+// is proportional to its X0 (divided by XDensity) and each line's row
+// is proportional to its Y1 (divided by YDensity).
+//
+// The output approximates what `pdftotext -layout` or pdfplumber's
+// `extract_text(layout=True)` would produce — useful for callers that
+// want to feed structured text to a downstream layout-aware consumer
+// (form scrapers, LLM prompts that benefit from preserved indentation).
+//
+// We DON'T attempt the per-column-cell expansion that pdfplumber does
+// for non-ttb / non-ltr text. The simple horizontal-ltr path is the
+// common case and covers >95% of real PDFs.
+func extractTextWithLayout(chars []Char, pageWidth, pageHeight float64, opts TextOpts) string {
+	if len(chars) == 0 {
+		return ""
+	}
+
+	words := extractWordsFromChars(chars, textOptsToWordOpts(opts))
+	if len(words) == 0 {
+		return ""
+	}
+
+	// Determine grid dimensions.
+	widthChars := opts.LayoutWidthChars
+	heightChars := opts.LayoutHeightChars
+	if widthChars == 0 {
+		widthChars = int(math.Round(pageWidth / opts.XDensity))
+	}
+	if heightChars == 0 {
+		heightChars = int(math.Round(pageHeight / opts.YDensity))
+	}
+	if widthChars < 1 {
+		widthChars = 1
+	}
+	if heightChars < 1 {
+		heightChars = 1
+	}
+
+	// Cluster words by visual top (matching dense path) so we know
+	// which words share a line.
+	lines := clusterObjects(words, func(w Word) float64 { return -w.Y1 }, opts.YTolerance, false)
+
+	// Determine the page-space y of each line's top: largest Y1 in
+	// the cluster wins (so layout indentation is calibrated against
+	// the visually-top edge of the line).
+	type lineInfo struct {
+		topY  float64
+		words []Word
+	}
+	infos := make([]lineInfo, len(lines))
+	for i, line := range lines {
+		// Sort line ltr; cluster ordering means this is already mostly
+		// in order but be explicit.
+		dir := "ltr"
+		if !opts.HorizontalLTR {
+			dir = "rtl"
+		}
+		sortWordsByDir(line, dir)
+
+		topY := line[0].Y1
+		for _, w := range line[1:] {
+			if w.Y1 > topY {
+				topY = w.Y1
+			}
+		}
+		infos[i] = lineInfo{topY: topY, words: line}
+	}
+
+	// PDF user space has Y growing UP, so "top of page" = largest Y.
+	// The first line in reading order is the one with the largest
+	// topY; lines lower on the page have smaller topY. cluster
+	// ordering is ascending key (-Y1 ascending = Y1 descending = top-
+	// to-bottom reading order), so infos is already in reading order.
+
+	// Lay each line into a row of widthChars columns, calibrating
+	// each word's column = round(X0 / XDensity).
+	// Lay each line's row at row = round((pageTopY - line.topY) / YDensity).
+	pageTopY := pageHeight
+	rows := make([][]rune, heightChars)
+	for i := range rows {
+		rows[i] = make([]rune, widthChars)
+		for j := range rows[i] {
+			rows[i][j] = ' '
+		}
+	}
+
+	for _, info := range infos {
+		row := int(math.Round((pageTopY - info.topY) / opts.YDensity))
+		if row < 0 {
+			row = 0
+		}
+		if row >= heightChars {
+			// Out-of-range row: extend the rows slice rather than drop
+			// the text, since heightChars is heuristic.
+			for r := heightChars; r <= row; r++ {
+				blank := make([]rune, widthChars)
+				for j := range blank {
+					blank[j] = ' '
+				}
+				rows = append(rows, blank)
+			}
+			heightChars = row + 1
+		}
+
+		for _, w := range info.words {
+			col := int(math.Round(w.X0 / opts.XDensity))
+			if col < 0 {
+				col = 0
+			}
+			for _, r := range w.Text {
+				if col >= widthChars {
+					// Extend the row to fit overflow text. We do this
+					// across ALL rows to keep the grid rectangular.
+					oldWidth := widthChars
+					for ; widthChars <= col; widthChars++ {
+					}
+					if widthChars > oldWidth {
+						for ri := range rows {
+							ext := make([]rune, widthChars-oldWidth)
+							for j := range ext {
+								ext[j] = ' '
+							}
+							rows[ri] = append(rows[ri], ext...)
+						}
+					}
+				}
+				if col < widthChars {
+					rows[row][col] = r
+				}
+				col++
+			}
+			// Insert a separator space if there's room — but only if
+			// the next position isn't already non-blank.
+			if col < widthChars && rows[row][col] == ' ' {
+				rows[row][col] = ' '
+			}
+		}
+	}
+
+	// Trim trailing spaces on each row, then join.
+	var sb strings.Builder
+	for i, r := range rows {
+		if i > 0 {
+			sb.WriteByte('\n')
+		}
+		end := len(r)
+		for end > 0 && r[end-1] == ' ' {
+			end--
+		}
+		for j := 0; j < end; j++ {
+			sb.WriteRune(r[j])
+		}
+	}
+	return sb.String()
+}
+
+// textOptsToWordOpts converts a TextOpts into the WordOpts shape used
+// by the word extractor.
+func textOptsToWordOpts(t TextOpts) WordOpts {
+	return WordOpts{
+		XTolerance:    nonZero(t.XTolerance, 3),
+		YTolerance:    nonZero(t.YTolerance, 3),
+		UseTextFlow:   t.UseTextFlow,
+		HorizontalLTR: t.HorizontalLTR,
+		VerticalTTB:   t.VerticalTTB,
+		ExtraAttrs:    t.ExtraAttrs,
+		Expand:        t.Expand,
+	}
+}
+
+func nonZero(v, dflt float64) float64 {
+	if v == 0 {
+		return dflt
+	}
+	return v
+}
+
+// Page-level entry points. We define them as methods on the page
+// struct (the unexported implementation of the Page interface). The
+// new methods get added to the Page interface in page.go.
+
+// Words extracts words from the page using the supplied options.
+// Float fields left at their zero value get replaced with pdfplumber-
+// matching defaults (XTolerance=3, YTolerance=3). Pass
+// DefaultWordOpts() for the explicit default set.
+func (p *page) Words(opts WordOpts) ([]Word, error) {
+	opts = applyWordOptDefaults(opts)
+	chars, err := p.Chars()
+	if err != nil {
+		return nil, err
+	}
+	return extractWordsFromChars(chars, opts), nil
+}
+
+// ExtractText extracts the text of the page as a single string. See
+// TextOpts for the layout / non-layout split.
+func (p *page) ExtractText(opts TextOpts) (string, error) {
+	opts = applyTextOptDefaults(opts)
+	chars, err := p.Chars()
+	if err != nil {
+		return "", err
+	}
+	if opts.Layout {
+		return extractTextWithLayout(chars, p.Width(), p.Height(), opts), nil
+	}
+	return extractTextFromChars(chars, opts), nil
+}
+
+// applyWordOptDefaults fills in zero-valued float fields with
+// pdfplumber-matching defaults. The presence-or-absence semantics for
+// bool fields are caller-defined (a Go zero-value bool is false, which
+// matches pdfplumber's default for all bool kwargs).
+func applyWordOptDefaults(opts WordOpts) WordOpts {
+	if opts.XTolerance == 0 {
+		opts.XTolerance = 3
+	}
+	if opts.YTolerance == 0 {
+		opts.YTolerance = 3
+	}
+	return opts
+}
+
+// applyTextOptDefaults is the TextOpts analogue.
+func applyTextOptDefaults(opts TextOpts) TextOpts {
+	if opts.XTolerance == 0 {
+		opts.XTolerance = 3
+	}
+	if opts.YTolerance == 0 {
+		opts.YTolerance = 3
+	}
+	if opts.XDensity == 0 {
+		opts.XDensity = 7.25
+	}
+	if opts.YDensity == 0 {
+		opts.YDensity = 13
+	}
+	return opts
+}
+
+// ExtractTextSimple is the dead-simple text-extraction primitive that
+// just clusters chars by line and joins them with single spaces / new
+// lines. It ports pdfplumber's extract_text_simple — useful as a
+// baseline when ExtractText's word-grouping heuristics produce
+// undesired results on adversarial input.
+func (p *page) ExtractTextSimple(xTolerance, yTolerance float64) (string, error) {
+	if xTolerance == 0 {
+		xTolerance = 3
+	}
+	if yTolerance == 0 {
+		yTolerance = 3
+	}
+	chars, err := p.Chars()
+	if err != nil {
+		return "", err
+	}
+	if len(chars) == 0 {
+		return "", nil
+	}
+
+	// Drop empty-text chars (failed glyph resolution) — they have no
+	// printable representation.
+	filtered := chars[:0:0]
+	for _, c := range chars {
+		if c.Text != "" {
+			filtered = append(filtered, c)
+		}
+	}
+
+	clustered := clusterObjects(filtered, func(c Char) float64 { return -c.Y1 }, yTolerance, false)
+	var sb strings.Builder
+	for i, line := range clustered {
+		if i > 0 {
+			sb.WriteByte('\n')
+		}
+		// Sort by X0 ascending and merge with " " between non-adjacent
+		// runs (gap > xTolerance).
+		sort.SliceStable(line, func(a, b int) bool { return line[a].X0 < line[b].X0 })
+		lastX1 := math.Inf(-1)
+		for _, c := range line {
+			if !math.IsInf(lastX1, -1) && c.X0 > lastX1+xTolerance {
+				sb.WriteByte(' ')
+			}
+			lastX1 = c.X1
+			if c.Text == " " {
+				// pdfplumber's collate_line drops spaces (they're
+				// implied by the gap detector). We do the same.
+				continue
+			}
+			sb.WriteString(c.Text)
+		}
+	}
+	return sb.String(), nil
+}
diff --git a/text_test.go b/text_test.go
new file mode 100644
index 0000000..6afdad2
--- /dev/null
+++ b/text_test.go
@@ -0,0 +1,495 @@
+// Copyright (c) 2026 Halleluyah Oludele
+// Licensed under the MIT License.
+
+package pdftable
+
+import (
+	"math"
+	"strings"
+	"testing"
+)
+
+// TestExtractWordsBasic builds a hand-crafted slice of Chars for two
+// words on one line and checks that the word grouper:
+//  1. produces exactly two words,
+//  2. keeps them in left-to-right order,
+//  3. concatenates the right text,
+//  4. unions the bbox correctly.
+//
+// We bypass the PDF pipeline (Page.Chars) and feed the algorithm
+// directly so the test stays deterministic regardless of font metrics.
+func TestExtractWordsBasic(t *testing.T) {
+	chars := []Char{
+		{Text: "H", X0: 10, Y0: 100, X1: 18, Y1: 112, Upright: true, FontName: "F", FontSize: 12},
+		{Text: "i", X0: 18, Y0: 100, X1: 21, Y1: 112, Upright: true, FontName: "F", FontSize: 12},
+		// Big gap after "Hi" → new word.
+		{Text: "T", X0: 50, Y0: 100, X1: 58, Y1: 112, Upright: true, FontName: "F", FontSize: 12},
+		{Text: "h", X0: 58, Y0: 100, X1: 64, Y1: 112, Upright: true, FontName: "F", FontSize: 12},
+		{Text: "e", X0: 64, Y0: 100, X1: 70, Y1: 112, Upright: true, FontName: "F", FontSize: 12},
+		{Text: "r", X0: 70, Y0: 100, X1: 73, Y1: 112, Upright: true, FontName: "F", FontSize: 12},
+		{Text: "e", X0: 73, Y0: 100, X1: 79, Y1: 112, Upright: true, FontName: "F", FontSize: 12},
+	}
+
+	opts := DefaultWordOpts()
+	words := extractWordsFromChars(chars, opts)
+
+	if len(words) != 2 {
+		t.Fatalf("got %d words, want 2", len(words))
+	}
+	if words[0].Text != "Hi" {
+		t.Errorf("word 0 = %q, want %q", words[0].Text, "Hi")
+	}
+	if words[1].Text != "There" {
+		t.Errorf("word 1 = %q, want %q", words[1].Text, "There")
+	}
+	if !approxFloat(words[0].X0, 10, 0.01) || !approxFloat(words[0].X1, 21, 0.01) {
+		t.Errorf("word 0 bbox X = (%v, %v), want (10, 21)", words[0].X0, words[0].X1)
+	}
+	if words[0].Direction != "ltr" {
+		t.Errorf("word 0 direction = %q, want ltr", words[0].Direction)
+	}
+}
+
+func TestExtractWordsMultipleLines(t *testing.T) {
+	// Two lines, two words each. Y values are widely separated.
+	chars := []Char{
+		// Line 1: "Hello world" at Y~100
+		{Text: "H", X0: 10, Y0: 100, X1: 18, Y1: 112, Upright: true, FontName: "F", FontSize: 12},
+		{Text: "e", X0: 18, Y0: 100, X1: 24, Y1: 112, Upright: true, FontName: "F", FontSize: 12},
+		{Text: "l", X0: 24, Y0: 100, X1: 27, Y1: 112, Upright: true, FontName: "F", FontSize: 12},
+		{Text: "l", X0: 27, Y0: 100, X1: 30, Y1: 112, Upright: true, FontName: "F", FontSize: 12},
+		{Text: "o", X0: 30, Y0: 100, X1: 36, Y1: 112, Upright: true, FontName: "F", FontSize: 12},
+		{Text: "w", X0: 50, Y0: 100, X1: 60, Y1: 112, Upright: true, FontName: "F", FontSize: 12},
+		{Text: "o", X0: 60, Y0: 100, X1: 66, Y1: 112, Upright: true, FontName: "F", FontSize: 12},
+		{Text: "r", X0: 66, Y0: 100, X1: 69, Y1: 112, Upright: true, FontName: "F", FontSize: 12},
+		{Text: "l", X0: 69, Y0: 100, X1: 72, Y1: 112, Upright: true, FontName: "F", FontSize: 12},
+		{Text: "d", X0: 72, Y0: 100, X1: 78, Y1: 112, Upright: true, FontName: "F", FontSize: 12},
+		// Line 2: "Foo bar" at Y~80
+		{Text: "F", X0: 10, Y0: 80, X1: 18, Y1: 92, Upright: true, FontName: "F", FontSize: 12},
+		{Text: "o", X0: 18, Y0: 80, X1: 24, Y1: 92, Upright: true, FontName: "F", FontSize: 12},
+		{Text: "o", X0: 24, Y0: 80, X1: 30, Y1: 92, Upright: true, FontName: "F", FontSize: 12},
+		{Text: "b", X0: 40, Y0: 80, X1: 46, Y1: 92, Upright: true, FontName: "F", FontSize: 12},
+		{Text: "a", X0: 46, Y0: 80, X1: 52, Y1: 92, Upright: true, FontName: "F", FontSize: 12},
+		{Text: "r", X0: 52, Y0: 80, X1: 55, Y1: 92, Upright: true, FontName: "F", FontSize: 12},
+	}
+
+	words := extractWordsFromChars(chars, DefaultWordOpts())
+	got := make([]string, len(words))
+	for i, w := range words {
+		got[i] = w.Text
+	}
+	want := []string{"Hello", "world", "Foo", "bar"}
+	if len(got) != len(want) {
+		t.Fatalf("got %d words %v, want %d %v", len(got), got, len(want), want)
+	}
+	for i := range want {
+		if got[i] != want[i] {
+			t.Errorf("word %d = %q, want %q", i, got[i], want[i])
+		}
+	}
+}
+
+func TestExtractWordsBlankChars(t *testing.T) {
+	// Explicit space char between two words — by default it should be
+	// dropped (KeepBlankChars=false).
+	chars := []Char{
+		{Text: "A", X0: 10, Y0: 100, X1: 18, Y1: 112, Upright: true},
+		{Text: " ", X0: 18, Y0: 100, X1: 22, Y1: 112, Upright: true},
+		{Text: "B", X0: 22, Y0: 100, X1: 30, Y1: 112, Upright: true},
+	}
+	opts := DefaultWordOpts()
+	words := extractWordsFromChars(chars, opts)
+
+	// "A" and "B" are 4pt apart (18→22), which is > XTolerance(3), so
+	// they're separate words. Without the space char, just two words.
+	if len(words) != 2 {
+		t.Fatalf("got %d words, want 2: %+v", len(words), words)
+	}
+
+	// With KeepBlankChars=true the algorithm sees the space and splits.
+	opts.KeepBlankChars = true
+	words = extractWordsFromChars(chars, opts)
+	if len(words) != 2 {
+		t.Fatalf("got %d words with KeepBlankChars, want 2", len(words))
+	}
+}
+
+func TestExtractWordsLigatureExpansion(t *testing.T) {
+	// A ligature char (ﬁ U+FB01) followed by "le" should expand to
+	// "file" when Expand=true.
+	chars := []Char{
+		{Text: "ﬁ", X0: 10, Y0: 100, X1: 18, Y1: 112, Upright: true},
+		{Text: "l", X0: 18, Y0: 100, X1: 21, Y1: 112, Upright: true},
+		{Text: "e", X0: 21, Y0: 100, X1: 27, Y1: 112, Upright: true},
+	}
+	opts := DefaultWordOpts()
+	words := extractWordsFromChars(chars, opts)
+	if len(words) != 1 {
+		t.Fatalf("got %d words, want 1", len(words))
+	}
+	if words[0].Text != "file" {
+		t.Errorf("got %q, want %q", words[0].Text, "file")
+	}
+
+	opts.Expand = false
+	words = extractWordsFromChars(chars, opts)
+	if words[0].Text != "ﬁle" {
+		t.Errorf("got %q, want %q (no expansion)", words[0].Text, "ﬁle")
+	}
+}
+
+func TestExtractWordsSplitAtPunctuation(t *testing.T) {
+	// "ABC,DEF" with SplitAtPunctuation=true → three words: ABC , DEF.
+	chars := []Char{
+		{Text: "A", X0: 10, Y0: 100, X1: 18, Y1: 112, Upright: true},
+		{Text: "B", X0: 18, Y0: 100, X1: 26, Y1: 112, Upright: true},
+		{Text: "C", X0: 26, Y0: 100, X1: 34, Y1: 112, Upright: true},
+		{Text: ",", X0: 34, Y0: 100, X1: 38, Y1: 112, Upright: true},
+		{Text: "D", X0: 38, Y0: 100, X1: 46, Y1: 112, Upright: true},
+		{Text: "E", X0: 46, Y0: 100, X1: 54, Y1: 112, Upright: true},
+		{Text: "F", X0: 54, Y0: 100, X1: 62, Y1: 112, Upright: true},
+	}
+	opts := DefaultWordOpts()
+	opts.SplitAtPunctuation = true
+	words := extractWordsFromChars(chars, opts)
+	wantTexts := []string{"ABC", ",", "DEF"}
+	if len(words) != len(wantTexts) {
+		t.Fatalf("got %d words, want %d", len(words), len(wantTexts))
+	}
+	for i, w := range words {
+		if w.Text != wantTexts[i] {
+			t.Errorf("word %d = %q, want %q", i, w.Text, wantTexts[i])
+		}
+	}
+}
+
+func TestExtractWordsKeepChars(t *testing.T) {
+	chars := []Char{
+		{Text: "A", X0: 10, Y0: 100, X1: 18, Y1: 112, Upright: true},
+		{Text: "B", X0: 18, Y0: 100, X1: 26, Y1: 112, Upright: true},
+	}
+	opts := DefaultWordOpts()
+	opts.KeepChars = true
+	words := extractWordsFromChars(chars, opts)
+	if len(words) != 1 || len(words[0].Chars) != 2 {
+		t.Fatalf("got %d words, %d chars; want 1 word with 2 chars", len(words), len(words[0].Chars))
+	}
+
+	// And when KeepChars=false, Chars should be nil.
+	opts.KeepChars = false
+	words = extractWordsFromChars(chars, opts)
+	if words[0].Chars != nil {
+		t.Errorf("Chars should be nil when KeepChars=false, got %v", words[0].Chars)
+	}
+}
+
+func TestExtractTextDense(t *testing.T) {
+	chars := []Char{
+		{Text: "H", X0: 10, Y0: 100, X1: 18, Y1: 112, Upright: true},
+		{Text: "i", X0: 18, Y0: 100, X1: 21, Y1: 112, Upright: true},
+		{Text: "T", X0: 50, Y0: 100, X1: 58, Y1: 112, Upright: true},
+		{Text: "h", X0: 58, Y0: 100, X1: 64, Y1: 112, Upright: true},
+		{Text: "e", X0: 64, Y0: 100, X1: 70, Y1: 112, Upright: true},
+		{Text: "r", X0: 70, Y0: 100, X1: 73, Y1: 112, Upright: true},
+		{Text: "e", X0: 73, Y0: 100, X1: 79, Y1: 112, Upright: true},
+		// Second line.
+		{Text: "Y", X0: 10, Y0: 80, X1: 18, Y1: 92, Upright: true},
+	}
+	opts := DefaultTextOpts()
+	got := extractTextFromChars(chars, opts)
+	want := "Hi There\nY"
+	if got != want {
+		t.Errorf("got %q, want %q", got, want)
+	}
+}
+
+func TestExtractTextEmpty(t *testing.T) {
+	opts := DefaultTextOpts()
+	if got := extractTextFromChars(nil, opts); got != "" {
+		t.Errorf("got %q, want empty", got)
+	}
+}
+
+func TestExtractTextLayoutPreservesIndentation(t *testing.T) {
+	// Two lines with different left-indents on a 612x792 page (US
+	// letter). After layout=true, the second line should appear
+	// indented relative to the first.
+	chars := []Char{
+		// Line 1 starts near the left margin, ~x=72.
+		{Text: "A", X0: 72, Y0: 700, X1: 80, Y1: 712, Upright: true},
+		{Text: "B", X0: 80, Y0: 700, X1: 88, Y1: 712, Upright: true},
+		// Line 2 deeply indented, ~x=200.
+		{Text: "C", X0: 200, Y0: 680, X1: 208, Y1: 692, Upright: true},
+		{Text: "D", X0: 208, Y0: 680, X1: 216, Y1: 692, Upright: true},
+	}
+	opts := DefaultTextOpts()
+	opts.Layout = true
+	opts.LayoutWidthChars = 80
+	opts.LayoutHeightChars = 60
+	out := extractTextWithLayout(chars, 612, 792, opts)
+	if !strings.Contains(out, "AB") {
+		t.Errorf("layout text missing AB run: %q", out)
+	}
+	if !strings.Contains(out, "CD") {
+		t.Errorf("layout text missing CD run: %q", out)
+	}
+	// Find the lines: AB should be at a column < CD's column.
+	lines := strings.Split(out, "\n")
+	var abCol, cdCol int = -1, -1
+	for _, l := range lines {
+		if i := strings.Index(l, "AB"); i >= 0 && abCol < 0 {
+			abCol = i
+		}
+		if i := strings.Index(l, "CD"); i >= 0 {
+			cdCol = i
+		}
+	}
+	if abCol < 0 || cdCol < 0 {
+		t.Fatalf("expected to find AB and CD on lines: %q", out)
+	}
+	if cdCol <= abCol {
+		t.Errorf("CD column (%d) should be > AB column (%d): %q", cdCol, abCol, out)
+	}
+}
+
+func TestPageExtractTextLayout(t *testing.T) {
+	doc, err := openHelloWorldDoc()
+	if err != nil {
+		t.Fatalf("open: %v", err)
+	}
+	defer doc.Close()
+	p, _ := doc.Page(1)
+
+	opts := DefaultTextOpts()
+	opts.Layout = true
+	got, err := p.ExtractText(opts)
+	if err != nil {
+		t.Fatalf("ExtractText layout: %v", err)
+	}
+	if !strings.Contains(got, "Hello") || !strings.Contains(got, "world") {
+		t.Errorf("layout text missing expected substrings: %q", got)
+	}
+}
+
+// TestPageWordsHelloWorld walks the Hello-world fixture through the
+// public Page.Words API and asserts one word matching "Hello," is
+// followed by one matching "world!".
+func TestPageWordsHelloWorld(t *testing.T) {
+	doc, err := openHelloWorldDoc()
+	if err != nil {
+		t.Fatalf("open hello: %v", err)
+	}
+	defer doc.Close()
+	p, _ := doc.Page(1)
+
+	words, err := p.Words(DefaultWordOpts())
+	if err != nil {
+		t.Fatalf("Words: %v", err)
+	}
+	if len(words) != 2 {
+		t.Fatalf("got %d words, want 2: %+v", len(words), words)
+	}
+	if words[0].Text != "Hello," || words[1].Text != "world!" {
+		t.Errorf("got %q, %q; want %q, %q",
+			words[0].Text, words[1].Text, "Hello,", "world!")
+	}
+	// Words inherit font metadata from their first char.
+	if words[0].FontName != "Helvetica" {
+		t.Errorf("FontName = %q, want Helvetica", words[0].FontName)
+	}
+	if words[0].FontSize != 12 {
+		t.Errorf("FontSize = %v, want 12", words[0].FontSize)
+	}
+	if words[0].Direction != "ltr" {
+		t.Errorf("Direction = %q, want ltr", words[0].Direction)
+	}
+}
+
+func TestPageExtractTextHelloWorld(t *testing.T) {
+	doc, err := openHelloWorldDoc()
+	if err != nil {
+		t.Fatalf("open: %v", err)
+	}
+	defer doc.Close()
+	p, _ := doc.Page(1)
+
+	got, err := p.ExtractText(DefaultTextOpts())
+	if err != nil {
+		t.Fatalf("ExtractText: %v", err)
+	}
+	if got != "Hello, world!" {
+		t.Errorf("got %q, want %q", got, "Hello, world!")
+	}
+}
+
+func TestPageExtractTextSimple(t *testing.T) {
+	doc, err := openHelloWorldDoc()
+	if err != nil {
+		t.Fatalf("open: %v", err)
+	}
+	defer doc.Close()
+	p, _ := doc.Page(1)
+
+	got, err := p.ExtractTextSimple(3, 3)
+	if err != nil {
+		t.Fatalf("ExtractTextSimple: %v", err)
+	}
+	if !strings.Contains(got, "Hello") || !strings.Contains(got, "world") {
+		t.Errorf("ExtractTextSimple result = %q, missing expected substrings", got)
+	}
+}
+
+func TestDirectionFor(t *testing.T) {
+	tests := []struct {
+		upright, ltr, ttb bool
+		want              string
+	}{
+		{true, true, true, "ltr"},
+		{true, false, true, "rtl"},
+		{false, true, true, "ttb"},
+		{false, true, false, "btt"},
+	}
+	for _, tt := range tests {
+		got := directionFor(tt.upright, tt.ltr, tt.ttb)
+		if got != tt.want {
+			t.Errorf("directionFor(%v,%v,%v) = %q, want %q",
+				tt.upright, tt.ltr, tt.ttb, got, tt.want)
+		}
+	}
+}
+
+func TestDefaultOpts(t *testing.T) {
+	w := DefaultWordOpts()
+	if w.XTolerance != 3 || w.YTolerance != 3 {
+		t.Errorf("DefaultWordOpts tolerances = (%v,%v), want (3,3)", w.XTolerance, w.YTolerance)
+	}
+	if !w.HorizontalLTR || !w.VerticalTTB || !w.Expand {
+		t.Errorf("DefaultWordOpts bool flags wrong: %+v", w)
+	}
+
+	x := DefaultTextOpts()
+	if x.XDensity != 7.25 || x.YDensity != 13 {
+		t.Errorf("DefaultTextOpts densities = (%v,%v), want (7.25, 13)", x.XDensity, x.YDensity)
+	}
+}
+
+func TestIsAllSpace(t *testing.T) {
+	cases := map[string]bool{
+		"":     false, // matches Python's "".isspace()
+		" ":    true,
+		"\t":   true,
+		"a":    false,
+		" a ":  false,
+		"\n\r": true,
+	}
+	for in, want := range cases {
+		got := isAllSpace(in)
+		if got != want {
+			t.Errorf("isAllSpace(%q) = %v, want %v", in, got, want)
+		}
+	}
+}
+
+func TestExpandLigatures(t *testing.T) {
+	cases := map[string]string{
+		"ﬀ": "ff",
+		"ﬁ": "fi",
+		"ﬂ": "fl",
+		"ﬃ": "ffi",
+		"ﬄ": "ffl",
+		"ﬅ": "st",
+		"ﬆ": "st",
+		"A":      "A", // pass-through
+		"":       "", // pass-through
+	}
+	for in, want := range cases {
+		if got := expandLigatures(in); got != want {
+			t.Errorf("expandLigatures(%q) = %q, want %q", in, got, want)
+		}
+	}
+}
+
+// openHelloWorldDoc opens the testdata-built hello PDF.  Pulled into a
+// helper so multiple tests can share the setup without copying the
+// boilerplate.
+func openHelloWorldDoc() (Document, error) {
+	// We import testdata indirectly via Open: that means this test
+	// can't live in the _test package because testdata is in a sub-
+	// directory and we're already in the pdftable package. Build a
+	// dependency-free fixture inline instead — same structure as
+	// testdata.Hello().
+	return OpenBytes(helloBytes())
+}
+
+// helloBytes returns the same PDF as testdata.Hello(), inlined so this
+// _test.go file inside the pdftable package doesn't pull in a
+// dependency on testdata. (testdata/fixtures.go is in a sub-package and
+// importing it here would cause a cycle.)
+func helloBytes() []byte {
+	return buildSinglePageBytes(`BT
+/F1 12 Tf
+72 720 Td
+(Hello, world!) Tj
+ET
+`)
+}
+
+// buildSinglePageBytes builds a minimal single-page PDF whose content
+// stream is the given text. Same layout as testdata.BuildSinglePage()
+// but inlined to avoid the import cycle.
+func buildSinglePageBytes(content string) []byte {
+	const header = "%PDF-1.4\n%\xe2\xe3\xcf\xd3\n"
+	objects := []string{
+		`<< /Type /Catalog /Pages 2 0 R >>`,
+		`<< /Type /Pages /Kids [3 0 R] /Count 1 >>`,
+		`<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Resources << /Font << /F1 4 0 R >> /ProcSet [/PDF /Text] >> /Contents 5 0 R >>`,
+		`<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica /Encoding /WinAnsiEncoding >>`,
+		"",
+	}
+	streamBody := []byte(content)
+	objects[4] = "<< /Length " + itoa(len(streamBody)) + " >>\nstream\n" + string(streamBody) + "endstream"
+
+	var sb strings.Builder
+	sb.WriteString(header)
+	offsets := make([]int, len(objects))
+	for i, body := range objects {
+		offsets[i] = sb.Len()
+		sb.WriteString(itoa(i+1) + " 0 obj\n" + body + "\nendobj\n")
+	}
+	xrefPos := sb.Len()
+	sb.WriteString("xref\n0 " + itoa(len(objects)+1) + "\n")
+	sb.WriteString("0000000000 65535 f \n")
+	for _, off := range offsets {
+		sb.WriteString(pad10(off) + " 00000 n \n")
+	}
+	sb.WriteString("trailer\n")
+	sb.WriteString("<< /Size " + itoa(len(objects)+1) + " /Root 1 0 R >>\n")
+	sb.WriteString("startxref\n" + itoa(xrefPos) + "\n%%EOF\n")
+	return []byte(sb.String())
+}
+
+func itoa(n int) string {
+	if n == 0 {
+		return "0"
+	}
+	var buf [20]byte
+	i := len(buf)
+	for n > 0 {
+		i--
+		buf[i] = byte('0' + n%10)
+		n /= 10
+	}
+	return string(buf[i:])
+}
+
+func pad10(n int) string {
+	s := itoa(n)
+	if len(s) >= 10 {
+		return s
+	}
+	return strings.Repeat("0", 10-len(s)) + s
+}
+
+func approxFloat(a, b, tol float64) bool {
+	return math.Abs(a-b) <= tol
+}