hallelx2 · hallelx2 · May 26, 2026 · May 26, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,51 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.1.0] - 2026-05-26
+
+Phase 1.3.B — words and text extraction. Direct port of pdfplumber's
+`WordExtractor`, `extract_text`, `extract_text_simple`. The v0.0.1
+public API surface is unchanged; v0.1.0 only adds methods to the
+`Page` interface, so existing callers compile and run as-is.
+
+### Added
+
+- `Page.Words(opts WordOpts) ([]Word, error)` — extract positioned
+  text runs. Each `Word` carries `Text`, `X0/Y0/X1/Y1` bbox,
+  `Upright`, `Direction` (ltr/rtl/ttb/btt), `FontName`, `FontSize`,
+  and an optional `Chars` slice (when `WordOpts.KeepChars=true`).
+- `Page.ExtractText(opts TextOpts) (string, error)` — page text as a
+  single string. Supports both dense (`Layout=false`, the default)
+  and layout-preserving (`Layout=true`) modes. The layout mode emits
+  a fixed-width grid mimicking `pdftotext -layout` / pdfplumber's
+  `extract_text(layout=True)`.
+- `Page.ExtractTextSimple(xTolerance, yTolerance float64) (string, error)` —
+  no-frills extraction baseline (ports pdfplumber's
+  `extract_text_simple`).
+- `WordOpts` / `TextOpts` option structs with `DefaultWordOpts()` /
+  `DefaultTextOpts()` constructors carrying pdfplumber-matching
+  defaults (XTolerance=3, YTolerance=3, Expand=true).
+- `BBox` value type with `Union`, `Intersect`, `Contains`, `Snap`,
+  `MergeBBoxes`, `BBoxOfChar`, `BBoxOfChars` helpers.
+- Internal clustering primitives in `clustering.go`:
+  `clusterFloat1D`, `makeClusterDict`, `clusterObjects[T]`,
+  `groupObjectsByAttr[T,K]`, `dedupeChars`. Ports of
+  pdfplumber/utils/clustering.py.
+- Ligature expansion table (ﬁ, ﬂ, ﬀ, ﬃ, ﬄ, ﬅ, ﬆ → fi/fl/ff/ffi/ffl/st).
+- Golden-file parity tests against pdfplumber output on three
+  fixtures (hello.pdf, rules.pdf, simple1.pdf). Regenerate via
+  `python scripts/gen_golden.py`.
+
+### Known limitations
+
+- Word bboxes drift by up to ~10 PDF points from pdfplumber's output
+  on standard-14 fonts because the AFM metrics aren't yet bundled.
+  Word text + count + order match exactly. The AFM bundle is a v0.2.x
+  goal.
+- `extract_text_lines` (regex-based line extraction) is not yet
+  ported.
+- `TextMap.search` is not yet ported.
+
 ## [0.0.1] - 2026-05-26
 
 Initial release. Phase 1.3.A — content-stream primitives layer.
@@ -51,4 +96,5 @@ Initial release. Phase 1.3.A — content-stream primitives layer.
 - Type 3 fonts (their glyph procedures are themselves content streams).
 - Vertical writing mode.
 
+[0.1.0]: https://github.com/hallelx2/pdftable/releases/tag/v0.1.0
 [0.0.1]: https://github.com/hallelx2/pdftable/releases/tag/v0.0.1
diff --git a/README.md b/README.md
@@ -19,9 +19,9 @@ heuristics on. This is that.
 
 ## Status
 
-`v0.0.1` — content-stream primitives layer. The public API surface is
-stable; higher-level operations (`ExtractText`, `FindTables`,
-`ExtractTables`) are coming in subsequent releases.
+`v0.1.0` — words and text extraction. `Page.Words`, `Page.ExtractText`,
+and `Page.ExtractTextSimple` ship with this release; table-finding
+(`FindTables`, `ExtractTables`) is the next phase.
 
 [![Go Reference](https://pkg.go.dev/badge/github.com/hallelx2/pdftable.svg)](https://pkg.go.dev/github.com/hallelx2/pdftable)
 [![CI](https://github.com/hallelx2/pdftable/actions/workflows/test.yml/badge.svg)](https://github.com/hallelx2/pdftable/actions/workflows/test.yml)
@@ -30,7 +30,7 @@ stable; higher-level operations (`ExtractText`, `FindTables`,
 ## Install
 
 ```sh
-go get github.com/hallelx2/pdftable@v0.0.1
+go get github.com/hallelx2/pdftable@v0.1.0
 ```
 
 Requires Go 1.25+ (uses the standard-library `iter` package for the `Pages()` range-over-func iterator, and pdfcpu v0.12+).
@@ -55,19 +55,28 @@ func main() {
     defer doc.Close()
 
     for n, page := range doc.Pages() {
+        // Primitives (v0.0.1).
         chars, _ := page.Chars()
         rects, _ := page.Rects()
         lines, _ := page.Lines()
         fmt.Printf("page %d: %d chars, %d rects, %d lines\n",
             n, len(chars), len(rects), len(lines))
 
-        // Each Char carries its own bbox, font name, font size, and
-        // upright flag — feed them to your own layout algorithm.
-        for _, c := range chars[:min(5, len(chars))] {
-            fmt.Printf("  %q at (%.1f, %.1f) - (%.1f, %.1f) %s %.1fpt\n",
-                c.Text, c.X0, c.Y0, c.X1, c.Y1, c.FontName, c.FontSize)
+        // Words and text extraction (v0.1.0).
+        words, _ := page.Words(pdftable.DefaultWordOpts())
+        text, _ := page.ExtractText(pdftable.DefaultTextOpts())
+        fmt.Printf("  %d words; first line: %q\n",
+            len(words), firstLine(text))
+    }
+}
+
+func firstLine(s string) string {
+    for i, r := range s {
+        if r == '\n' {
+            return s[:i]
         }
     }
+    return s
 }
 ```
 
@@ -97,6 +106,11 @@ type Page interface {
     Rects() ([]Rect, error)
     Curves() ([]Curve, error)
     Objects() (Objects, error)
+
+    // New in v0.1.0: word + text extraction.
+    Words(opts WordOpts) ([]Word, error)
+    ExtractText(opts TextOpts) (string, error)
+    ExtractTextSimple(xTolerance, yTolerance float64) (string, error)
 }
 
 // Primitives.
@@ -117,6 +131,45 @@ type Curve struct { Points [][2]float64; Stroke, Fill bool; Width float64 }
 
 type Objects struct { Chars []Char; Lines []Line; Rects []Rect; Curves []Curve }
 
+// Word (new in v0.1.0).
+type Word struct {
+    Text                string
+    X0, Y0, X1, Y1      float64
+    Upright             bool
+    Direction           string // "ltr" | "rtl" | "ttb" | "btt"
+    FontName            string
+    FontSize            float64
+    Chars               []Char // populated when WordOpts.KeepChars=true
+}
+
+// WordOpts: configure Page.Words. Use DefaultWordOpts() for pdfplumber-matching defaults.
+type WordOpts struct {
+    XTolerance         float64 // default 3
+    YTolerance         float64 // default 3
+    KeepBlankChars     bool
+    UseTextFlow        bool
+    HorizontalLTR      bool   // default true
+    VerticalTTB        bool   // default true
+    ExtraAttrs         []string
+    SplitAtPunctuation bool
+    Expand             bool   // ligature expansion; default true
+    KeepChars          bool
+}
+
+// TextOpts: configure Page.ExtractText. Use DefaultTextOpts() for defaults.
+type TextOpts struct {
+    XTolerance, YTolerance       float64
+    Layout                       bool
+    LayoutWidthChars             int
+    LayoutHeightChars            int
+    XDensity, YDensity           float64 // PDF points per character / per line
+    UseTextFlow                  bool
+    HorizontalLTR                bool
+    VerticalTTB                  bool
+    ExtraAttrs                   []string
+    Expand                       bool
+}
+
 // Sentinel errors.
 var (
     ErrInvalidPDF     = errors.New("pdftable: invalid PDF")
@@ -126,6 +179,33 @@ var (
 )
 ```
 
+## Text extraction
+
+```go
+doc, _ := pdftable.OpenFile("report.pdf")
+defer doc.Close()
+page, _ := doc.Page(1)
+
+// Words: each Word is a contiguous text run.
+words, _ := page.Words(pdftable.DefaultWordOpts())
+for _, w := range words {
+    fmt.Printf("%-20s @ (%.1f, %.1f) %s %.1fpt\n",
+        w.Text, w.X0, w.Y0, w.FontName, w.FontSize)
+}
+
+// ExtractText: all text on the page as one string. Dense (no layout)
+// joins words with spaces and lines with "\n".
+text, _ := page.ExtractText(pdftable.DefaultTextOpts())
+fmt.Println(text)
+
+// Layout-preserving extraction emulates `pdftotext -layout` / pdfplumber's
+// extract_text(layout=True) — column-aligned output suitable for forms.
+opts := pdftable.DefaultTextOpts()
+opts.Layout = true
+laid, _ := page.ExtractText(opts)
+fmt.Println(laid)
+```
+
 ## Side-by-side comparison with pdfplumber
 
 ```python
@@ -134,8 +214,9 @@ import pdfplumber
 
 with pdfplumber.open("report.pdf") as pdf:
     page = pdf.pages[0]
-    for char in page.chars:
-        print(char["text"], char["x0"], char["y0"])
+    for word in page.extract_words(x_tolerance=3, y_tolerance=3):
+        print(word["text"], word["x0"], word["top"])
+    print(page.extract_text())
 ```
 
 ```go
@@ -145,10 +226,14 @@ import "github.com/hallelx2/pdftable"
 doc, _ := pdftable.OpenFile("report.pdf")
 defer doc.Close()
 page, _ := doc.Page(1)
-chars, _ := page.Chars()
-for _, c := range chars {
-    fmt.Println(c.Text, c.X0, c.Y0)
+
+words, _ := page.Words(pdftable.DefaultWordOpts())
+for _, w := range words {
+    // pdftable's Y is PDF user-space (origin bottom-left). The
+    // pdfplumber-equivalent "top" is page.Height() - w.Y1.
+    fmt.Println(w.Text, w.X0, page.Height()-w.Y1)
 }
+fmt.Println(must(page.ExtractText(pdftable.DefaultTextOpts())))
 ```
 
 Three differences worth noting:
@@ -158,10 +243,52 @@ Three differences worth noting:
    pdfplumber compensates). Our `Page(1)` is the same first page.
 2. **Coordinates are in PDF user space with origin at bottom-left**.
    pdfplumber by default reports `top` (origin top-left, Y growing down)
-   on its chars; we report `Y0` / `Y1` in PDF native coordinates. The
-   conversion is `top = mediabox.height - Y1`.
-3. **No layout-analysis methods yet**. `extract_text`, `extract_tables`,
-   `find_tables` are coming in later releases.
+   on its chars and words; we report `Y0` / `Y1` in PDF native
+   coordinates. The conversion is `top = page.Height() - Y1`.
+3. **Options are explicit Go structs, not `**kwargs`**. Build a
+   `WordOpts` / `TextOpts`, override the fields you care about, pass
+   it through. `DefaultWordOpts()` / `DefaultTextOpts()` return
+   pdfplumber-matching defaults.
+
+## Parity with pdfplumber
+
+The word-grouping and text-extraction algorithms are direct ports of
+pdfplumber's `WordExtractor` and `extract_text` (see
+[`pdfplumber/utils/text.py`](https://github.com/jsvine/pdfplumber/blob/main/pdfplumber/utils/text.py)).
+Tests in [`golden_test.go`](golden_test.go) compare the Go output
+against pdfplumber's reference output on shared fixture PDFs.
+
+Behaviours that match exactly:
+
+- Word grouping: same line-cluster-then-merge-by-gap algorithm, same
+  defaults (XTolerance=3, YTolerance=3), same handling of blank-char
+  filtering, ligature expansion (ﬁ→fi, etc.), and split-at-punctuation.
+- Ordering: words returned in pdfplumber's order (top-to-bottom, then
+  left-to-right within each line) when UseTextFlow is false.
+- Direction handling: ltr / rtl / ttb / btt mapping from
+  upright + HorizontalLTR + VerticalTTB.
+
+Behaviours that intentionally differ:
+
+- **Position precision drifts when font metrics aren't bundled**.
+  pdfplumber uses pdfminer.six's AFM tables for the standard 14 fonts;
+  we use a default-width fallback for now. Word text and order match
+  exactly; word bboxes drift by up to ~10 PDF points on glyphs whose
+  width isn't in the PDF's /Widths array. Golden tests assert text
+  parity exactly and position parity within a 15-point envelope; the
+  envelope tightens to <1pt once the AFM bundle lands (planned for
+  v0.2.x).
+- **`Layout=true` output is structurally similar but not byte-equal**.
+  Pdfplumber's layout algorithm has version-to-version drift; we
+  produce a column-aligned grid with the same density defaults but
+  don't promise byte-equal output across pdfplumber releases.
+
+Behaviours not yet ported:
+
+- `extract_text_lines` (regex-based line extraction).
+- `search` on TextMap (regex over assembled page text with char-level
+  match back-references).
+- Per-character extra_attrs hooks beyond `fontname` and `size`.
 
 ## Architecture
 
@@ -171,6 +298,9 @@ pdftable/
 ├── pdf.go             // Document interface + implementation
 ├── page.go            // Page interface + implementation
 ├── char.go            // Public Char / Line / Rect / Curve / Objects
+├── text.go            // Word + ExtractText + ExtractTextSimple (v0.1.0)
+├── clustering.go      // 1-D clusterObjects, groupObjectsByAttr, dedupeChars
+├── geometry.go        // BBox helpers: Union, Intersect, Contains, Snap
 ├── errors.go          // Sentinel errors
 └── internal/pdf/
     ├── reader.go      // pdfcpu bridge
@@ -201,11 +331,13 @@ stdlib-only.
 
 ## Roadmap
 
-- `v0.0.x` — content-stream primitives (this release).
-- `v0.1.x` — text extraction: `Page.ExtractText`, `Page.Words`, word
-  grouping with reading-order sort.
+- `v0.0.x` — content-stream primitives.
+- `v0.1.x` — text extraction: `Page.ExtractText`, `Page.Words`,
+  `Page.ExtractTextSimple` (this release).
 - `v0.2.x` — table finding: `Page.FindTables` using ruling-line +
   whitespace heuristics, `Page.ExtractTables` returning row/cell text.
+  Bundle the standard-14 AFM metrics so word bboxes match pdfplumber
+  to within 1 PDF point.
 - `v0.3.x` — performance pass: parser benchmarking against pdfminer.six
   and pdfplumber on a representative document corpus.