From a431cf4222135dea17a4feb94edc290c9b5e491e Mon Sep 17 00:00:00 2001 From: Halleluyah Oludele Date: Tue, 26 May 2026 23:51:18 +0100 Subject: [PATCH] v0.1.0: Words + ExtractText (Phase 1.3.B) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Port pdfplumber's WordExtractor and extract_text into Go. Three new methods on the Page interface: - Page.Words(WordOpts) → []Word, error - Page.ExtractText(TextOpts) → string, error - Page.ExtractTextSimple(xt, yt) → string, error Each Word carries its bbox, font name/size, upright flag, and direction (ltr/rtl/ttb/btt), with an optional Chars slice when KeepChars=true. Supporting infrastructure: - geometry.go — BBox value type with Union/Intersect/Contains/Snap and MergeBBoxes helpers. - clustering.go — 1-D agglomerative clustering primitives (clusterFloat1D, clusterObjects[T], groupObjectsByAttr[T,K], dedupeChars). Ports of pdfplumber/utils/clustering.py. - text.go — Word + WordExtractor algorithm, dense and layout- preserving ExtractText paths, ligature expansion table. The Page interface is additive: v0.0.1 callers that only use Chars/Lines/Rects/Curves continue to compile and work unchanged. Tests: - geometry_test.go, clustering_test.go, text_test.go — table- driven unit tests for each primitive and each public entry point. - golden_test.go — parity tests against pdfplumber output on three fixture PDFs (hello, rules, simple1). Expected outputs in testdata/golden/*.expected.json, regenerable via scripts/gen_golden.py. Parity notes: - Word text, count, order, and direction match pdfplumber exactly. - Word bbox positions drift by up to ~10 PDF points on standard-14 fonts because the AFM metrics aren't yet bundled (planned for v0.2.x). The golden test tolerance is 15 points to absorb this. --- CHANGELOG.md | 46 ++ README.md | 174 ++++- clustering.go | 286 ++++++++ clustering_test.go | 207 ++++++ geometry.go | 158 +++++ geometry_test.go | 236 +++++++ golden_test.go | 225 ++++++ page.go | 26 + pdftable.go | 11 +- scripts/gen_golden.py | 90 +++ testdata/golden/hello.expected.json | 31 + testdata/golden/hello.pdf | Bin 0 -> 643 bytes testdata/golden/rules.expected.json | 12 + testdata/golden/rules.pdf | Bin 0 -> 790 bytes testdata/golden/simple1.expected.json | 229 ++++++ testdata/golden/simple1.pdf | Bin 0 -> 849 bytes text.go | 956 ++++++++++++++++++++++++++ text_test.go | 495 +++++++++++++ 18 files changed, 3156 insertions(+), 26 deletions(-) create mode 100644 clustering.go create mode 100644 clustering_test.go create mode 100644 geometry.go create mode 100644 geometry_test.go create mode 100644 golden_test.go create mode 100644 scripts/gen_golden.py create mode 100644 testdata/golden/hello.expected.json create mode 100644 testdata/golden/hello.pdf create mode 100644 testdata/golden/rules.expected.json create mode 100644 testdata/golden/rules.pdf create mode 100644 testdata/golden/simple1.expected.json create mode 100644 testdata/golden/simple1.pdf create mode 100644 text.go create mode 100644 text_test.go diff --git a/CHANGELOG.md b/CHANGELOG.md index 98dce6b..0a50bcc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,51 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.1.0] - 2026-05-26 + +Phase 1.3.B — words and text extraction. Direct port of pdfplumber's +`WordExtractor`, `extract_text`, `extract_text_simple`. The v0.0.1 +public API surface is unchanged; v0.1.0 only adds methods to the +`Page` interface, so existing callers compile and run as-is. + +### Added + +- `Page.Words(opts WordOpts) ([]Word, error)` — extract positioned + text runs. Each `Word` carries `Text`, `X0/Y0/X1/Y1` bbox, + `Upright`, `Direction` (ltr/rtl/ttb/btt), `FontName`, `FontSize`, + and an optional `Chars` slice (when `WordOpts.KeepChars=true`). +- `Page.ExtractText(opts TextOpts) (string, error)` — page text as a + single string. Supports both dense (`Layout=false`, the default) + and layout-preserving (`Layout=true`) modes. The layout mode emits + a fixed-width grid mimicking `pdftotext -layout` / pdfplumber's + `extract_text(layout=True)`. +- `Page.ExtractTextSimple(xTolerance, yTolerance float64) (string, error)` — + no-frills extraction baseline (ports pdfplumber's + `extract_text_simple`). +- `WordOpts` / `TextOpts` option structs with `DefaultWordOpts()` / + `DefaultTextOpts()` constructors carrying pdfplumber-matching + defaults (XTolerance=3, YTolerance=3, Expand=true). +- `BBox` value type with `Union`, `Intersect`, `Contains`, `Snap`, + `MergeBBoxes`, `BBoxOfChar`, `BBoxOfChars` helpers. +- Internal clustering primitives in `clustering.go`: + `clusterFloat1D`, `makeClusterDict`, `clusterObjects[T]`, + `groupObjectsByAttr[T,K]`, `dedupeChars`. Ports of + pdfplumber/utils/clustering.py. +- Ligature expansion table (fi, fl, ff, ffi, ffl, ſt, st → fi/fl/ff/ffi/ffl/st). +- Golden-file parity tests against pdfplumber output on three + fixtures (hello.pdf, rules.pdf, simple1.pdf). Regenerate via + `python scripts/gen_golden.py`. + +### Known limitations + +- Word bboxes drift by up to ~10 PDF points from pdfplumber's output + on standard-14 fonts because the AFM metrics aren't yet bundled. + Word text + count + order match exactly. The AFM bundle is a v0.2.x + goal. +- `extract_text_lines` (regex-based line extraction) is not yet + ported. +- `TextMap.search` is not yet ported. + ## [0.0.1] - 2026-05-26 Initial release. Phase 1.3.A — content-stream primitives layer. @@ -51,4 +96,5 @@ Initial release. Phase 1.3.A — content-stream primitives layer. - Type 3 fonts (their glyph procedures are themselves content streams). - Vertical writing mode. +[0.1.0]: https://github.com/hallelx2/pdftable/releases/tag/v0.1.0 [0.0.1]: https://github.com/hallelx2/pdftable/releases/tag/v0.0.1 diff --git a/README.md b/README.md index 44842cd..64642dd 100644 --- a/README.md +++ b/README.md @@ -19,9 +19,9 @@ heuristics on. This is that. ## Status -`v0.0.1` — content-stream primitives layer. The public API surface is -stable; higher-level operations (`ExtractText`, `FindTables`, -`ExtractTables`) are coming in subsequent releases. +`v0.1.0` — words and text extraction. `Page.Words`, `Page.ExtractText`, +and `Page.ExtractTextSimple` ship with this release; table-finding +(`FindTables`, `ExtractTables`) is the next phase. [![Go Reference](https://pkg.go.dev/badge/github.com/hallelx2/pdftable.svg)](https://pkg.go.dev/github.com/hallelx2/pdftable) [![CI](https://github.com/hallelx2/pdftable/actions/workflows/test.yml/badge.svg)](https://github.com/hallelx2/pdftable/actions/workflows/test.yml) @@ -30,7 +30,7 @@ stable; higher-level operations (`ExtractText`, `FindTables`, ## Install ```sh -go get github.com/hallelx2/pdftable@v0.0.1 +go get github.com/hallelx2/pdftable@v0.1.0 ``` Requires Go 1.25+ (uses the standard-library `iter` package for the `Pages()` range-over-func iterator, and pdfcpu v0.12+). @@ -55,19 +55,28 @@ func main() { defer doc.Close() for n, page := range doc.Pages() { + // Primitives (v0.0.1). chars, _ := page.Chars() rects, _ := page.Rects() lines, _ := page.Lines() fmt.Printf("page %d: %d chars, %d rects, %d lines\n", n, len(chars), len(rects), len(lines)) - // Each Char carries its own bbox, font name, font size, and - // upright flag — feed them to your own layout algorithm. - for _, c := range chars[:min(5, len(chars))] { - fmt.Printf(" %q at (%.1f, %.1f) - (%.1f, %.1f) %s %.1fpt\n", - c.Text, c.X0, c.Y0, c.X1, c.Y1, c.FontName, c.FontSize) + // Words and text extraction (v0.1.0). + words, _ := page.Words(pdftable.DefaultWordOpts()) + text, _ := page.ExtractText(pdftable.DefaultTextOpts()) + fmt.Printf(" %d words; first line: %q\n", + len(words), firstLine(text)) + } +} + +func firstLine(s string) string { + for i, r := range s { + if r == '\n' { + return s[:i] } } + return s } ``` @@ -97,6 +106,11 @@ type Page interface { Rects() ([]Rect, error) Curves() ([]Curve, error) Objects() (Objects, error) + + // New in v0.1.0: word + text extraction. + Words(opts WordOpts) ([]Word, error) + ExtractText(opts TextOpts) (string, error) + ExtractTextSimple(xTolerance, yTolerance float64) (string, error) } // Primitives. @@ -117,6 +131,45 @@ type Curve struct { Points [][2]float64; Stroke, Fill bool; Width float64 } type Objects struct { Chars []Char; Lines []Line; Rects []Rect; Curves []Curve } +// Word (new in v0.1.0). +type Word struct { + Text string + X0, Y0, X1, Y1 float64 + Upright bool + Direction string // "ltr" | "rtl" | "ttb" | "btt" + FontName string + FontSize float64 + Chars []Char // populated when WordOpts.KeepChars=true +} + +// WordOpts: configure Page.Words. Use DefaultWordOpts() for pdfplumber-matching defaults. +type WordOpts struct { + XTolerance float64 // default 3 + YTolerance float64 // default 3 + KeepBlankChars bool + UseTextFlow bool + HorizontalLTR bool // default true + VerticalTTB bool // default true + ExtraAttrs []string + SplitAtPunctuation bool + Expand bool // ligature expansion; default true + KeepChars bool +} + +// TextOpts: configure Page.ExtractText. Use DefaultTextOpts() for defaults. +type TextOpts struct { + XTolerance, YTolerance float64 + Layout bool + LayoutWidthChars int + LayoutHeightChars int + XDensity, YDensity float64 // PDF points per character / per line + UseTextFlow bool + HorizontalLTR bool + VerticalTTB bool + ExtraAttrs []string + Expand bool +} + // Sentinel errors. var ( ErrInvalidPDF = errors.New("pdftable: invalid PDF") @@ -126,6 +179,33 @@ var ( ) ``` +## Text extraction + +```go +doc, _ := pdftable.OpenFile("report.pdf") +defer doc.Close() +page, _ := doc.Page(1) + +// Words: each Word is a contiguous text run. +words, _ := page.Words(pdftable.DefaultWordOpts()) +for _, w := range words { + fmt.Printf("%-20s @ (%.1f, %.1f) %s %.1fpt\n", + w.Text, w.X0, w.Y0, w.FontName, w.FontSize) +} + +// ExtractText: all text on the page as one string. Dense (no layout) +// joins words with spaces and lines with "\n". +text, _ := page.ExtractText(pdftable.DefaultTextOpts()) +fmt.Println(text) + +// Layout-preserving extraction emulates `pdftotext -layout` / pdfplumber's +// extract_text(layout=True) — column-aligned output suitable for forms. +opts := pdftable.DefaultTextOpts() +opts.Layout = true +laid, _ := page.ExtractText(opts) +fmt.Println(laid) +``` + ## Side-by-side comparison with pdfplumber ```python @@ -134,8 +214,9 @@ import pdfplumber with pdfplumber.open("report.pdf") as pdf: page = pdf.pages[0] - for char in page.chars: - print(char["text"], char["x0"], char["y0"]) + for word in page.extract_words(x_tolerance=3, y_tolerance=3): + print(word["text"], word["x0"], word["top"]) + print(page.extract_text()) ``` ```go @@ -145,10 +226,14 @@ import "github.com/hallelx2/pdftable" doc, _ := pdftable.OpenFile("report.pdf") defer doc.Close() page, _ := doc.Page(1) -chars, _ := page.Chars() -for _, c := range chars { - fmt.Println(c.Text, c.X0, c.Y0) + +words, _ := page.Words(pdftable.DefaultWordOpts()) +for _, w := range words { + // pdftable's Y is PDF user-space (origin bottom-left). The + // pdfplumber-equivalent "top" is page.Height() - w.Y1. + fmt.Println(w.Text, w.X0, page.Height()-w.Y1) } +fmt.Println(must(page.ExtractText(pdftable.DefaultTextOpts()))) ``` Three differences worth noting: @@ -158,10 +243,52 @@ Three differences worth noting: pdfplumber compensates). Our `Page(1)` is the same first page. 2. **Coordinates are in PDF user space with origin at bottom-left**. pdfplumber by default reports `top` (origin top-left, Y growing down) - on its chars; we report `Y0` / `Y1` in PDF native coordinates. The - conversion is `top = mediabox.height - Y1`. -3. **No layout-analysis methods yet**. `extract_text`, `extract_tables`, - `find_tables` are coming in later releases. + on its chars and words; we report `Y0` / `Y1` in PDF native + coordinates. The conversion is `top = page.Height() - Y1`. +3. **Options are explicit Go structs, not `**kwargs`**. Build a + `WordOpts` / `TextOpts`, override the fields you care about, pass + it through. `DefaultWordOpts()` / `DefaultTextOpts()` return + pdfplumber-matching defaults. + +## Parity with pdfplumber + +The word-grouping and text-extraction algorithms are direct ports of +pdfplumber's `WordExtractor` and `extract_text` (see +[`pdfplumber/utils/text.py`](https://github.com/jsvine/pdfplumber/blob/main/pdfplumber/utils/text.py)). +Tests in [`golden_test.go`](golden_test.go) compare the Go output +against pdfplumber's reference output on shared fixture PDFs. + +Behaviours that match exactly: + +- Word grouping: same line-cluster-then-merge-by-gap algorithm, same + defaults (XTolerance=3, YTolerance=3), same handling of blank-char + filtering, ligature expansion (fi→fi, etc.), and split-at-punctuation. +- Ordering: words returned in pdfplumber's order (top-to-bottom, then + left-to-right within each line) when UseTextFlow is false. +- Direction handling: ltr / rtl / ttb / btt mapping from + upright + HorizontalLTR + VerticalTTB. + +Behaviours that intentionally differ: + +- **Position precision drifts when font metrics aren't bundled**. + pdfplumber uses pdfminer.six's AFM tables for the standard 14 fonts; + we use a default-width fallback for now. Word text and order match + exactly; word bboxes drift by up to ~10 PDF points on glyphs whose + width isn't in the PDF's /Widths array. Golden tests assert text + parity exactly and position parity within a 15-point envelope; the + envelope tightens to <1pt once the AFM bundle lands (planned for + v0.2.x). +- **`Layout=true` output is structurally similar but not byte-equal**. + Pdfplumber's layout algorithm has version-to-version drift; we + produce a column-aligned grid with the same density defaults but + don't promise byte-equal output across pdfplumber releases. + +Behaviours not yet ported: + +- `extract_text_lines` (regex-based line extraction). +- `search` on TextMap (regex over assembled page text with char-level + match back-references). +- Per-character extra_attrs hooks beyond `fontname` and `size`. ## Architecture @@ -171,6 +298,9 @@ pdftable/ ├── pdf.go // Document interface + implementation ├── page.go // Page interface + implementation ├── char.go // Public Char / Line / Rect / Curve / Objects +├── text.go // Word + ExtractText + ExtractTextSimple (v0.1.0) +├── clustering.go // 1-D clusterObjects, groupObjectsByAttr, dedupeChars +├── geometry.go // BBox helpers: Union, Intersect, Contains, Snap ├── errors.go // Sentinel errors └── internal/pdf/ ├── reader.go // pdfcpu bridge @@ -201,11 +331,13 @@ stdlib-only. ## Roadmap -- `v0.0.x` — content-stream primitives (this release). -- `v0.1.x` — text extraction: `Page.ExtractText`, `Page.Words`, word - grouping with reading-order sort. +- `v0.0.x` — content-stream primitives. +- `v0.1.x` — text extraction: `Page.ExtractText`, `Page.Words`, + `Page.ExtractTextSimple` (this release). - `v0.2.x` — table finding: `Page.FindTables` using ruling-line + whitespace heuristics, `Page.ExtractTables` returning row/cell text. + Bundle the standard-14 AFM metrics so word bboxes match pdfplumber + to within 1 PDF point. - `v0.3.x` — performance pass: parser benchmarking against pdfminer.six and pdfplumber on a representative document corpus. diff --git a/clustering.go b/clustering.go new file mode 100644 index 0000000..0431bd8 --- /dev/null +++ b/clustering.go @@ -0,0 +1,286 @@ +// Copyright (c) 2026 Halleluyah Oludele +// Licensed under the MIT License. + +package pdftable + +import ( + "math" + "sort" +) + +// This file is the Go port of pdfplumber/utils/clustering.py. The shape +// is the same — a 1-D agglomerative clusterer over a key extracted from +// each input — but the API uses generics so callers don't have to wrap +// every value in a dict-of-strings. +// +// The clustering primitives here are the load-bearing piece of the +// text-extraction pipeline: words are formed by clustering chars whose +// Y position is "close enough" (within YTolerance), and similarly for +// the layout grid that maps chars onto a fixed-width column. + +// clusterFloat1D buckets sorted, deduped floats into clusters where +// each consecutive pair differs by <= tolerance. The output groups +// preserve the SORTED order of input — callers that need the original +// input order should use clusterObjects with preserveOrder=true. +// +// This is the workhorse pdfplumber calls cluster_list. +func clusterFloat1D(xs []float64, tolerance float64) [][]float64 { + if len(xs) == 0 { + return nil + } + sorted := make([]float64, len(xs)) + copy(sorted, xs) + sort.Float64s(sorted) + + if tolerance == 0 { + // Special-case to match pdfplumber: each value gets its own + // singleton cluster. Without this branch we'd still cluster + // equal values together, which differs from pdfplumber's + // "tolerance==0 → no clustering" semantics. + out := make([][]float64, len(sorted)) + for i, v := range sorted { + out[i] = []float64{v} + } + return out + } + + var groups [][]float64 + current := []float64{sorted[0]} + last := sorted[0] + for _, v := range sorted[1:] { + if v <= last+tolerance { + current = append(current, v) + } else { + groups = append(groups, current) + current = []float64{v} + } + last = v + } + groups = append(groups, current) + return groups +} + +// makeClusterDict returns a map from each unique input value to the +// integer cluster id it lands in. Used internally by clusterObjects to +// translate per-object keys into group indices. +// +// We dedupe the input on the way in (pdfplumber does the same with +// `set(values)`) so that callers passing 10k chars don't pay 10k× +// sort-and-compare cost when many chars share the same key. +func makeClusterDict(values []float64, tolerance float64) map[float64]int { + if len(values) == 0 { + return map[float64]int{} + } + seen := make(map[float64]struct{}, len(values)) + uniq := make([]float64, 0, len(values)) + for _, v := range values { + if _, ok := seen[v]; ok { + continue + } + seen[v] = struct{}{} + uniq = append(uniq, v) + } + clusters := clusterFloat1D(uniq, tolerance) + out := make(map[float64]int, len(uniq)) + for i, c := range clusters { + for _, v := range c { + out[v] = i + } + } + return out +} + +// clusterObjects groups xs by the float key returned from keyFn, +// bucketing keys that differ by <= tolerance into the same cluster. +// +// preserveOrder=false (the default in pdfplumber) emits clusters in +// ascending-key order; xs WITHIN a cluster keep their relative input +// order. preserveOrder=true keeps the original input order at both +// levels — which is what UseTextFlow text extraction wants, since the +// content stream's order conveys reading order in many PDFs. With +// preserveOrder=true, a NEW group is started every time consecutive +// items have different cluster ids (matching pdfplumber's +// itertools.groupby on un-sorted cluster_tuples). +// +// The function is generic over T so callers can cluster Chars, Words, +// or any other domain type without an interface{} cast on every +// element. +func clusterObjects[T any](xs []T, keyFn func(T) float64, tolerance float64, preserveOrder bool) [][]T { + if len(xs) == 0 { + return nil + } + + keys := make([]float64, len(xs)) + for i, x := range xs { + keys[i] = keyFn(x) + } + dict := makeClusterDict(keys, tolerance) + + type indexed struct { + x T + cid int + } + buf := make([]indexed, len(xs)) + for i, x := range xs { + buf[i] = indexed{x: x, cid: dict[keys[i]]} + } + + if !preserveOrder { + // Sort by cluster id; SliceStable so ties keep input order. + sort.SliceStable(buf, func(i, j int) bool { + return buf[i].cid < buf[j].cid + }) + } + + // Emit a new group whenever the cluster id changes from the + // previous entry. With preserveOrder=true this matches Python's + // itertools.groupby on un-sorted cluster_tuples. + var out [][]T + current := []T{buf[0].x} + currentID := buf[0].cid + for _, e := range buf[1:] { + if e.cid == currentID { + current = append(current, e.x) + } else { + out = append(out, current) + current = []T{e.x} + currentID = e.cid + } + } + out = append(out, current) + return out +} + +// groupObjectsByAttr buckets xs into groups that share the exact same +// value for the comparable key returned by keyFn. Order of input is +// preserved within each group; groups appear in the order their key +// first appears. This is the Go port of pdfplumber's itertools.groupby +// on (upright, *extra_attrs) — the outer grouping step in +// iter_extract_tuples. +// +// Unlike clusterObjects, this is an EXACT match on the key, not a +// tolerance-based clustering. The two are intentionally different +// operations and we name them differently to avoid confusion. +func groupObjectsByAttr[T any, K comparable](xs []T, keyFn func(T) K) [][]T { + if len(xs) == 0 { + return nil + } + var out [][]T + current := []T{xs[0]} + currentKey := keyFn(xs[0]) + for _, x := range xs[1:] { + k := keyFn(x) + if k == currentKey { + current = append(current, x) + } else { + out = append(out, current) + current = []T{x} + currentKey = k + } + } + out = append(out, current) + return out +} + +// dedupeChars removes near-duplicate chars (same text + position within +// tolerance). The classic case is a PDF that draws each glyph twice +// for an emboss/shadow effect — text extraction should report one +// glyph, not two. Mirrors pdfplumber.utils.text.dedupe_chars, with +// the same extra_attrs hook for letting callers tighten/loosen the +// "what counts as a duplicate" predicate. +// +// Supported extra_attrs: "fontname", "size". Other values are +// ignored (pdfplumber accepts any attribute name and indexes into the +// char dict; our Char struct doesn't have arbitrary keys, so we +// surface the two attrs callers actually use). +// +// The output is in the SAME ORDER as the input — the first occurrence +// of each cluster is kept and subsequent duplicates are dropped. This +// preserves content-stream order, which downstream code may rely on. +func dedupeChars(chars []Char, tolerance float64, extraAttrs []string) []Char { + if len(chars) == 0 { + return nil + } + + // Group key: (upright, text, *extra_attrs). We collapse to a + // string because (a) the key needs to be hashable for the equality + // check, and (b) the extra_attrs slice is variable. + keyOf := func(c Char) string { + buf := make([]byte, 0, 32+len(c.Text)+len(c.FontName)) + if c.Upright { + buf = append(buf, 'U') + } else { + buf = append(buf, 'u') + } + buf = append(buf, '\x00') + buf = append(buf, c.Text...) + for _, attr := range extraAttrs { + buf = append(buf, '\x00') + switch attr { + case "fontname": + buf = append(buf, c.FontName...) + case "size": + bits := math.Float64bits(c.FontSize) + for i := 7; i >= 0; i-- { + buf = append(buf, byte(bits>>(i*8))) + } + } + } + return string(buf) + } + + type indexed struct { + c Char + idx int + } + sorted := make([]indexed, len(chars)) + for i, c := range chars { + sorted[i] = indexed{c: c, idx: i} + } + sort.SliceStable(sorted, func(i, j int) bool { + return keyOf(sorted[i].c) < keyOf(sorted[j].c) + }) + + keepIdx := make(map[int]struct{}, len(chars)) + + // Walk equal-key runs; within each run cluster by Y0 then X0. + for i := 0; i < len(sorted); { + j := i + 1 + k := keyOf(sorted[i].c) + for j < len(sorted) && keyOf(sorted[j].c) == k { + j++ + } + runChars := sorted[i:j] + yClusters := clusterObjects(runChars, func(e indexed) float64 { return e.c.Y0 }, tolerance, false) + for _, yc := range yClusters { + xClusters := clusterObjects(yc, func(e indexed) float64 { return e.c.X0 }, tolerance, false) + for _, xc := range xClusters { + // Keep the char with the smallest original index in + // the position bucket — preserves "first occurrence + // wins" semantics relative to content-stream order. + minIdx := xc[0].idx + for _, e := range xc[1:] { + if e.idx < minIdx { + minIdx = e.idx + } + } + keepIdx[minIdx] = struct{}{} + } + } + i = j + } + + out := make([]Char, 0, len(keepIdx)) + for i, c := range chars { + if _, ok := keepIdx[i]; ok { + out = append(out, c) + } + } + return out +} + +// float64Bits is a tiny re-export point that consolidates the math. +// Float64bits dependency so other tests/files don't have to import +// "math" just to compare floats. Left as a package-private helper for +// now — only dedupeChars's key construction uses it. +func float64Bits(f float64) uint64 { return math.Float64bits(f) } diff --git a/clustering_test.go b/clustering_test.go new file mode 100644 index 0000000..a3b41ab --- /dev/null +++ b/clustering_test.go @@ -0,0 +1,207 @@ +// Copyright (c) 2026 Halleluyah Oludele +// Licensed under the MIT License. + +package pdftable + +import ( + "reflect" + "testing" +) + +func TestClusterFloat1D(t *testing.T) { + tests := []struct { + name string + in []float64 + tolerance float64 + want [][]float64 + }{ + { + name: "empty", + in: nil, + want: nil, + }, + { + name: "single", + in: []float64{42}, + tolerance: 5, + want: [][]float64{{42}}, + }, + { + name: "all within tolerance", + in: []float64{1, 2, 3, 4, 5}, + tolerance: 1.5, + want: [][]float64{{1, 2, 3, 4, 5}}, + }, + { + name: "out of order input gets sorted", + in: []float64{5, 1, 3, 4, 2}, + tolerance: 1.5, + want: [][]float64{{1, 2, 3, 4, 5}}, + }, + { + name: "two clusters", + in: []float64{1, 2, 10, 11, 12}, + tolerance: 2, + want: [][]float64{{1, 2}, {10, 11, 12}}, + }, + { + name: "tolerance 0 → singletons", + in: []float64{1, 1, 2, 2, 3}, + tolerance: 0, + want: [][]float64{{1}, {1}, {2}, {2}, {3}}, + }, + { + name: "chain growth (each in range of last)", + in: []float64{1, 1.4, 1.7, 2.0, 2.3, 2.6, 100}, + tolerance: 0.5, + want: [][]float64{{1, 1.4, 1.7, 2.0, 2.3, 2.6}, {100}}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := clusterFloat1D(tt.in, tt.tolerance) + if !reflect.DeepEqual(got, tt.want) { + t.Errorf("got %v, want %v", got, tt.want) + } + }) + } +} + +func TestMakeClusterDict(t *testing.T) { + // Duplicates should be deduped before clustering, but the result + // map should contain an entry for every unique input value. + got := makeClusterDict([]float64{1, 1, 2, 10, 10, 11}, 2) + // Cluster 0 = {1, 2}, cluster 1 = {10, 11}. + if got[1] != got[2] { + t.Errorf("1 and 2 should be in same cluster, got %d vs %d", got[1], got[2]) + } + if got[10] != got[11] { + t.Errorf("10 and 11 should be in same cluster, got %d vs %d", got[10], got[11]) + } + if got[1] == got[10] { + t.Errorf("1 and 10 should be in different clusters, both got %d", got[1]) + } +} + +// TestClusterObjects exercises both preserveOrder modes on a tiny set +// of struct-valued inputs. +func TestClusterObjects(t *testing.T) { + type pt struct { + x float64 + tag string + } + xs := []pt{ + {x: 1, tag: "a"}, + {x: 10, tag: "b"}, + {x: 2, tag: "c"}, + {x: 11, tag: "d"}, + } + + // preserveOrder=false: clusters sorted by key, items sorted within + // by input order (a,c then b,d). + got := clusterObjects(xs, func(p pt) float64 { return p.x }, 5, false) + want := [][]pt{ + {{1, "a"}, {2, "c"}}, + {{10, "b"}, {11, "d"}}, + } + if !reflect.DeepEqual(got, want) { + t.Errorf("preserveOrder=false: got %v, want %v", got, want) + } + + // preserveOrder=true: groups in input order; new group whenever + // cluster id changes from previous entry. + // Input cluster ids: a=0, b=1, c=0, d=1 → groups: [a], [b], [c], [d]. + got = clusterObjects(xs, func(p pt) float64 { return p.x }, 5, true) + want = [][]pt{ + {{1, "a"}}, + {{10, "b"}}, + {{2, "c"}}, + {{11, "d"}}, + } + if !reflect.DeepEqual(got, want) { + t.Errorf("preserveOrder=true: got %v, want %v", got, want) + } +} + +func TestGroupObjectsByAttr(t *testing.T) { + type item struct { + key string + val int + } + xs := []item{ + {"a", 1}, + {"a", 2}, + {"b", 3}, + {"a", 4}, // restart "a" group — exact-match groupby preserves order + } + got := groupObjectsByAttr(xs, func(i item) string { return i.key }) + want := [][]item{ + {{"a", 1}, {"a", 2}}, + {{"b", 3}}, + {{"a", 4}}, + } + if !reflect.DeepEqual(got, want) { + t.Errorf("got %v, want %v", got, want) + } +} + +func TestDedupeCharsSimple(t *testing.T) { + // Two pairs of overlapping glyphs (shadow effect) drawn at near- + // identical positions. After dedupe we expect one of each pair. + chars := []Char{ + {Text: "A", X0: 0, Y0: 0, X1: 10, Y1: 10, FontName: "F", FontSize: 12, Upright: true}, + {Text: "A", X0: 0.2, Y0: 0.3, X1: 10.2, Y1: 10.3, FontName: "F", FontSize: 12, Upright: true}, + {Text: "B", X0: 11, Y0: 0, X1: 20, Y1: 10, FontName: "F", FontSize: 12, Upright: true}, + {Text: "B", X0: 11.1, Y0: 0.2, X1: 20.1, Y1: 10.2, FontName: "F", FontSize: 12, Upright: true}, + } + got := dedupeChars(chars, 1, []string{"fontname", "size"}) + if len(got) != 2 { + t.Fatalf("got %d chars after dedupe, want 2", len(got)) + } + // First occurrence should be kept. + if got[0].Text != "A" || got[0].X0 != 0 { + t.Errorf("first kept char = %+v, want first 'A'", got[0]) + } + if got[1].Text != "B" || got[1].X0 != 11 { + t.Errorf("second kept char = %+v, want first 'B'", got[1]) + } +} + +func TestDedupeCharsKeepsDifferentText(t *testing.T) { + // Two glyphs at identical positions but different text should NOT + // be deduped. + chars := []Char{ + {Text: "A", X0: 0, Y0: 0, X1: 10, Y1: 10, FontName: "F", FontSize: 12, Upright: true}, + {Text: "B", X0: 0, Y0: 0, X1: 10, Y1: 10, FontName: "F", FontSize: 12, Upright: true}, + } + got := dedupeChars(chars, 1, []string{"fontname", "size"}) + if len(got) != 2 { + t.Fatalf("got %d chars, want 2 (different text → keep both)", len(got)) + } +} + +func TestDedupeCharsKeepsDifferentFont(t *testing.T) { + // Same text at same position but different fontname → keep both + // when fontname is in extra_attrs. + chars := []Char{ + {Text: "A", X0: 0, Y0: 0, X1: 10, Y1: 10, FontName: "Helvetica", FontSize: 12, Upright: true}, + {Text: "A", X0: 0, Y0: 0, X1: 10, Y1: 10, FontName: "Times", FontSize: 12, Upright: true}, + } + got := dedupeChars(chars, 1, []string{"fontname"}) + if len(got) != 2 { + t.Fatalf("got %d chars, want 2 (different fontname → keep both)", len(got)) + } + + // Without fontname in extra_attrs → drop the duplicate. + got = dedupeChars(chars, 1, nil) + if len(got) != 1 { + t.Fatalf("got %d chars, want 1 (no extra_attrs → dedupe by text+pos)", len(got)) + } +} + +func TestDedupeCharsEmpty(t *testing.T) { + if got := dedupeChars(nil, 1, nil); got != nil { + t.Errorf("dedupeChars(nil) = %v, want nil", got) + } +} diff --git a/geometry.go b/geometry.go new file mode 100644 index 0000000..7c06a33 --- /dev/null +++ b/geometry.go @@ -0,0 +1,158 @@ +// Copyright (c) 2026 Halleluyah Oludele +// Licensed under the MIT License. + +package pdftable + +import "math" + +// BBox is the canonical four-tuple bounding-box helper that the layout +// algorithms (clustering, word grouping, text extraction) operate on. +// Field naming follows the Char/Line/Rect convention used throughout +// the package: x0,y0 is the lower-left corner and x1,y1 is the upper- +// right corner in PDF user space (origin at bottom-left, Y growing up). +// +// We expose BBox as a value type — small, stack-allocatable, trivially +// copyable. Algorithms that need to pass a bbox around without poking +// at the larger Char/Rect/Line wrappers can construct one with NewBBox +// or pull one out with the BBoxOf helpers below. +// +// The Go API intentionally chooses (X0,Y0,X1,Y1) over pdfplumber's +// dict-of-strings ({"x0","top","x1","bottom"}). The two flavours differ +// because pdfplumber operates in image space (Y growing down, "top" = +// small Y, "bottom" = large Y) and we operate in PDF user space (Y +// growing up). Comments call out the mapping wherever it matters. +type BBox struct { + X0, Y0, X1, Y1 float64 +} + +// NewBBox builds a BBox and normalises it so X0<=X1 and Y0<=Y1. +// Algorithms downstream rely on the normal form, so we never let an +// inverted bbox leak past this constructor. +func NewBBox(x0, y0, x1, y1 float64) BBox { + if x1 < x0 { + x0, x1 = x1, x0 + } + if y1 < y0 { + y0, y1 = y1, y0 + } + return BBox{X0: x0, Y0: y0, X1: x1, Y1: y1} +} + +// Width returns the bbox's horizontal extent. +func (b BBox) Width() float64 { return b.X1 - b.X0 } + +// Height returns the bbox's vertical extent. +func (b BBox) Height() float64 { return b.Y1 - b.Y0 } + +// Area returns Width * Height. +func (b BBox) Area() float64 { return b.Width() * b.Height() } + +// IsZero reports whether the bbox is the zero value (all four fields +// equal to zero). Useful for "did I forget to populate this" checks. +func (b BBox) IsZero() bool { return b == BBox{} } + +// Union returns the smallest bbox enclosing both b and other. +// +// We DON'T treat a zero-value BBox as "empty" here — a caller that +// passes BBox{} to Union genuinely means "enclose the origin point". +// Use MergeBBoxes when you have a slice and want it to be a no-op on +// the empty slice. +func (b BBox) Union(other BBox) BBox { + return BBox{ + X0: math.Min(b.X0, other.X0), + Y0: math.Min(b.Y0, other.Y0), + X1: math.Max(b.X1, other.X1), + Y1: math.Max(b.Y1, other.Y1), + } +} + +// Intersect returns the overlapping rectangle of b and other, and a +// boolean reporting whether the intersection has non-empty area (i.e. +// the two bboxes actually overlap). This mirrors pdfplumber's +// get_bbox_overlap, which returns None when the boxes don't touch and +// the overlapping bbox otherwise. +// +// We treat touching-but-not-overlapping (shared edge, zero area) as +// non-overlap, matching pdfplumber's `o_height + o_width > 0` check — +// a single-line ruler that grazes a word's bbox should not be reported +// as "intersecting" the word. +func (b BBox) Intersect(other BBox) (BBox, bool) { + oLeft := math.Max(b.X0, other.X0) + oRight := math.Min(b.X1, other.X1) + oBottom := math.Max(b.Y0, other.Y0) + oTop := math.Min(b.Y1, other.Y1) + + w := oRight - oLeft + h := oTop - oBottom + // pdfplumber requires width>=0, height>=0, AND width+height>0 + // (so zero-area overlaps don't count). Matching that exactly. + if w < 0 || h < 0 || w+h <= 0 { + return BBox{}, false + } + return BBox{X0: oLeft, Y0: oBottom, X1: oRight, Y1: oTop}, true +} + +// Contains reports whether b fully encloses other. Edges are +// considered inside (>= on the low side, <= on the high side), so a +// bbox contains itself. +func (b BBox) Contains(other BBox) bool { + return other.X0 >= b.X0 && + other.Y0 >= b.Y0 && + other.X1 <= b.X1 && + other.Y1 <= b.Y1 +} + +// ContainsPoint reports whether (x,y) lies inside b (inclusive on +// edges). +func (b BBox) ContainsPoint(x, y float64) bool { + return x >= b.X0 && x <= b.X1 && y >= b.Y0 && y <= b.Y1 +} + +// Snap rounds each of b's four coordinates to the nearest multiple of +// step. Used by layout-analysis code to coalesce near-equal positions +// (e.g. ruling lines drawn at 99.9, 100.0, 100.1) before clustering. +// A step of 0 returns the original bbox unchanged. +func (b BBox) Snap(step float64) BBox { + if step == 0 { + return b + } + return BBox{ + X0: math.Round(b.X0/step) * step, + Y0: math.Round(b.Y0/step) * step, + X1: math.Round(b.X1/step) * step, + Y1: math.Round(b.Y1/step) * step, + } +} + +// MergeBBoxes returns the smallest bbox enclosing every input. Empty +// input returns a zero BBox. This mirrors pdfplumber's merge_bboxes +// and objects_to_bbox helpers — the typical caller has a slice of +// Chars and wants the combined bounding box for the resulting Word. +func MergeBBoxes(bboxes []BBox) BBox { + if len(bboxes) == 0 { + return BBox{} + } + out := bboxes[0] + for _, bb := range bboxes[1:] { + out = out.Union(bb) + } + return out +} + +// BBoxOfChar returns the bounding box of a Char. +func BBoxOfChar(c Char) BBox { + return BBox{X0: c.X0, Y0: c.Y0, X1: c.X1, Y1: c.Y1} +} + +// BBoxOfChars returns the smallest bbox enclosing every char in cs. +// Returns the zero BBox for an empty slice. +func BBoxOfChars(cs []Char) BBox { + if len(cs) == 0 { + return BBox{} + } + out := BBoxOfChar(cs[0]) + for _, c := range cs[1:] { + out = out.Union(BBoxOfChar(c)) + } + return out +} diff --git a/geometry_test.go b/geometry_test.go new file mode 100644 index 0000000..6149a1d --- /dev/null +++ b/geometry_test.go @@ -0,0 +1,236 @@ +// Copyright (c) 2026 Halleluyah Oludele +// Licensed under the MIT License. + +package pdftable + +import ( + "math" + "testing" +) + +func TestBBoxNormalisation(t *testing.T) { + // NewBBox should always normalise so the first corner is the + // lower-left and the second corner is the upper-right. + got := NewBBox(10, 20, 5, 5) + want := BBox{X0: 5, Y0: 5, X1: 10, Y1: 20} + if got != want { + t.Errorf("NewBBox normalisation: got %+v, want %+v", got, want) + } +} + +func TestBBoxWidthHeight(t *testing.T) { + b := BBox{X0: 10, Y0: 20, X1: 30, Y1: 50} + if b.Width() != 20 { + t.Errorf("Width = %v, want 20", b.Width()) + } + if b.Height() != 30 { + t.Errorf("Height = %v, want 30", b.Height()) + } + if b.Area() != 600 { + t.Errorf("Area = %v, want 600", b.Area()) + } +} + +func TestBBoxIsZero(t *testing.T) { + if !(BBox{}).IsZero() { + t.Error("BBox{} should be zero") + } + if (BBox{X0: 1}).IsZero() { + t.Error("non-zero BBox should not be zero") + } +} + +func TestBBoxUnion(t *testing.T) { + a := BBox{X0: 0, Y0: 0, X1: 10, Y1: 10} + b := BBox{X0: 5, Y0: 5, X1: 20, Y1: 30} + got := a.Union(b) + want := BBox{X0: 0, Y0: 0, X1: 20, Y1: 30} + if got != want { + t.Errorf("Union = %+v, want %+v", got, want) + } + + // Disjoint bboxes: union spans both. + c := BBox{X0: 100, Y0: 100, X1: 110, Y1: 110} + got = a.Union(c) + want = BBox{X0: 0, Y0: 0, X1: 110, Y1: 110} + if got != want { + t.Errorf("Union disjoint = %+v, want %+v", got, want) + } +} + +func TestBBoxIntersect(t *testing.T) { + tests := []struct { + name string + a, b BBox + want BBox + overlap bool + }{ + { + name: "fully overlapping", + a: BBox{X0: 0, Y0: 0, X1: 10, Y1: 10}, + b: BBox{X0: 5, Y0: 5, X1: 20, Y1: 20}, + want: BBox{X0: 5, Y0: 5, X1: 10, Y1: 10}, + overlap: true, + }, + { + name: "disjoint", + a: BBox{X0: 0, Y0: 0, X1: 10, Y1: 10}, + b: BBox{X0: 20, Y0: 20, X1: 30, Y1: 30}, + overlap: false, + }, + { + name: "share a single corner point (zero w + zero h)", + a: BBox{X0: 0, Y0: 0, X1: 10, Y1: 10}, + b: BBox{X0: 10, Y0: 10, X1: 20, Y1: 20}, + overlap: false, // pdfplumber: w+h must be > 0; point touch has 0+0 + }, + { + name: "share horizontal edge (zero h, non-zero w)", + a: BBox{X0: 0, Y0: 0, X1: 10, Y1: 10}, + b: BBox{X0: 0, Y0: 10, X1: 10, Y1: 20}, + want: BBox{X0: 0, Y0: 10, X1: 10, Y1: 10}, + overlap: true, // pdfplumber: 10+0 > 0 → counted as overlap + }, + { + name: "b fully inside a", + a: BBox{X0: 0, Y0: 0, X1: 100, Y1: 100}, + b: BBox{X0: 10, Y0: 10, X1: 20, Y1: 20}, + want: BBox{X0: 10, Y0: 10, X1: 20, Y1: 20}, + overlap: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, ok := tt.a.Intersect(tt.b) + if ok != tt.overlap { + t.Fatalf("overlap = %v, want %v", ok, tt.overlap) + } + if !ok { + return + } + if got != tt.want { + t.Errorf("Intersect = %+v, want %+v", got, tt.want) + } + }) + } +} + +func TestBBoxContains(t *testing.T) { + outer := BBox{X0: 0, Y0: 0, X1: 100, Y1: 100} + inner := BBox{X0: 10, Y0: 10, X1: 20, Y1: 20} + crossing := BBox{X0: 50, Y0: 50, X1: 150, Y1: 150} + + if !outer.Contains(inner) { + t.Error("outer should contain inner") + } + if !outer.Contains(outer) { + t.Error("bbox should contain itself") + } + if outer.Contains(crossing) { + t.Error("outer should not contain crossing") + } +} + +func TestBBoxContainsPoint(t *testing.T) { + b := BBox{X0: 0, Y0: 0, X1: 10, Y1: 10} + if !b.ContainsPoint(5, 5) { + t.Error("centre point should be contained") + } + if !b.ContainsPoint(0, 0) { + t.Error("corner point should be contained (inclusive)") + } + if !b.ContainsPoint(10, 10) { + t.Error("opposite corner should be contained (inclusive)") + } + if b.ContainsPoint(11, 5) { + t.Error("outside point should not be contained") + } +} + +func TestBBoxSnap(t *testing.T) { + b := BBox{X0: 99.7, Y0: 100.2, X1: 199.4, Y1: 200.8} + got := b.Snap(1) + want := BBox{X0: 100, Y0: 100, X1: 199, Y1: 201} + if got != want { + t.Errorf("Snap(1) = %+v, want %+v", got, want) + } + + // Snap to 0.5 multiples. + got = b.Snap(0.5) + want = BBox{X0: 99.5, Y0: 100, X1: 199.5, Y1: 201} + if got != want { + t.Errorf("Snap(0.5) = %+v, want %+v", got, want) + } + + // Snap with step 0 is a no-op. + got = b.Snap(0) + if got != b { + t.Errorf("Snap(0) should be no-op, got %+v want %+v", got, b) + } +} + +func TestMergeBBoxes(t *testing.T) { + // Empty: zero BBox. + if got := MergeBBoxes(nil); !got.IsZero() { + t.Errorf("MergeBBoxes(nil) = %+v, want zero", got) + } + + // Single: same bbox. + one := BBox{X0: 1, Y0: 2, X1: 3, Y1: 4} + if got := MergeBBoxes([]BBox{one}); got != one { + t.Errorf("MergeBBoxes([one]) = %+v, want %+v", got, one) + } + + // Multiple. + bs := []BBox{ + {X0: 10, Y0: 20, X1: 30, Y1: 40}, + {X0: 5, Y0: 25, X1: 25, Y1: 35}, + {X0: 15, Y0: 10, X1: 35, Y1: 30}, + } + got := MergeBBoxes(bs) + want := BBox{X0: 5, Y0: 10, X1: 35, Y1: 40} + if got != want { + t.Errorf("MergeBBoxes = %+v, want %+v", got, want) + } +} + +func TestBBoxOfChar(t *testing.T) { + c := Char{Text: "x", X0: 1, Y0: 2, X1: 3, Y1: 4} + got := BBoxOfChar(c) + want := BBox{X0: 1, Y0: 2, X1: 3, Y1: 4} + if got != want { + t.Errorf("BBoxOfChar = %+v, want %+v", got, want) + } +} + +func TestBBoxOfChars(t *testing.T) { + // Empty. + if got := BBoxOfChars(nil); !got.IsZero() { + t.Errorf("BBoxOfChars(nil) = %+v, want zero", got) + } + + cs := []Char{ + {X0: 10, Y0: 20, X1: 15, Y1: 30}, + {X0: 15, Y0: 20, X1: 25, Y1: 30}, + {X0: 25, Y0: 20, X1: 35, Y1: 30}, + } + got := BBoxOfChars(cs) + want := BBox{X0: 10, Y0: 20, X1: 35, Y1: 30} + if got != want { + t.Errorf("BBoxOfChars = %+v, want %+v", got, want) + } +} + +func TestBBoxIntersectArea(t *testing.T) { + // Sanity: small overlap area matches expected via math. + a := BBox{X0: 0, Y0: 0, X1: 10, Y1: 10} + b := BBox{X0: 8, Y0: 8, X1: 20, Y1: 20} + overlap, ok := a.Intersect(b) + if !ok { + t.Fatal("expected overlap") + } + if math.Abs(overlap.Area()-4) > 1e-9 { + t.Errorf("overlap area = %v, want 4", overlap.Area()) + } +} diff --git a/golden_test.go b/golden_test.go new file mode 100644 index 0000000..2453e81 --- /dev/null +++ b/golden_test.go @@ -0,0 +1,225 @@ +// Copyright (c) 2026 Halleluyah Oludele +// Licensed under the MIT License. + +package pdftable_test + +import ( + "encoding/json" + "fmt" + "math" + "os" + "path/filepath" + "strings" + "testing" + + "github.com/hallelx2/pdftable" +) + +// Golden-file tests against pdfplumber output. The expected JSON files +// in testdata/golden are generated by scripts/gen_golden.py running the +// reference pdfplumber library on the same fixture PDFs. We assert +// that pdftable's output matches the algorithm-level behaviour of +// pdfplumber: word count, word text, word order, and word direction +// are exact; word positions are checked with a tolerance wide enough +// to absorb known font-metric drift (we use 1000-units-per-em default +// fallback for standard fonts whose AFM tables we don't bundle yet; +// pdfplumber uses pdfminer.six's bundled AFM metrics). See AGENT A's +// v0.0.1 report and the "Parity with pdfplumber" section of the +// README for the up-to-date list of metric differences. +// +// Tolerances: +// +// - Word count: must match EXACTLY. +// - Word text: must match EXACTLY (byte-equal). +// - Word direction: must match EXACTLY. +// - Word bbox: ±15 PDF points to absorb font-width drift. Tightening +// this tolerance is a v0.2.x goal once we bundle the standard-14 +// AFM metrics. +// +// extract_text() is compared as a sequence of whitespace-separated +// words (order + text), absorbing spacing differences that come from +// the same font-metric origin. + +type goldenWord struct { + Text string `json:"text"` + X0 float64 `json:"x0"` + X1 float64 `json:"x1"` + Y0 float64 `json:"y0"` + Y1 float64 `json:"y1"` + Upright bool `json:"upright"` + Direction string `json:"direction"` +} + +type goldenPage struct { + Number int `json:"number"` + Width float64 `json:"width"` + Height float64 `json:"height"` + ExtractText string `json:"extract_text"` + ExtractWords []goldenWord `json:"extract_words"` +} + +type golden struct { + Name string `json:"name"` + Pages []goldenPage `json:"pages"` +} + +// TestGoldenAgainstPdfplumber loads each fixture, runs the pdftable +// extraction, and diffs against the pre-generated pdfplumber output. +func TestGoldenAgainstPdfplumber(t *testing.T) { + dir := filepath.Join("testdata", "golden") + entries, err := os.ReadDir(dir) + if err != nil { + t.Fatalf("read golden dir: %v", err) + } + + // Find every .expected.json and run a sub-test for each. + for _, e := range entries { + if e.IsDir() { + continue + } + name := e.Name() + if !strings.HasSuffix(name, ".expected.json") { + continue + } + stem := strings.TrimSuffix(name, ".expected.json") + t.Run(stem, func(t *testing.T) { + runGoldenCase(t, dir, stem) + }) + } +} + +func runGoldenCase(t *testing.T, dir, stem string) { + t.Helper() + + pdfPath := filepath.Join(dir, stem+".pdf") + jsonPath := filepath.Join(dir, stem+".expected.json") + + data, err := os.ReadFile(jsonPath) + if err != nil { + t.Fatalf("read %s: %v", jsonPath, err) + } + var g golden + if err := json.Unmarshal(data, &g); err != nil { + t.Fatalf("parse %s: %v", jsonPath, err) + } + + doc, err := pdftable.OpenFile(pdfPath) + if err != nil { + t.Fatalf("OpenFile %s: %v", pdfPath, err) + } + defer doc.Close() + + if doc.NumPages() != len(g.Pages) { + t.Fatalf("page count: got %d, want %d", doc.NumPages(), len(g.Pages)) + } + + for _, expPage := range g.Pages { + p, err := doc.Page(expPage.Number) + if err != nil { + t.Fatalf("Page(%d): %v", expPage.Number, err) + } + + // Page dimensions should match exactly (these come from the + // MediaBox, not from font metrics). + if math.Abs(p.Width()-expPage.Width) > 0.01 { + t.Errorf("page %d width: got %v, want %v", expPage.Number, p.Width(), expPage.Width) + } + if math.Abs(p.Height()-expPage.Height) > 0.01 { + t.Errorf("page %d height: got %v, want %v", expPage.Number, p.Height(), expPage.Height) + } + + gotWords, err := p.Words(pdftable.DefaultWordOpts()) + if err != nil { + t.Fatalf("Words: %v", err) + } + assertGoldenWords(t, expPage.Number, gotWords, expPage.ExtractWords) + + // Dense extract_text should produce the same word *sequence* + // when split on whitespace. We don't compare byte-equal because + // pdfplumber sometimes inserts extra spaces for vertical jitter + // that our YTolerance clustering smooths over. + gotText, err := p.ExtractText(pdftable.DefaultTextOpts()) + if err != nil { + t.Fatalf("ExtractText: %v", err) + } + assertWordSequence(t, expPage.Number, gotText, expPage.ExtractText) + } +} + +func assertGoldenWords(t *testing.T, page int, got []pdftable.Word, want []goldenWord) { + t.Helper() + if len(got) != len(want) { + t.Errorf("page %d: got %d words, want %d", page, len(got), len(want)) + // Print first few mismatches for debugging. + n := len(got) + if len(want) > n { + n = len(want) + } + if n > 6 { + n = 6 + } + for i := 0; i < n; i++ { + var gs, ws string + if i < len(got) { + gs = fmt.Sprintf("%q@(%.1f,%.1f)-(%.1f,%.1f)", got[i].Text, got[i].X0, got[i].Y0, got[i].X1, got[i].Y1) + } + if i < len(want) { + ws = fmt.Sprintf("%q@(%.1f,%.1f)-(%.1f,%.1f)", want[i].Text, want[i].X0, want[i].Y0, want[i].X1, want[i].Y1) + } + t.Logf(" [%d] got=%s want=%s", i, gs, ws) + } + return + } + + // Generous position tolerance to absorb font-metric drift. The + // algorithm-level outputs (text, count, order, direction) below + // are still asserted exactly; the position checks here are a + // regression guard against catastrophic mis-placement, not a + // pixel-level parity check. + const posTol = 15.0 // PDF points + for i := range want { + g := got[i] + w := want[i] + if g.Text != w.Text { + t.Errorf("page %d word %d: text got %q, want %q", page, i, g.Text, w.Text) + continue + } + if math.Abs(g.X0-w.X0) > posTol { + t.Errorf("page %d word %d (%q): X0 got %v, want %v", page, i, g.Text, g.X0, w.X0) + } + if math.Abs(g.X1-w.X1) > posTol { + t.Errorf("page %d word %d (%q): X1 got %v, want %v", page, i, g.Text, g.X1, w.X1) + } + if math.Abs(g.Y0-w.Y0) > posTol { + t.Errorf("page %d word %d (%q): Y0 got %v, want %v", page, i, g.Text, g.Y0, w.Y0) + } + if math.Abs(g.Y1-w.Y1) > posTol { + t.Errorf("page %d word %d (%q): Y1 got %v, want %v", page, i, g.Text, g.Y1, w.Y1) + } + if g.Upright != w.Upright { + t.Errorf("page %d word %d (%q): Upright got %v, want %v", page, i, g.Text, g.Upright, w.Upright) + } + if g.Direction != w.Direction { + t.Errorf("page %d word %d (%q): Direction got %q, want %q", page, i, g.Text, g.Direction, w.Direction) + } + } +} + +// assertWordSequence splits both strings on whitespace and compares the +// resulting word lists. Spacing differences and newlines are absorbed. +func assertWordSequence(t *testing.T, page int, got, want string) { + t.Helper() + gotWords := strings.Fields(got) + wantWords := strings.Fields(want) + if len(gotWords) != len(wantWords) { + t.Errorf("page %d: extract_text word count: got %d %v, want %d %v", + page, len(gotWords), gotWords, len(wantWords), wantWords) + return + } + for i := range wantWords { + if gotWords[i] != wantWords[i] { + t.Errorf("page %d: extract_text[%d] got %q, want %q", + page, i, gotWords[i], wantWords[i]) + } + } +} diff --git a/page.go b/page.go index f641400..996e7db 100644 --- a/page.go +++ b/page.go @@ -66,6 +66,32 @@ type Page interface { // cheaper than calling each accessor separately because the // content stream is parsed exactly once. Objects() (Objects, error) + + // Words extracts positioned text runs from the page. A "word" + // is a contiguous group of chars whose horizontal gaps are + // within WordOpts.XTolerance and whose vertical positions + // agree within WordOpts.YTolerance. Pass DefaultWordOpts() to + // use pdfplumber-matching defaults. See WordOpts for the full + // configuration surface. + // + // Returns an empty slice (not nil) when the page contains no + // extractable text. + Words(opts WordOpts) ([]Word, error) + + // ExtractText returns the page's text as a single string. By + // default words on the same line are joined with a single + // space and lines are joined with "\n". When TextOpts.Layout is + // true, the output preserves spatial layout (column-aligned + // text, blank lines for vertical gaps) at the cost of more + // whitespace. Pass DefaultTextOpts() for pdfplumber-matching + // defaults. + ExtractText(opts TextOpts) (string, error) + + // ExtractTextSimple is a no-frills extraction that clusters + // chars by visual line and joins them by gap detection. Use + // when ExtractText's word-grouping heuristics produce undesired + // results on adversarial input. + ExtractTextSimple(xTolerance, yTolerance float64) (string, error) } // page is the unexported implementation backing the Page interface. diff --git a/pdftable.go b/pdftable.go index 36d1452..dc5beb2 100644 --- a/pdftable.go +++ b/pdftable.go @@ -30,11 +30,12 @@ // fmt.Printf("page %d: %d chars\n", n, len(chars)) // } // -// Phase scope: this initial release exposes the primitives. The -// higher-level operations (ExtractText, ExtractTables, FindTables, -// Words) are explicit future phases — see the README for the -// roadmap. The Page interface is designed so those methods can be -// added without breaking existing callers. +// Phase scope: v0.1.0 ships content-stream primitives plus text +// extraction (Page.Words, Page.ExtractText, Page.ExtractTextSimple). +// Table-finding (ExtractTables, FindTables) is the next phase — see +// the README for the roadmap. The Page interface is additive across +// releases; v0.0.1 callers using only Chars/Lines/Rects/Curves +// continue to compile against v0.1.0 without changes. package pdftable import ( diff --git a/scripts/gen_golden.py b/scripts/gen_golden.py new file mode 100644 index 0000000..3af3366 --- /dev/null +++ b/scripts/gen_golden.py @@ -0,0 +1,90 @@ +"""Generate golden-file expected outputs for pdftable's parity tests. + +Run from the repo root after copying the fixture PDFs into +testdata/golden/: + + pip install pdfplumber + python scripts/gen_golden.py + +The script reads every *.pdf in testdata/golden/, runs pdfplumber's +extract_text() and extract_words() on each page, and writes the result +as .expected.json next to the PDF. + +Coordinate-system note: pdfplumber emits word "top" and "bottom" in +image space (origin at top-left, Y growing DOWN). pdftable uses PDF +user space (origin at bottom-left, Y growing UP). We translate +pdfplumber's coords into PDF-user-space here so the JSON matches the +y0/y1 fields on pdftable.Word directly. + +To regenerate after upgrading pdfplumber, simply re-run this script. +The file outputs are deterministic and stable. +""" + +from __future__ import annotations + +import json +import os +import sys + +import pdfplumber + +DIR = os.path.join("testdata", "golden") + + +def main() -> int: + target = DIR if len(sys.argv) < 2 else sys.argv[1] + pdfs = sorted( + f for f in os.listdir(target) if f.endswith(".pdf") + ) + if not pdfs: + print(f"no .pdf files in {target}", file=sys.stderr) + return 1 + for fname in pdfs: + name = os.path.splitext(fname)[0] + pdf_path = os.path.join(target, fname) + out = {"name": name, "pages": []} + with pdfplumber.open(pdf_path) as pdf: + for p in pdf.pages: + page = { + "number": p.page_number, + "width": p.width, + "height": p.height, + "extract_text": p.extract_text() or "", + "extract_words": [], + } + words = p.extract_words( + x_tolerance=3, + y_tolerance=3, + keep_blank_chars=False, + use_text_flow=False, + horizontal_ltr=True, + vertical_ttb=True, + extra_attrs=None, + split_at_punctuation=False, + expand_ligatures=True, + ) + for w in words: + y1_user = p.height - w["top"] + y0_user = p.height - w["bottom"] + page["extract_words"].append( + { + "text": w["text"], + "x0": w["x0"], + "x1": w["x1"], + "y0": y0_user, + "y1": y1_user, + "upright": bool(w.get("upright", True)), + "direction": w.get("direction", "ltr"), + } + ) + out["pages"].append(page) + expected = os.path.join(target, f"{name}.expected.json") + with open(expected, "w", encoding="utf-8") as f: + json.dump(out, f, ensure_ascii=False, indent=2) + nwords = sum(len(pp["extract_words"]) for pp in out["pages"]) + print(f"wrote {expected}: {len(out['pages'])} pages, {nwords} words") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/testdata/golden/hello.expected.json b/testdata/golden/hello.expected.json new file mode 100644 index 0000000..b7acfbe --- /dev/null +++ b/testdata/golden/hello.expected.json @@ -0,0 +1,31 @@ +{ + "name": "hello", + "pages": [ + { + "number": 1, + "width": 612, + "height": 792, + "extract_text": "Hello, world!", + "extract_words": [ + { + "text": "Hello,", + "x0": 72.0, + "x1": 102.672, + "y0": 717.516, + "y1": 729.516, + "upright": true, + "direction": "ltr" + }, + { + "text": "world!", + "x0": 106.00800000000001, + "x1": 138.01200000000003, + "y0": 717.516, + "y1": 729.516, + "upright": true, + "direction": "ltr" + } + ] + } + ] +} \ No newline at end of file diff --git a/testdata/golden/hello.pdf b/testdata/golden/hello.pdf new file mode 100644 index 0000000000000000000000000000000000000000..4ed735fe92923fb45e83978777c3094349dfa061 GIT binary patch literal 643 zcmZWnT~ER=6n*cnxIQe=MD03Q2?-$)WHC`Afh9f=9?Dj*WUi(yivCF-{0I6+yd4t+ z`m~*U&OP@l7+xnA?6OIM@1M^v!XN}aTar!(X!`ylf%b(Hgx6MtolPE+AQ(I( F?1=}exR6e7dwVPo`>U%*rlU&#`7 zMl*T!3rquiA) z2GoD3{|~HyL!hJojSU2>qjs&u0Z1Vhv_MDLpzi>b5S!a9L9|QPc?D#X9OBjo^F)v%UBpL#pZvDOr+~-WAD{l!l@&q#ut(z#B#h zGO>C#UrPZb(!0gQT z&3ik`ncL@AHsfG|+V0qB#K7>s-X$1Qp+%x1JTD5J#Hq}|LX?M!rHJr!U7p5cCex6( zWIglIFp)D=MXJGMM?CS*@TFq}Kg3}UYp^K-Y+5!yVUz6-c9~r!F;tZv7*8^c#Y7#@ zpNWsdEg0NdwbxW5G>01sgj7%Fs>lK?jv9f5W-3@o4NheT!&T^cG(v&HfRups(0eS4 zd{v2o@hHvQ zYUVzzPu-e&k*M3$!8HzG=HfoegT{H)R%-j%;Atv2y;ZT*?@[\]^_`{|}~. SplitAtPunctuation +// terminates a word at every one of these. +const asciiPunctuation = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" + +// isPunctRune reports whether r is in asciiPunctuation. +func isPunctRune(r rune) bool { + if r > 127 { + return false + } + return strings.ContainsRune(asciiPunctuation, r) +} + +// extractWordsFromChars is the core word-grouping algorithm. It is +// pulled out of Page.Words so it can be exercised by unit tests on +// hand-crafted Char slices without spinning up a Page. +// +// Steps (port of WordExtractor.iter_extract_tuples): +// 1. If !KeepBlankChars, drop chars whose text is whitespace. +// 2. Filter out chars with empty text (PDF glyphs that failed to +// resolve via encoding / ToUnicode). Their bbox can still inform +// layout but they shouldn't participate in text assembly. +// 3. Group chars by (upright, *extra_attrs) — exact-match grouping. +// 4. For each group, either keep content-stream order (UseTextFlow) +// or cluster into LINES by Y position then sort each line by X. +// 5. Within each line, walk left-to-right (or right-to-left for rtl) +// and split into words whenever the gap exceeds XTolerance or the +// char begins a new line within the cluster's tolerance band. +// 6. Apply ligature expansion (if Expand) when concatenating text. +func extractWordsFromChars(chars []Char, opts WordOpts) []Word { + if len(chars) == 0 { + return nil + } + + // Step 1/2: filter blanks (unless KeepBlankChars) and empties. + filtered := chars + if !opts.KeepBlankChars { + out := make([]Char, 0, len(chars)) + for _, c := range chars { + if c.Text == "" { + continue + } + if isAllSpace(c.Text) { + continue + } + out = append(out, c) + } + filtered = out + } else { + out := make([]Char, 0, len(chars)) + for _, c := range chars { + if c.Text == "" { + continue + } + out = append(out, c) + } + filtered = out + } + if len(filtered) == 0 { + return nil + } + + // Step 3: group by (upright, *extra_attrs). The group key is a + // string so it can be a map key. We keep the SAME ORDER as input + // — within a contiguous run of equal-key chars, all chars go in + // one group. A second equal-key run later in the slice starts a + // new group (matching itertools.groupby semantics). + keyOf := func(c Char) string { + buf := make([]byte, 0, 8+len(c.FontName)) + if c.Upright { + buf = append(buf, 'U') + } else { + buf = append(buf, 'u') + } + for _, attr := range opts.ExtraAttrs { + buf = append(buf, '\x00') + switch attr { + case "fontname": + buf = append(buf, c.FontName...) + case "size": + bits := math.Float64bits(c.FontSize) + for i := 7; i >= 0; i-- { + buf = append(buf, byte(bits>>(i*8))) + } + } + } + return string(buf) + } + groups := groupObjectsByAttr(filtered, keyOf) + + var words []Word + for _, group := range groups { + upright := group[0].Upright + + // Step 4: cluster into lines (or honour use_text_flow). + var lines [][]Char + var charDir string + if opts.UseTextFlow { + charDir = directionFor(upright, opts.HorizontalLTR, opts.VerticalTTB) + lines = [][]Char{group} + } else { + lineDir := "ttb" + if !upright { + // For rotated text, pdfplumber flips line / char dir: + // line_dir_rotated defaults to char_dir, char_dir_rotated + // defaults to line_dir. So a rotated cluster's line_dir + // becomes "ltr" (left-to-right) and char_dir becomes + // "ttb" (top-to-bottom). The clustering then groups by + // x rather than y. + lineDir = "ltr" + } + charDir = directionFor(upright, opts.HorizontalLTR, opts.VerticalTTB) + + var keyForCluster func(c Char) float64 + var tol float64 + switch lineDir { + case "ttb": + // Cluster by Y1 (visual top in PDF user space). + keyForCluster = func(c Char) float64 { return -c.Y1 } + tol = opts.YTolerance + case "ltr": + keyForCluster = func(c Char) float64 { return c.X0 } + tol = opts.XTolerance + } + lines = clusterObjects(group, keyForCluster, tol, false) + + // Sort within each line by char_dir. + for i := range lines { + sortCharsByDir(lines[i], charDir) + } + } + + // Step 5: walk each line and split into words. + for _, line := range lines { + words = append(words, mergeLineIntoWords(line, charDir, opts)...) + } + } + + return words +} + +// directionFor picks the char direction for a glyph based on its +// upright flag and the HorizontalLTR / VerticalTTB toggles. +// +// upright=true, ltr=true → ltr +// upright=true, ltr=false → rtl +// upright=false, ttb=true → ttb +// upright=false, ttb=false → btt +func directionFor(upright, horizontalLTR, verticalTTB bool) string { + if upright { + if horizontalLTR { + return "ltr" + } + return "rtl" + } + if verticalTTB { + return "ttb" + } + return "btt" +} + +// sortCharsByDir sorts chars in-place by the requested reading direction. +// For "ltr" we sort ascending by X0; for "rtl" descending by X1; for "ttb" +// ascending by Y1 (visual top first in PDF coords); for "btt" ascending by Y0. +// +// Note that PDF user space has Y growing UP, but pdfplumber's image space +// has Y growing DOWN. pdfplumber's "ttb" sort key is `(top, bottom)` +// where "top" is the smaller y in image space (visually higher). For us, +// visually higher means LARGER Y1, so "ttb" sorts by -Y1 ascending = Y1 +// descending. We flip the sign in the comparison so the call sites read +// naturally. +func sortCharsByDir(chars []Char, dir string) { + switch dir { + case "ltr": + sort.SliceStable(chars, func(i, j int) bool { return chars[i].X0 < chars[j].X0 }) + case "rtl": + sort.SliceStable(chars, func(i, j int) bool { return chars[i].X1 > chars[j].X1 }) + case "ttb": + // Visually top-most first = largest Y1 first in PDF space. + sort.SliceStable(chars, func(i, j int) bool { return chars[i].Y1 > chars[j].Y1 }) + case "btt": + // Visually bottom-most first = smallest Y0 first. + sort.SliceStable(chars, func(i, j int) bool { return chars[i].Y0 < chars[j].Y0 }) + } +} + +// mergeLineIntoWords walks a sorted line of chars and emits words. A +// new word starts whenever the gap to the previous char exceeds +// XTolerance (for ltr/rtl) / YTolerance (for ttb/btt), the perpendicular +// distance exceeds the cross-tolerance, or a blank/punctuation char +// triggers a split. +func mergeLineIntoWords(line []Char, dir string, opts WordOpts) []Word { + if len(line) == 0 { + return nil + } + + var words []Word + var current []Char + + flush := func() { + if len(current) > 0 { + words = append(words, buildWord(current, dir, opts)) + current = nil + } + } + + for _, c := range line { + text := c.Text + // Whitespace breaks the word (we filtered earlier unless + // KeepBlankChars=true; if we're here with a space, the caller + // asked for explicit space chars and we honour the break). + if opts.KeepBlankChars && isAllSpace(text) { + flush() + continue + } + + // Punctuation: split before AND after, so the punctuation char + // becomes its own one-char word. + if opts.SplitAtPunctuation && len(text) == 1 && isPunctRune(rune(text[0])) { + flush() + current = []Char{c} + flush() + continue + } + + if len(current) == 0 { + current = []Char{c} + continue + } + + if charBeginsNewWord(current[len(current)-1], c, dir, opts) { + flush() + current = []Char{c} + } else { + current = append(current, c) + } + } + flush() + + return words +} + +// charBeginsNewWord is the Go port of WordExtractor.char_begins_new_word. +// Returns true if curr is far enough from prev to start a new word. +// +// pdfplumber's check has two parts: +// - INTRALINE: gap between previous char's TRAILING edge and current +// char's LEADING edge exceeds XTolerance. (Or the current char +// overlaps backwards — cx < ax.) +// - INTERLINE: chars within the same cluster but on visually +// different lines (|cy - ay| > YTolerance). +// +// We map pdfplumber's image-space y to our user-space Y1 (visual top). +// In pdfplumber "top" decreases as you go down the page; in PDF user +// space Y1 decreases as you go down the page too (Y grows up, so the +// "top" Y1 of a lower char is smaller). So the |cy - ay| > y check +// uses Y1 directly. +func charBeginsNewWord(prev, curr Char, dir string, opts WordOpts) bool { + var ax, bx, cx float64 // intraline (along reading direction) + var ay, cy float64 // interline (perpendicular) + var xTol, yTol float64 + + switch dir { + case "ltr": + ax = prev.X0 + bx = prev.X1 + cx = curr.X0 + ay = prev.Y1 // visual top + cy = curr.Y1 + xTol = opts.XTolerance + yTol = opts.YTolerance + case "rtl": + ax = -prev.X1 + bx = -prev.X0 + cx = -curr.X1 + ay = prev.Y1 + cy = curr.Y1 + xTol = opts.XTolerance + yTol = opts.YTolerance + case "ttb": + // Reading top-to-bottom: along-direction is Y (descending), + // perpendicular is X. Intraline gap measured from prev's + // BOTTOM (smaller Y1) to curr's TOP (larger Y1 of curr is + // AHEAD of prev in image space → we invert by negating). + ax = -prev.Y1 + bx = -prev.Y0 + cx = -curr.Y1 + ay = prev.X0 + cy = curr.X0 + xTol = opts.YTolerance + yTol = opts.XTolerance + case "btt": + ax = prev.Y0 + bx = prev.Y1 + cx = curr.Y0 + ay = prev.X0 + cy = curr.X0 + xTol = opts.YTolerance + yTol = opts.XTolerance + default: + // Unknown direction — default to ltr behaviour. + ax = prev.X0 + bx = prev.X1 + cx = curr.X0 + ay = prev.Y1 + cy = curr.Y1 + xTol = opts.XTolerance + yTol = opts.YTolerance + } + + intraline := cx < ax || cx > bx+xTol + interline := math.Abs(cy-ay) > yTol + return intraline || interline +} + +// buildWord assembles a Word from the chars that should join it. Text +// is concatenated with ligature expansion if Expand=true. Bbox is the +// union of all char bboxes. FontName/FontSize/Upright are copied from +// the first char. +func buildWord(chars []Char, dir string, opts WordOpts) Word { + var sb strings.Builder + sb.Grow(len(chars)) + for _, c := range chars { + if opts.Expand { + sb.WriteString(expandLigatures(c.Text)) + } else { + sb.WriteString(c.Text) + } + } + bbox := BBoxOfChars(chars) + w := Word{ + Text: sb.String(), + X0: bbox.X0, + Y0: bbox.Y0, + X1: bbox.X1, + Y1: bbox.Y1, + Upright: chars[0].Upright, + Direction: dir, + FontName: chars[0].FontName, + FontSize: chars[0].FontSize, + } + if opts.KeepChars { + copyChars := make([]Char, len(chars)) + copy(copyChars, chars) + w.Chars = copyChars + } + return w +} + +// isAllSpace returns true if every rune in s is whitespace (matching +// Python's str.isspace()). An empty string returns false — same as +// Python's "".isspace() == False. +func isAllSpace(s string) bool { + if s == "" { + return false + } + for _, r := range s { + if !unicode.IsSpace(r) { + return false + } + } + return true +} + +// extractTextFromChars implements the dense (non-layout) text-extraction +// path: words → cluster into lines → join words with spaces → join lines +// with newlines. +func extractTextFromChars(chars []Char, opts TextOpts) string { + if len(chars) == 0 { + return "" + } + words := extractWordsFromChars(chars, textOptsToWordOpts(opts)) + if len(words) == 0 { + return "" + } + + // Cluster words into lines by visual top (Y1 in PDF coords). + lines := clusterObjects(words, func(w Word) float64 { return -w.Y1 }, opts.YTolerance, false) + + // Within each line, sort by X0 ascending (ltr) or X1 descending (rtl). + dir := "ltr" + if !opts.HorizontalLTR { + dir = "rtl" + } + for i := range lines { + sortWordsByDir(lines[i], dir) + } + + var sb strings.Builder + for i, line := range lines { + if i > 0 { + sb.WriteByte('\n') + } + for j, w := range line { + if j > 0 { + sb.WriteByte(' ') + } + sb.WriteString(w.Text) + } + } + return sb.String() +} + +// sortWordsByDir is the Word equivalent of sortCharsByDir. Only +// horizontal directions are supported in the dense path (rotated text +// extraction falls back to per-word direction inside the chars). +func sortWordsByDir(words []Word, dir string) { + switch dir { + case "ltr": + sort.SliceStable(words, func(i, j int) bool { return words[i].X0 < words[j].X0 }) + case "rtl": + sort.SliceStable(words, func(i, j int) bool { return words[i].X1 > words[j].X1 }) + } +} + +// extractTextWithLayout implements the layout-preserving path. It +// builds a fixed-width grid of characters where each glyph's column +// is proportional to its X0 (divided by XDensity) and each line's row +// is proportional to its Y1 (divided by YDensity). +// +// The output approximates what `pdftotext -layout` or pdfplumber's +// `extract_text(layout=True)` would produce — useful for callers that +// want to feed structured text to a downstream layout-aware consumer +// (form scrapers, LLM prompts that benefit from preserved indentation). +// +// We DON'T attempt the per-column-cell expansion that pdfplumber does +// for non-ttb / non-ltr text. The simple horizontal-ltr path is the +// common case and covers >95% of real PDFs. +func extractTextWithLayout(chars []Char, pageWidth, pageHeight float64, opts TextOpts) string { + if len(chars) == 0 { + return "" + } + + words := extractWordsFromChars(chars, textOptsToWordOpts(opts)) + if len(words) == 0 { + return "" + } + + // Determine grid dimensions. + widthChars := opts.LayoutWidthChars + heightChars := opts.LayoutHeightChars + if widthChars == 0 { + widthChars = int(math.Round(pageWidth / opts.XDensity)) + } + if heightChars == 0 { + heightChars = int(math.Round(pageHeight / opts.YDensity)) + } + if widthChars < 1 { + widthChars = 1 + } + if heightChars < 1 { + heightChars = 1 + } + + // Cluster words by visual top (matching dense path) so we know + // which words share a line. + lines := clusterObjects(words, func(w Word) float64 { return -w.Y1 }, opts.YTolerance, false) + + // Determine the page-space y of each line's top: largest Y1 in + // the cluster wins (so layout indentation is calibrated against + // the visually-top edge of the line). + type lineInfo struct { + topY float64 + words []Word + } + infos := make([]lineInfo, len(lines)) + for i, line := range lines { + // Sort line ltr; cluster ordering means this is already mostly + // in order but be explicit. + dir := "ltr" + if !opts.HorizontalLTR { + dir = "rtl" + } + sortWordsByDir(line, dir) + + topY := line[0].Y1 + for _, w := range line[1:] { + if w.Y1 > topY { + topY = w.Y1 + } + } + infos[i] = lineInfo{topY: topY, words: line} + } + + // PDF user space has Y growing UP, so "top of page" = largest Y. + // The first line in reading order is the one with the largest + // topY; lines lower on the page have smaller topY. cluster + // ordering is ascending key (-Y1 ascending = Y1 descending = top- + // to-bottom reading order), so infos is already in reading order. + + // Lay each line into a row of widthChars columns, calibrating + // each word's column = round(X0 / XDensity). + // Lay each line's row at row = round((pageTopY - line.topY) / YDensity). + pageTopY := pageHeight + rows := make([][]rune, heightChars) + for i := range rows { + rows[i] = make([]rune, widthChars) + for j := range rows[i] { + rows[i][j] = ' ' + } + } + + for _, info := range infos { + row := int(math.Round((pageTopY - info.topY) / opts.YDensity)) + if row < 0 { + row = 0 + } + if row >= heightChars { + // Out-of-range row: extend the rows slice rather than drop + // the text, since heightChars is heuristic. + for r := heightChars; r <= row; r++ { + blank := make([]rune, widthChars) + for j := range blank { + blank[j] = ' ' + } + rows = append(rows, blank) + } + heightChars = row + 1 + } + + for _, w := range info.words { + col := int(math.Round(w.X0 / opts.XDensity)) + if col < 0 { + col = 0 + } + for _, r := range w.Text { + if col >= widthChars { + // Extend the row to fit overflow text. We do this + // across ALL rows to keep the grid rectangular. + oldWidth := widthChars + for ; widthChars <= col; widthChars++ { + } + if widthChars > oldWidth { + for ri := range rows { + ext := make([]rune, widthChars-oldWidth) + for j := range ext { + ext[j] = ' ' + } + rows[ri] = append(rows[ri], ext...) + } + } + } + if col < widthChars { + rows[row][col] = r + } + col++ + } + // Insert a separator space if there's room — but only if + // the next position isn't already non-blank. + if col < widthChars && rows[row][col] == ' ' { + rows[row][col] = ' ' + } + } + } + + // Trim trailing spaces on each row, then join. + var sb strings.Builder + for i, r := range rows { + if i > 0 { + sb.WriteByte('\n') + } + end := len(r) + for end > 0 && r[end-1] == ' ' { + end-- + } + for j := 0; j < end; j++ { + sb.WriteRune(r[j]) + } + } + return sb.String() +} + +// textOptsToWordOpts converts a TextOpts into the WordOpts shape used +// by the word extractor. +func textOptsToWordOpts(t TextOpts) WordOpts { + return WordOpts{ + XTolerance: nonZero(t.XTolerance, 3), + YTolerance: nonZero(t.YTolerance, 3), + UseTextFlow: t.UseTextFlow, + HorizontalLTR: t.HorizontalLTR, + VerticalTTB: t.VerticalTTB, + ExtraAttrs: t.ExtraAttrs, + Expand: t.Expand, + } +} + +func nonZero(v, dflt float64) float64 { + if v == 0 { + return dflt + } + return v +} + +// Page-level entry points. We define them as methods on the page +// struct (the unexported implementation of the Page interface). The +// new methods get added to the Page interface in page.go. + +// Words extracts words from the page using the supplied options. +// Float fields left at their zero value get replaced with pdfplumber- +// matching defaults (XTolerance=3, YTolerance=3). Pass +// DefaultWordOpts() for the explicit default set. +func (p *page) Words(opts WordOpts) ([]Word, error) { + opts = applyWordOptDefaults(opts) + chars, err := p.Chars() + if err != nil { + return nil, err + } + return extractWordsFromChars(chars, opts), nil +} + +// ExtractText extracts the text of the page as a single string. See +// TextOpts for the layout / non-layout split. +func (p *page) ExtractText(opts TextOpts) (string, error) { + opts = applyTextOptDefaults(opts) + chars, err := p.Chars() + if err != nil { + return "", err + } + if opts.Layout { + return extractTextWithLayout(chars, p.Width(), p.Height(), opts), nil + } + return extractTextFromChars(chars, opts), nil +} + +// applyWordOptDefaults fills in zero-valued float fields with +// pdfplumber-matching defaults. The presence-or-absence semantics for +// bool fields are caller-defined (a Go zero-value bool is false, which +// matches pdfplumber's default for all bool kwargs). +func applyWordOptDefaults(opts WordOpts) WordOpts { + if opts.XTolerance == 0 { + opts.XTolerance = 3 + } + if opts.YTolerance == 0 { + opts.YTolerance = 3 + } + return opts +} + +// applyTextOptDefaults is the TextOpts analogue. +func applyTextOptDefaults(opts TextOpts) TextOpts { + if opts.XTolerance == 0 { + opts.XTolerance = 3 + } + if opts.YTolerance == 0 { + opts.YTolerance = 3 + } + if opts.XDensity == 0 { + opts.XDensity = 7.25 + } + if opts.YDensity == 0 { + opts.YDensity = 13 + } + return opts +} + +// ExtractTextSimple is the dead-simple text-extraction primitive that +// just clusters chars by line and joins them with single spaces / new +// lines. It ports pdfplumber's extract_text_simple — useful as a +// baseline when ExtractText's word-grouping heuristics produce +// undesired results on adversarial input. +func (p *page) ExtractTextSimple(xTolerance, yTolerance float64) (string, error) { + if xTolerance == 0 { + xTolerance = 3 + } + if yTolerance == 0 { + yTolerance = 3 + } + chars, err := p.Chars() + if err != nil { + return "", err + } + if len(chars) == 0 { + return "", nil + } + + // Drop empty-text chars (failed glyph resolution) — they have no + // printable representation. + filtered := chars[:0:0] + for _, c := range chars { + if c.Text != "" { + filtered = append(filtered, c) + } + } + + clustered := clusterObjects(filtered, func(c Char) float64 { return -c.Y1 }, yTolerance, false) + var sb strings.Builder + for i, line := range clustered { + if i > 0 { + sb.WriteByte('\n') + } + // Sort by X0 ascending and merge with " " between non-adjacent + // runs (gap > xTolerance). + sort.SliceStable(line, func(a, b int) bool { return line[a].X0 < line[b].X0 }) + lastX1 := math.Inf(-1) + for _, c := range line { + if !math.IsInf(lastX1, -1) && c.X0 > lastX1+xTolerance { + sb.WriteByte(' ') + } + lastX1 = c.X1 + if c.Text == " " { + // pdfplumber's collate_line drops spaces (they're + // implied by the gap detector). We do the same. + continue + } + sb.WriteString(c.Text) + } + } + return sb.String(), nil +} diff --git a/text_test.go b/text_test.go new file mode 100644 index 0000000..6afdad2 --- /dev/null +++ b/text_test.go @@ -0,0 +1,495 @@ +// Copyright (c) 2026 Halleluyah Oludele +// Licensed under the MIT License. + +package pdftable + +import ( + "math" + "strings" + "testing" +) + +// TestExtractWordsBasic builds a hand-crafted slice of Chars for two +// words on one line and checks that the word grouper: +// 1. produces exactly two words, +// 2. keeps them in left-to-right order, +// 3. concatenates the right text, +// 4. unions the bbox correctly. +// +// We bypass the PDF pipeline (Page.Chars) and feed the algorithm +// directly so the test stays deterministic regardless of font metrics. +func TestExtractWordsBasic(t *testing.T) { + chars := []Char{ + {Text: "H", X0: 10, Y0: 100, X1: 18, Y1: 112, Upright: true, FontName: "F", FontSize: 12}, + {Text: "i", X0: 18, Y0: 100, X1: 21, Y1: 112, Upright: true, FontName: "F", FontSize: 12}, + // Big gap after "Hi" → new word. + {Text: "T", X0: 50, Y0: 100, X1: 58, Y1: 112, Upright: true, FontName: "F", FontSize: 12}, + {Text: "h", X0: 58, Y0: 100, X1: 64, Y1: 112, Upright: true, FontName: "F", FontSize: 12}, + {Text: "e", X0: 64, Y0: 100, X1: 70, Y1: 112, Upright: true, FontName: "F", FontSize: 12}, + {Text: "r", X0: 70, Y0: 100, X1: 73, Y1: 112, Upright: true, FontName: "F", FontSize: 12}, + {Text: "e", X0: 73, Y0: 100, X1: 79, Y1: 112, Upright: true, FontName: "F", FontSize: 12}, + } + + opts := DefaultWordOpts() + words := extractWordsFromChars(chars, opts) + + if len(words) != 2 { + t.Fatalf("got %d words, want 2", len(words)) + } + if words[0].Text != "Hi" { + t.Errorf("word 0 = %q, want %q", words[0].Text, "Hi") + } + if words[1].Text != "There" { + t.Errorf("word 1 = %q, want %q", words[1].Text, "There") + } + if !approxFloat(words[0].X0, 10, 0.01) || !approxFloat(words[0].X1, 21, 0.01) { + t.Errorf("word 0 bbox X = (%v, %v), want (10, 21)", words[0].X0, words[0].X1) + } + if words[0].Direction != "ltr" { + t.Errorf("word 0 direction = %q, want ltr", words[0].Direction) + } +} + +func TestExtractWordsMultipleLines(t *testing.T) { + // Two lines, two words each. Y values are widely separated. + chars := []Char{ + // Line 1: "Hello world" at Y~100 + {Text: "H", X0: 10, Y0: 100, X1: 18, Y1: 112, Upright: true, FontName: "F", FontSize: 12}, + {Text: "e", X0: 18, Y0: 100, X1: 24, Y1: 112, Upright: true, FontName: "F", FontSize: 12}, + {Text: "l", X0: 24, Y0: 100, X1: 27, Y1: 112, Upright: true, FontName: "F", FontSize: 12}, + {Text: "l", X0: 27, Y0: 100, X1: 30, Y1: 112, Upright: true, FontName: "F", FontSize: 12}, + {Text: "o", X0: 30, Y0: 100, X1: 36, Y1: 112, Upright: true, FontName: "F", FontSize: 12}, + {Text: "w", X0: 50, Y0: 100, X1: 60, Y1: 112, Upright: true, FontName: "F", FontSize: 12}, + {Text: "o", X0: 60, Y0: 100, X1: 66, Y1: 112, Upright: true, FontName: "F", FontSize: 12}, + {Text: "r", X0: 66, Y0: 100, X1: 69, Y1: 112, Upright: true, FontName: "F", FontSize: 12}, + {Text: "l", X0: 69, Y0: 100, X1: 72, Y1: 112, Upright: true, FontName: "F", FontSize: 12}, + {Text: "d", X0: 72, Y0: 100, X1: 78, Y1: 112, Upright: true, FontName: "F", FontSize: 12}, + // Line 2: "Foo bar" at Y~80 + {Text: "F", X0: 10, Y0: 80, X1: 18, Y1: 92, Upright: true, FontName: "F", FontSize: 12}, + {Text: "o", X0: 18, Y0: 80, X1: 24, Y1: 92, Upright: true, FontName: "F", FontSize: 12}, + {Text: "o", X0: 24, Y0: 80, X1: 30, Y1: 92, Upright: true, FontName: "F", FontSize: 12}, + {Text: "b", X0: 40, Y0: 80, X1: 46, Y1: 92, Upright: true, FontName: "F", FontSize: 12}, + {Text: "a", X0: 46, Y0: 80, X1: 52, Y1: 92, Upright: true, FontName: "F", FontSize: 12}, + {Text: "r", X0: 52, Y0: 80, X1: 55, Y1: 92, Upright: true, FontName: "F", FontSize: 12}, + } + + words := extractWordsFromChars(chars, DefaultWordOpts()) + got := make([]string, len(words)) + for i, w := range words { + got[i] = w.Text + } + want := []string{"Hello", "world", "Foo", "bar"} + if len(got) != len(want) { + t.Fatalf("got %d words %v, want %d %v", len(got), got, len(want), want) + } + for i := range want { + if got[i] != want[i] { + t.Errorf("word %d = %q, want %q", i, got[i], want[i]) + } + } +} + +func TestExtractWordsBlankChars(t *testing.T) { + // Explicit space char between two words — by default it should be + // dropped (KeepBlankChars=false). + chars := []Char{ + {Text: "A", X0: 10, Y0: 100, X1: 18, Y1: 112, Upright: true}, + {Text: " ", X0: 18, Y0: 100, X1: 22, Y1: 112, Upright: true}, + {Text: "B", X0: 22, Y0: 100, X1: 30, Y1: 112, Upright: true}, + } + opts := DefaultWordOpts() + words := extractWordsFromChars(chars, opts) + + // "A" and "B" are 4pt apart (18→22), which is > XTolerance(3), so + // they're separate words. Without the space char, just two words. + if len(words) != 2 { + t.Fatalf("got %d words, want 2: %+v", len(words), words) + } + + // With KeepBlankChars=true the algorithm sees the space and splits. + opts.KeepBlankChars = true + words = extractWordsFromChars(chars, opts) + if len(words) != 2 { + t.Fatalf("got %d words with KeepBlankChars, want 2", len(words)) + } +} + +func TestExtractWordsLigatureExpansion(t *testing.T) { + // A ligature char (fi U+FB01) followed by "le" should expand to + // "file" when Expand=true. + chars := []Char{ + {Text: "fi", X0: 10, Y0: 100, X1: 18, Y1: 112, Upright: true}, + {Text: "l", X0: 18, Y0: 100, X1: 21, Y1: 112, Upright: true}, + {Text: "e", X0: 21, Y0: 100, X1: 27, Y1: 112, Upright: true}, + } + opts := DefaultWordOpts() + words := extractWordsFromChars(chars, opts) + if len(words) != 1 { + t.Fatalf("got %d words, want 1", len(words)) + } + if words[0].Text != "file" { + t.Errorf("got %q, want %q", words[0].Text, "file") + } + + opts.Expand = false + words = extractWordsFromChars(chars, opts) + if words[0].Text != "file" { + t.Errorf("got %q, want %q (no expansion)", words[0].Text, "file") + } +} + +func TestExtractWordsSplitAtPunctuation(t *testing.T) { + // "ABC,DEF" with SplitAtPunctuation=true → three words: ABC , DEF. + chars := []Char{ + {Text: "A", X0: 10, Y0: 100, X1: 18, Y1: 112, Upright: true}, + {Text: "B", X0: 18, Y0: 100, X1: 26, Y1: 112, Upright: true}, + {Text: "C", X0: 26, Y0: 100, X1: 34, Y1: 112, Upright: true}, + {Text: ",", X0: 34, Y0: 100, X1: 38, Y1: 112, Upright: true}, + {Text: "D", X0: 38, Y0: 100, X1: 46, Y1: 112, Upright: true}, + {Text: "E", X0: 46, Y0: 100, X1: 54, Y1: 112, Upright: true}, + {Text: "F", X0: 54, Y0: 100, X1: 62, Y1: 112, Upright: true}, + } + opts := DefaultWordOpts() + opts.SplitAtPunctuation = true + words := extractWordsFromChars(chars, opts) + wantTexts := []string{"ABC", ",", "DEF"} + if len(words) != len(wantTexts) { + t.Fatalf("got %d words, want %d", len(words), len(wantTexts)) + } + for i, w := range words { + if w.Text != wantTexts[i] { + t.Errorf("word %d = %q, want %q", i, w.Text, wantTexts[i]) + } + } +} + +func TestExtractWordsKeepChars(t *testing.T) { + chars := []Char{ + {Text: "A", X0: 10, Y0: 100, X1: 18, Y1: 112, Upright: true}, + {Text: "B", X0: 18, Y0: 100, X1: 26, Y1: 112, Upright: true}, + } + opts := DefaultWordOpts() + opts.KeepChars = true + words := extractWordsFromChars(chars, opts) + if len(words) != 1 || len(words[0].Chars) != 2 { + t.Fatalf("got %d words, %d chars; want 1 word with 2 chars", len(words), len(words[0].Chars)) + } + + // And when KeepChars=false, Chars should be nil. + opts.KeepChars = false + words = extractWordsFromChars(chars, opts) + if words[0].Chars != nil { + t.Errorf("Chars should be nil when KeepChars=false, got %v", words[0].Chars) + } +} + +func TestExtractTextDense(t *testing.T) { + chars := []Char{ + {Text: "H", X0: 10, Y0: 100, X1: 18, Y1: 112, Upright: true}, + {Text: "i", X0: 18, Y0: 100, X1: 21, Y1: 112, Upright: true}, + {Text: "T", X0: 50, Y0: 100, X1: 58, Y1: 112, Upright: true}, + {Text: "h", X0: 58, Y0: 100, X1: 64, Y1: 112, Upright: true}, + {Text: "e", X0: 64, Y0: 100, X1: 70, Y1: 112, Upright: true}, + {Text: "r", X0: 70, Y0: 100, X1: 73, Y1: 112, Upright: true}, + {Text: "e", X0: 73, Y0: 100, X1: 79, Y1: 112, Upright: true}, + // Second line. + {Text: "Y", X0: 10, Y0: 80, X1: 18, Y1: 92, Upright: true}, + } + opts := DefaultTextOpts() + got := extractTextFromChars(chars, opts) + want := "Hi There\nY" + if got != want { + t.Errorf("got %q, want %q", got, want) + } +} + +func TestExtractTextEmpty(t *testing.T) { + opts := DefaultTextOpts() + if got := extractTextFromChars(nil, opts); got != "" { + t.Errorf("got %q, want empty", got) + } +} + +func TestExtractTextLayoutPreservesIndentation(t *testing.T) { + // Two lines with different left-indents on a 612x792 page (US + // letter). After layout=true, the second line should appear + // indented relative to the first. + chars := []Char{ + // Line 1 starts near the left margin, ~x=72. + {Text: "A", X0: 72, Y0: 700, X1: 80, Y1: 712, Upright: true}, + {Text: "B", X0: 80, Y0: 700, X1: 88, Y1: 712, Upright: true}, + // Line 2 deeply indented, ~x=200. + {Text: "C", X0: 200, Y0: 680, X1: 208, Y1: 692, Upright: true}, + {Text: "D", X0: 208, Y0: 680, X1: 216, Y1: 692, Upright: true}, + } + opts := DefaultTextOpts() + opts.Layout = true + opts.LayoutWidthChars = 80 + opts.LayoutHeightChars = 60 + out := extractTextWithLayout(chars, 612, 792, opts) + if !strings.Contains(out, "AB") { + t.Errorf("layout text missing AB run: %q", out) + } + if !strings.Contains(out, "CD") { + t.Errorf("layout text missing CD run: %q", out) + } + // Find the lines: AB should be at a column < CD's column. + lines := strings.Split(out, "\n") + var abCol, cdCol int = -1, -1 + for _, l := range lines { + if i := strings.Index(l, "AB"); i >= 0 && abCol < 0 { + abCol = i + } + if i := strings.Index(l, "CD"); i >= 0 { + cdCol = i + } + } + if abCol < 0 || cdCol < 0 { + t.Fatalf("expected to find AB and CD on lines: %q", out) + } + if cdCol <= abCol { + t.Errorf("CD column (%d) should be > AB column (%d): %q", cdCol, abCol, out) + } +} + +func TestPageExtractTextLayout(t *testing.T) { + doc, err := openHelloWorldDoc() + if err != nil { + t.Fatalf("open: %v", err) + } + defer doc.Close() + p, _ := doc.Page(1) + + opts := DefaultTextOpts() + opts.Layout = true + got, err := p.ExtractText(opts) + if err != nil { + t.Fatalf("ExtractText layout: %v", err) + } + if !strings.Contains(got, "Hello") || !strings.Contains(got, "world") { + t.Errorf("layout text missing expected substrings: %q", got) + } +} + +// TestPageWordsHelloWorld walks the Hello-world fixture through the +// public Page.Words API and asserts one word matching "Hello," is +// followed by one matching "world!". +func TestPageWordsHelloWorld(t *testing.T) { + doc, err := openHelloWorldDoc() + if err != nil { + t.Fatalf("open hello: %v", err) + } + defer doc.Close() + p, _ := doc.Page(1) + + words, err := p.Words(DefaultWordOpts()) + if err != nil { + t.Fatalf("Words: %v", err) + } + if len(words) != 2 { + t.Fatalf("got %d words, want 2: %+v", len(words), words) + } + if words[0].Text != "Hello," || words[1].Text != "world!" { + t.Errorf("got %q, %q; want %q, %q", + words[0].Text, words[1].Text, "Hello,", "world!") + } + // Words inherit font metadata from their first char. + if words[0].FontName != "Helvetica" { + t.Errorf("FontName = %q, want Helvetica", words[0].FontName) + } + if words[0].FontSize != 12 { + t.Errorf("FontSize = %v, want 12", words[0].FontSize) + } + if words[0].Direction != "ltr" { + t.Errorf("Direction = %q, want ltr", words[0].Direction) + } +} + +func TestPageExtractTextHelloWorld(t *testing.T) { + doc, err := openHelloWorldDoc() + if err != nil { + t.Fatalf("open: %v", err) + } + defer doc.Close() + p, _ := doc.Page(1) + + got, err := p.ExtractText(DefaultTextOpts()) + if err != nil { + t.Fatalf("ExtractText: %v", err) + } + if got != "Hello, world!" { + t.Errorf("got %q, want %q", got, "Hello, world!") + } +} + +func TestPageExtractTextSimple(t *testing.T) { + doc, err := openHelloWorldDoc() + if err != nil { + t.Fatalf("open: %v", err) + } + defer doc.Close() + p, _ := doc.Page(1) + + got, err := p.ExtractTextSimple(3, 3) + if err != nil { + t.Fatalf("ExtractTextSimple: %v", err) + } + if !strings.Contains(got, "Hello") || !strings.Contains(got, "world") { + t.Errorf("ExtractTextSimple result = %q, missing expected substrings", got) + } +} + +func TestDirectionFor(t *testing.T) { + tests := []struct { + upright, ltr, ttb bool + want string + }{ + {true, true, true, "ltr"}, + {true, false, true, "rtl"}, + {false, true, true, "ttb"}, + {false, true, false, "btt"}, + } + for _, tt := range tests { + got := directionFor(tt.upright, tt.ltr, tt.ttb) + if got != tt.want { + t.Errorf("directionFor(%v,%v,%v) = %q, want %q", + tt.upright, tt.ltr, tt.ttb, got, tt.want) + } + } +} + +func TestDefaultOpts(t *testing.T) { + w := DefaultWordOpts() + if w.XTolerance != 3 || w.YTolerance != 3 { + t.Errorf("DefaultWordOpts tolerances = (%v,%v), want (3,3)", w.XTolerance, w.YTolerance) + } + if !w.HorizontalLTR || !w.VerticalTTB || !w.Expand { + t.Errorf("DefaultWordOpts bool flags wrong: %+v", w) + } + + x := DefaultTextOpts() + if x.XDensity != 7.25 || x.YDensity != 13 { + t.Errorf("DefaultTextOpts densities = (%v,%v), want (7.25, 13)", x.XDensity, x.YDensity) + } +} + +func TestIsAllSpace(t *testing.T) { + cases := map[string]bool{ + "": false, // matches Python's "".isspace() + " ": true, + "\t": true, + "a": false, + " a ": false, + "\n\r": true, + } + for in, want := range cases { + got := isAllSpace(in) + if got != want { + t.Errorf("isAllSpace(%q) = %v, want %v", in, got, want) + } + } +} + +func TestExpandLigatures(t *testing.T) { + cases := map[string]string{ + "ff": "ff", + "fi": "fi", + "fl": "fl", + "ffi": "ffi", + "ffl": "ffl", + "ſt": "st", + "st": "st", + "A": "A", // pass-through + "": "", // pass-through + } + for in, want := range cases { + if got := expandLigatures(in); got != want { + t.Errorf("expandLigatures(%q) = %q, want %q", in, got, want) + } + } +} + +// openHelloWorldDoc opens the testdata-built hello PDF. Pulled into a +// helper so multiple tests can share the setup without copying the +// boilerplate. +func openHelloWorldDoc() (Document, error) { + // We import testdata indirectly via Open: that means this test + // can't live in the _test package because testdata is in a sub- + // directory and we're already in the pdftable package. Build a + // dependency-free fixture inline instead — same structure as + // testdata.Hello(). + return OpenBytes(helloBytes()) +} + +// helloBytes returns the same PDF as testdata.Hello(), inlined so this +// _test.go file inside the pdftable package doesn't pull in a +// dependency on testdata. (testdata/fixtures.go is in a sub-package and +// importing it here would cause a cycle.) +func helloBytes() []byte { + return buildSinglePageBytes(`BT +/F1 12 Tf +72 720 Td +(Hello, world!) Tj +ET +`) +} + +// buildSinglePageBytes builds a minimal single-page PDF whose content +// stream is the given text. Same layout as testdata.BuildSinglePage() +// but inlined to avoid the import cycle. +func buildSinglePageBytes(content string) []byte { + const header = "%PDF-1.4\n%\xe2\xe3\xcf\xd3\n" + objects := []string{ + `<< /Type /Catalog /Pages 2 0 R >>`, + `<< /Type /Pages /Kids [3 0 R] /Count 1 >>`, + `<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Resources << /Font << /F1 4 0 R >> /ProcSet [/PDF /Text] >> /Contents 5 0 R >>`, + `<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica /Encoding /WinAnsiEncoding >>`, + "", + } + streamBody := []byte(content) + objects[4] = "<< /Length " + itoa(len(streamBody)) + " >>\nstream\n" + string(streamBody) + "endstream" + + var sb strings.Builder + sb.WriteString(header) + offsets := make([]int, len(objects)) + for i, body := range objects { + offsets[i] = sb.Len() + sb.WriteString(itoa(i+1) + " 0 obj\n" + body + "\nendobj\n") + } + xrefPos := sb.Len() + sb.WriteString("xref\n0 " + itoa(len(objects)+1) + "\n") + sb.WriteString("0000000000 65535 f \n") + for _, off := range offsets { + sb.WriteString(pad10(off) + " 00000 n \n") + } + sb.WriteString("trailer\n") + sb.WriteString("<< /Size " + itoa(len(objects)+1) + " /Root 1 0 R >>\n") + sb.WriteString("startxref\n" + itoa(xrefPos) + "\n%%EOF\n") + return []byte(sb.String()) +} + +func itoa(n int) string { + if n == 0 { + return "0" + } + var buf [20]byte + i := len(buf) + for n > 0 { + i-- + buf[i] = byte('0' + n%10) + n /= 10 + } + return string(buf[i:]) +} + +func pad10(n int) string { + s := itoa(n) + if len(s) >= 10 { + return s + } + return strings.Repeat("0", 10-len(s)) + s +} + +func approxFloat(a, b, tol float64) bool { + return math.Abs(a-b) <= tol +}