From ee7c58f6999c562b82146a2d7d285aca421865b6 Mon Sep 17 00:00:00 2001
From: Robert Sayre <sayrer@gmail.com>
Date: Sat, 30 May 2026 14:33:51 -0700
Subject: [PATCH 1/6] dedup_key: add tableShareKey for post-embed smallTable
 identity
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds tableShareKey, a stable identifier for the share group of
slice-headers inside a smallTable. After smallTable is embedded
into faState by value, two states that hold copies of one source
smallTable still share the underlying steps/ceilings/epsilons
backing arrays — and a key built from SliceData(steps) + len(steps)
captures that equivalence. This replaces *smallTable-pointer-identity
as the dedup key in the next commit.

No behavioral change yet; this commit only adds the helper and unit
tests.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 dedup_key.go      | 26 +++++++++++++++++++++
 dedup_key_test.go | 57 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 83 insertions(+)
 create mode 100644 dedup_key.go
 create mode 100644 dedup_key_test.go

diff --git a/dedup_key.go b/dedup_key.go
new file mode 100644
index 0000000..aaa830e
--- /dev/null
+++ b/dedup_key.go
@@ -0,0 +1,26 @@
+package quamina
+
+import "unsafe"
+
+// tableShareKey returns a stable identifier for a smallTable's "share group".
+// Two states whose smallTables hold slice-headers pointing at the same `steps`
+// backing array (which is what happens when one smallTable struct value is
+// copied into multiple faStates during construction) will produce equal
+// keys. This replaces *smallTable-pointer identity as the dedup key in
+// epsilon-closure computation after smallTable is embedded into faState
+// by value.
+//
+// A zero key (nil pointer, len 0) means "no share group" — used for tables
+// with no byte transitions. Callers that want to dedup such tables should
+// skip the zero key.
+type tableShareKey struct {
+	stepsData unsafe.Pointer
+	stepsLen  int
+}
+
+func newTableShareKey(t *smallTable) tableShareKey {
+	return tableShareKey{
+		stepsData: unsafe.Pointer(unsafe.SliceData(t.steps)),
+		stepsLen:  len(t.steps),
+	}
+}
diff --git a/dedup_key_test.go b/dedup_key_test.go
new file mode 100644
index 0000000..7ba95b5
--- /dev/null
+++ b/dedup_key_test.go
@@ -0,0 +1,57 @@
+package quamina
+
+import (
+	"testing"
+)
+
+func TestTableShareKey_SharedBackings(t *testing.T) {
+	// Construct one smallTable, value-copy it (simulating post-embed share).
+	src := smallTable{
+		ceilings: []byte{'a', 'b', byte(byteCeiling)},
+		steps:    []*faState{nil, nil, nil},
+	}
+	copy1 := src
+	copy2 := src
+	if newTableShareKey(&copy1) != newTableShareKey(&copy2) {
+		t.Errorf("value-copied tables should share key; got %v vs %v",
+			newTableShareKey(&copy1), newTableShareKey(&copy2))
+	}
+}
+
+func TestTableShareKey_DistinctBackings(t *testing.T) {
+	t1 := smallTable{
+		ceilings: []byte{'a', byte(byteCeiling)},
+		steps:    []*faState{nil, nil},
+	}
+	t2 := smallTable{
+		ceilings: []byte{'a', byte(byteCeiling)},
+		steps:    []*faState{nil, nil},
+	}
+	if newTableShareKey(&t1) == newTableShareKey(&t2) {
+		t.Errorf("independently-built tables should not share key")
+	}
+}
+
+// TestTableShareKey_AppendBreaksShare verifies that when a value-copy
+// is mutated via append in a way that reallocates the backing array,
+// the keys diverge. We force reallocation by starting at cap=1 and
+// appending many entries.
+func TestTableShareKey_AppendBreaksShare(t *testing.T) {
+	src := smallTable{
+		ceilings: make([]byte, 0, 1),
+		steps:    make([]*faState, 0, 1),
+	}
+	src.ceilings = append(src.ceilings, byte(byteCeiling))
+	src.steps = append(src.steps, nil)
+	copy1 := src
+	// Appending 8 entries to a slice with cap=1 guarantees at least one
+	// realloc of the steps backing.
+	for i := 0; i < 8; i++ {
+		copy1.steps = append(copy1.steps, nil)
+		copy1.ceilings = append(copy1.ceilings, byte(i))
+	}
+	if newTableShareKey(&src) == newTableShareKey(&copy1) {
+		t.Errorf("expected keys to diverge after append-with-realloc; got equal: %v",
+			newTableShareKey(&src))
+	}
+}

From c6d4ef15f453d6d77770b5f794c238020be33d56 Mon Sep 17 00:00:00 2001
From: Robert Sayre <sayrer@gmail.com>
Date: Sat, 30 May 2026 12:56:50 -0700
Subject: [PATCH 2/6] epsi_closure: dedup via tableShareKey instead of
 *smallTable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Switches closureBuffers.tables from map[*smallTable]*tableMark to
map[tableShareKey]*tableMark. With *smallTable pointer identity
still intact (pre-embed), the new key is equivalent: two states
that share a *smallTable trivially share their steps backing.

No behavioral change. Sets up the next commit, which embeds
smallTable in faState by value — at which point pointer-identity
goes away but slice-backing identity remains.
---
 epsi_closure.go | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/epsi_closure.go b/epsi_closure.go
index 7e923b3..6f1ace2 100644
--- a/epsi_closure.go
+++ b/epsi_closure.go
@@ -20,24 +20,25 @@ type closureBuffers struct {
 	gen           uint32
 	closureSetGen uint32
 	closureList   []*faState
-	tables        map[*smallTable]*tableMark
+	tables        map[tableShareKey]*tableMark
 	states        map[*faState]uint32
 }
 
 func newClosureBuffers() *closureBuffers {
 	return &closureBuffers{
 		gen:    1,
-		tables: make(map[*smallTable]*tableMark),
+		tables: make(map[tableShareKey]*tableMark),
 		states: make(map[*faState]uint32),
 	}
 }
 
 // tableMarkOf returns the tableMark for t, creating one on first access.
 func (b *closureBuffers) tableMarkOf(t *smallTable) *tableMark {
-	m, ok := b.tables[t]
+	key := newTableShareKey(t)
+	m, ok := b.tables[key]
 	if !ok {
 		m = &tableMark{}
-		b.tables[t] = m
+		b.tables[key] = m
 	}
 	return m
 }

From 25593150d3e6777a2392363e4ec4c4e0d5fec37b Mon Sep 17 00:00:00 2001
From: Robert Sayre <sayrer@gmail.com>
Date: Sat, 30 May 2026 13:48:46 -0700
Subject: [PATCH 3/6] nfa: embed smallTable into faState by value
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

faState.table changes from *smallTable to smallTable (inline). This
shrinks per-state memory:
  - faState 64B + smallTable 80B-class = 144B per state pair
  - embedded faState fits 128B size class = 128B per state
  - saves 16B/state on workloads without table-pointer sharing

vmFields.startTable becomes startState *faState (the start state
owns the start table inline). traverseNFA, epsilonClosure, and
related helpers take *faState instead of *smallTable.

Epsilon-closure dedup continues to work via tableShareKey (added in
the previous two commits) — value-copies of one source smallTable
still share their slice backings, and SliceData(steps) is the new
identity.

Size-assertion tests are not yet recalibrated; that follows in the
next commit.
---
 anything_but.go         |  8 ++--
 core_matcher_test.go    |  4 +-
 epsi_closure.go         | 25 +++++++------
 epsi_closure_test.go    | 67 ++++++++++++++-------------------
 memory_cost.go          | 16 ++++----
 memory_cost_test.go     |  6 +--
 monocase.go             | 24 ++++++------
 nfa.go                  | 75 +++++++++++++++----------------------
 nfa_test.go             | 34 +++++++++--------
 prettyprinter.go        | 34 ++++++++++++++---
 prettyprinter_test.go   | 26 ++++++-------
 quantified_atom.go      |  8 ++--
 regexp_nfa.go           | 83 +++++++++++++++++++----------------------
 regexp_nfa_test.go      | 26 ++++++-------
 regexp_validity_test.go |  8 ++--
 rune_range.go           | 30 +++++++--------
 rune_range_test.go      |  8 ++--
 shell_style.go          | 14 +++----
 shell_style_test.go     |  2 +-
 small_table.go          |  8 ++--
 small_table_test.go     | 12 +++---
 stats.go                |  8 ++--
 value_matcher.go        | 79 ++++++++++++++++++++++-----------------
 value_matcher_test.go   | 42 ++++++++++-----------
 wildcard.go             | 14 +++----
 25 files changed, 334 insertions(+), 327 deletions(-)

diff --git a/anything_but.go b/anything_but.go
index 8253f5b..0faf426 100644
--- a/anything_but.go
+++ b/anything_but.go
@@ -70,12 +70,12 @@ func readAnythingButSpecial(pb *patternBuild, valsIn []typedVal) (pathVals []typ
 // Making a succession of anything-but automata for each of "a" and "b" and then merging them turns out not
 // to work because what the caller means is really an AND - everything that matches neither "a" nor "b". So
 // in principle we could intersect automata.
-func makeMultiAnythingButFA(vals [][]byte) (*smallTable, *fieldMatcher) {
+func makeMultiAnythingButFA(vals [][]byte) (*faState, *fieldMatcher) {
 	nextField := newFieldMatcher()
 	success := &faState{table: newSmallTable(), fieldTransitions: []*fieldMatcher{nextField}}
 
-	ret, _ := makeOneMultiAnythingButStep(vals, 0, success), nextField
-	return ret, nextField
+	startTable := makeOneMultiAnythingButStep(vals, 0, success)
+	return &faState{table: startTable}, nextField
 }
 
 // makeOneMultiAnythingButStep - spookeh. The idea is that there will be N smallTables in this FA, where N is
@@ -84,7 +84,7 @@ func makeMultiAnythingButFA(vals [][]byte) (*smallTable, *fieldMatcher) {
 // yet been exhausted. We notice when we get to the end of each val and put in a valueTerminator transition
 // to a step with no nextField entry, i.e. failure because we've exactly matched one of the anything-but
 // strings.
-func makeOneMultiAnythingButStep(vals [][]byte, index int, success *faState) *smallTable {
+func makeOneMultiAnythingButStep(vals [][]byte, index int, success *faState) smallTable {
 	// this will be the default transition in all the anything-but tables.
 	var u unpackedTable
 	for i := range u {
diff --git a/core_matcher_test.go b/core_matcher_test.go
index dc83979..51370f2 100644
--- a/core_matcher_test.go
+++ b/core_matcher_test.go
@@ -330,8 +330,8 @@ func TestSimpleaddPattern(t *testing.T) {
 // which means the finite automata are hidden deep inside the coreMatcher instance
 // and hard to get at.  This helper routine fetches the value-matcher automaton
 // corresponding to the "path" argument
-func fetchFAForPath(t *testing.T, cm *coreMatcher, path string) *smallTable {
+func fetchFAForPath(t *testing.T, cm *coreMatcher, path string) *faState {
 	t.Helper()
 	vm := cm.fields().state.fields().transitions[path]
-	return vm.fields().startTable
+	return vm.fields().startState
 }
diff --git a/epsi_closure.go b/epsi_closure.go
index 6f1ace2..196deda 100644
--- a/epsi_closure.go
+++ b/epsi_closure.go
@@ -43,29 +43,30 @@ func (b *closureBuffers) tableMarkOf(t *smallTable) *tableMark {
 	return m
 }
 
-// epsilonClosure walks the automaton starting from the given table
+// epsilonClosure walks the automaton starting from the given state
 // and precomputes the epsilon closure for every reachable faState.
-func epsilonClosure(table *smallTable) {
+func epsilonClosure(start *faState) {
 	bufs := newClosureBuffers()
-	closureForNfa(table, bufs)
+	closureForState(start, bufs)
+	closureForNfa(start, bufs)
 }
 
-func closureForNfa(table *smallTable, bufs *closureBuffers) {
-	mark := bufs.tableMarkOf(table)
+func closureForNfa(state *faState, bufs *closureBuffers) {
+	mark := bufs.tableMarkOf(&state.table)
 	if mark.lastVisitedGen == bufs.gen {
 		return
 	}
 	mark.lastVisitedGen = bufs.gen
 
-	for _, state := range table.steps {
-		if state != nil {
-			closureForState(state, bufs)
-			closureForNfa(state.table, bufs)
+	for _, s := range state.table.steps {
+		if s != nil {
+			closureForState(s, bufs)
+			closureForNfa(s, bufs)
 		}
 	}
-	for _, eps := range table.epsilons {
+	for _, eps := range state.table.epsilons {
 		closureForState(eps, bufs)
-		closureForNfa(eps.table, bufs)
+		closureForNfa(eps, bufs)
 	}
 }
 
@@ -106,7 +107,7 @@ func closureForState(state *faState, bufs *closureBuffers) {
 	dedupGen := bufs.gen
 	closure := make([]*faState, 0, len(bufs.closureList))
 	for _, s := range bufs.closureList {
-		mark := bufs.tableMarkOf(s.table)
+		mark := bufs.tableMarkOf(&s.table)
 		if mark.closureGen == dedupGen {
 			if sameFieldTransitions(mark.closureRep, s) {
 				continue
diff --git a/epsi_closure_test.go b/epsi_closure_test.go
index 58313cf..8c0f522 100644
--- a/epsi_closure_test.go
+++ b/epsi_closure_test.go
@@ -6,23 +6,19 @@ import (
 )
 
 func TestEpsilonClosure(t *testing.T) {
-	var st *smallTable
-
 	pp := newPrettyPrinter(4589)
 
-	st = newSmallTable()
-	aSa := &faState{table: st}
-	pp.labelTable(aSa.table, "aSa")
+	aSa := &faState{table: newSmallTable()}
+	pp.labelTable(&aSa.table, "aSa")
 	aSstar := &faState{}
 	aSc := &faState{}
-	st.addByteStep('b', aSstar)
-	st = newSmallTable()
-	st.epsilons = []*faState{aSstar}
-	st.addByteStep('c', aSc)
-	aSstar.table = st
-	pp.labelTable(aSstar.table, "aSstar")
+	aSa.table.addByteStep('b', aSstar)
+	aSstar.table = newSmallTable()
+	aSstar.table.epsilons = []*faState{aSstar}
+	aSstar.table.addByteStep('c', aSc)
+	pp.labelTable(&aSstar.table, "aSstar")
 	aSc.table = newSmallTable()
-	pp.labelTable(aSc.table, "aSc")
+	pp.labelTable(&aSc.table, "aSc")
 	aFM := newFieldMatcher()
 	aSc.fieldTransitions = []*fieldMatcher{aFM}
 
@@ -41,31 +37,28 @@ func TestEpsilonClosure(t *testing.T) {
 
 	// (b) ab|*x
 	var bSa, bSb, bSsplice, bSstar, bSx *faState
-	st = newSmallTable()
 
-	bSa = &faState{table: st}
+	bSa = &faState{table: newSmallTable()}
 	bFM1 := newFieldMatcher()
 	bSb = &faState{table: newSmallTable(), fieldTransitions: []*fieldMatcher{bFM1}}
 	bSa.table.addByteStep('b', bSb)
 	bFM2 := newFieldMatcher()
 	bSx = &faState{table: newSmallTable(), fieldTransitions: []*fieldMatcher{bFM2}}
 
-	st = newSmallTable()
-	bSstar = &faState{table: st}
-	st.epsilons = []*faState{bSstar}
-	st.addByteStep('x', bSx)
-	st.epsilons = []*faState{bSstar}
+	bSstar = &faState{table: newSmallTable()}
+	bSstar.table.epsilons = []*faState{bSstar}
+	bSstar.table.addByteStep('x', bSx)
+	bSstar.table.epsilons = []*faState{bSstar}
 
-	st = newSmallTable()
-	st.epsilons = []*faState{bSa, bSstar}
-	bSsplice = &faState{table: st}
+	bSsplice = &faState{table: newSmallTable()}
+	bSsplice.table.epsilons = []*faState{bSa, bSstar}
 
 	// 	var bSa, bSb, bSsplice, bSstar, bSx *faState
-	pp.labelTable(bSa.table, "bSa")
-	pp.labelTable(bSb.table, "bSb")
-	pp.labelTable(bSstar.table, "bSstar")
-	pp.labelTable(bSx.table, "bSx")
-	pp.labelTable(bSsplice.table, "bSsplice")
+	pp.labelTable(&bSa.table, "bSa")
+	pp.labelTable(&bSb.table, "bSb")
+	pp.labelTable(&bSstar.table, "bSstar")
+	pp.labelTable(&bSx.table, "bSx")
+	pp.labelTable(&bSsplice.table, "bSsplice")
 
 	bEcShouldBeOne := []*faState{bSa, bSb, bSx, bSstar}
 	zNames := []string{"bSa", "bSb", "bSx", "bSstar"}
@@ -102,8 +95,7 @@ func TestEpsilonClosure(t *testing.T) {
 	names := []string{"cStart", "cSa", "cSb", "cSc", "cSz"}
 	states := []*faState{cStart, cSa, cSb, cSc, cSz}
 	for i, name := range names {
-		st = states[i].table
-		pp.labelTable(st, name)
+		pp.labelTable(&states[i].table, name)
 	}
 
 	closureForStateNoBufs(cStart)
@@ -158,16 +150,15 @@ func TestTablePointerDedupPreservesFieldTransitions(t *testing.T) {
 	xState := &faState{table: xTable}
 
 	// quoteState transitions on 'x' to xState
-	quoteTable := newSmallTable()
-	quoteTable.addByteStep('x', xState)
-	quoteState := &faState{table: quoteTable}
+	quoteState := &faState{table: newSmallTable()}
+	quoteState.table.addByteStep('x', xState)
 
 	// start transitions on '"' to quoteState
-	startTable := newSmallTable()
-	startTable.addByteStep('"', quoteState)
+	start := &faState{table: newSmallTable()}
+	start.table.addByteStep('"', quoteState)
 
 	// Compute epsilon closures for the whole automaton
-	epsilonClosure(startTable)
+	epsilonClosure(start)
 
 	// Verify: xState's closure must include both stateA and stateB
 	if !containsState(t, xState.epsilonClosure, stateA) {
@@ -181,7 +172,7 @@ func TestTablePointerDedupPreservesFieldTransitions(t *testing.T) {
 	bufs := newNfaBuffers()
 	tm := bufs.getTransmap()
 	tm.push()
-	nfaResult := traverseNFA(startTable, []byte(`"x"`), nil, bufs)
+	nfaResult := traverseNFA(start, []byte(`"x"`), nil, bufs)
 	tm.pop()
 
 	if !slices.Contains(nfaResult, fmA) {
@@ -192,8 +183,8 @@ func TestTablePointerDedupPreservesFieldTransitions(t *testing.T) {
 	}
 
 	// Verify via DFA conversion: both field matchers must survive
-	dfa := nfa2Dfa(startTable)
-	dfaResult := traverseDFA(dfa.table, []byte(`"x"`), nil)
+	dfa := nfa2Dfa(start)
+	dfaResult := traverseDFA(dfa, []byte(`"x"`), nil)
 
 	if !slices.Contains(dfaResult, fmA) {
 		t.Error("DFA traversal missing fmA")
diff --git a/memory_cost.go b/memory_cost.go
index 794d5c6..1211704 100644
--- a/memory_cost.go
+++ b/memory_cost.go
@@ -23,11 +23,11 @@ func cmFieldMatcherStats(fm *fieldMatcher, stats *matcherStats, pp printer) {
 		if singleton != nil {
 			stats.bytes += int64(len(singleton))
 		}
-		table := vm.fields().startTable
-		if table == nil {
+		start := vm.fields().startState
+		if start == nil {
 			continue
 		}
-		cmStateStats(&faState{table: table}, stats, pp)
+		cmStateStats(start, stats, pp)
 	}
 }
 
@@ -49,7 +49,9 @@ func cmStateStats(state *faState, stats *matcherStats, pp printer) {
 		}
 	}
 	for _, eps := range state.table.epsilons {
-		cmStateStats(eps, stats, pp)
+		if eps != nil {
+			cmStateStats(eps, stats, pp)
+		}
 	}
 	for _, trans := range state.fieldTransitions {
 		cmFieldMatcherStats(trans, stats, pp)
@@ -66,9 +68,9 @@ func mcSmallTable(st *smallTable) int64 {
 
 func mcFaState(state *faState) int64 {
 	cost := mcFaStateBase
-	if state.table != nil {
-		cost += mcSmallTable(state.table)
-	}
+	cost += int64(cap(state.table.ceilings))
+	cost += mcPointer * int64(cap(state.table.steps))
+	cost += mcPointer * int64(cap(state.table.epsilons))
 	cost += mcPointer * int64(cap(state.fieldTransitions))
 	cost += mcPointer * int64(cap(state.epsilonClosure))
 	return cost
diff --git a/memory_cost_test.go b/memory_cost_test.go
index aba59df..7a6f7f7 100644
--- a/memory_cost_test.go
+++ b/memory_cost_test.go
@@ -10,9 +10,9 @@ func TestMcBasicSizes(t *testing.T) {
 	table := newSmallTable()
 	// NewSmallTable output: base + 1 byte of ceiling + 1 pointer of steps (8b) +
 	want := tableBase + 1 + mcPointer
-	tableGot := mcSmallTable(table)
+	tableGot := mcSmallTable(&table)
 	if want != tableGot {
-		t.Errorf("Table wanted %d got %d", want, mcSmallTable(table))
+		t.Errorf("Table wanted %d got %d", want, mcSmallTable(&table))
 	}
 	stateBase := int64(unsafe.Sizeof(faState{}))
 	state := faState{table: table}
@@ -69,7 +69,7 @@ func TestMcNfaSizes(t *testing.T) {
 	stats := &matcherStats{
 		seenStates: make(map[*faState]bool),
 	}
-	cmStateStats(&faState{table: fa1}, stats, pp)
+	cmStateStats(fa1, stats, pp)
 	wantedBytes := int64(1321) // laboriously hand-calculated
 	wantedFanout := int64(5)
 	wantedMaxFanout := int64(2)
diff --git a/monocase.go b/monocase.go
index c662b4a..68a41bf 100644
--- a/monocase.go
+++ b/monocase.go
@@ -40,11 +40,11 @@ func readMonocaseSpecial(pb *patternBuild, valsIn []typedVal) (pathVals []typedV
 // multi-byte and in fact not even the same number of bytes. So in that case you need two paths forward that step
 // through the bytes of each form and then rejoin to arrive at a state. Also note
 // that in many cases the upper/lower case versions of a rune have leading bytes in common
-func makeMonocaseFA(val []byte, pp printer) (*smallTable, *fieldMatcher) {
+func makeMonocaseFA(val []byte, pp printer) (*faState, *fieldMatcher) {
 	fm := newFieldMatcher()
 	index := 0
-	table := newSmallTable() // start state
-	startTable := table
+	startState := &faState{table: newSmallTable()} // start state
+	currentTable := &startState.table
 	var nextStep *faState
 	for index < len(val) {
 		var orig, alt []byte
@@ -56,32 +56,32 @@ func makeMonocaseFA(val []byte, pp printer) (*smallTable, *fieldMatcher) {
 			utf8.EncodeRune(alt, altRune)
 		}
 		nextStep = &faState{table: newSmallTable()}
-		pp.labelTable(nextStep.table, fmt.Sprintf("On %d, alt=%v", val[index], alt))
+		pp.labelTable(&nextStep.table, fmt.Sprintf("On %d, alt=%v", val[index], alt))
 		if alt == nil {
 			// easy case, no casefolding issues.  We should maybe try to coalesce these
 			// no-casefolding sections and only call makeFAFragment once for all of them
 			origFA := makeFAFragment(orig, nextStep, pp)
-			table.addByteStep(orig[0], origFA)
+			currentTable.addByteStep(orig[0], origFA)
 		} else {
 			// two paths to next state
 			// but they might have a common prefix
 			var commonPrefix int
 			for commonPrefix = 0; orig[commonPrefix] == alt[commonPrefix]; commonPrefix++ {
 				prefixStep := &faState{table: newSmallTable()}
-				table.addByteStep(orig[commonPrefix], prefixStep)
-				table = prefixStep.table
-				pp.labelTable(table, fmt.Sprintf("common prologue on %x", orig[commonPrefix]))
+				currentTable.addByteStep(orig[commonPrefix], prefixStep)
+				currentTable = &prefixStep.table
+				pp.labelTable(currentTable, fmt.Sprintf("common prologue on %x", orig[commonPrefix]))
 			}
 			// now build automata for the orig and alt versions of the char
 			origFA := makeFAFragment(orig[commonPrefix:], nextStep, pp)
 			altFA := makeFAFragment(alt[commonPrefix:], nextStep, pp)
-			table.addByteStep(orig[commonPrefix], origFA)
-			table.addByteStep(alt[commonPrefix], altFA)
+			currentTable.addByteStep(orig[commonPrefix], origFA)
+			currentTable.addByteStep(alt[commonPrefix], altFA)
 		}
-		table = nextStep.table
+		currentTable = &nextStep.table
 		index += width
 	}
 	lastState := &faState{table: newSmallTable(), fieldTransitions: []*fieldMatcher{fm}}
 	nextStep.table.addByteStep(valueTerminator, lastState)
-	return startTable, fm
+	return startState, fm
 }
diff --git a/nfa.go b/nfa.go
index b6b9d89..989cdae 100644
--- a/nfa.go
+++ b/nfa.go
@@ -11,7 +11,7 @@ import (
 // automaton requires a smallTable and for some of them, taking the step means you've matched a value and can
 // transition to a new fieldMatcher, in which case the fieldTransitions slice will be non-nil
 type faState struct {
-	table            *smallTable
+	table            smallTable
 	fieldTransitions []*fieldMatcher
 	epsilonClosure   []*faState // precomputed epsilon closure including self
 	isSpinner        bool
@@ -93,8 +93,6 @@ type nfaBuffers struct {
 	resultBuf      []X
 	transmap       *transmap
 	fieldSet       map[*fieldMatcher]bool
-	startState     *faState
-	startClosure   []*faState
 	qNumBuf        [MaxBytesInEncoding]byte
 }
 
@@ -140,25 +138,13 @@ func (nb *nfaBuffers) getFieldSet() map[*fieldMatcher]bool {
 	return nb.fieldSet
 }
 
-func (nb *nfaBuffers) getStartState(table *smallTable) *faState {
-	if nb.startState == nil {
-		nb.startState = &faState{}
-		nb.startClosure = make([]*faState, 1)
-	}
-	nb.startState.table = table
-	nb.startClosure[0] = nb.startState
-	nb.startState.epsilonClosure = nb.startClosure
-	return nb.startState
-}
-
 // nfa2Dfa does what the name says, but as of 2026/01 is not used.
-func nfa2Dfa(nfaTable *smallTable) *faState {
+func nfa2Dfa(nfaStart *faState) *faState {
 	// The start state always has a trivial epsilon closure (just itself) because
 	// all Quamina automata begin by matching the opening quote (0x22). The start
 	// table therefore has a single transition on `"` and never has epsilons.
-	startState := &faState{table: nfaTable}
-	startState.epsilonClosure = []*faState{startState}
-	startNfa := []*faState{startState}
+	nfaStart.epsilonClosure = []*faState{nfaStart}
+	startNfa := []*faState{nfaStart}
 	return n2dNode(startNfa, newStateLists())
 }
 
@@ -189,14 +175,14 @@ func n2dNode(rawNStates []*faState, sList *stateLists) *faState {
 	// to simplify, let's unpack all the ingredients
 	nUnpacked := make([]*unpackedTable, len(ingredients))
 	for i, nState := range ingredients {
-		nUnpacked[i] = unpackTable(nState.table)
+		nUnpacked[i] = unpackTable(&nState.table)
 	}
 
 	// Unpack the DFA table once, set all byte transitions, then pack once —
 	// the old code called addByteStep per byte which unpacked and repacked
 	// for each of up to 256 values. rawStates is allocated once and reset
 	// with [:0] each iteration to avoid per-byte-value slice allocation.
-	dfaUnpacked := unpackTable(dfaState.table)
+	dfaUnpacked := unpackTable(&dfaState.table)
 	rawStates := make([]*faState, 0, len(ingredients))
 	for utf8byte := 0; utf8byte < byteCeiling; utf8byte++ {
 		rawStates = rawStates[:0]
@@ -238,7 +224,8 @@ func n2dNode(rawNStates []*faState, sList *stateLists) *faState {
 // NFA-capable data structure, we can traverse it deterministically if we know in advance that every
 // combination of an faState with a byte will transition to at most one other faState.
 
-func traverseDFA(table *smallTable, val []byte, transitions []*fieldMatcher) []*fieldMatcher {
+func traverseDFA(start *faState, val []byte, transitions []*fieldMatcher) []*fieldMatcher {
+	table := &start.table
 	for index := 0; index <= len(val); index++ {
 		var utf8Byte byte
 		if index < len(val) {
@@ -251,7 +238,7 @@ func traverseDFA(table *smallTable, val []byte, transitions []*fieldMatcher) []*
 			break
 		}
 		transitions = append(transitions, next.fieldTransitions...)
-		table = next.table
+		table = &next.table
 	}
 	return transitions
 }
@@ -262,12 +249,12 @@ func traverseDFA(table *smallTable, val []byte, transitions []*fieldMatcher) []*
 // collected in the nextStates list.  The bufs structure contains three buffers, one each for
 // currentStates, nextStates, and the epsilon closure of one particular state. These are re-used
 // and should grow with use and minimize the need for memory allocation.
-func traverseNFA(table *smallTable, val []byte, transitions []*fieldMatcher, bufs *nfaBuffers) []*fieldMatcher {
+func traverseNFA(start *faState, val []byte, transitions []*fieldMatcher, bufs *nfaBuffers) []*fieldMatcher {
 	currentStates := bufs.getBuf1()
 	// The start state always has a trivial epsilon closure (just itself) because
 	// all Quamina automata begin by matching the opening quote (0x22). The start
 	// table therefore has a single transition on `"` and never has epsilons.
-	currentStates = append(currentStates, bufs.getStartState(table))
+	currentStates = append(currentStates, start)
 	nextStates := bufs.getBuf2()
 
 	// Use flat dedup set — no stacking needed since traverseNFA is not recursive
@@ -376,9 +363,9 @@ func simplifyCollect(s *faState, visited map[*faState]struct{}, targets []*faSta
 // minimal or even avoids being wasteful.
 // INVARIANT: neither argument is nil
 // INVARIANT: To be thread-safe, no existing table can be updated except when we're building it
-func mergeFAs(table1, table2 *smallTable, pp printer) *smallTable {
-	state1 := &faState{table: table1}
-	state2 := &faState{table: table2}
+func mergeFAs(table1, table2 *smallTable, pp printer) smallTable {
+	state1 := &faState{table: *table1}
+	state2 := &faState{table: *table2}
 	s := mergeFAStates(state1, state2, make(map[faStepKey]*faState), pp)
 	return s.table
 }
@@ -399,7 +386,7 @@ func mergeFAStates(state1, state2 *faState, keyMemo map[faStepKey]*faState, pp p
 	// TODO: This is still creating way too many splice states and slowing down traversal. Fix that.
 	switch {
 	case state1.isSpinner && state2.isSpinner:
-		pp.labelTable(combined.table, "2Spinners")
+		pp.labelTable(&combined.table, "2Spinners")
 		combined = symmetricSpinnerMerge(state1, state2, keyMemo, pp)
 		keyMemo[mKey] = combined
 		return combined
@@ -420,7 +407,7 @@ func mergeFAStates(state1, state2 *faState, keyMemo map[faStepKey]*faState, pp p
 	// If either of the states to be merged has epsilons we have to do a splice.
 	// To avoid deep nesting of splice states, we flatten the epsilon targets.
 	if len(state1.table.epsilons) != 0 || len(state2.table.epsilons) != 0 {
-		pp.labelTable(combined.table, "Splice")
+		pp.labelTable(&combined.table, "Splice")
 		combined.table.epsilons = simplifySplices(state1, state2)
 		keyMemo[mKey] = combined
 		return combined
@@ -428,14 +415,14 @@ func mergeFAStates(state1, state2 *faState, keyMemo map[faStepKey]*faState, pp p
 
 	combined.fieldTransitions = append(state1.fieldTransitions, state2.fieldTransitions...)
 
-	pp.labelTable(combined.table, fmt.Sprintf("%d∎%d",
-		pp.tableSerial(state1.table), pp.tableSerial(state2.table)))
+	pp.labelTable(&combined.table, fmt.Sprintf("%d∎%d",
+		pp.tableSerial(&state1.table), pp.tableSerial(&state2.table)))
 
 	keyMemo[mKey] = combined
 
 	var iter1, iter2 stIterator
-	iter1.table = state1.table
-	iter2.table = state2.table
+	iter1.table = &state1.table
+	iter2.table = &state2.table
 	var uComb unpackedTable
 	var merged *faState
 
@@ -478,14 +465,14 @@ func asymmetricSpinnerMerge(spinner, nonSpinner *faState, keyMemo map[faStepKey]
 	combined := &faState{table: newSmallTable()}
 	combined.fieldTransitions = append(spinner.fieldTransitions, nonSpinner.fieldTransitions...)
 
-	pp.labelTable(combined.table, fmt.Sprintf("%d∎%d",
-		pp.tableSerial(spinner.table), pp.tableSerial(nonSpinner.table)))
+	pp.labelTable(&combined.table, fmt.Sprintf("%d∎%d",
+		pp.tableSerial(&spinner.table), pp.tableSerial(&nonSpinner.table)))
 
 	keyMemo[mKey] = combined
 
 	var iter1, iter2 stIterator
-	iter1.table = spinner.table
-	iter2.table = nonSpinner.table
+	iter1.table = &spinner.table
+	iter2.table = &nonSpinner.table
 	var uComb unpackedTable
 	var mergedState *faState
 
@@ -505,7 +492,7 @@ func asymmetricSpinnerMerge(spinner, nonSpinner *faState, keyMemo map[faStepKey]
 			// nonspinner has a branch here
 			// if the current spinner value is a loopback, we need to make a new state whose value
 			// is the nonspinner with the addition of the epsilon link back to the spinner
-			mergedTable := &smallTable{
+			mergedTable := smallTable{
 				steps:    nonSpinnernext.table.steps,
 				ceilings: nonSpinnernext.table.ceilings,
 				epsilons: append(nonSpinnernext.table.epsilons, spinner),
@@ -545,14 +532,14 @@ func symmetricSpinnerMerge(state1, state2 *faState, keyMemo map[faStepKey]*faSta
 	combined := &faState{table: newSmallTable()}
 	combined.fieldTransitions = append(state1.fieldTransitions, state2.fieldTransitions...)
 
-	pp.labelTable(combined.table, fmt.Sprintf("%d∎%d",
-		pp.tableSerial(state1.table), pp.tableSerial(state2.table)))
+	pp.labelTable(&combined.table, fmt.Sprintf("%d∎%d",
+		pp.tableSerial(&state1.table), pp.tableSerial(&state2.table)))
 
 	keyMemo[makeFaStepKey(state1, state2)] = combined
 
 	var iter1, iter2 stIterator
-	iter1.table = state1.table
-	iter2.table = state2.table
+	iter1.table = &state1.table
+	iter2.table = &state2.table
 	var uComb unpackedTable
 	var mergedState *faState
 
@@ -570,7 +557,7 @@ func symmetricSpinnerMerge(state1, state2 *faState, keyMemo map[faStepKey]*faSta
 
 		case next1 == state1 && next2 != state2:
 			// next2 is an actual branch, so we will have to install the spin pointer in the target
-			table := &smallTable{
+			table := smallTable{
 				ceilings: next2.table.ceilings,
 				steps:    next2.table.steps,
 				epsilons: append(state2.table.epsilons, combined),
@@ -581,7 +568,7 @@ func symmetricSpinnerMerge(state1, state2 *faState, keyMemo map[faStepKey]*faSta
 			}
 		case next2 == state2 && next1 != state1:
 			// next1 is an actual branch, so we will have to install the spin pointer in the target
-			table := &smallTable{
+			table := smallTable{
 				ceilings: next1.table.ceilings,
 				steps:    next1.table.steps,
 				epsilons: append(state1.table.epsilons, combined),
diff --git a/nfa_test.go b/nfa_test.go
index 056246f..95a3618 100644
--- a/nfa_test.go
+++ b/nfa_test.go
@@ -54,7 +54,7 @@ func TestFocusedMerge(t *testing.T) {
 		"ab*",
 		"*ab",
 	}
-	var automata []*smallTable
+	var automata []*faState
 	var matchers []*fieldMatcher
 
 	for _, shellStyle := range shellStyles {
@@ -72,13 +72,13 @@ func TestFocusedMerge(t *testing.T) {
 
 	merged := newSmallTable()
 	for _, automaton := range automata {
-		merged = mergeFAs(merged, automaton, sharedNullPrinter)
+		merged = mergeFAs(&merged, &automaton.table, sharedNullPrinter)
 		s := statsAccum{
 			fmVisited: make(map[*fieldMatcher]bool),
 			vmVisited: make(map[*valueMatcher]bool),
 			stVisited: make(map[*smallTable]bool),
 		}
-		faStats(merged, &s)
+		faStats(&merged, &s)
 		fmt.Println(s.stStats())
 	}
 }
@@ -158,15 +158,15 @@ func TestNfa2Dfa(t *testing.T) {
 			}
 		}
 		dfa := nfa2Dfa(nfa)
-		// fmt.Println("DFA: " + pp.printNFA(dfa.table))
+		// fmt.Println("DFA: " + pp.printNFA(&dfa.table))
 		for _, should := range test.shoulds {
-			matched := traverseDFA(dfa.table, asQuotedBytes(t, should), transitions)
+			matched := traverseDFA(dfa, asQuotedBytes(t, should), transitions)
 			if len(matched) != 1 {
 				t.Errorf("DFA %s didn't match %s ", test.pattern, should)
 			}
 		}
 		for _, nope := range test.nopes {
-			matched := traverseDFA(dfa.table, asQuotedBytes(t, nope), transitions)
+			matched := traverseDFA(dfa, asQuotedBytes(t, nope), transitions)
 			if len(matched) != 0 {
 				t.Errorf("DFA %s matched %s", test.pattern, nope)
 			}
@@ -182,10 +182,10 @@ func asQuotedBytes(t *testing.T, s string) []byte {
 // testTraverseNFA wraps traverseNFA with the push/pop that tryToMatch
 // normally provides. Test-only convenience so direct callers don't need
 // to manage the transmap stack themselves.
-func testTraverseNFA(table *smallTable, val []byte, transitions []*fieldMatcher, bufs *nfaBuffers) []*fieldMatcher {
+func testTraverseNFA(start *faState, val []byte, transitions []*fieldMatcher, bufs *nfaBuffers) []*fieldMatcher {
 	tm := bufs.getTransmap()
 	tm.push()
-	result := traverseNFA(table, val, transitions, bufs)
+	result := traverseNFA(start, val, transitions, bufs)
 	tm.pop()
 	return result
 }
@@ -432,10 +432,10 @@ func TestTransmapBufferReuse(t *testing.T) {
 }
 
 // collectClosureStats walks an NFA and reports epsilon closure size statistics.
-func collectClosureStats(startTable *smallTable) (stateCount, totalEntries, maxClosure int, tableSharing int) {
+func collectClosureStats(start *faState) (stateCount, totalEntries, maxClosure int, tableSharing int) {
 	visitedTables := make(map[*smallTable]bool)
 	visitedStates := make(map[*faState]bool)
-	tableCounts := make(map[*smallTable]int)
+	tableCounts := make(map[tableShareKey]int)
 
 	var walkTable func(t *smallTable)
 	walkTable = func(t *smallTable) {
@@ -446,29 +446,31 @@ func collectClosureStats(startTable *smallTable) (stateCount, totalEntries, maxC
 		for _, state := range t.steps {
 			if state != nil && !visitedStates[state] {
 				visitedStates[state] = true
-				tableCounts[state.table]++
+				tableCounts[newTableShareKey(&state.table)]++
 				ec := len(state.epsilonClosure)
 				totalEntries += ec
 				if ec > maxClosure {
 					maxClosure = ec
 				}
-				walkTable(state.table)
+				walkTable(&state.table)
 			}
 		}
 		for _, eps := range t.epsilons {
 			if !visitedStates[eps] {
 				visitedStates[eps] = true
-				tableCounts[eps.table]++
+				tableCounts[newTableShareKey(&eps.table)]++
 				ec := len(eps.epsilonClosure)
 				totalEntries += ec
 				if ec > maxClosure {
 					maxClosure = ec
 				}
-				walkTable(eps.table)
+				walkTable(&eps.table)
 			}
 		}
 	}
-	walkTable(startTable)
+	if start != nil {
+		walkTable(&start.table)
+	}
 
 	for _, count := range tableCounts {
 		if count > 1 {
@@ -612,7 +614,7 @@ func TestTablePointerDedup(t *testing.T) {
 			m := q.matcher.(*coreMatcher)
 
 			vm := m.fields().state.fields().transitions["val"]
-			nfaStart := vm.fields().startTable
+			nfaStart := vm.fields().startState
 			stateCount, totalEntries, maxClosure, tableSharing := collectClosureStats(nfaStart)
 
 			if stateCount != wl.stateCount {
diff --git a/prettyprinter.go b/prettyprinter.go
index 0a724c3..f581296 100644
--- a/prettyprinter.go
+++ b/prettyprinter.go
@@ -106,11 +106,33 @@ func (a ppAlready) remember(state *faState, table *smallTable) {
 }
 
 func (pp *prettyPrinter) printNFA(t *smallTable) string {
-	return pp.printNFAStep(&faState{table: t}, 0, newPpAlready())
+	// Use the caller's *smallTable pointer for label lookup. Building a
+	// throwaway faState{table: *t} would re-locate the smallTable in memory
+	// and lose the label (labels are keyed by address).
+	return pp.printNFAFromTable(t, newPpAlready())
+}
+
+// printNFAFromTable prints starting from a smallTable, using its address for
+// label lookup. Used for the start node where there's no real *faState owning
+// the requested address.
+func (pp *prettyPrinter) printNFAFromTable(t *smallTable, already ppAlready) string {
+	tableCost := mcSmallTable(t)
+	stateCost := mcFaStateBase + tableCost - mcSmallTableBase
+	trailer := fmt.Sprintf("[s/t %d/%d] \n", tableCost, stateCost)
+	s := " " + pp.printTable(t) + trailer
+	for _, step := range t.steps {
+		if step != nil {
+			s += pp.printNFAStep(step, 1, already)
+		}
+	}
+	for _, step := range t.epsilons {
+		s += pp.printNFAStep(step, 1, already)
+	}
+	return s
 }
 
 func (pp *prettyPrinter) printNFAStep(fas *faState, indent int, already ppAlready) string {
-	t := fas.table
+	t := &fas.table
 	if already.sawThis(fas, t) {
 		return ""
 	}
@@ -122,7 +144,7 @@ func (pp *prettyPrinter) printNFAStep(fas *faState, indent int, already ppAlread
 	if len(fas.fieldTransitions) != 0 {
 		trailer += fmt.Sprintf(" [%d transition(s)]", len(fas.fieldTransitions))
 	}
-	trailer += fmt.Sprintf("[s/t %d/%d] ", mcSmallTable(fas.table), mcFaState(fas))
+	trailer += fmt.Sprintf("[s/t %d/%d] ", mcSmallTable(&fas.table), mcFaState(fas))
 	trailer += "\n"
 	s := " " + pp.printTable(t) + trailer
 	for _, step := range t.steps {
@@ -207,11 +229,11 @@ func (pp *prettyPrinter) printTable(t *smallTable) string {
 }
 
 func (pp *prettyPrinter) nextString(n *faState) string {
-	label := pp.tableLabel(n.table)
+	label := pp.tableLabel(&n.table)
 	if len(label) == 0 {
-		label = shortTableAddress(n.table)
+		label = shortTableAddress(&n.table)
 	}
-	return fmt.Sprintf("%d[%s]", pp.tableSerial(n.table), label)
+	return fmt.Sprintf("%d[%s]", pp.tableSerial(&n.table), label)
 }
 
 func branchChar(b byte) string {
diff --git a/prettyprinter_test.go b/prettyprinter_test.go
index cc62011..75a989e 100644
--- a/prettyprinter_test.go
+++ b/prettyprinter_test.go
@@ -6,21 +6,21 @@ import (
 
 func TestPP(t *testing.T) {
 	pp := newPrettyPrinter(1)
-	table, _ := makeShellStyleFA([]byte(`"x*9"`), pp)
-	pp.labelTable(table, "START HERE")
-	wanted := ` 884[START HERE] '22/"' → (914[on " at 0][s/t 216/280] 
- 914[on " at 0] '78/x' → (384[*-Spinner][s/t 216/280] 
- 384[*-Spinner] '39/9' → (322[spinEscape on 9 at 3] / ★ → 384[*-Spinner][s/t 216/280] 
- 322[spinEscape on 9 at 3] ε → 384[*-Spinner] / '22/"' → (769[on " at 4][s/t 224/288] 
- 769[on " at 4] 'f5/ℵ' → (301[last step at 5][s/t 216/280] 
- 301[last step at 5]  [1 transition(s)][s/t 81/153] 
+	state, _ := makeShellStyleFA([]byte(`"x*9"`), pp)
+	pp.labelTable(&state.table, "START HERE")
+	wanted := ` 884[START HERE] '22/"' → (914[on " at 0][s/t 216/280]
+ 914[on " at 0] '78/x' → (384[*-Spinner][s/t 216/280]
+ 384[*-Spinner] '39/9' → (322[spinEscape on 9 at 3] / ★ → 384[*-Spinner][s/t 216/280]
+ 322[spinEscape on 9 at 3] ε → 384[*-Spinner] / '22/"' → (769[on " at 4][s/t 224/288]
+ 769[on " at 4] 'f5/ℵ' → (301[last step at 5][s/t 216/280]
+ 301[last step at 5]  [1 transition(s)][s/t 81/153]
 `
-	s := pp.printNFA(table)
+	s := pp.printNFA(&state.table)
 	if s != wanted {
 		t.Errorf("LONG: wanted\n<%s>\ngot\n<%s>\n", wanted, s)
 	}
-	if pp.shortPrintNFA(table) != "884[START HERE]" {
-		t.Errorf("SHORT: wanted <%s> got <%s>\n", "758[START HERE]", pp.shortPrintNFA(table))
+	if pp.shortPrintNFA(&state.table) != "884[START HERE]" {
+		t.Errorf("SHORT: wanted <%s> got <%s>\n", "758[START HERE]", pp.shortPrintNFA(&state.table))
 	}
 }
 
@@ -28,8 +28,8 @@ func TestNullPP(t *testing.T) {
 	np := &nullPrinter{}
 	table := newSmallTable()
 	table.addByteStep(3, &faState{})
-	np.labelTable(table, "foo")
-	if np.printNFA(table) != noPP || np.shortPrintNFA(table) != noPP {
+	np.labelTable(&table, "foo")
+	if np.printNFA(&table) != noPP || np.shortPrintNFA(&table) != noPP {
 		t.Error("didn't get noPP")
 	}
 }
diff --git a/quantified_atom.go b/quantified_atom.go
index 0cd326d..6afd4b2 100644
--- a/quantified_atom.go
+++ b/quantified_atom.go
@@ -49,12 +49,12 @@ func (qa *quantifiedAtom) isMinimumOnly() bool {
 	return qa.quantMax == regexpMinimumOnly
 }
 
-func (qa *quantifiedAtom) makeFA(nextStep *faState, pp printer) *smallTable {
-	var table *smallTable
+func (qa *quantifiedAtom) makeFA(nextStep *faState, pp printer) smallTable {
+	var table smallTable
 	switch {
 	case qa.isDot():
 		table = makeDotFA(nextStep)
-		pp.labelTable(table, "Dot")
+		pp.labelTable(&table, "Dot")
 	case qa.getSubtree() != nil:
 		table = makeNFAFromBranches(qa.getSubtree(), nextStep, false, pp)
 	case qa.runeRangeCache() != "":
@@ -64,7 +64,7 @@ func (qa *quantifiedAtom) makeFA(nextStep *faState, pp printer) *smallTable {
 	default:
 		// if it's none of these other things, it has to boil down to a rune range
 		table = makeRuneRangeNFA(qa.runes, nextStep, pp)
-		pp.labelTable(table, fmt.Sprintf("RR %x/%x, %d-%d", qa.runes[0].Lo, qa.runes[0].Hi, qa.quantMin, qa.quantMax))
+		pp.labelTable(&table, fmt.Sprintf("RR %x/%x, %d-%d", qa.runes[0].Lo, qa.runes[0].Hi, qa.quantMin, qa.quantMax))
 	}
 	return table
 }
diff --git a/regexp_nfa.go b/regexp_nfa.go
index 27b83f2..0275fa6 100644
--- a/regexp_nfa.go
+++ b/regexp_nfa.go
@@ -20,46 +20,48 @@ type regexpRoot []regexpBranch
 // makeRegexpNFA traverses the parsed regexp tree and generates a finite automaton
 // that matches it. The FA has states that match " at the beginning and end because
 // all Quamina field values are enclosed in quotes.
-func makeRegexpNFA(root regexpRoot, pp printer) (*smallTable, *fieldMatcher) {
+func makeRegexpNFA(root regexpRoot, pp printer) (*faState, *fieldMatcher) {
 	nextField := newFieldMatcher()
 	nextStep := makeNFATrailer(nextField)
-	pp.labelTable(nextStep.table, "Trailer")
+	pp.labelTable(&nextStep.table, "Trailer")
 	table := makeSmallTable(nil, []byte{'"'}, []*faState{nextStep})
-	pp.labelTable(table, "</Field>")
+	pp.labelTable(&table, "</Field>")
 	nextStep = &faState{table: table}
-	fa := makeNFAFromBranches(root, nextStep, true, pp)
-	return fa, nextField
+	startTable := makeNFAFromBranches(root, nextStep, true, pp)
+	return &faState{table: startTable}, nextField
 }
-func makeNFAFromBranches(root regexpRoot, nextStep *faState, addQuoteTransition bool, pp printer) *smallTable {
+func makeNFAFromBranches(root regexpRoot, nextStep *faState, addQuoteTransition bool, pp printer) smallTable {
 	// completely empty regexp
 	if len(root) == 0 {
 		return makeSmallTable(nil, []byte{'"'}, []*faState{nextStep})
 	}
-	var fa *smallTable
+	var fa smallTable
+	first := true
 	for _, branch := range root {
-		var nextBranch *smallTable
+		var nextBranch smallTable
 		if len(branch) == 0 {
 			nextBranch = makeSmallTable(nil, []byte{'"'}, []*faState{nextStep})
-			pp.labelTable(nextBranch, "next on len 0")
+			pp.labelTable(&nextBranch, "next on len 0")
 		} else {
 			nextBranch = faFromBranch(branch, nextStep, addQuoteTransition, pp)
 		}
-		if fa != nil {
-			fa = mergeFAs(fa, nextBranch, pp)
+		if !first {
+			fa = mergeFAs(&fa, &nextBranch, pp)
 		} else {
 			fa = nextBranch
+			first = false
 		}
 	}
 	return fa
 }
 
-func faFromBranch(branch regexpBranch, nextStep *faState, addQuoteTransition bool, pp printer) *smallTable {
+func faFromBranch(branch regexpBranch, nextStep *faState, addQuoteTransition bool, pp printer) smallTable {
 	state := faFromQuantifiedAtom(branch, 0, nextStep, pp)
 	table := state.table
 	if addQuoteTransition {
 		firstState := &faState{table: table}
 		table = makeSmallTable(nil, []byte{'"'}, []*faState{firstState})
-		pp.labelTable(table, "<Field>")
+		pp.labelTable(&table, "<Field>")
 	}
 	return table
 }
@@ -79,7 +81,7 @@ func faFromQuantifiedAtom(branch regexpBranch, index int, finalStep *faState, pp
 	case atom.isPlus():
 		// the + construction requires a loopback state in front of the state table
 		plusLoopback := &faState{table: newSmallTable()}
-		pp.labelTable(plusLoopback.table, "PlusLoopback")
+		pp.labelTable(&plusLoopback.table, "PlusLoopback")
 		state = &faState{table: atom.makeFA(plusLoopback, pp)}
 
 		// for the + case, need to loop back to the newly created state
@@ -99,8 +101,8 @@ func faFromQuantifiedAtom(branch regexpBranch, index int, finalStep *faState, pp
 		nextMinMaxStep := nextState
 
 		for counter := atom.quantMax; counter > 0; counter-- {
-			stepTable := faFromShell(shellTable, PlaceholderState, nextMinMaxStep)
-			pp.labelTable(stepTable, fmt.Sprintf("minmax at %d", counter))
+			stepTable := faFromShell(&shellTable, PlaceholderState, nextMinMaxStep)
+			pp.labelTable(&stepTable, fmt.Sprintf("minmax at %d", counter))
 
 			// if it's between quantMin & max, we're in optional territory
 			// so it needs an epsilon to allow jumping out
@@ -127,8 +129,8 @@ func faFromQuantifiedAtom(branch regexpBranch, index int, finalStep *faState, pp
 
 		var lastState *faState
 		for counter := atom.quantMin; counter > 0; counter-- {
-			stepTable := faFromShell(shellTable, PlaceholderState, nextMinMaxStep)
-			pp.labelTable(stepTable, fmt.Sprintf("minmax at %d", counter))
+			stepTable := faFromShell(&shellTable, PlaceholderState, nextMinMaxStep)
+			pp.labelTable(&stepTable, fmt.Sprintf("minmax at %d", counter))
 			state = &faState{table: stepTable}
 
 			// there's a chain of the minimum-count steps, but the last one has to
@@ -160,57 +162,50 @@ func makeNFATrailer(nextField *fieldMatcher) *faState {
 	return &faState{table: table}
 }
 
-func makeByteDotFA(dest *faState, pp printer) *smallTable {
+func makeByteDotFA(dest *faState, pp printer) smallTable {
 	ceilings := []byte{0xC0, 0xC2, 0xF5, 0xF6}
 	steps := []*faState{dest, nil, dest, nil}
-	t := &smallTable{ceilings: ceilings, steps: steps}
-	pp.labelTable(t, " · ")
+	t := smallTable{ceilings: ceilings, steps: steps}
+	pp.labelTable(&t, " · ")
 	return t
 }
 
-func makeDotFA(dest *faState) *smallTable {
-	sLast := &smallTable{
+func makeDotFA(dest *faState) smallTable {
+	targetLast := &faState{table: smallTable{
 		ceilings: []byte{0x80, 0xc0, byte(byteCeiling)},
 		steps:    []*faState{nil, dest, nil},
-	}
-	targetLast := &faState{table: sLast}
-	sLastInter := &smallTable{
+	}}
+	targetLastInter := &faState{table: smallTable{
 		ceilings: []byte{0x80, 0xc0, byte(byteCeiling)},
 		steps:    []*faState{nil, targetLast, nil},
-	}
-	targetLastInter := &faState{table: sLastInter}
-	sFirstInter := &smallTable{
+	}}
+	targetFirstInter := &faState{table: smallTable{
 		ceilings: []byte{0x80, 0xc0, byte(byteCeiling)},
 		steps:    []*faState{nil, targetLastInter, nil},
-	}
-	targetFirstInter := &faState{table: sFirstInter}
+	}}
 
-	sE0 := &smallTable{
+	targetE0 := &faState{table: smallTable{
 		ceilings: []byte{0xa0, 0xc0, byte(byteCeiling)},
 		steps:    []*faState{nil, targetLast, nil},
-	}
-	targetE0 := &faState{table: sE0}
+	}}
 
-	sED := &smallTable{
+	targetED := &faState{table: smallTable{
 		ceilings: []byte{0x80, 0xA0, byte(byteCeiling)},
 		steps:    []*faState{nil, targetLast, nil},
-	}
-	targetED := &faState{table: sED}
+	}}
 
-	sF0 := &smallTable{
+	targetF0 := &faState{table: smallTable{
 		ceilings: []byte{0x90, 0xC0, byte(byteCeiling)},
 		steps:    []*faState{nil, targetLastInter, nil},
-	}
-	targetF0 := &faState{table: sF0}
+	}}
 
-	sF4 := &smallTable{
+	targetF4 := &faState{table: smallTable{
 		ceilings: []byte{0x80, 0x90, byte(byteCeiling)},
 		steps:    []*faState{nil, targetLastInter, nil},
-	}
-	targetF4 := &faState{table: sF4}
+	}}
 
 	// for reference, see https://www.tbray.org/ongoing/When/202x/2024/12/29/Matching-Dot-Redux
-	return &smallTable{
+	return smallTable{
 		ceilings: []byte{
 			0x80,              // 0
 			0xC2,              // 1
diff --git a/regexp_nfa_test.go b/regexp_nfa_test.go
index e16e1df..08b1101 100644
--- a/regexp_nfa_test.go
+++ b/regexp_nfa_test.go
@@ -66,14 +66,14 @@ func applyAndRunRegexp(t *testing.T, regexp string, match string, pp printer) in
 	t.Helper()
 	qm := []byte(`"` + match + `"`)
 	fa := faFromRegexp(t, regexp, pp)
-	fmt.Println("N:\n" + pp.printNFA(fa))
+	fmt.Println("N:\n" + pp.printNFA(&fa.table))
 	var transitions []*fieldMatcher
 	bufs := newNfaBuffers()
 	matches := testTraverseNFA(fa, qm, transitions, bufs)
 	return len(matches)
 }
 
-func faFromRegexp(t *testing.T, r string, pp printer) *smallTable {
+func faFromRegexp(t *testing.T, r string, pp printer) *faState {
 	t.Helper()
 	parse, err := readRegexp(r)
 	if err != nil {
@@ -95,7 +95,7 @@ func TestRegexpPlus(t *testing.T) {
 		"[123]+|[abc]+",
 	}
 	pp := newPrettyPrinter(4623)
-	var fa *smallTable
+	var fa *faState
 	for _, re := range res {
 		fa = faFromRegexp(t, re, pp)
 		epsilonClosure(fa)
@@ -147,11 +147,11 @@ func TestExploreUTF8Form(t *testing.T) {
 
 	wantFM := &fieldMatcher{}
 	targetState := &faState{table: newSmallTable(), fieldTransitions: []*fieldMatcher{wantFM}}
-	table := makeDotFA(targetState)
+	startState := &faState{table: makeDotFA(targetState)}
 	var matchers []*fieldMatcher
 	var got []*fieldMatcher
 	for i, bad := range bads {
-		got = traverseDFA(table, bad, matchers)
+		got = traverseDFA(startState, bad, matchers)
 		if len(got) != 0 {
 			t.Errorf("accepted index %d", i)
 		}
@@ -161,7 +161,7 @@ func TestExploreUTF8Form(t *testing.T) {
 func TestDotSemantics(t *testing.T) {
 	wantFM := &fieldMatcher{}
 	targetState := &faState{table: newSmallTable(), fieldTransitions: []*fieldMatcher{wantFM}}
-	table := makeDotFA(targetState)
+	startState := &faState{table: makeDotFA(targetState)}
 	var matchers []*fieldMatcher
 	var got []*fieldMatcher
 	var r rune
@@ -170,7 +170,7 @@ func TestDotSemantics(t *testing.T) {
 		if r >= 0xD800 && r <= 0xDFFF {
 			continue
 		}
-		got = traverseDFA(table, []byte(string([]rune{r})), matchers)
+		got = traverseDFA(startState, []byte(string([]rune{r})), matchers)
 		if len(got) != 1 || got[0] != wantFM {
 			t.Errorf("failed on %x", r)
 		}
@@ -188,14 +188,14 @@ func TestDotSemantics(t *testing.T) {
 	}
 
 	for _, good := range goodUTF8 {
-		got = traverseDFA(table, good, matchers)
+		got = traverseDFA(startState, good, matchers)
 		if len(got) != 1 || got[0] != wantFM {
 			t.Errorf("failed on non-surrogate %04x", r)
 		}
 		matchers = matchers[:0]
 	}
 	for _, bad := range badUTF8 {
-		got = traverseDFA(table, bad, matchers)
+		got = traverseDFA(startState, bad, matchers)
 		if len(got) != 0 {
 			t.Errorf("accepted surrogate %04x", r)
 		}
@@ -341,17 +341,17 @@ func TestMultiLengthRR(t *testing.T) {
 		wantFM := &fieldMatcher{}
 
 		dest := &faState{table: newSmallTable(), fieldTransitions: []*fieldMatcher{wantFM}}
-		st := makeRuneRangeNFA(rr, dest, sharedNullPrinter)
+		startState := &faState{table: makeRuneRangeNFA(rr, dest, sharedNullPrinter)}
 
 		matchers := []*fieldMatcher{}
 		var got []*fieldMatcher
 		for _, rp := range multiLengthTest {
-			got = traverseDFA(st, []byte(string([]rune{rp.Lo})), matchers)
+			got = traverseDFA(startState, []byte(string([]rune{rp.Lo})), matchers)
 			if len(got) != 1 || got[0] != wantFM {
 				t.Errorf("failed on %x", rp.Lo)
 			}
 		}
-		nfaSize(t, st)
+		nfaSize(t, &startState.table)
 	}
 }
 
@@ -384,7 +384,7 @@ func nfaSizeStep(t *testing.T, st *smallTable, s *statsAccum, depth int) {
 	}
 	for _, step := range st.steps {
 		if step != nil {
-			nfaSizeStep(t, step.table, s, depth+1)
+			nfaSizeStep(t, &step.table, s, depth+1)
 		}
 	}
 }
diff --git a/regexp_validity_test.go b/regexp_validity_test.go
index f56b503..0d4adc0 100644
--- a/regexp_validity_test.go
+++ b/regexp_validity_test.go
@@ -54,7 +54,7 @@ func TestEmptyRegexp(t *testing.T) {
 }
 
 func TestToxicStack(t *testing.T) {
-	var table *smallTable
+	var start *faState
 	pp := newPrettyPrinter(34897)
 
 	re3 := "(([~.~~~?~*~+~{~}~[~]~(~)~|]?)*)+"
@@ -66,12 +66,12 @@ func TestToxicStack(t *testing.T) {
 	if err != nil {
 		t.Error("OOPS: " + err.Error())
 	}
-	table, _ = makeRegexpNFA(parse.tree, pp)
-	epsilonClosure(table)
+	start, _ = makeRegexpNFA(parse.tree, pp)
+	epsilonClosure(start)
 
 	var transitions []*fieldMatcher
 	bufs := newNfaBuffers()
-	trans := testTraverseNFA(table, []byte(str), transitions, bufs)
+	trans := testTraverseNFA(start, []byte(str), transitions, bufs)
 	if len(trans) != 1 {
 		t.Error("Toxic stack failure")
 	}
diff --git a/rune_range.go b/rune_range.go
index 3f3ddf1..f85c5e3 100644
--- a/rune_range.go
+++ b/rune_range.go
@@ -15,17 +15,17 @@ import (
 // can safely build and update the cachedRRFaShells
 
 var PlaceholderState *faState = &faState{table: newSmallTable()}
-var cachedFaShells = make(map[string]*smallTable)
+var cachedFaShells = make(map[string]smallTable)
 
-func faFromShell(shell *smallTable, oldNext *faState, newNext *faState) *smallTable {
-	return copyShellNode(&faState{table: shell}, oldNext, newNext, make(map[*faState]*faState)).table
+func faFromShell(shell *smallTable, oldNext *faState, newNext *faState) smallTable {
+	return copyShellNode(&faState{table: *shell}, oldNext, newNext, make(map[*faState]*faState)).table
 }
 func copyShellNode(shell *faState, oldNext *faState, newNext *faState, mem map[*faState]*faState) *faState {
 	already, ok := mem[shell]
 	if ok {
 		return already
 	}
-	table := &smallTable{
+	table := smallTable{
 		ceilings: slices.Clone(shell.table.ceilings),
 		steps:    make([]*faState, len(shell.table.steps)),
 		epsilons: make([]*faState, len(shell.table.epsilons)),
@@ -37,9 +37,9 @@ func copyShellNode(shell *faState, oldNext *faState, newNext *faState, mem map[*
 		case nil:
 			// no-op
 		case oldNext:
-			table.steps[i] = newNext
+			state.table.steps[i] = newNext
 		default:
-			table.steps[i] = copyShellNode(step, oldNext, newNext, mem)
+			state.table.steps[i] = copyShellNode(step, oldNext, newNext, mem)
 		}
 	}
 	for i, epsilon := range shell.table.epsilons {
@@ -47,9 +47,9 @@ func copyShellNode(shell *faState, oldNext *faState, newNext *faState, mem map[*
 		case nil:
 		// no-op
 		case oldNext:
-			table.epsilons[i] = newNext
+			state.table.epsilons[i] = newNext
 		default:
-			table.epsilons[i] = copyShellNode(epsilon, oldNext, newNext, mem)
+			state.table.epsilons[i] = copyShellNode(epsilon, oldNext, newNext, mem)
 		}
 	}
 	return state
@@ -97,17 +97,17 @@ func newRuneRangeIterator(rr RuneRange) (*runeRangeIterator, error) {
 // here's the problem: A construct like [~p{L}~p[Nd}~p{Zs}] is going to be brutally expensive, because
 // it'll have to build the FA to match the combination of all those huge rune-ranges.
 
-func makeAndCacheRuneRangeFA(rr RuneRange, next *faState, name string, pp printer) *smallTable {
+func makeAndCacheRuneRangeFA(rr RuneRange, next *faState, name string, pp printer) smallTable {
 	if name != "" {
 		fa, ok := cachedFaShells[name]
 		if !ok {
 			fa = makeAndCacheRuneRangeFA(rr, PlaceholderState, "", pp)
 			cachedFaShells[name] = fa
 		}
-		return faFromShell(fa, PlaceholderState, next)
+		return faFromShell(&fa, PlaceholderState, next)
 	}
 
-	pp.labelTable(next.table, "Next")
+	pp.labelTable(&next.table, "Next")
 	// turn the slice of hi/lo inclusive endpoints into a slice of utf8 encodings
 	ri, err := newRuneRangeIterator(rr)
 
@@ -125,7 +125,7 @@ func makeAndCacheRuneRangeFA(rr RuneRange, next *faState, name string, pp printe
 	return nfaFromSkinnyRuneTree(root, pp)
 }
 
-func makeRuneRangeNFA(rr RuneRange, next *faState, pp printer) *smallTable {
+func makeRuneRangeNFA(rr RuneRange, next *faState, pp printer) smallTable {
 	return makeAndCacheRuneRangeFA(rr, next, "", pp)
 }
 
@@ -188,10 +188,10 @@ func addSkinnyRuneTreeEntry(root *skinnyRuneTreeNode, r rune, dest *faState) {
 		node = nextEntry.node
 	}
 }
-func nfaFromSkinnyRuneTree(root *skinnyRuneTreeNode, pp printer) *smallTable {
+func nfaFromSkinnyRuneTree(root *skinnyRuneTreeNode, pp printer) smallTable {
 	return tableFromSkinnyRuneTreeNode(root, pp)
 }
-func tableFromSkinnyRuneTreeNode(node *skinnyRuneTreeNode, pp printer) *smallTable {
+func tableFromSkinnyRuneTreeNode(node *skinnyRuneTreeNode, pp printer) smallTable {
 	var unpacked unpackedTable
 	for index, byteVal := range node.byteVals {
 		entry := node.entries[index]
@@ -199,7 +199,7 @@ func tableFromSkinnyRuneTreeNode(node *skinnyRuneTreeNode, pp printer) *smallTab
 			unpacked[byteVal] = entry.next
 		} else {
 			table := tableFromSkinnyRuneTreeNode(entry.node, pp)
-			pp.labelTable(table, fmt.Sprintf("on %x", byteVal))
+			pp.labelTable(&table, fmt.Sprintf("on %x", byteVal))
 			unpacked[byteVal] = &faState{table: table}
 		}
 	}
diff --git a/rune_range_test.go b/rune_range_test.go
index 36e7b49..27febc3 100644
--- a/rune_range_test.go
+++ b/rune_range_test.go
@@ -12,15 +12,15 @@ func TestSkinnyRuneTree(t *testing.T) {
 	srt := &skinnyRuneTreeNode{}
 	pp := newPrettyPrinter(246758)
 	tt := newSmallTable()
-	pp.labelTable(tt, "Next")
+	pp.labelTable(&tt, "Next")
 	dest := &faState{table: tt, fieldTransitions: []*fieldMatcher{{}}}
 	addSkinnyRuneTreeEntry(srt, r, dest)
 	addSkinnyRuneTreeEntry(srt, r+1, dest)
 	addSkinnyRuneTreeEntry(srt, r+3, dest)
-	fa := nfaFromSkinnyRuneTree(srt, pp)
-	fmt.Println("FA:\n" + pp.printNFA(fa))
+	startState := &faState{table: nfaFromSkinnyRuneTree(srt, pp)}
+	fmt.Println("FA:\n" + pp.printNFA(&startState.table))
 	trans := []*fieldMatcher{}
-	matches := traverseDFA(fa, utf8, trans)
+	matches := traverseDFA(startState, utf8, trans)
 	if len(matches) != 1 {
 		t.Error("MISSED")
 	}
diff --git a/shell_style.go b/shell_style.go
index da50454..b737744 100644
--- a/shell_style.go
+++ b/shell_style.go
@@ -44,10 +44,10 @@ func readShellStyleSpecial(pb *patternBuild, valsIn []typedVal) (pathVals []type
 
 // makeShellStyleFA does what it says.  It is precisely equivalent to a regex with the only operator
 // being a single ".*". Once we've implemented regular expressions we can use that to more or less eliminate this
-func makeShellStyleFA(val []byte, pp printer) (start *smallTable, nextField *fieldMatcher) {
+func makeShellStyleFA(val []byte, pp printer) (start *faState, nextField *fieldMatcher) {
 	state := &faState{table: newSmallTable()}
-	start = state.table
-	pp.labelTable(start, "SHELLSTYLE")
+	start = state
+	pp.labelTable(&start.table, "SHELLSTYLE")
 	nextField = newFieldMatcher()
 
 	// for each byte in the pattern
@@ -63,19 +63,19 @@ func makeShellStyleFA(val []byte, pp printer) (start *smallTable, nextField *fie
 			spinEscape.table.epsilons = []*faState{spinner}
 			spinner.table = makeByteDotFA(spinner, pp)
 			spinner.table.addByteStep(val[valIndex], spinEscape)
-			pp.labelTable(spinner.table, "*-Spinner")
-			pp.labelTable(spinEscape.table, fmt.Sprintf("spinEscape on %c at %d", val[valIndex], valIndex))
+			pp.labelTable(&spinner.table, "*-Spinner")
+			pp.labelTable(&spinEscape.table, fmt.Sprintf("spinEscape on %c at %d", val[valIndex], valIndex))
 			state = spinEscape
 		} else {
 			nextStep := &faState{table: newSmallTable()}
-			pp.labelTable(nextStep.table, fmt.Sprintf("on %c at %d", val[valIndex], valIndex))
+			pp.labelTable(&nextStep.table, fmt.Sprintf("on %c at %d", val[valIndex], valIndex))
 			state.table.addByteStep(ch, nextStep)
 			state = nextStep
 		}
 		valIndex++
 	}
 	lastStep := &faState{table: newSmallTable(), fieldTransitions: []*fieldMatcher{nextField}}
-	pp.labelTable(lastStep.table, fmt.Sprintf("last step at %d", valIndex))
+	pp.labelTable(&lastStep.table, fmt.Sprintf("last step at %d", valIndex))
 	state.table.addByteStep(valueTerminator, lastStep)
 	return
 }
diff --git a/shell_style_test.go b/shell_style_test.go
index 6f9d573..aa905b8 100644
--- a/shell_style_test.go
+++ b/shell_style_test.go
@@ -62,7 +62,7 @@ func TestMakeShellStyleFA(t *testing.T) {
 		a, wanted := makeShellStyleFA([]byte(pattern), sharedNullPrinter)
 		epsilonClosure(a)
 		vm := newValueMatcher()
-		vmf := vmFields{startTable: a}
+		vmf := vmFields{startState: a}
 		vm.update(&vmf)
 		bufs := newNfaBuffers()
 		for _, should := range shouldsForPatterns[i] {
diff --git a/small_table.go b/small_table.go
index 5b80da0..ec7fb1e 100644
--- a/small_table.go
+++ b/small_table.go
@@ -45,8 +45,8 @@ type smallTable struct {
 
 // newSmallTable mostly exists to enforce the constraint that every smallTable has a byteCeiling entry at
 // the end, which smallTable.step totally depends on.
-func newSmallTable() *smallTable {
-	return &smallTable{
+func newSmallTable() smallTable {
+	return smallTable{
 		ceilings: []byte{byte(byteCeiling)},
 		steps:    []*faState{nil},
 	}
@@ -105,7 +105,7 @@ func (t *smallTable) dStep(utf8Byte byte) *faState {
 // value, and then a few other values with their indexes and values specified in the other two arguments. The
 // goal is to reduce memory churn
 // constraint: positions must be provided in order
-func makeSmallTable(defaultStep *faState, indices []byte, steps []*faState) *smallTable {
+func makeSmallTable(defaultStep *faState, indices []byte, steps []*faState) smallTable {
 	t := smallTable{
 		ceilings: make([]byte, 0, len(indices)+2),
 		steps:    make([]*faState, 0, len(indices)+2),
@@ -125,7 +125,7 @@ func makeSmallTable(defaultStep *faState, indices []byte, steps []*faState) *sma
 		t.ceilings = append(t.ceilings, byte(byteCeiling))
 		t.steps = append(t.steps, defaultStep)
 	}
-	return &t
+	return t
 }
 
 // For manipulating larger-scale machines, the performance starts to be dominated by
diff --git a/small_table_test.go b/small_table_test.go
index 16db156..4cab429 100644
--- a/small_table_test.go
+++ b/small_table_test.go
@@ -81,7 +81,7 @@ func TestSmallTableIterator(t *testing.T) {
 	for i, byteval := range bytevals {
 		wanted[byteval] = steps[i]
 	}
-	iter := newSTIterator(st, nil)
+	iter := newSTIterator(&st, nil)
 	for iter.hasNext() {
 		utf8byte, step := iter.next()
 		if wanted[utf8byte] != step {
@@ -96,8 +96,8 @@ func TestSmallTableIterator(t *testing.T) {
 			t.Errorf("at u=%x wanted %p got %p", i, wanted[i], state)
 		}
 	}
-	unpacked := unpackTable(st)
-	iter = newSTIterator(st, &iter)
+	unpacked := unpackTable(&st)
+	iter = newSTIterator(&st, &iter)
 	for iter.hasNext() {
 		utf8byte, step := iter.next()
 		if unpacked[utf8byte] != step {
@@ -115,15 +115,15 @@ func TestSmallTableIterator(t *testing.T) {
 	for i, byteval := range bytevals {
 		wanted[byteval] = steps[i]
 	}
-	iter = newSTIterator(st, &iter)
+	iter = newSTIterator(&st, &iter)
 	for iter.hasNext() {
 		utf8byte, step := iter.next()
 		if wanted[utf8byte] != step {
 			t.Errorf("at u=%x wanted %p got %p", utf8byte, wanted[utf8byte], step)
 		}
 	}
-	unpacked = unpackTable(st)
-	iter = newSTIterator(st, &iter)
+	unpacked = unpackTable(&st)
+	iter = newSTIterator(&st, &iter)
 	for iter.hasNext() {
 		utf8byte, step := iter.next()
 		if unpacked[utf8byte] != step {
diff --git a/stats.go b/stats.go
index ae5017e..cbaf20a 100644
--- a/stats.go
+++ b/stats.go
@@ -152,8 +152,8 @@ func vmStats(m *valueMatcher, s *statsAccum) {
 		s.siCount++
 		fmStats(state.singletonTransition, s)
 	}
-	if state.startTable != nil {
-		faStats(state.startTable, s)
+	if state.startState != nil {
+		faStats(&state.startState.table, s)
 	}
 }
 
@@ -180,10 +180,10 @@ func faStats(t *smallTable, s *statsAccum) {
 	}
 	for _, next := range t.steps {
 		if next != nil {
-			faStats(next.table, s)
+			faStats(&next.table, s)
 		}
 	}
 	for _, epsilon := range t.epsilons {
-		faStats(epsilon.table, s)
+		faStats(&epsilon.table, s)
 	}
 }
diff --git a/value_matcher.go b/value_matcher.go
index 96e45e6..1010e15 100644
--- a/value_matcher.go
+++ b/value_matcher.go
@@ -23,7 +23,7 @@ type valueMatcher struct {
 	updateable atomic.Pointer[vmFields]
 }
 type vmFields struct {
-	startTable          *smallTable
+	startState          *faState
 	singletonMatch      []byte
 	singletonTransition *fieldMatcher
 	hasNumbers          bool
@@ -66,23 +66,23 @@ func (m *valueMatcher) transitionOn(eventField *Field, bufs *nfaBuffers) []*fiel
 		}
 		return transitions
 
-	case vmFields.startTable != nil:
+	case vmFields.startState != nil:
 		// if there is a potential for a numeric match, try making a Q number from the event
 		if vmFields.hasNumbers && eventField.IsNumber {
 			qNum, err := qNumFromBytesBuf(val, &bufs.qNumBuf)
 			if err == nil {
 				if vmFields.isNondeterministic {
-					return traverseNFA(vmFields.startTable, qNum, transitions, bufs)
+					return traverseNFA(vmFields.startState, qNum, transitions, bufs)
 				}
-				return traverseDFA(vmFields.startTable, qNum, transitions)
+				return traverseDFA(vmFields.startState, qNum, transitions)
 			}
 		}
 
 		// if it doesn't work as a Q number for some reason, go ahead and compare the string values
 		if vmFields.isNondeterministic {
-			return traverseNFA(vmFields.startTable, val, transitions, bufs)
+			return traverseNFA(vmFields.startState, val, transitions, bufs)
 		}
-		return traverseDFA(vmFields.startTable, val, transitions)
+		return traverseDFA(vmFields.startState, val, transitions)
 
 	default:
 		// no FA, no singleton, nothing to do, this probably can't happen because a flattener
@@ -97,7 +97,7 @@ func (m *valueMatcher) addTransition(val typedVal, printer printer) *fieldMatche
 	var err error
 
 	// special case - virgin state and this is a string match
-	if fields.startTable == nil && fields.singletonMatch == nil && (val.vType == stringType || val.vType == literalType) {
+	if fields.startState == nil && fields.singletonMatch == nil && (val.vType == stringType || val.vType == literalType) {
 		fields.singletonMatch = valBytes
 		fields.singletonTransition = newFieldMatcher()
 		m.update(fields)
@@ -114,38 +114,46 @@ func (m *valueMatcher) addTransition(val typedVal, printer printer) *fieldMatche
 	// no dodges, we have to build an automaton to match this value
 	var nextField *fieldMatcher
 
-	var newFA *smallTable
+	// newFA holds the newly-built automaton. Most builders return a smallTable
+	// value to be wrapped in an faState; makeRegexpNFA and a few NFA builders
+	// return *faState directly. After this switch, newFAState is the start
+	// faState for the new automaton.
+	var newFAState *faState
 	switch val.vType {
 	case stringType, literalType:
-		newFA, nextField = makeStringFA(valBytes, nil, false)
+		t, fm := makeStringFA(valBytes, nil, false)
+		newFAState, nextField = &faState{table: t}, fm
 	case numberType:
-		newFA, nextField = makeStringFA(valBytes, nil, true)
+		t, fm := makeStringFA(valBytes, nil, true)
+		newFAState, nextField = &faState{table: t}, fm
 		fields.hasNumbers = true
 	case anythingButType:
-		newFA, nextField = makeMultiAnythingButFA(val.list)
+		newFAState, nextField = makeMultiAnythingButFA(val.list)
 	case shellStyleType:
-		newFA, nextField = makeShellStyleFA(valBytes, printer)
+		newFAState, nextField = makeShellStyleFA(valBytes, printer)
 		fields.isNondeterministic = true
 	case wildcardType:
-		newFA, nextField = makeWildCardFA(valBytes, printer)
+		newFAState, nextField = makeWildCardFA(valBytes, printer)
 		fields.isNondeterministic = true
 	case prefixType:
-		newFA, nextField = makePrefixFA(valBytes)
+		t, fm := makePrefixFA(valBytes)
+		newFAState, nextField = &faState{table: t}, fm
 	case monocaseType:
-		newFA, nextField = makeMonocaseFA(valBytes, printer)
+		newFAState, nextField = makeMonocaseFA(valBytes, printer)
 	case regexpType:
-		newFA, nextField = makeRegexpNFA(val.parsedRegexp, sharedNullPrinter)
-		if newFA.isNondeterministic() {
+		newFAState, nextField = makeRegexpNFA(val.parsedRegexp, sharedNullPrinter)
+		if newFAState.table.isNondeterministic() {
 			fields.isNondeterministic = true
 		}
-		printer.labelTable(newFA, "RX start")
+		printer.labelTable(&newFAState.table, "RX start")
 	default:
 		panic("unknown value type")
 	}
 
 	// there's already a table, thus an out-degree > 1
-	if fields.startTable != nil {
-		fields.startTable = mergeFAs(fields.startTable, newFA, printer)
+	if fields.startState != nil {
+		mergedTable := mergeFAs(&fields.startState.table, &newFAState.table, printer)
+		fields.startState = &faState{table: mergedTable}
 		if err != nil {
 			return nil
 		}
@@ -156,7 +164,7 @@ func (m *valueMatcher) addTransition(val typedVal, printer printer) *fieldMatche
 		// 	if (bytesAllocated() - mm.baseAlloc) > mm.headroom {
 
 		if fields.isNondeterministic {
-			epsilonClosure(fields.startTable)
+			epsilonClosure(fields.startState)
 		}
 
 		m.update(fields)
@@ -167,35 +175,36 @@ func (m *valueMatcher) addTransition(val typedVal, printer printer) *fieldMatche
 	if fields.singletonMatch != nil {
 		// singleton is here, we don't match, so our outdegree becomes 2, so we have
 		// to build an automaton with two values in it.
-		singletonAutomaton, _ := makeStringFA(fields.singletonMatch, fields.singletonTransition, false)
+		singletonTable, _ := makeStringFA(fields.singletonMatch, fields.singletonTransition, false)
 
 		// now table is ready for use, nuke singleton to signal threads to use it
-		fields.startTable = mergeFAs(singletonAutomaton, newFA, sharedNullPrinter)
+		mergedTable := mergeFAs(&singletonTable, &newFAState.table, sharedNullPrinter)
+		fields.startState = &faState{table: mergedTable}
 		if err != nil {
 			return nil
 		}
 		if fields.isNondeterministic {
-			epsilonClosure(fields.startTable)
+			epsilonClosure(fields.startState)
 		}
 		fields.singletonMatch = nil
 		fields.singletonTransition = nil
 	} else {
 		// empty valueMatcher, no special cases, just jam in the new FA
-		fields.startTable = newFA
+		fields.startState = newFAState
 		if fields.isNondeterministic {
-			epsilonClosure(fields.startTable)
+			epsilonClosure(fields.startState)
 		}
 	}
 	m.update(fields)
 	return nextField
 }
 
-func makePrefixFA(val []byte) (*smallTable, *fieldMatcher) {
+func makePrefixFA(val []byte) (smallTable, *fieldMatcher) {
 	nextField := newFieldMatcher()
 	return makeOnePrefixFAStep(val, 0, nextField), nextField
 }
 
-func makeOnePrefixFAStep(val []byte, index int, nextField *fieldMatcher) *smallTable {
+func makeOnePrefixFAStep(val []byte, index int, nextField *fieldMatcher) smallTable {
 	// have to stop one short to skip the closing "
 	var nextState *faState
 	if index == len(val)-2 {
@@ -211,7 +220,7 @@ func makeOnePrefixFAStep(val []byte, index int, nextField *fieldMatcher) *smallT
 // is recursive because this allows the use of the makeSmallTable call, which
 // reduces memory churn. Converting from a straightforward implementation to
 // this approximately doubled the fields/second rate in addPattern
-func makeStringFA(val []byte, useThisTransition *fieldMatcher, isNumber bool) (*smallTable, *fieldMatcher) {
+func makeStringFA(val []byte, useThisTransition *fieldMatcher, isNumber bool) (smallTable, *fieldMatcher) {
 	var nextField *fieldMatcher
 	if useThisTransition != nil {
 		nextField = useThisTransition
@@ -227,7 +236,7 @@ func makeStringFA(val []byte, useThisTransition *fieldMatcher, isNumber bool) (*
 		qNum, err := qNumFromBytes(val)
 		if err == nil {
 			numberFA := makeOneStringFAStep(qNum, 0, nextField)
-			stringFA = mergeFAs(stringFA, numberFA, sharedNullPrinter)
+			stringFA = mergeFAs(&stringFA, &numberFA, sharedNullPrinter)
 		}
 	}
 	return stringFA, nextField
@@ -249,22 +258,22 @@ func makeFAFragment(val []byte, endAt *faState, pp printer) *faState {
 	for index := 1; index < len(val); index++ {
 		if index == len(val)-1 {
 			table := makeSmallTable(nil, []byte{val[index]}, []*faState{endAt})
-			pp.labelTable(table, fmt.Sprintf("exiting on %v", val[index]))
+			pp.labelTable(&table, fmt.Sprintf("exiting on %v", val[index]))
 			step.table = table
-			pp.labelTable(step.table, "Last step")
+			pp.labelTable(&step.table, "Last step")
 		} else {
 			nextState := &faState{}
 			table := makeSmallTable(nil, []byte{val[index]}, []*faState{nextState})
-			pp.labelTable(table, fmt.Sprintf("stepping on %c", val[index]))
+			pp.labelTable(&table, fmt.Sprintf("stepping on %c", val[index]))
 			step.table = table
-			pp.labelTable(step.table, "Step")
+			pp.labelTable(&step.table, "Step")
 			step = nextState
 		}
 	}
 	return firstStep
 }
 
-func makeOneStringFAStep(val []byte, index int, nextField *fieldMatcher) *smallTable {
+func makeOneStringFAStep(val []byte, index int, nextField *fieldMatcher) smallTable {
 	var nextStep *faState
 	if index == len(val)-1 {
 		lastStep := &faState{
diff --git a/value_matcher_test.go b/value_matcher_test.go
index 6c61e06..442715d 100644
--- a/value_matcher_test.go
+++ b/value_matcher_test.go
@@ -348,9 +348,8 @@ func TestMakeFAFragment(t *testing.T) {
 	pp := newPrettyPrinter(3234)
 	for _, datum := range data {
 		frag := makeFAFragment([]byte(datum), targetState, pp)
-		startTable := frag.table
 		var transIn []*fieldMatcher
-		transOut := traverseDFA(startTable, []byte(datum)[1:], transIn)
+		transOut := traverseDFA(frag, []byte(datum)[1:], transIn)
 		if len(transOut) != 1 || transOut[0] != targetFA {
 			t.Error("fail on ", datum)
 		}
@@ -462,8 +461,8 @@ func TestEpsilonClosureAfterMerge(t *testing.T) {
 	}
 
 	// Walk the automaton and verify all states have epsilon closures computed
-	visited := make(map[*smallTable]bool)
-	missingClosures := checkEpsilonClosures(fields.startTable, visited)
+	visited := make(map[*faState]bool)
+	missingClosures := checkEpsilonClosures(fields.startState, visited)
 	if len(missingClosures) > 0 {
 		t.Errorf("found %d states with missing epsilon closures", len(missingClosures))
 	}
@@ -489,26 +488,26 @@ func TestEpsilonClosureAfterMerge(t *testing.T) {
 
 // checkEpsilonClosures walks the automaton and returns states that have
 // epsilon transitions but no computed epsilon closure.
-func checkEpsilonClosures(table *smallTable, visited map[*smallTable]bool) []*faState {
+func checkEpsilonClosures(start *faState, visited map[*faState]bool) []*faState {
 	var missing []*faState
-	if visited[table] {
+	if visited[start] {
 		return missing
 	}
-	visited[table] = true
+	visited[start] = true
 
-	for _, state := range table.steps {
+	for _, state := range start.table.steps {
 		if state != nil {
 			if len(state.table.epsilons) > 0 && state.epsilonClosure == nil {
 				missing = append(missing, state)
 			}
-			missing = append(missing, checkEpsilonClosures(state.table, visited)...)
+			missing = append(missing, checkEpsilonClosures(state, visited)...)
 		}
 	}
-	for _, eps := range table.epsilons {
+	for _, eps := range start.table.epsilons {
 		if eps.epsilonClosure == nil {
 			missing = append(missing, eps)
 		}
-		missing = append(missing, checkEpsilonClosures(eps.table, visited)...)
+		missing = append(missing, checkEpsilonClosures(eps, visited)...)
 	}
 	return missing
 }
@@ -547,7 +546,7 @@ func TestEpsilonClosureRequired(t *testing.T) {
 
 	// Step 2: Clear all epsilon closures to simulate missing epsilonClosure call
 	fields := vm.fields()
-	clearEpsilonClosures(fields.startTable, make(map[*smallTable]bool))
+	clearEpsilonClosures(fields.startState, make(map[*faState]bool))
 
 	// Step 3: Without closures, traverseNFA fails because it iterates over
 	// state.epsilonClosure which is now nil (empty loop = no matches)
@@ -563,7 +562,7 @@ func TestEpsilonClosureRequired(t *testing.T) {
 	}
 
 	// Step 4: Restore closures and verify matching works again
-	epsilonClosure(fields.startTable)
+	epsilonClosure(fields.startState)
 
 	trans = testTransitionOn(vm, []byte("abc"), bufs)
 	if len(trans) != 1 {
@@ -576,20 +575,19 @@ func TestEpsilonClosureRequired(t *testing.T) {
 }
 
 // clearEpsilonClosures walks the automaton and sets all epsilonClosure fields to nil
-func clearEpsilonClosures(table *smallTable, visited map[*smallTable]bool) {
-	if visited[table] {
+func clearEpsilonClosures(start *faState, visited map[*faState]bool) {
+	if visited[start] {
 		return
 	}
-	visited[table] = true
+	visited[start] = true
+	start.epsilonClosure = nil
 
-	for _, state := range table.steps {
+	for _, state := range start.table.steps {
 		if state != nil {
-			state.epsilonClosure = nil
-			clearEpsilonClosures(state.table, visited)
+			clearEpsilonClosures(state, visited)
 		}
 	}
-	for _, eps := range table.epsilons {
-		eps.epsilonClosure = nil
-		clearEpsilonClosures(eps.table, visited)
+	for _, eps := range start.table.epsilons {
+		clearEpsilonClosures(eps, visited)
 	}
 }
diff --git a/wildcard.go b/wildcard.go
index 837c903..b25015d 100644
--- a/wildcard.go
+++ b/wildcard.go
@@ -74,10 +74,10 @@ func readWildcardSpecial(pb *patternBuild, valsIn []typedVal) ([]typedVal, error
 
 // makeWildcardFA is a replacement for shellstyle patterns, the only difference being that escaping is
 // provided for * and \.
-func makeWildCardFA(val []byte, pp printer) (start *smallTable, nextField *fieldMatcher) {
+func makeWildCardFA(val []byte, pp printer) (start *faState, nextField *fieldMatcher) {
 	state := &faState{table: newSmallTable()}
-	start = state.table
-	pp.labelTable(start, "WILDCARD")
+	start = state
+	pp.labelTable(&start.table, "WILDCARD")
 	nextField = newFieldMatcher()
 
 	// for each byte in the pattern. \-escape processing is simplified because illegal constructs such as \a and \
@@ -99,19 +99,19 @@ func makeWildCardFA(val []byte, pp printer) (start *smallTable, nextField *field
 			spinEscape.table.epsilons = []*faState{spinner}
 			spinner.table = makeByteDotFA(spinner, pp)
 			spinner.table.addByteStep(val[valIndex], spinEscape)
-			pp.labelTable(spinner.table, "*-Spinner")
-			pp.labelTable(spinEscape.table, fmt.Sprintf("spinEscape on %c at %d", val[valIndex], valIndex))
+			pp.labelTable(&spinner.table, "*-Spinner")
+			pp.labelTable(&spinEscape.table, fmt.Sprintf("spinEscape on %c at %d", val[valIndex], valIndex))
 			state = spinEscape
 		} else {
 			nextStep := &faState{table: newSmallTable()}
-			pp.labelTable(nextStep.table, fmt.Sprintf("on %c at %d", val[valIndex], valIndex))
+			pp.labelTable(&nextStep.table, fmt.Sprintf("on %c at %d", val[valIndex], valIndex))
 			state.table.addByteStep(ch, nextStep)
 			state = nextStep
 		}
 		valIndex++
 	}
 	lastStep := &faState{table: newSmallTable(), fieldTransitions: []*fieldMatcher{nextField}}
-	pp.labelTable(lastStep.table, fmt.Sprintf("last step at %d", valIndex))
+	pp.labelTable(&lastStep.table, fmt.Sprintf("last step at %d", valIndex))
 	state.table.addByteStep(valueTerminator, lastStep)
 	return
 }

From b8d21493842dc9dadb52c5433cf1363c065a7f35 Mon Sep 17 00:00:00 2001
From: Robert Sayre <sayrer@gmail.com>
Date: Sat, 30 May 2026 14:19:23 -0700
Subject: [PATCH 4/6] tests: recalibrate size assertions after embedding
 smallTable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Per-state size shrunk (faState now 128B with inline smallTable). Update
hand-calibrated constants in TestMcBasicSizes, TestQuaminaMemoryCost,
TestMcNfaSizes. Update TestPP s/t numbers and restore trailing whitespace
in wanted literal (producer still emits "[s/t X/Y] \n" with the trailing
space before the newline).

TestTablePointerDedup's tableSharing and totalEntries expectations also
needed updating — the dedup metric now uses slice-backing identity
(via tableShareKey from earlier commits) rather than *smallTable pointer
identity, so value-copies of a source smallTable register as shared.
Recalibrated to the new ground-truth values.

Minor cleanups: stale "startTable" reference in TestQuaminaMemoryCostSingleton
comment (renamed to startState); inline a dead local in copyShellNode.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 memory_cost_test.go   | 18 ++++++++++--------
 nfa_test.go           |  8 ++++----
 prettyprinter_test.go | 13 ++++++-------
 rune_range.go         |  5 ++---
 4 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/memory_cost_test.go b/memory_cost_test.go
index 7a6f7f7..00dbdf3 100644
--- a/memory_cost_test.go
+++ b/memory_cost_test.go
@@ -16,8 +16,10 @@ func TestMcBasicSizes(t *testing.T) {
 	}
 	stateBase := int64(unsafe.Sizeof(faState{}))
 	state := faState{table: table}
-	// want base + tableActual
-	want = stateBase + tableGot
+	// faState embeds smallTable, so stateBase already covers the smallTable struct.
+	// Add only the slice-backing bytes (1 ceiling byte + 1 step pointer), not tableGot
+	// (which includes mcSmallTableBase again and would double-count the struct overhead).
+	want = stateBase + 1 + mcPointer
 	stateGot := mcFaState(&state)
 	if stateGot != want {
 		t.Errorf("State wanted %d got %d", want, stateGot)
@@ -31,7 +33,7 @@ func TestQuaminaMemoryCost(t *testing.T) {
 		t.Error(err)
 	}
 	bytes := q.GetMatcherStats()["bytes"]
-	if bytes != 1321 {
+	if bytes != 1289 {
 		t.Error("WRONG NUMBERS")
 	}
 	err = q.AddPattern("x", `{"y":[{"wildcard": "*y"}]}}`)
@@ -39,15 +41,15 @@ func TestQuaminaMemoryCost(t *testing.T) {
 		t.Error(err)
 	}
 	bytes = q.GetMatcherStats()["bytes"]
-	if bytes != 2*1321 {
+	if bytes != 2*1289 {
 		t.Error("WRONG NUMBERS")
 	}
 }
 
 // Regression: GetMatcherStats panicked when a valueMatcher used the
-// singleton-match optimization (singletonMatch set, startTable nil),
+// singleton-match optimization (singletonMatch set, startState nil),
 // e.g. boolean-valued patterns. cmFieldMatcherStats now skips the nil
-// startTable rather than building a faState with state.table == nil.
+// startState rather than building a faState with state.table == nil.
 func TestQuaminaMemoryCostSingleton(t *testing.T) {
 	q, _ := New()
 	if err := q.AddPattern("p", `{"Animated": [false]}`); err != nil {
@@ -70,8 +72,8 @@ func TestMcNfaSizes(t *testing.T) {
 		seenStates: make(map[*faState]bool),
 	}
 	cmStateStats(fa1, stats, pp)
-	wantedBytes := int64(1321) // laboriously hand-calculated
-	wantedFanout := int64(5)
+	wantedBytes := int64(1289) // recalibrated after embedding smallTable in faState
+	wantedFanout := int64(6)
 	wantedMaxFanout := int64(2)
 	if stats.bytes != wantedBytes {
 		t.Errorf("Wanted %d bytes, got %d", wantedBytes, stats.bytes)
diff --git a/nfa_test.go b/nfa_test.go
index 95a3618..9138097 100644
--- a/nfa_test.go
+++ b/nfa_test.go
@@ -505,9 +505,9 @@ var dedupWorkloads = []dedupWorkload{
 			"([xyz]?)*end", "(([mno]?)*)+", "([pqr]+)*s",
 		},
 		stateCount:   1101,
-		totalEntries: 4371,
+		totalEntries: 4369,
 		maxMax:       20,
-		tableSharing: 11,
+		tableSharing: 981,
 		matches:      []int{3, 2, 7},
 	},
 	{
@@ -520,7 +520,7 @@ var dedupWorkloads = []dedupWorkload{
 			"(([op]?)*)+", "([qr]+)*t", "(e*)*f", "(g*)*h",
 		},
 		stateCount:   149,
-		totalEntries: 261,
+		totalEntries: 254,
 		maxMax:       50,
 		tableSharing: 39,
 		matches:      []int{0, 0, 0},
@@ -568,7 +568,7 @@ var dedupWorkloads = []dedupWorkload{
 		stateCount:   837,
 		totalEntries: 3410,
 		maxMax:       30,
-		tableSharing: 16,
+		tableSharing: 744,
 		matches:      []int{10, 10, 10},
 	},
 }
diff --git a/prettyprinter_test.go b/prettyprinter_test.go
index 75a989e..6dc0cd8 100644
--- a/prettyprinter_test.go
+++ b/prettyprinter_test.go
@@ -8,13 +8,12 @@ func TestPP(t *testing.T) {
 	pp := newPrettyPrinter(1)
 	state, _ := makeShellStyleFA([]byte(`"x*9"`), pp)
 	pp.labelTable(&state.table, "START HERE")
-	wanted := ` 884[START HERE] '22/"' → (914[on " at 0][s/t 216/280]
- 914[on " at 0] '78/x' → (384[*-Spinner][s/t 216/280]
- 384[*-Spinner] '39/9' → (322[spinEscape on 9 at 3] / ★ → 384[*-Spinner][s/t 216/280]
- 322[spinEscape on 9 at 3] ε → 384[*-Spinner] / '22/"' → (769[on " at 4][s/t 224/288]
- 769[on " at 4] 'f5/ℵ' → (301[last step at 5][s/t 216/280]
- 301[last step at 5]  [1 transition(s)][s/t 81/153]
-`
+	wanted := " 884[START HERE] '22/\"' → (914[on \" at 0][s/t 216/272] \n" +
+		" 914[on \" at 0] '78/x' → (384[*-Spinner][s/t 216/272] \n" +
+		" 384[*-Spinner] '39/9' → (322[spinEscape on 9 at 3] / ★ → 384[*-Spinner][s/t 216/272] \n" +
+		" 322[spinEscape on 9 at 3] ε → 384[*-Spinner] / '22/\"' → (769[on \" at 4][s/t 224/280] \n" +
+		" 769[on \" at 4] 'f5/ℵ' → (301[last step at 5][s/t 216/272] \n" +
+		" 301[last step at 5]  [1 transition(s)][s/t 81/145] \n"
 	s := pp.printNFA(&state.table)
 	if s != wanted {
 		t.Errorf("LONG: wanted\n<%s>\ngot\n<%s>\n", wanted, s)
diff --git a/rune_range.go b/rune_range.go
index f85c5e3..7f3d9b5 100644
--- a/rune_range.go
+++ b/rune_range.go
@@ -25,12 +25,11 @@ func copyShellNode(shell *faState, oldNext *faState, newNext *faState, mem map[*
 	if ok {
 		return already
 	}
-	table := smallTable{
+	state := &faState{table: smallTable{
 		ceilings: slices.Clone(shell.table.ceilings),
 		steps:    make([]*faState, len(shell.table.steps)),
 		epsilons: make([]*faState, len(shell.table.epsilons)),
-	}
-	state := &faState{table: table}
+	}}
 	mem[shell] = state
 	for i, step := range shell.table.steps {
 		switch step {

From 69415cee78dd83b7d4d119b15dadcb73385c9753 Mon Sep 17 00:00:00 2001
From: Robert Sayre <sayrer@gmail.com>
Date: Sun, 31 May 2026 11:12:12 -0700
Subject: [PATCH 5/6] epsi_closure: pool buffers, restore two-counter dedup

Port of the build-context-extract fix (6947edf) to the embedded-smallTable
design. embed-smalltable inherited the same two regressions from the shared
32dc2a9 ancestor; shellstyle build at 1000 words was ~7x slower than main.

1. closureForNfa's walk dedup was broken. The refactor collapsed two
   independent counters into a single bufs.gen that closureForState mutates,
   so the walk's visited check never matched after the first state and the
   heavily-shared shellstyle graph got re-traversed (O(V*E)). Restored via
   bufs.walkGen, a snapshot closureForState never touches.

2. Per-call allocation. epsilonClosure allocated fresh maps per call and
   tableMarkOf heap-allocated a *tableMark per share group. closureBuffers
   are now pooled (sync.Pool, GC-reclaimable so no steady-state cost), maps
   are reused via a monotonic generation (no clearing), and tableMark is
   stored by value.

Unlike the *smallTable-keyed bce branch, the walk here dedups by *faState
identity rather than tableShareKey. Share-key dedup is unsafe for the walk:
distinct states can share a steps backing array yet have different epsilons,
and the zero key collapses all no-byte tables. The faState pointer is the
natural unique identity now that smallTable is embedded by value. The
post-pass table-pointer dedup keeps tableShareKey (collision-safe there, as
it re-checks sameFieldTransitions) and now skips the zero key explicitly.

Shellstyle build at 1000 words: 1363ms -> 473ms. Full suite passes.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 epsi_closure.go | 104 +++++++++++++++++++++++++++++-------------------
 1 file changed, 63 insertions(+), 41 deletions(-)

diff --git a/epsi_closure.go b/epsi_closure.go
index 196deda..a16dc85 100644
--- a/epsi_closure.go
+++ b/epsi_closure.go
@@ -1,62 +1,77 @@
 package quamina
 
-// tableMark carries the per-smallTable scratch used only during epsilon
-// closure computation (lastVisitedGen for NFA walk dedup, and closureGen /
-// closureRep for table-pointer dedup). These used to live as fields on
-// smallTable itself, but they are purely build-time state and their
-// permanent presence on every smallTable was wasted steady-state memory.
-// They now live in a per-call side table that is discarded when
-// epsilonClosure returns.
+import "sync"
+
+// tableMark carries the per-table-share-group scratch used by the closure
+// post-pass that collapses states sharing a smallTable. It used to live as
+// fields on smallTable itself, but that is purely build-time state whose
+// permanent presence was wasted steady-state memory; it now lives in a
+// pooled side table (closureBuffers.tables).
+//
+// tableMark is stored by value so marking a share group costs no per-entry
+// heap allocation.
 type tableMark struct {
-	lastVisitedGen uint32
-	closureGen     uint32
-	closureRep     *faState
+	closureGen uint64
+	closureRep *faState
 }
 
-// closureBuffers carries per-epsilonClosure-call scratch. The two maps
-// replace build-time fields that used to sit on smallTable/faState;
-// they live only for the duration of the closure computation.
+// closureBuffers carries the scratch for epsilon closure computation. It is
+// pooled (see closureBufferPool) and reused across epsilonClosure calls, so
+// the maps are allocated once and grown, not rebuilt per call. Visited
+// tracking is generation-based: gen only ever increases, so stale map
+// entries from a previous use are simply older than the current generation
+// and need no clearing.
 type closureBuffers struct {
-	gen           uint32
-	closureSetGen uint32
-	closureList   []*faState
-	tables        map[tableShareKey]*tableMark
-	states        map[*faState]uint32
+	gen           uint64                      // monotonic counter; bumped by closureForState's two dedup phases
+	walkGen       uint64                      // snapshot of gen for the current closureForNfa walk (NFA state dedup)
+	closureSetGen uint64                      // snapshot of gen for the current closureForState faState dedup
+	closureList   []*faState                  // reusable accumulator for the state list before the dedup post-pass
+	tables        map[tableShareKey]tableMark // share-group scratch for the post-pass (closureGen, closureRep)
+	states        map[*faState]uint64         // per-faState last-visited gen, used by traverseEpsilons
+	walkVisited   map[*faState]uint64         // per-faState last-walked gen, used by closureForNfa
 }
 
 func newClosureBuffers() *closureBuffers {
 	return &closureBuffers{
-		gen:    1,
-		tables: make(map[tableShareKey]*tableMark),
-		states: make(map[*faState]uint32),
+		tables:      make(map[tableShareKey]tableMark),
+		states:      make(map[*faState]uint64),
+		walkVisited: make(map[*faState]uint64),
 	}
 }
 
-// tableMarkOf returns the tableMark for t, creating one on first access.
-func (b *closureBuffers) tableMarkOf(t *smallTable) *tableMark {
-	key := newTableShareKey(t)
-	m, ok := b.tables[key]
-	if !ok {
-		m = &tableMark{}
-		b.tables[key] = m
-	}
-	return m
+// closureBufferPool reuses closureBuffers (and their maps) across the many
+// epsilonClosure calls a build performs, eliminating per-call map allocation.
+// The pool is concurrency-safe, and sync.Pool drops its contents on GC, so
+// the maps do not become permanent steady-state memory.
+var closureBufferPool = sync.Pool{
+	New: func() any { return newClosureBuffers() },
 }
 
 // epsilonClosure walks the automaton starting from the given state
 // and precomputes the epsilon closure for every reachable faState.
 func epsilonClosure(start *faState) {
-	bufs := newClosureBuffers()
+	bufs := closureBufferPool.Get().(*closureBuffers)
+	// Take a fresh generation for this walk. closureForState bumps bufs.gen
+	// for its own dedup phases, but it never touches walkGen, so the state
+	// dedup in closureForNfa compares against a value that stays fixed for
+	// the whole walk.
+	bufs.gen++
+	bufs.walkGen = bufs.gen
 	closureForState(start, bufs)
 	closureForNfa(start, bufs)
+	closureBufferPool.Put(bufs)
 }
 
+// closureForNfa dedups by faState identity, not table-share key: each state
+// must be walked once. (Share-key dedup is unsafe here — distinct states can
+// share a steps backing array yet have different epsilons, and the zero key
+// collapses all no-byte tables; the post-pass below re-checks fieldTransitions
+// on collision, but the walk has no such guard.)
 func closureForNfa(state *faState, bufs *closureBuffers) {
-	mark := bufs.tableMarkOf(&state.table)
-	if mark.lastVisitedGen == bufs.gen {
+	if bufs.walkVisited[state] == bufs.walkGen {
 		return
 	}
-	mark.lastVisitedGen = bufs.gen
+	bufs.walkVisited[state] = bufs.walkGen
 
 	for _, s := range state.table.steps {
 		if s != nil {
@@ -87,8 +102,8 @@ func closureForState(state *faState, bufs *closureBuffers) {
 		return
 	}
 
-	// Use generation-based visited tracking instead of a fresh map per
-	// traversal. bufs.states records which gen last visited each state.
+	// Generation-based visited tracking: bufs.states records which gen last
+	// visited each state, so we never clear the map between traversals.
 	bufs.gen++
 	bufs.closureSetGen = bufs.gen
 	bufs.closureList = bufs.closureList[:0]
@@ -99,15 +114,21 @@ func closureForState(state *faState, bufs *closureBuffers) {
 	traverseEpsilons(state, state.table.epsilons, bufs)
 
 	// Table-pointer dedup: when multiple states in the closure share the
-	// same *smallTable, their byte transitions are identical, so only one
-	// representative is needed. This is done as a post-pass over the
-	// closure list rather than during traversal to keep traverseEpsilons
-	// zero-overhead. States with different fieldTransitions are preserved.
+	// same smallTable (steps backing array), their byte transitions are
+	// identical, so only one representative is needed. Done as a post-pass
+	// over the closure list to keep traverseEpsilons zero-overhead. The
+	// zero key (no byte transitions) is never deduped, and states with
+	// different fieldTransitions are preserved.
 	bufs.gen++
 	dedupGen := bufs.gen
 	closure := make([]*faState, 0, len(bufs.closureList))
 	for _, s := range bufs.closureList {
-		mark := bufs.tableMarkOf(&s.table)
+		key := newTableShareKey(&s.table)
+		if (key == tableShareKey{}) {
+			closure = append(closure, s)
+			continue
+		}
+		mark := bufs.tables[key]
 		if mark.closureGen == dedupGen {
 			if sameFieldTransitions(mark.closureRep, s) {
 				continue
@@ -115,6 +136,7 @@ func closureForState(state *faState, bufs *closureBuffers) {
 		} else {
 			mark.closureGen = dedupGen
 			mark.closureRep = s
+			bufs.tables[key] = mark
 		}
 		closure = append(closure, s)
 	}

From 324284121451d36b133e7069f19be15febec70cd Mon Sep 17 00:00:00 2001
From: Robert Sayre <sayrer@gmail.com>
Date: Sun, 31 May 2026 12:08:07 -0700
Subject: [PATCH 6/6] state_lists: dedup intern() via sort+compact, drop the
 seen map
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

nfa2Dfa/intern is slated to go live. Profiling BenchmarkNfa2Dfa showed it
~64% slower than main, entirely inside intern()'s dedup: clear(sl.seen) plus
a per-state map assign into a map[*faState]struct{} cost ~600ms where main's
faState.closureSetGen generation-counter compare cost ~50ms. That field was
removed to shrink steady-state memory, so the map was the fallback.

intern already sorts the state set by pointer to build a canonical key, so
duplicates are adjacent after the sort. Replacing the map-based dedup with
slices.Compact over the sorted buffer removes the map (and its clear()) with
no per-faState field and no extra sort — sorting was already happening.

Nfa2Dfa vs main (geomean, n=6): time +63.8% -> +2.65%, B/op -1.05% -> -1.65%,
allocs/op -7.8% -> -9.6%. Full suite passes.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 state_lists.go | 27 +++++++++------------------
 1 file changed, 9 insertions(+), 18 deletions(-)

diff --git a/state_lists.go b/state_lists.go
index 36f36bb..a4a5521 100644
--- a/state_lists.go
+++ b/state_lists.go
@@ -20,15 +20,13 @@ type internEntry struct {
 type stateLists struct {
 	entries map[string]internEntry
 	// Scratch space reused across intern() calls
-	sortBuf []*faState            // reusable sorted buffer
-	keyBuf  []byte                // reusable key bytes buffer
-	seen    map[*faState]struct{} // reusable dedup set, cleared per call
+	sortBuf []*faState // reusable sorted buffer
+	keyBuf  []byte     // reusable key bytes buffer
 }
 
 func newStateLists() *stateLists {
 	return &stateLists{
 		entries: make(map[string]internEntry),
-		seen:    make(map[*faState]struct{}),
 	}
 }
 
@@ -38,23 +36,16 @@ func newStateLists() *stateLists {
 // which either has already been computed for the set or is created and empty, and
 // a boolean indicating whether the DFA state has already been computed or not.
 func (sl *stateLists) intern(list []*faState) ([]*faState, *faState, bool) {
-	// Dedup within this call using a reused map. Previously this rode on
-	// a generation counter stored inline on each faState; that field has
-	// been removed to shrink steady-state memory.
-	clear(sl.seen)
-	sl.sortBuf = sl.sortBuf[:0]
-	for _, state := range list {
-		if _, ok := sl.seen[state]; ok {
-			continue
-		}
-		sl.seen[state] = struct{}{}
-		sl.sortBuf = append(sl.sortBuf, state)
-	}
-
-	// compute a key representing the set
+	// Dedup by sorting then compacting adjacent duplicates. The set key is
+	// built from sorted pointers anyway, so sorting is not extra work; once
+	// sorted, duplicates are adjacent and Compact removes them in one linear
+	// pass. This avoids both a per-call dedup map and a per-faState
+	// generation field (the latter was removed to shrink steady-state memory).
+	sl.sortBuf = append(sl.sortBuf[:0], list...)
 	slices.SortFunc(sl.sortBuf, func(a, b *faState) int {
 		return cmp.Compare(uintptr(unsafe.Pointer(a)), uintptr(unsafe.Pointer(b)))
 	})
+	sl.sortBuf = slices.Compact(sl.sortBuf)
 
 	// Pre-size the key buffer and write pointers with PutUint64 instead of
 	// appending byte-by-byte, avoiding 8 append calls and bounds checks per state.