timbray · sayrer · May 30, 2026 · May 30, 2026 · May 30, 2026 · May 30, 2026
diff --git a/anything_but.go b/anything_but.go
@@ -70,12 +70,12 @@ func readAnythingButSpecial(pb *patternBuild, valsIn []typedVal) (pathVals []typ
 // Making a succession of anything-but automata for each of "a" and "b" and then merging them turns out not
 // to work because what the caller means is really an AND - everything that matches neither "a" nor "b". So
 // in principle we could intersect automata.
-func makeMultiAnythingButFA(vals [][]byte) (*smallTable, *fieldMatcher) {
+func makeMultiAnythingButFA(vals [][]byte) (*faState, *fieldMatcher) {
 	nextField := newFieldMatcher()
 	success := &faState{table: newSmallTable(), fieldTransitions: []*fieldMatcher{nextField}}
 
-	ret, _ := makeOneMultiAnythingButStep(vals, 0, success), nextField
-	return ret, nextField
+	startTable := makeOneMultiAnythingButStep(vals, 0, success)
+	return &faState{table: startTable}, nextField
 }
 
 // makeOneMultiAnythingButStep - spookeh. The idea is that there will be N smallTables in this FA, where N is
@@ -84,7 +84,7 @@ func makeMultiAnythingButFA(vals [][]byte) (*smallTable, *fieldMatcher) {
 // yet been exhausted. We notice when we get to the end of each val and put in a valueTerminator transition
 // to a step with no nextField entry, i.e. failure because we've exactly matched one of the anything-but
 // strings.
-func makeOneMultiAnythingButStep(vals [][]byte, index int, success *faState) *smallTable {
+func makeOneMultiAnythingButStep(vals [][]byte, index int, success *faState) smallTable {
 	// this will be the default transition in all the anything-but tables.
 	var u unpackedTable
 	for i := range u {

diff --git a/core_matcher_test.go b/core_matcher_test.go
@@ -330,8 +330,8 @@ func TestSimpleaddPattern(t *testing.T) {
 // which means the finite automata are hidden deep inside the coreMatcher instance
 // and hard to get at.  This helper routine fetches the value-matcher automaton
 // corresponding to the "path" argument
-func fetchFAForPath(t *testing.T, cm *coreMatcher, path string) *smallTable {
+func fetchFAForPath(t *testing.T, cm *coreMatcher, path string) *faState {
 	t.Helper()
 	vm := cm.fields().state.fields().transitions[path]
-	return vm.fields().startTable
+	return vm.fields().startState
 }
diff --git a/dedup_key.go b/dedup_key.go
@@ -0,0 +1,26 @@
+package quamina
+
+import "unsafe"
+
+// tableShareKey returns a stable identifier for a smallTable's "share group".
+// Two states whose smallTables hold slice-headers pointing at the same `steps`
+// backing array (which is what happens when one smallTable struct value is
+// copied into multiple faStates during construction) will produce equal
+// keys. This replaces *smallTable-pointer identity as the dedup key in
+// epsilon-closure computation after smallTable is embedded into faState
+// by value.
+//
+// A zero key (nil pointer, len 0) means "no share group" — used for tables
+// with no byte transitions. Callers that want to dedup such tables should
+// skip the zero key.
+type tableShareKey struct {
+	stepsData unsafe.Pointer
+	stepsLen  int
+}
+
+func newTableShareKey(t *smallTable) tableShareKey {
+	return tableShareKey{
+		stepsData: unsafe.Pointer(unsafe.SliceData(t.steps)),
+		stepsLen:  len(t.steps),
+	}
+}
diff --git a/dedup_key_test.go b/dedup_key_test.go
@@ -0,0 +1,57 @@
+package quamina
+
+import (
+	"testing"
+)
+
+func TestTableShareKey_SharedBackings(t *testing.T) {
+	// Construct one smallTable, value-copy it (simulating post-embed share).
+	src := smallTable{
+		ceilings: []byte{'a', 'b', byte(byteCeiling)},
+		steps:    []*faState{nil, nil, nil},
+	}
+	copy1 := src
+	copy2 := src
+	if newTableShareKey(&copy1) != newTableShareKey(&copy2) {
+		t.Errorf("value-copied tables should share key; got %v vs %v",
+			newTableShareKey(&copy1), newTableShareKey(&copy2))
+	}
+}
+
+func TestTableShareKey_DistinctBackings(t *testing.T) {
+	t1 := smallTable{
+		ceilings: []byte{'a', byte(byteCeiling)},
+		steps:    []*faState{nil, nil},
+	}
+	t2 := smallTable{
+		ceilings: []byte{'a', byte(byteCeiling)},
+		steps:    []*faState{nil, nil},
+	}
+	if newTableShareKey(&t1) == newTableShareKey(&t2) {
+		t.Errorf("independently-built tables should not share key")
+	}
+}
+
+// TestTableShareKey_AppendBreaksShare verifies that when a value-copy
+// is mutated via append in a way that reallocates the backing array,
+// the keys diverge. We force reallocation by starting at cap=1 and
+// appending many entries.
+func TestTableShareKey_AppendBreaksShare(t *testing.T) {
+	src := smallTable{
+		ceilings: make([]byte, 0, 1),
+		steps:    make([]*faState, 0, 1),
+	}
+	src.ceilings = append(src.ceilings, byte(byteCeiling))
+	src.steps = append(src.steps, nil)
+	copy1 := src
+	// Appending 8 entries to a slice with cap=1 guarantees at least one
+	// realloc of the steps backing.
+	for i := 0; i < 8; i++ {
+		copy1.steps = append(copy1.steps, nil)
+		copy1.ceilings = append(copy1.ceilings, byte(i))
+	}
+	if newTableShareKey(&src) == newTableShareKey(&copy1) {
+		t.Errorf("expected keys to diverge after append-with-realloc; got equal: %v",
+			newTableShareKey(&src))
+	}
+}
diff --git a/epsi_closure.go b/epsi_closure.go
@@ -1,70 +1,87 @@
 package quamina
 
-// tableMark carries the per-smallTable scratch used only during epsilon
-// closure computation (lastVisitedGen for NFA walk dedup, and closureGen /
-// closureRep for table-pointer dedup). These used to live as fields on
-// smallTable itself, but they are purely build-time state and their
-// permanent presence on every smallTable was wasted steady-state memory.
-// They now live in a per-call side table that is discarded when
-// epsilonClosure returns.
+import "sync"
+
+// tableMark carries the per-table-share-group scratch used by the closure
+// post-pass that collapses states sharing a smallTable. It used to live as
+// fields on smallTable itself, but that is purely build-time state whose
+// permanent presence was wasted steady-state memory; it now lives in a
+// pooled side table (closureBuffers.tables).
+//
+// tableMark is stored by value so marking a share group costs no per-entry
+// heap allocation.
 type tableMark struct {
-	lastVisitedGen uint32
-	closureGen     uint32
-	closureRep     *faState
+	closureGen uint64
+	closureRep *faState
 }
 
-// closureBuffers carries per-epsilonClosure-call scratch. The two maps
-// replace build-time fields that used to sit on smallTable/faState;
-// they live only for the duration of the closure computation.
+// closureBuffers carries the scratch for epsilon closure computation. It is
+// pooled (see closureBufferPool) and reused across epsilonClosure calls, so
+// the maps are allocated once and grown, not rebuilt per call. Visited
+// tracking is generation-based: gen only ever increases, so stale map
+// entries from a previous use are simply older than the current generation
+// and need no clearing.
 type closureBuffers struct {
-	gen           uint32                     // bumped by closureForNfa (NFA walk dedup) and the closureForState post-pass (table-pointer dedup)
-	closureSetGen uint32                     // snapshot of gen used by traverseEpsilons to dedup faState visits within one closure
-	closureList   []*faState                 // reusable accumulator for the state list before the dedup post-pass
-	tables        map[*smallTable]*tableMark // per-call side-table for smallTable scratch (lastVisitedGen, closureRep)
-	states        map[*faState]uint32        // per-faState last-visited generation, used by traverseEpsilons
+	gen           uint64                      // monotonic counter; bumped by closureForState's two dedup phases
+	walkGen       uint64                      // snapshot of gen for the current closureForNfa walk (NFA state dedup)
+	closureSetGen uint64                      // snapshot of gen for the current closureForState faState dedup
+	closureList   []*faState                  // reusable accumulator for the state list before the dedup post-pass
+	tables        map[tableShareKey]tableMark // share-group scratch for the post-pass (closureGen, closureRep)
+	states        map[*faState]uint64         // per-faState last-visited gen, used by traverseEpsilons
+	walkVisited   map[*faState]uint64         // per-faState last-walked gen, used by closureForNfa
 }
 
 func newClosureBuffers() *closureBuffers {
 	return &closureBuffers{
-		gen:    1,
-		tables: make(map[*smallTable]*tableMark),
-		states: make(map[*faState]uint32),
+		tables:      make(map[tableShareKey]tableMark),
+		states:      make(map[*faState]uint64),
+		walkVisited: make(map[*faState]uint64),
 	}
 }
 
-// tableMarkOf returns the tableMark for t, creating one on first access.
-func (b *closureBuffers) tableMarkOf(t *smallTable) *tableMark {
-	m, ok := b.tables[t]
-	if !ok {
-		m = &tableMark{}
-		b.tables[t] = m
-	}
-	return m
+// closureBufferPool reuses closureBuffers (and their maps) across the many
+// epsilonClosure calls a build performs, eliminating per-call map allocation.
+// The pool is concurrency-safe, and sync.Pool drops its contents on GC, so
+// the maps do not become permanent steady-state memory.
+var closureBufferPool = sync.Pool{
+	New: func() any { return newClosureBuffers() },
 }
 
-// epsilonClosure walks the automaton starting from the given table
+// epsilonClosure walks the automaton starting from the given state
 // and precomputes the epsilon closure for every reachable faState.
-func epsilonClosure(table *smallTable) {
-	bufs := newClosureBuffers()
-	closureForNfa(table, bufs)
+func epsilonClosure(start *faState) {
+	bufs := closureBufferPool.Get().(*closureBuffers)
+	// Take a fresh generation for this walk. closureForState bumps bufs.gen
+	// for its own dedup phases, but it never touches walkGen, so the state
+	// dedup in closureForNfa compares against a value that stays fixed for
+	// the whole walk.
+	bufs.gen++
+	bufs.walkGen = bufs.gen
+	closureForState(start, bufs)
+	closureForNfa(start, bufs)
+	closureBufferPool.Put(bufs)
 }
 
-func closureForNfa(table *smallTable, bufs *closureBuffers) {
-	mark := bufs.tableMarkOf(table)
-	if mark.lastVisitedGen == bufs.gen {
+// closureForNfa dedups by faState identity, not table-share key: each state
+// must be walked once. (Share-key dedup is unsafe here — distinct states can
+// share a steps backing array yet have different epsilons, and the zero key
+// collapses all no-byte tables; the post-pass below re-checks fieldTransitions
+// on collision, but the walk has no such guard.)
+func closureForNfa(state *faState, bufs *closureBuffers) {
+	if bufs.walkVisited[state] == bufs.walkGen {
 		return
 	}
-	mark.lastVisitedGen = bufs.gen
+	bufs.walkVisited[state] = bufs.walkGen
 
-	for _, state := range table.steps {
-		if state != nil {
-			closureForState(state, bufs)
-			closureForNfa(state.table, bufs)
+	for _, s := range state.table.steps {
+		if s != nil {
+			closureForState(s, bufs)
+			closureForNfa(s, bufs)
 		}
 	}
-	for _, eps := range table.epsilons {
+	for _, eps := range state.table.epsilons {
 		closureForState(eps, bufs)
-		closureForNfa(eps.table, bufs)
+		closureForNfa(eps, bufs)
 	}
 }
 
@@ -85,8 +102,8 @@ func closureForState(state *faState, bufs *closureBuffers) {
 		return
 	}
 
-	// Use generation-based visited tracking instead of a fresh map per
-	// traversal. bufs.states records which gen last visited each state.
+	// Generation-based visited tracking: bufs.states records which gen last
+	// visited each state, so we never clear the map between traversals.
 	bufs.gen++
 	bufs.closureSetGen = bufs.gen
 	bufs.closureList = bufs.closureList[:0]
@@ -97,22 +114,29 @@ func closureForState(state *faState, bufs *closureBuffers) {
 	traverseEpsilons(state, state.table.epsilons, bufs)
 
 	// Table-pointer dedup: when multiple states in the closure share the
-	// same *smallTable, their byte transitions are identical, so only one
-	// representative is needed. This is done as a post-pass over the
-	// closure list rather than during traversal to keep traverseEpsilons
-	// zero-overhead. States with different fieldTransitions are preserved.
+	// same smallTable (steps backing array), their byte transitions are
+	// identical, so only one representative is needed. Done as a post-pass
+	// over the closure list to keep traverseEpsilons zero-overhead. The
+	// zero key (no byte transitions) is never deduped, and states with
+	// different fieldTransitions are preserved.
 	bufs.gen++
 	dedupGen := bufs.gen
 	closure := make([]*faState, 0, len(bufs.closureList))
 	for _, s := range bufs.closureList {
-		mark := bufs.tableMarkOf(s.table)
+		key := newTableShareKey(&s.table)
+		if (key == tableShareKey{}) {
+			closure = append(closure, s)
+			continue
+		}
+		mark := bufs.tables[key]
 		if mark.closureGen == dedupGen {
 			if sameFieldTransitions(mark.closureRep, s) {
 				continue
 			}
 		} else {
 			mark.closureGen = dedupGen
 			mark.closureRep = s
+			bufs.tables[key] = mark
 		}
 		closure = append(closure, s)
 	}