diff --git a/epsi_closure.go b/epsi_closure.go
index 1aff56e..35d9853 100644
--- a/epsi_closure.go
+++ b/epsi_closure.go
@@ -1,35 +1,60 @@
 package quamina
 
-// closureGeneration is a global counter used for generation-based visited
-// tracking. It is incremented by epsilonClosure (for NFA walk dedup via
-// lastVisitedGen) and by closureForState (for table-pointer dedup
-// via closureGen). Each smallTable stores the generation it was last
-// visited in, avoiding the need for a visited map. This works because
-// epsilonClosure snapshots the counter into bufs.generation before the
-// walk begins, so subsequent increments by the dedup pass don't interfere.
-var closureGeneration uint64
+// tableMark carries the per-smallTable scratch used only during epsilon
+// closure computation (lastVisitedGen for NFA walk dedup, and closureGen /
+// closureRep for table-pointer dedup). These used to live as fields on
+// smallTable itself, but they are purely build-time state and their
+// permanent presence on every smallTable was wasted steady-state memory.
+// They now live in a per-call side table that is discarded when
+// epsilonClosure returns.
+type tableMark struct {
+	lastVisitedGen uint32
+	closureGen     uint32
+	closureRep     *faState
+}
 
+// closureBuffers carries per-epsilonClosure-call scratch. The two maps
+// replace build-time fields that used to sit on smallTable/faState;
+// they live only for the duration of the closure computation.
 type closureBuffers struct {
-	generation    uint64     // used by closureForNfa to avoid revisiting smallTables
-	closureSetGen uint64     // used by traverseEpsilons to avoid revisiting faStates
-	closureList   []*faState // accumulated closure members, reused across calls
+	gen           uint32                     // bumped by closureForNfa (NFA walk dedup) and the closureForState post-pass (table-pointer dedup)
+	closureSetGen uint32                     // snapshot of gen used by traverseEpsilons to dedup faState visits within one closure
+	closureList   []*faState                 // reusable accumulator for the state list before the dedup post-pass
+	tables        map[*smallTable]*tableMark // per-call side-table for smallTable scratch (lastVisitedGen, closureRep)
+	states        map[*faState]uint32        // per-faState last-visited generation, used by traverseEpsilons
+}
+
+func newClosureBuffers() *closureBuffers {
+	return &closureBuffers{
+		gen:    1,
+		tables: make(map[*smallTable]*tableMark),
+		states: make(map[*faState]uint32),
+	}
+}
+
+// tableMarkOf returns the tableMark for t, creating one on first access.
+func (b *closureBuffers) tableMarkOf(t *smallTable) *tableMark {
+	m, ok := b.tables[t]
+	if !ok {
+		m = &tableMark{}
+		b.tables[t] = m
+	}
+	return m
 }
 
 // epsilonClosure walks the automaton starting from the given table
 // and precomputes the epsilon closure for every reachable faState.
 func epsilonClosure(table *smallTable) {
-	closureGeneration++
-	bufs := &closureBuffers{
-		generation: closureGeneration,
-	}
+	bufs := newClosureBuffers()
 	closureForNfa(table, bufs)
 }
 
 func closureForNfa(table *smallTable, bufs *closureBuffers) {
-	if table.lastVisitedGen == bufs.generation {
+	mark := bufs.tableMarkOf(table)
+	if mark.lastVisitedGen == bufs.gen {
 		return
 	}
-	table.lastVisitedGen = bufs.generation
+	mark.lastVisitedGen = bufs.gen
 
 	for _, state := range table.steps {
 		if state != nil {
@@ -46,7 +71,7 @@ func closureForNfa(table *smallTable, bufs *closureBuffers) {
 // closureForStateNoBufs computes the epsilon closure for a single state.
 // Used directly in tests; production code uses closureForState.
 func closureForStateNoBufs(state *faState) {
-	bufs := &closureBuffers{}
+	bufs := newClosureBuffers()
 	closureForState(state, bufs)
 }
 
@@ -60,12 +85,13 @@ func closureForState(state *faState, bufs *closureBuffers) {
 		return
 	}
 
-	// Use generation-based visited tracking instead of a map
-	closureGeneration++
-	bufs.closureSetGen = closureGeneration
+	// Use generation-based visited tracking instead of a fresh map per
+	// traversal. bufs.states records which gen last visited each state.
+	bufs.gen++
+	bufs.closureSetGen = bufs.gen
 	bufs.closureList = bufs.closureList[:0]
 	if !state.table.isEpsilonOnly() {
-		state.closureSetGen = bufs.closureSetGen
+		bufs.states[state] = bufs.closureSetGen
 		bufs.closureList = append(bufs.closureList, state)
 	}
 	traverseEpsilons(state, state.table.epsilons, bufs)
@@ -75,16 +101,18 @@ func closureForState(state *faState, bufs *closureBuffers) {
 	// representative is needed. This is done as a post-pass over the
 	// closure list rather than during traversal to keep traverseEpsilons
 	// zero-overhead. States with different fieldTransitions are preserved.
-	closureGeneration++
+	bufs.gen++
+	dedupGen := bufs.gen
 	closure := make([]*faState, 0, len(bufs.closureList))
 	for _, s := range bufs.closureList {
-		if s.table.closureGen == closureGeneration {
-			if sameFieldTransitions(s.table.closureRep, s) {
+		mark := bufs.tableMarkOf(s.table)
+		if mark.closureGen == dedupGen {
+			if sameFieldTransitions(mark.closureRep, s) {
 				continue
 			}
 		} else {
-			s.table.closureGen = closureGeneration
-			s.table.closureRep = s
+			mark.closureGen = dedupGen
+			mark.closureRep = s
 		}
 		closure = append(closure, s)
 	}
@@ -95,10 +123,10 @@ func closureForState(state *faState, bufs *closureBuffers) {
 // via epsilon transitions into bufs.closureList.
 func traverseEpsilons(start *faState, epsilons []*faState, bufs *closureBuffers) {
 	for _, eps := range epsilons {
-		if eps == start || eps.closureSetGen == bufs.closureSetGen {
+		if eps == start || bufs.states[eps] == bufs.closureSetGen {
 			continue
 		}
-		eps.closureSetGen = bufs.closureSetGen
+		bufs.states[eps] = bufs.closureSetGen
 		if !eps.table.isEpsilonOnly() {
 			bufs.closureList = append(bufs.closureList, eps)
 		}
diff --git a/memory_cost.go b/memory_cost.go
index 198c5e1..c1863ff 100644
--- a/memory_cost.go
+++ b/memory_cost.go
@@ -21,9 +21,12 @@ func cmFieldMatcherStats(fm *fieldMatcher, stats *matcherStats, pp printer) {
 	for _, vm := range fmTrans {
 		singleton := vm.fields().singletonMatch
 		if singleton != nil {
-			stats.bytes += int64(len(singleton))
+			stats.bytes += int64(cap(singleton))
 		}
 		table := vm.fields().startTable
+		if table == nil {
+			continue
+		}
 		cmStateStats(&faState{table: table}, stats, pp)
 	}
 }
diff --git a/memory_cost_test.go b/memory_cost_test.go
index 4569039..ee31785 100644
--- a/memory_cost_test.go
+++ b/memory_cost_test.go
@@ -31,7 +31,7 @@ func TestQuaminaMemoryCost(t *testing.T) {
 		t.Error(err)
 	}
 	bytes := q.GetMatcherStats()["bytes"]
-	if bytes != 1481 {
+	if bytes != 1321 {
 		t.Error("WRONG NUMBERS")
 	}
 	err = q.AddPattern("x", `{"y":[{"wildcard": "*y"}]}}`)
@@ -39,11 +39,28 @@ func TestQuaminaMemoryCost(t *testing.T) {
 		t.Error(err)
 	}
 	bytes = q.GetMatcherStats()["bytes"]
-	if bytes != 2*1481 {
+	if bytes != 2*1321 {
 		t.Error("WRONG NUMBERS")
 	}
 }
 
+// Regression: GetMatcherStats panicked when a valueMatcher used the
+// singleton-match optimization (singletonMatch set, startTable nil).
+// That optimization fires for any field with a single string or literal
+// value — the matcher uses bytes.Compare instead of building an FA.
+// Minimal repro: {"Animated": [false]}. cmFieldMatcherStats now skips
+// the nil startTable rather than building a faState with state.table == nil.
+func TestQuaminaMemoryCostSingleton(t *testing.T) {
+	q, _ := New()
+	if err := q.AddPattern("p", `{"Animated": [false]}`); err != nil {
+		t.Fatal(err)
+	}
+	s := q.GetMatcherStats()
+	if s["bytes"] == 0 {
+		t.Errorf("expected bytes > 0 for singleton matcher, got %v", s["bytes"])
+	}
+}
+
 func TestMcNfaSizes(t *testing.T) {
 	pp := newPrettyPrinter(2355)
 	wc1 := `"*z"`
@@ -55,7 +72,7 @@ func TestMcNfaSizes(t *testing.T) {
 		seenStates: make(map[*faState]bool),
 	}
 	cmStateStats(&faState{table: fa1}, stats, pp)
-	wantedBytes := int64(1481) // laboriously hand-calculated
+	wantedBytes := int64(1321) // laboriously hand-calculated
 	wantedFanout := int64(5)
 	wantedMaxFanout := int64(2)
 	if stats.bytes != wantedBytes {
diff --git a/nfa.go b/nfa.go
index 023bbff..a9341db 100644
--- a/nfa.go
+++ b/nfa.go
@@ -13,9 +13,8 @@ import (
 type faState struct {
 	table            *smallTable
 	fieldTransitions []*fieldMatcher
-	isSpinner        bool
 	epsilonClosure   []*faState // precomputed epsilon closure including self
-	closureSetGen    uint64     // generation for closure set visited tracking
+	isSpinner        bool
 }
 
 /*
@@ -344,23 +343,25 @@ func makeFaStepKey(s1, s2 *faState) faStepKey {
 // epsilon transitions from state1 and state2. This prevents deep nesting of
 // splice states that would otherwise accumulate during repeated merges.
 func simplifySplices(state1, state2 *faState) []*faState {
-	closureGeneration++
-	gen := closureGeneration
+	// A freshly-allocated visited map is used as a side table; the old
+	// approach stored a generation counter on faState itself, which bloated
+	// every state permanently for build-only state.
+	visited := make(map[*faState]bool)
 	targets := make([]*faState, 0, 4)
-	targets = simplifyCollect(state1, gen, targets)
-	targets = simplifyCollect(state2, gen, targets)
+	targets = simplifyCollect(state1, visited, targets)
+	targets = simplifyCollect(state2, visited, targets)
 	return targets
 }
 
-func simplifyCollect(s *faState, gen uint64, targets []*faState) []*faState {
-	if s.closureSetGen == gen {
+func simplifyCollect(s *faState, visited map[*faState]bool, targets []*faState) []*faState {
+	if visited[s] {
 		return targets
 	}
-	s.closureSetGen = gen
+	visited[s] = true
 
 	if s.table.isEpsilonOnly() {
 		for _, eps := range s.table.epsilons {
-			targets = simplifyCollect(eps, gen, targets)
+			targets = simplifyCollect(eps, visited, targets)
 		}
 	} else {
 		targets = append(targets, s)
diff --git a/prettyprinter_test.go b/prettyprinter_test.go
index 4968f56..cc62011 100644
--- a/prettyprinter_test.go
+++ b/prettyprinter_test.go
@@ -8,12 +8,12 @@ func TestPP(t *testing.T) {
 	pp := newPrettyPrinter(1)
 	table, _ := makeShellStyleFA([]byte(`"x*9"`), pp)
 	pp.labelTable(table, "START HERE")
-	wanted := ` 884[START HERE] '22/"' → (914[on " at 0][s/t 240/312] 
- 914[on " at 0] '78/x' → (384[*-Spinner][s/t 240/312] 
- 384[*-Spinner] '39/9' → (322[spinEscape on 9 at 3] / ★ → 384[*-Spinner][s/t 240/312] 
- 322[spinEscape on 9 at 3] ε → 384[*-Spinner] / '22/"' → (769[on " at 4][s/t 248/320] 
- 769[on " at 4] 'f5/ℵ' → (301[last step at 5][s/t 240/312] 
- 301[last step at 5]  [1 transition(s)][s/t 105/185] 
+	wanted := ` 884[START HERE] '22/"' → (914[on " at 0][s/t 216/280] 
+ 914[on " at 0] '78/x' → (384[*-Spinner][s/t 216/280] 
+ 384[*-Spinner] '39/9' → (322[spinEscape on 9 at 3] / ★ → 384[*-Spinner][s/t 216/280] 
+ 322[spinEscape on 9 at 3] ε → 384[*-Spinner] / '22/"' → (769[on " at 4][s/t 224/288] 
+ 769[on " at 4] 'f5/ℵ' → (301[last step at 5][s/t 216/280] 
+ 301[last step at 5]  [1 transition(s)][s/t 81/153] 
 `
 	s := pp.printNFA(table)
 	if s != wanted {
diff --git a/small_table.go b/small_table.go
index 1a8f3a5..5b80da0 100644
--- a/small_table.go
+++ b/small_table.go
@@ -38,20 +38,9 @@ const valueTerminator byte = 0xf5
 // by branching on 'b' to a state that has no byte transitions but two epsilons, one each for s1 and s2.
 
 type smallTable struct {
-	ceilings       []byte
-	steps          []*faState
-	epsilons       []*faState
-	lastVisitedGen uint64 // generation counter for epsilon closure traversal
-	// closureGen records which closureGeneration this table's
-	// representative was set in. If it equals the current global
-	// closureGeneration, then closureRep is valid; otherwise, the
-	// table has not yet been seen in this dedup pass.
-	closureGen uint64
-	// closureRep is the representative faState for this table in the
-	// current closure dedup pass. When multiple states share the same
-	// smallTable and have identical fieldTransitions, only this
-	// representative is kept in the closure.
-	closureRep *faState
+	ceilings []byte
+	steps    []*faState
+	epsilons []*faState
 }
 
 // newSmallTable mostly exists to enforce the constraint that every smallTable has a byteCeiling entry at
diff --git a/state_lists.go b/state_lists.go
index 54c664e..8040ed5 100644
--- a/state_lists.go
+++ b/state_lists.go
@@ -20,13 +20,15 @@ type internEntry struct {
 type stateLists struct {
 	entries map[string]internEntry
 	// Scratch space reused across intern() calls
-	sortBuf []*faState // reusable sorted buffer
-	keyBuf  []byte     // reusable key bytes buffer
+	sortBuf []*faState        // reusable sorted buffer
+	keyBuf  []byte            // reusable key bytes buffer
+	seen    map[*faState]bool // reusable dedup set, cleared per call
 }
 
 func newStateLists() *stateLists {
 	return &stateLists{
 		entries: make(map[string]internEntry),
+		seen:    make(map[*faState]bool),
 	}
 }
 
@@ -36,17 +38,17 @@ func newStateLists() *stateLists {
 // which either has already been computed for the set or is created and empty, and
 // a boolean indicating whether the DFA state has already been computed or not.
 func (sl *stateLists) intern(list []*faState) ([]*faState, *faState, bool) {
-	// Dedupe using the global generation counter and faState.closureSetGen
-	// instead of allocating a map per call. Safe to reuse closureSetGen
-	// because nfa2Dfa runs after epsilon closure computation is complete.
-	closureGeneration++
-	gen := closureGeneration
+	// Dedup within this call using a reused map. Previously this rode on
+	// a generation counter stored inline on each faState; that field has
+	// been removed to shrink steady-state memory.
+	clear(sl.seen)
 	sl.sortBuf = sl.sortBuf[:0]
 	for _, state := range list {
-		if state.closureSetGen != gen {
-			state.closureSetGen = gen
-			sl.sortBuf = append(sl.sortBuf, state)
+		if sl.seen[state] {
+			continue
 		}
+		sl.seen[state] = true
+		sl.sortBuf = append(sl.sortBuf, state)
 	}
 
 	// compute a key representing the set