From e33139fde148bcfcb30056a30bd44378890247f2 Mon Sep 17 00:00:00 2001
From: Robert Sayre <sayrer@gmail.com>
Date: Thu, 16 Apr 2026 16:02:49 -0700
Subject: [PATCH 1/2] nfa: simplify smallTable.step, eliminate per-traverse
 stepOut alloc
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

step() previously returned via a *stepOut parameter whose `epsilons`
field was written on every call but never read — traverseNFA iterates
the precomputed epsilonClosure separately. Change step to return
*faState directly, drop the stepOut struct, and drop the stack slot
in traverseNFA (it was escaping to the heap as one alloc per match).

Also replace the forbiddenBytes map with a range check; the three
ranges {0xC0, 0xC1, 0xF5-0xFF} are enough for a couple of compares
and the map fetch was pure cost on the cold path.

Bench (Apple M1 Ultra, n=6):

  ShellStyleBuildTime-20   56.10m -> 42.44m  -24.36%
                            958 B ->  709 B  -25.99%
                           1 allocs -> 0 allocs
  geomean (10 benchmarks):             -2.26% time

Other targeted benchmarks are within +/-2% (statistically significant
but small); ShellstyleMultiMatch / TablePointerDedup / CityLots
allocations unchanged at 0.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 nfa.go              |  6 ++----
 regexp_nfa_test.go  |  2 +-
 small_table.go      | 43 ++++++++++++++++++-------------------------
 small_table_test.go |  9 ++++++---
 4 files changed, 27 insertions(+), 33 deletions(-)

diff --git a/nfa.go b/nfa.go
index 7fa5800..2bfb991 100644
--- a/nfa.go
+++ b/nfa.go
@@ -278,7 +278,6 @@ func traverseNFA(table *smallTable, val []byte, transitions []*fieldMatcher, buf
 		fieldSet[fm] = true
 	}
 
-	stepResult := &stepOut{}
 	for index := 0; len(currentStates) != 0 && index <= len(val); index++ {
 		var utf8Byte byte
 		if index < len(val) {
@@ -291,9 +290,8 @@ func traverseNFA(table *smallTable, val []byte, transitions []*fieldMatcher, buf
 				for _, fm := range ecState.fieldTransitions {
 					fieldSet[fm] = true
 				}
-				ecState.table.step(utf8Byte, stepResult)
-				if stepResult.step != nil {
-					nextStates = append(nextStates, stepResult.step)
+				if nextStep := ecState.table.step(utf8Byte); nextStep != nil {
+					nextStates = append(nextStates, nextStep)
 				}
 			}
 		}
diff --git a/regexp_nfa_test.go b/regexp_nfa_test.go
index 93d7017..13200bc 100644
--- a/regexp_nfa_test.go
+++ b/regexp_nfa_test.go
@@ -227,7 +227,7 @@ func TestMakeByteDotFA(t *testing.T) {
 	for i := 0; i < 256; i++ {
 		b := byte(i)
 		got := st.dStep(b)
-		if forbiddenBytes[b] {
+		if isForbiddenUTF8(b) {
 			if got != nil {
 				t.Errorf("accepted %x", b)
 			}
diff --git a/small_table.go b/small_table.go
index 1f9a3c9..1a8f3a5 100644
--- a/small_table.go
+++ b/small_table.go
@@ -67,42 +67,36 @@ func (t *smallTable) isEpsilonOnly() bool {
 	return len(t.epsilons) > 0 && len(t.ceilings) == 1
 }
 
-type stepOut struct {
-	step     *faState
-	epsilons []*faState
-}
-
-var forbiddenBytes = map[byte]bool{
-	0xC0: true, 0xC1: true,
-	0xF5: true, 0xF6: true, 0xF7: true, 0xF8: true, 0xF9: true, 0xFA: true,
-	0xFB: true, 0xFC: true, 0xFD: true, 0xFE: true, 0xFF: true,
-}
-
 func (t *smallTable) isJustEpsilons() bool {
 	// TODO I think the second of the three conditions is unnecessary
 	return len(t.steps) == 1 && t.steps[0] == nil && len(t.epsilons) != 0
 }
 
-// step finds the list of states that result from a transition on the utf8Byte argument. The states can come
-// as a result of looking in the table structure, and also the "epsilon" transitions that occur on every
-// input byte.  Since this is the white-hot center of Quamina's runtime CPU, we don't want to be merging
-// the two lists. So to avoid any memory allocation, the caller passes in a structure with the two lists
-// and step fills them in.
-func (t *smallTable) step(utf8Byte byte, out *stepOut) {
-	out.epsilons = t.epsilons
+// step returns the faState that results from a transition on utf8Byte, or nil
+// if the table has no step for that byte. Epsilon transitions are handled
+// separately (via precomputed epsilonClosure), so step never touches t.epsilons.
+// This is the white-hot center of Quamina's runtime CPU; keep it inlinable.
+func (t *smallTable) step(utf8Byte byte) *faState {
 	for index, ceiling := range t.ceilings {
 		if utf8Byte < ceiling {
-			out.step = t.steps[index]
-			return
+			return t.steps[index]
 		}
 	}
-	_, forbidden := forbiddenBytes[utf8Byte]
-	if forbidden {
-		return
+	// utf8Byte >= byteCeiling (0xF6): only valid if it's a forbidden UTF-8 byte,
+	// in which case we return nil so the caller can drop this path.
+	if isForbiddenUTF8(utf8Byte) {
+		return nil
 	}
 	panic("Malformed smallTable")
 }
 
+// isForbiddenUTF8 reports whether the byte can never appear in valid UTF-8.
+// Range check instead of a map lookup — strictly faster and the forbidden set
+// is three compact ranges: {0xC0, 0xC1} and {0xF5–0xFF}.
+func isForbiddenUTF8(b byte) bool {
+	return b == 0xC0 || b == 0xC1 || b >= 0xF5
+}
+
 // dStep takes a step through an NFA in the case where it is known that the NFA in question
 // is deterministic, i.e. each combination of an faState and a byte value transitions to at
 // most one other byte value.
@@ -112,8 +106,7 @@ func (t *smallTable) dStep(utf8Byte byte) *faState {
 			return t.steps[index]
 		}
 	}
-	_, forbidden := forbiddenBytes[utf8Byte]
-	if forbidden {
+	if isForbiddenUTF8(utf8Byte) {
 		return nil
 	}
 	panic("Malformed smallTable")
diff --git a/small_table_test.go b/small_table_test.go
index 2bd2b70..6686b21 100644
--- a/small_table_test.go
+++ b/small_table_test.go
@@ -64,9 +64,12 @@ func TestUnpack(t *testing.T) {
 
 func TestDodgeBadUTF8(t *testing.T) {
 	st := makeSmallTable(nil, []byte{'a'}, []*faState{{}})
-	so := &stepOut{}
-	st.step(0xFE, so)
-	st.dStep(0xFE)
+	if got := st.step(0xFE); got != nil {
+		t.Errorf("step(0xFE) = %v, want nil", got)
+	}
+	if got := st.dStep(0xFE); got != nil {
+		t.Errorf("dStep(0xFE) = %v, want nil", got)
+	}
 }
 
 func TestSmallTableIterator(t *testing.T) {

From b96ef0992291e36c0a33b2fa086d8a9312c9f316 Mon Sep 17 00:00:00 2001
From: Robert Sayre <sayrer@gmail.com>
Date: Fri, 17 Apr 2026 10:42:07 -0700
Subject: [PATCH 2/2] memory_cost_test: widen TestStringFA budget margin
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The final sub-case set the budget to current-1 and expected the
100-char string pattern add to fail. Under Go 1.23 with -race, the
race detector's bookkeeping shifts TotalAlloc by more than 1 byte
between the reference measurement and the replay on a fresh matcher,
so addPattern stayed under budget and the test tripped its
"should fail" assertion. (Go 1.24+ on the same hardware behaved
differently, and the Cover job without -race passes.)

Widen the margin to len(i100) — comfortably smaller than the
i100 pattern's own FA cost (thousands of bytes) but large enough to
absorb alloc variance.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 memory_cost_test.go | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/memory_cost_test.go b/memory_cost_test.go
index 9132c20..4a01003 100644
--- a/memory_cost_test.go
+++ b/memory_cost_test.go
@@ -156,7 +156,11 @@ func TestStringFA(t *testing.T) {
 
 	_, current = cm.getMemoryBudget()
 	cm = newCoreMatcher()
-	_, _ = cm.setMemoryBudget(current - 1)
+	// Leave a margin larger than typical TotalAlloc variance across Go
+	// versions and the race detector's bookkeeping overhead. A 1-byte
+	// margin was flaky under Go 1.23 + -race; the i100 pattern's FA
+	// comfortably exceeds any reasonable margin below its own cost.
+	_, _ = cm.setMemoryBudget(current - uint64(len(i100)))
 	err = cm.addPattern("x", `{"x": ["x"]}`)
 	if err != nil {
 		t.Error("x?")