From e33139fde148bcfcb30056a30bd44378890247f2 Mon Sep 17 00:00:00 2001 From: Robert Sayre Date: Thu, 16 Apr 2026 16:02:49 -0700 Subject: [PATCH 1/2] nfa: simplify smallTable.step, eliminate per-traverse stepOut alloc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit step() previously returned via a *stepOut parameter whose `epsilons` field was written on every call but never read — traverseNFA iterates the precomputed epsilonClosure separately. Change step to return *faState directly, drop the stepOut struct, and drop the stack slot in traverseNFA (it was escaping to the heap as one alloc per match). Also replace the forbiddenBytes map with a range check; the three ranges {0xC0, 0xC1, 0xF5-0xFF} are enough for a couple of compares and the map fetch was pure cost on the cold path. Bench (Apple M1 Ultra, n=6): ShellStyleBuildTime-20 56.10m -> 42.44m -24.36% 958 B -> 709 B -25.99% 1 allocs -> 0 allocs geomean (10 benchmarks): -2.26% time Other targeted benchmarks are within +/-2% (statistically significant but small); ShellstyleMultiMatch / TablePointerDedup / CityLots allocations unchanged at 0. Co-Authored-By: Claude Opus 4.7 (1M context) --- nfa.go | 6 ++---- regexp_nfa_test.go | 2 +- small_table.go | 43 ++++++++++++++++++------------------------- small_table_test.go | 9 ++++++--- 4 files changed, 27 insertions(+), 33 deletions(-) diff --git a/nfa.go b/nfa.go index 7fa5800..2bfb991 100644 --- a/nfa.go +++ b/nfa.go @@ -278,7 +278,6 @@ func traverseNFA(table *smallTable, val []byte, transitions []*fieldMatcher, buf fieldSet[fm] = true } - stepResult := &stepOut{} for index := 0; len(currentStates) != 0 && index <= len(val); index++ { var utf8Byte byte if index < len(val) { @@ -291,9 +290,8 @@ func traverseNFA(table *smallTable, val []byte, transitions []*fieldMatcher, buf for _, fm := range ecState.fieldTransitions { fieldSet[fm] = true } - ecState.table.step(utf8Byte, stepResult) - if stepResult.step != nil { - nextStates = append(nextStates, stepResult.step) + if nextStep := ecState.table.step(utf8Byte); nextStep != nil { + nextStates = append(nextStates, nextStep) } } } diff --git a/regexp_nfa_test.go b/regexp_nfa_test.go index 93d7017..13200bc 100644 --- a/regexp_nfa_test.go +++ b/regexp_nfa_test.go @@ -227,7 +227,7 @@ func TestMakeByteDotFA(t *testing.T) { for i := 0; i < 256; i++ { b := byte(i) got := st.dStep(b) - if forbiddenBytes[b] { + if isForbiddenUTF8(b) { if got != nil { t.Errorf("accepted %x", b) } diff --git a/small_table.go b/small_table.go index 1f9a3c9..1a8f3a5 100644 --- a/small_table.go +++ b/small_table.go @@ -67,42 +67,36 @@ func (t *smallTable) isEpsilonOnly() bool { return len(t.epsilons) > 0 && len(t.ceilings) == 1 } -type stepOut struct { - step *faState - epsilons []*faState -} - -var forbiddenBytes = map[byte]bool{ - 0xC0: true, 0xC1: true, - 0xF5: true, 0xF6: true, 0xF7: true, 0xF8: true, 0xF9: true, 0xFA: true, - 0xFB: true, 0xFC: true, 0xFD: true, 0xFE: true, 0xFF: true, -} - func (t *smallTable) isJustEpsilons() bool { // TODO I think the second of the three conditions is unnecessary return len(t.steps) == 1 && t.steps[0] == nil && len(t.epsilons) != 0 } -// step finds the list of states that result from a transition on the utf8Byte argument. The states can come -// as a result of looking in the table structure, and also the "epsilon" transitions that occur on every -// input byte. Since this is the white-hot center of Quamina's runtime CPU, we don't want to be merging -// the two lists. So to avoid any memory allocation, the caller passes in a structure with the two lists -// and step fills them in. -func (t *smallTable) step(utf8Byte byte, out *stepOut) { - out.epsilons = t.epsilons +// step returns the faState that results from a transition on utf8Byte, or nil +// if the table has no step for that byte. Epsilon transitions are handled +// separately (via precomputed epsilonClosure), so step never touches t.epsilons. +// This is the white-hot center of Quamina's runtime CPU; keep it inlinable. +func (t *smallTable) step(utf8Byte byte) *faState { for index, ceiling := range t.ceilings { if utf8Byte < ceiling { - out.step = t.steps[index] - return + return t.steps[index] } } - _, forbidden := forbiddenBytes[utf8Byte] - if forbidden { - return + // utf8Byte >= byteCeiling (0xF6): only valid if it's a forbidden UTF-8 byte, + // in which case we return nil so the caller can drop this path. + if isForbiddenUTF8(utf8Byte) { + return nil } panic("Malformed smallTable") } +// isForbiddenUTF8 reports whether the byte can never appear in valid UTF-8. +// Range check instead of a map lookup — strictly faster and the forbidden set +// is three compact ranges: {0xC0, 0xC1} and {0xF5–0xFF}. +func isForbiddenUTF8(b byte) bool { + return b == 0xC0 || b == 0xC1 || b >= 0xF5 +} + // dStep takes a step through an NFA in the case where it is known that the NFA in question // is deterministic, i.e. each combination of an faState and a byte value transitions to at // most one other byte value. @@ -112,8 +106,7 @@ func (t *smallTable) dStep(utf8Byte byte) *faState { return t.steps[index] } } - _, forbidden := forbiddenBytes[utf8Byte] - if forbidden { + if isForbiddenUTF8(utf8Byte) { return nil } panic("Malformed smallTable") diff --git a/small_table_test.go b/small_table_test.go index 2bd2b70..6686b21 100644 --- a/small_table_test.go +++ b/small_table_test.go @@ -64,9 +64,12 @@ func TestUnpack(t *testing.T) { func TestDodgeBadUTF8(t *testing.T) { st := makeSmallTable(nil, []byte{'a'}, []*faState{{}}) - so := &stepOut{} - st.step(0xFE, so) - st.dStep(0xFE) + if got := st.step(0xFE); got != nil { + t.Errorf("step(0xFE) = %v, want nil", got) + } + if got := st.dStep(0xFE); got != nil { + t.Errorf("dStep(0xFE) = %v, want nil", got) + } } func TestSmallTableIterator(t *testing.T) { From b96ef0992291e36c0a33b2fa086d8a9312c9f316 Mon Sep 17 00:00:00 2001 From: Robert Sayre Date: Fri, 17 Apr 2026 10:42:07 -0700 Subject: [PATCH 2/2] memory_cost_test: widen TestStringFA budget margin MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The final sub-case set the budget to current-1 and expected the 100-char string pattern add to fail. Under Go 1.23 with -race, the race detector's bookkeeping shifts TotalAlloc by more than 1 byte between the reference measurement and the replay on a fresh matcher, so addPattern stayed under budget and the test tripped its "should fail" assertion. (Go 1.24+ on the same hardware behaved differently, and the Cover job without -race passes.) Widen the margin to len(i100) — comfortably smaller than the i100 pattern's own FA cost (thousands of bytes) but large enough to absorb alloc variance. Co-Authored-By: Claude Opus 4.7 (1M context) --- memory_cost_test.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/memory_cost_test.go b/memory_cost_test.go index 9132c20..4a01003 100644 --- a/memory_cost_test.go +++ b/memory_cost_test.go @@ -156,7 +156,11 @@ func TestStringFA(t *testing.T) { _, current = cm.getMemoryBudget() cm = newCoreMatcher() - _, _ = cm.setMemoryBudget(current - 1) + // Leave a margin larger than typical TotalAlloc variance across Go + // versions and the race detector's bookkeeping overhead. A 1-byte + // margin was flaky under Go 1.23 + -race; the i100 pattern's FA + // comfortably exceeds any reasonable margin below its own cost. + _, _ = cm.setMemoryBudget(current - uint64(len(i100))) err = cm.addPattern("x", `{"x": ["x"]}`) if err != nil { t.Error("x?")