From b5f93ab59a0cfd931f6f39bdd0ce095259dc635c Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Fri, 13 Jun 2025 12:37:38 -0700 Subject: [PATCH] Properly bound reps for machine parser Bounded repetitions were buggy for both machine parsers, except for character classes because they implement their own repetition. The problems were due mainly to (a) not accounting for the parsing success that occurred in order to reach the UPDATE instruction, and (b) not accounting for the mincount matches that have already occurred, as they are not tracked by the UPDATE's counter. Before: >>> pe.compile('"a"{1,2}', parser="machine-python").match('aaaaaa') Now: >>> pe.compile('"a"{1,2}', parser="machine-python").match('aaaaaa') Fixes #53 --- CHANGELOG.md | 2 ++ pe/_cy_machine.pyx | 5 +++-- pe/_py_machine.py | 7 ++++--- test/test_parsers.py | 1 + 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1fc2365..cdaabad 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ ### Fixed +* Properly bound repetitions for machine parsers ([#53]) * Group quantized 2+ char literals in regex optimization ([#54]) @@ -219,4 +220,5 @@ descent parser and a work-in-progress state-machine parser. [#38]: https://github.com/goodmami/pe/issues/38 [#44]: https://github.com/goodmami/pe/issues/44 [#46]: https://github.com/goodmami/pe/issues/46 +[#53]: https://github.com/goodmami/pe/issues/53 [#54]: https://github.com/goodmami/pe/issues/54 diff --git a/pe/_cy_machine.pyx b/pe/_cy_machine.pyx index bd352d2..6fe98e0 100644 --- a/pe/_cy_machine.pyx +++ b/pe/_cy_machine.pyx @@ -272,7 +272,8 @@ cdef class _Parser: continue elif instr.opcode == UPDATE: - if instr.maxcount == -1 or state.count < instr.maxcount: + state.count += 1 + if instr.maxcount < 0 or state.count < instr.maxcount: state.pos = pos state.argidx = len(args) state.kwidx = len(kwargs) @@ -433,7 +434,7 @@ def _loop(defn, mincount, maxcount): *(pi.copy() for _ in range(mincount) for pi in pis), Instruction(BRANCH, len(pis) + 2), *pis, - Instruction(UPDATE, -len(pis), maxcount=maxcount) + Instruction(UPDATE, -len(pis), maxcount=(maxcount - mincount)) ] diff --git a/pe/_py_machine.py b/pe/_py_machine.py index f5a5984..81597c7 100644 --- a/pe/_py_machine.py +++ b/pe/_py_machine.py @@ -205,8 +205,9 @@ def _match( # noqa: C901 elif opcode == UPDATE: next_idx, _, count, prev_mark, _, _ = pop() - if maxcount == -1 or count < maxcount: - push((next_idx, pos, count + 1, prev_mark, len(args), len(kwargs))) + count += 1 + if maxcount < 0 or count < maxcount: + push((next_idx, pos, count, prev_mark, len(args), len(kwargs))) idx += oploc else: idx += 1 @@ -355,7 +356,7 @@ def _loop(defn, mincount: int, maxcount: int): return [*(pis * mincount), # risk of billion laughs attack Instruction(BRANCH, len(pis) + 2), *pis, - Instruction(UPDATE, -len(pis), maxcount=maxcount)] + Instruction(UPDATE, -len(pis), maxcount=(maxcount - mincount))] def _sym(defn): diff --git a/test/test_parsers.py b/test/test_parsers.py index aad0fb5..8066bc7 100644 --- a/test/test_parsers.py +++ b/test/test_parsers.py @@ -91,6 +91,7 @@ 'aabbcc', 0, 3, _blank), ('Rpt3', Rpt(abc, min=3), 'aaxx', 0, FAIL, None), ('Rpt4', Rpt(abc, max=1), 'aabbcc', 0, 1, _blank), + ('Rpt5', Rpt('a', max=2), 'aaaaaa', 0, 2, _blank), ('And0', And(abc), 'a', 0, 0, _blank), ('And1', And(abc), 'd', 0, FAIL, None),