From 0818bfc9179bdb049a350804ef82b9f5b9896f6f Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Wed, 18 Jun 2025 11:17:29 -0700 Subject: [PATCH 1/3] Group quantized literals in regex optimization The regex optimizer did not put a non-capturing group around quantized dots, character classes, or literals, but for literals of length > 2, this meant that only the final character was quantized: Before: - 'ab'? -> `ab?` - 'cd'* -> `cd*` Now: - 'ab?' -> `(?:ab)?` - 'cd'* -> `(?:cd)*` Fixes #54 --- CHANGELOG.md | 5 +++++ pe/_optimize.py | 12 +++++++++--- test/test_regression.py | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 47 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a50f5bd..1fc2365 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,10 @@ ## [Unreleased][unreleased] +### Fixed + +* Group quantized 2+ char literals in regex optimization ([#54]) + ## [v0.5.3][] @@ -215,3 +219,4 @@ descent parser and a work-in-progress state-machine parser. [#38]: https://github.com/goodmami/pe/issues/38 [#44]: https://github.com/goodmami/pe/issues/44 [#46]: https://github.com/goodmami/pe/issues/46 +[#54]: https://github.com/goodmami/pe/issues/54 diff --git a/pe/_optimize.py b/pe/_optimize.py index 11a98a9..aeb80a3 100644 --- a/pe/_optimize.py +++ b/pe/_optimize.py @@ -242,7 +242,7 @@ def _regex_optional(defn, defs, grpid): subdef = defn.args[0] d = _regex(defn.args[0], defs, grpid) if d.op == RGX: - subpat = d.args[0] if subdef.op in (DOT, LIT, CLS) else f'(?:{d.args[0]})' + subpat = _regex_maybe_group(subdef.op, d.args[0]) return Regex(f'{subpat}?') else: return Optional(d) @@ -252,7 +252,7 @@ def _regex_star(defn, defs, grpid): subdef = defn.args[0] d = _regex(subdef, defs, grpid) if d.op == RGX: - subpat = d.args[0] if subdef.op in (DOT, LIT, CLS) else f'(?:{d.args[0]})' + subpat = _regex_maybe_group(subdef.op, d.args[0]) gid = f'_{next(grpid)}' return Regex(f'(?=(?P<{gid}>{subpat}*))(?P={gid})') else: @@ -263,13 +263,19 @@ def _regex_plus(defn, defs, grpid): subdef = defn.args[0] d = _regex(defn.args[0], defs, grpid) if d.op == RGX: - subpat = d.args[0] if subdef.op in (DOT, LIT, CLS) else f'(?:{d.args[0]})' + subpat = _regex_maybe_group(subdef.op, d.args[0]) gid = f'_{next(grpid)}' return Regex(f'(?=(?P<{gid}>{subpat}+))(?P={gid})') else: return Plus(d) +def _regex_maybe_group(op: Operator, arg: str) -> str: + if op in (DOT, CLS) or (op == LIT and len(arg) == 1): + return arg + return f'(?:{arg})' + + def _regex_and(defn, defs, grpid): d = _regex(defn.args[0], defs, grpid) if d.op == RGX: diff --git a/test/test_regression.py b/test/test_regression.py index 8331397..47d32c3 100644 --- a/test/test_regression.py +++ b/test/test_regression.py @@ -40,3 +40,36 @@ def test_capture_repeated(parser): m3 = pe.match('(~"a")+', 'aaa', parser=parser) assert m3.group() == 'aaa' assert m3.groups() == ('a', 'a', 'a') + + +@pytest.mark.parametrize('parser', ['packrat', 'machine', 'machine-python']) +def test_regex_optimized_quantified_literal(parser): + """https://github.com/goodmami/pe/issues/54""" + + p = pe.compile("Start <- 'ab'? 'c'", parser=parser, flags=pe.REGEX) + assert p.match("c", flags=pe.NONE).group(0) == "c" + assert p.match("abc", flags=pe.NONE).group(0) == "abc" + assert p.match("ababc", flags=pe.NONE) is None + assert p.match("ac", flags=pe.NONE) is None + assert p.match("abac", flags=pe.NONE) is None + + p = pe.compile("Start <- 'ab'* 'c'", parser=parser, flags=pe.REGEX) + assert p.match("c", flags=pe.NONE).group(0) == "c" + assert p.match("abc", flags=pe.NONE).group(0) == "abc" + assert p.match("ababc", flags=pe.NONE).group(0) == "ababc" + assert p.match("ac", flags=pe.NONE) is None + assert p.match("abac", flags=pe.NONE) is None + + p = pe.compile("Start <- 'ab'+ 'c'", parser=parser, flags=pe.REGEX) + assert p.match("c", flags=pe.NONE) is None + assert p.match("abc", flags=pe.NONE).group(0) == "abc" + assert p.match("ababc", flags=pe.NONE).group(0) == "ababc" + assert p.match("ac", flags=pe.NONE) is None + assert p.match("abac", flags=pe.NONE) is None + + p = pe.compile("Start <- 'ab'{2} 'c'", parser=parser, flags=pe.REGEX) + assert p.match("c", flags=pe.NONE) is None + assert p.match("abc", flags=pe.NONE) is None + assert p.match("ababc", flags=pe.NONE).group(0) == "ababc" + assert p.match("ac", flags=pe.NONE) is None + assert p.match("abac", flags=pe.NONE) is None From 546737de5fd2d6fac132659c790bc83aa09a678f Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Wed, 18 Jun 2025 11:36:41 -0700 Subject: [PATCH 2/3] Fix type checking error on Match args list --- pe/packrat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pe/packrat.py b/pe/packrat.py index 439aaf5..dc4da3d 100644 --- a/pe/packrat.py +++ b/pe/packrat.py @@ -87,7 +87,7 @@ def match(self, else: return None - args = tuple(args or ()) + args = list(args or ()) if kwargs is None: kwargs = {} From 7d950e316cbad1cab7a0c3b8daecb004aede3a65 Mon Sep 17 00:00:00 2001 From: Michael Wayne Goodman Date: Wed, 18 Jun 2025 11:37:14 -0700 Subject: [PATCH 3/3] Fix doctest issue --- docs/guides/using-flags.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/guides/using-flags.md b/docs/guides/using-flags.md index 2b2bcf5..54d49c8 100644 --- a/docs/guides/using-flags.md +++ b/docs/guides/using-flags.md @@ -125,7 +125,7 @@ pe._errors.ParseError: line 0, character 8 "one" 1 "two ^ -ParseError: `(?=(?P<_5>"(?=(?P<_2>[^"]*))(?P=_2)"|\-?(?=(?P<_4>0|[1-9](?=(?P<_3>[0-9]*))(?P=_3)))(?P=_4)))(?P=_5)` +ParseError: `(?=(?P<_5>"(?=(?P<_2>[^"]*))(?P=_2)"|(?:\-)?(?=(?P<_4>0|[1-9](?=(?P<_3>[0-9]*))(?P=_3)))(?P=_4)))(?P=_5)` ```