diff --git a/CHANGELOG.md b/CHANGELOG.md index a50f5bd..1fc2365 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,10 @@ ## [Unreleased][unreleased] +### Fixed + +* Group quantized 2+ char literals in regex optimization ([#54]) + ## [v0.5.3][] @@ -215,3 +219,4 @@ descent parser and a work-in-progress state-machine parser. [#38]: https://github.com/goodmami/pe/issues/38 [#44]: https://github.com/goodmami/pe/issues/44 [#46]: https://github.com/goodmami/pe/issues/46 +[#54]: https://github.com/goodmami/pe/issues/54 diff --git a/docs/guides/using-flags.md b/docs/guides/using-flags.md index 2b2bcf5..54d49c8 100644 --- a/docs/guides/using-flags.md +++ b/docs/guides/using-flags.md @@ -125,7 +125,7 @@ pe._errors.ParseError: line 0, character 8 "one" 1 "two ^ -ParseError: `(?=(?P<_5>"(?=(?P<_2>[^"]*))(?P=_2)"|\-?(?=(?P<_4>0|[1-9](?=(?P<_3>[0-9]*))(?P=_3)))(?P=_4)))(?P=_5)` +ParseError: `(?=(?P<_5>"(?=(?P<_2>[^"]*))(?P=_2)"|(?:\-)?(?=(?P<_4>0|[1-9](?=(?P<_3>[0-9]*))(?P=_3)))(?P=_4)))(?P=_5)` ``` diff --git a/pe/_optimize.py b/pe/_optimize.py index 11a98a9..aeb80a3 100644 --- a/pe/_optimize.py +++ b/pe/_optimize.py @@ -242,7 +242,7 @@ def _regex_optional(defn, defs, grpid): subdef = defn.args[0] d = _regex(defn.args[0], defs, grpid) if d.op == RGX: - subpat = d.args[0] if subdef.op in (DOT, LIT, CLS) else f'(?:{d.args[0]})' + subpat = _regex_maybe_group(subdef.op, d.args[0]) return Regex(f'{subpat}?') else: return Optional(d) @@ -252,7 +252,7 @@ def _regex_star(defn, defs, grpid): subdef = defn.args[0] d = _regex(subdef, defs, grpid) if d.op == RGX: - subpat = d.args[0] if subdef.op in (DOT, LIT, CLS) else f'(?:{d.args[0]})' + subpat = _regex_maybe_group(subdef.op, d.args[0]) gid = f'_{next(grpid)}' return Regex(f'(?=(?P<{gid}>{subpat}*))(?P={gid})') else: @@ -263,13 +263,19 @@ def _regex_plus(defn, defs, grpid): subdef = defn.args[0] d = _regex(defn.args[0], defs, grpid) if d.op == RGX: - subpat = d.args[0] if subdef.op in (DOT, LIT, CLS) else f'(?:{d.args[0]})' + subpat = _regex_maybe_group(subdef.op, d.args[0]) gid = f'_{next(grpid)}' return Regex(f'(?=(?P<{gid}>{subpat}+))(?P={gid})') else: return Plus(d) +def _regex_maybe_group(op: Operator, arg: str) -> str: + if op in (DOT, CLS) or (op == LIT and len(arg) == 1): + return arg + return f'(?:{arg})' + + def _regex_and(defn, defs, grpid): d = _regex(defn.args[0], defs, grpid) if d.op == RGX: diff --git a/pe/packrat.py b/pe/packrat.py index 439aaf5..dc4da3d 100644 --- a/pe/packrat.py +++ b/pe/packrat.py @@ -87,7 +87,7 @@ def match(self, else: return None - args = tuple(args or ()) + args = list(args or ()) if kwargs is None: kwargs = {} diff --git a/test/test_regression.py b/test/test_regression.py index 8331397..47d32c3 100644 --- a/test/test_regression.py +++ b/test/test_regression.py @@ -40,3 +40,36 @@ def test_capture_repeated(parser): m3 = pe.match('(~"a")+', 'aaa', parser=parser) assert m3.group() == 'aaa' assert m3.groups() == ('a', 'a', 'a') + + +@pytest.mark.parametrize('parser', ['packrat', 'machine', 'machine-python']) +def test_regex_optimized_quantified_literal(parser): + """https://github.com/goodmami/pe/issues/54""" + + p = pe.compile("Start <- 'ab'? 'c'", parser=parser, flags=pe.REGEX) + assert p.match("c", flags=pe.NONE).group(0) == "c" + assert p.match("abc", flags=pe.NONE).group(0) == "abc" + assert p.match("ababc", flags=pe.NONE) is None + assert p.match("ac", flags=pe.NONE) is None + assert p.match("abac", flags=pe.NONE) is None + + p = pe.compile("Start <- 'ab'* 'c'", parser=parser, flags=pe.REGEX) + assert p.match("c", flags=pe.NONE).group(0) == "c" + assert p.match("abc", flags=pe.NONE).group(0) == "abc" + assert p.match("ababc", flags=pe.NONE).group(0) == "ababc" + assert p.match("ac", flags=pe.NONE) is None + assert p.match("abac", flags=pe.NONE) is None + + p = pe.compile("Start <- 'ab'+ 'c'", parser=parser, flags=pe.REGEX) + assert p.match("c", flags=pe.NONE) is None + assert p.match("abc", flags=pe.NONE).group(0) == "abc" + assert p.match("ababc", flags=pe.NONE).group(0) == "ababc" + assert p.match("ac", flags=pe.NONE) is None + assert p.match("abac", flags=pe.NONE) is None + + p = pe.compile("Start <- 'ab'{2} 'c'", parser=parser, flags=pe.REGEX) + assert p.match("c", flags=pe.NONE) is None + assert p.match("abc", flags=pe.NONE) is None + assert p.match("ababc", flags=pe.NONE).group(0) == "ababc" + assert p.match("ac", flags=pe.NONE) is None + assert p.match("abac", flags=pe.NONE) is None