Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

## [Unreleased][unreleased]

### Fixed

* Group quantized 2+ char literals in regex optimization ([#54])


## [v0.5.3][]

Expand Down Expand Up @@ -215,3 +219,4 @@ descent parser and a work-in-progress state-machine parser.
[#38]: https://github.com/goodmami/pe/issues/38
[#44]: https://github.com/goodmami/pe/issues/44
[#46]: https://github.com/goodmami/pe/issues/46
[#54]: https://github.com/goodmami/pe/issues/54
2 changes: 1 addition & 1 deletion docs/guides/using-flags.md
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ pe._errors.ParseError:
line 0, character 8
"one" 1 "two
^
ParseError: `(?=(?P<_5>"(?=(?P<_2>[^"]*))(?P=_2)"|\-?(?=(?P<_4>0|[1-9](?=(?P<_3>[0-9]*))(?P=_3)))(?P=_4)))(?P=_5)`
ParseError: `(?=(?P<_5>"(?=(?P<_2>[^"]*))(?P=_2)"|(?:\-)?(?=(?P<_4>0|[1-9](?=(?P<_3>[0-9]*))(?P=_3)))(?P=_4)))(?P=_5)`

```

Expand Down
12 changes: 9 additions & 3 deletions pe/_optimize.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ def _regex_optional(defn, defs, grpid):
subdef = defn.args[0]
d = _regex(defn.args[0], defs, grpid)
if d.op == RGX:
subpat = d.args[0] if subdef.op in (DOT, LIT, CLS) else f'(?:{d.args[0]})'
subpat = _regex_maybe_group(subdef.op, d.args[0])
return Regex(f'{subpat}?')
else:
return Optional(d)
Expand All @@ -252,7 +252,7 @@ def _regex_star(defn, defs, grpid):
subdef = defn.args[0]
d = _regex(subdef, defs, grpid)
if d.op == RGX:
subpat = d.args[0] if subdef.op in (DOT, LIT, CLS) else f'(?:{d.args[0]})'
subpat = _regex_maybe_group(subdef.op, d.args[0])
gid = f'_{next(grpid)}'
return Regex(f'(?=(?P<{gid}>{subpat}*))(?P={gid})')
else:
Expand All @@ -263,13 +263,19 @@ def _regex_plus(defn, defs, grpid):
subdef = defn.args[0]
d = _regex(defn.args[0], defs, grpid)
if d.op == RGX:
subpat = d.args[0] if subdef.op in (DOT, LIT, CLS) else f'(?:{d.args[0]})'
subpat = _regex_maybe_group(subdef.op, d.args[0])
gid = f'_{next(grpid)}'
return Regex(f'(?=(?P<{gid}>{subpat}+))(?P={gid})')
else:
return Plus(d)


def _regex_maybe_group(op: Operator, arg: str) -> str:
if op in (DOT, CLS) or (op == LIT and len(arg) == 1):
return arg
return f'(?:{arg})'


def _regex_and(defn, defs, grpid):
d = _regex(defn.args[0], defs, grpid)
if d.op == RGX:
Expand Down
2 changes: 1 addition & 1 deletion pe/packrat.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def match(self,
else:
return None

args = tuple(args or ())
args = list(args or ())
if kwargs is None:
kwargs = {}

Expand Down
33 changes: 33 additions & 0 deletions test/test_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,36 @@ def test_capture_repeated(parser):
m3 = pe.match('(~"a")+', 'aaa', parser=parser)
assert m3.group() == 'aaa'
assert m3.groups() == ('a', 'a', 'a')


@pytest.mark.parametrize('parser', ['packrat', 'machine', 'machine-python'])
def test_regex_optimized_quantified_literal(parser):
"""https://github.com/goodmami/pe/issues/54"""

p = pe.compile("Start <- 'ab'? 'c'", parser=parser, flags=pe.REGEX)
assert p.match("c", flags=pe.NONE).group(0) == "c"
assert p.match("abc", flags=pe.NONE).group(0) == "abc"
assert p.match("ababc", flags=pe.NONE) is None
assert p.match("ac", flags=pe.NONE) is None
assert p.match("abac", flags=pe.NONE) is None

p = pe.compile("Start <- 'ab'* 'c'", parser=parser, flags=pe.REGEX)
assert p.match("c", flags=pe.NONE).group(0) == "c"
assert p.match("abc", flags=pe.NONE).group(0) == "abc"
assert p.match("ababc", flags=pe.NONE).group(0) == "ababc"
assert p.match("ac", flags=pe.NONE) is None
assert p.match("abac", flags=pe.NONE) is None

p = pe.compile("Start <- 'ab'+ 'c'", parser=parser, flags=pe.REGEX)
assert p.match("c", flags=pe.NONE) is None
assert p.match("abc", flags=pe.NONE).group(0) == "abc"
assert p.match("ababc", flags=pe.NONE).group(0) == "ababc"
assert p.match("ac", flags=pe.NONE) is None
assert p.match("abac", flags=pe.NONE) is None

p = pe.compile("Start <- 'ab'{2} 'c'", parser=parser, flags=pe.REGEX)
assert p.match("c", flags=pe.NONE) is None
assert p.match("abc", flags=pe.NONE) is None
assert p.match("ababc", flags=pe.NONE).group(0) == "ababc"
assert p.match("ac", flags=pe.NONE) is None
assert p.match("abac", flags=pe.NONE) is None