From 304668bf074837c80c0f7a62152d83aa545c59b5 Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Tue, 14 Apr 2026 05:19:32 +0000 Subject: [PATCH] fix: validate charset flag conflicts in (?...) groups Perl 5.14+ treats charset flags (a/d/l/u) as mutually exclusive: only one may be active, with 'aa' as the sole valid doubling. The parser accepted all combinations silently, producing nodes with conflicting charset semantics. Three new error codes matching Perl's native errors: - RPe_DUPLCH: doubled d/l/u (e.g. (?dd:...)) - RPe_EXCLCH: conflicting charset flags (e.g. (?al:...), (?du:...)) - RPe_NEGCHR: charset flags after - (e.g. (?-a:...)) Validated in both grouped (?xx:...) and toggle (?xx) forms, with and without caret reset syntax. Co-Authored-By: Claude Opus 4.6 --- README | 9 +++++++ lib/Regexp/Parser.pm | 12 ++++++++++ lib/Regexp/Parser/Diagnostics.pm | 3 +++ lib/Regexp/Parser/Handlers.pm | 22 ++++++++++++++++++ t/11errors.t | 40 ++++++++++++++++++++++++++++++++ 5 files changed, 86 insertions(+) diff --git a/README b/README index dfc5d78..f5b637f 100644 --- a/README +++ b/README @@ -305,6 +305,15 @@ ERROR HANDLING RPe_IRANGE (-28) Invalid [] range "%s-%s" + RPe_DUPLCH (-29) + Regexp modifier "%s" may not appear twice + + RPe_EXCLCH (-30) + Regexp modifiers "%s" and "%s" are mutually exclusive + + RPe_NEGCHR (-31) + Regexp modifier "%s" may not appear after the "-" + EXTENSIONS Here are some ideas for extensions (sub-classes) for this module. Some of them may be absorbed into the core functionality of Regexp::Parser in diff --git a/lib/Regexp/Parser.pm b/lib/Regexp/Parser.pm index 9412df5..d007a99 100644 --- a/lib/Regexp/Parser.pm +++ b/lib/Regexp/Parser.pm @@ -706,6 +706,18 @@ False [] range "%s-%s" Invalid [] range "%s-%s" +=item RPe_DUPLCH (-29) + +Regexp modifier "%s" may not appear twice + +=item RPe_EXCLCH (-30) + +Regexp modifiers "%s" and "%s" are mutually exclusive + +=item RPe_NEGCHR (-31) + +Regexp modifier "%s" may not appear after the "-" + =back =head1 EXTENSIONS diff --git a/lib/Regexp/Parser/Diagnostics.pm b/lib/Regexp/Parser/Diagnostics.pm index 971cfd3..fce3fb4 100644 --- a/lib/Regexp/Parser/Diagnostics.pm +++ b/lib/Regexp/Parser/Diagnostics.pm @@ -34,5 +34,8 @@ use constant RPe_OUTPOS => --$ENUM, 'POSIX syntax [%s %s] belongs inside charact use constant RPe_EMPTYB => --$ENUM, 'Empty \%s{}'; use constant RPe_FRANGE => --$ENUM, 'False [] range "%s-%s"'; use constant RPe_IRANGE => --$ENUM, 'Invalid [] range "%s-%s"'; +use constant RPe_DUPLCH => --$ENUM, 'Regexp modifier "%s" may not appear twice'; +use constant RPe_EXCLCH => --$ENUM, 'Regexp modifiers "%s" and "%s" are mutually exclusive'; +use constant RPe_NEGCHR => --$ENUM, 'Regexp modifier "%s" may not appear after the "-"'; 1; diff --git a/lib/Regexp/Parser/Handlers.pm b/lib/Regexp/Parser/Handlers.pm index e700f73..4d959d2 100644 --- a/lib/Regexp/Parser/Handlers.pm +++ b/lib/Regexp/Parser/Handlers.pm @@ -734,10 +734,28 @@ sub init { &RxPOS -= length($on.$off); my $old = &RxPOS; + my $charset_seen = ''; # track first charset flag (a/d/l/u) for (split //, $on) { &RxPOS++; if (my $f = $S->can("FLAG_$_")) { my $v = $S->$f(1) and $r_on .= $_; + # charset flag conflict detection (Perl 5.14+) + if ($v && ($v & 0xF0)) { # this is a charset flag + if ($charset_seen) { + if ($charset_seen eq $_ && $_ eq 'a') { + # aa (strict ASCII) is valid — continue + } + elsif ($charset_seen eq $_) { + # dd, ll, uu — doubled non-a charset flag + $S->error($S->RPe_DUPLCH, $_); + } + else { + # different charset flags — mutually exclusive + $S->error($S->RPe_EXCLCH, $charset_seen, $_); + } + } + $charset_seen = $_; + } # /xx: if x is already on, set the xx bit (Perl 5.26+) if ($_ eq 'x' && ($f_on & $v)) { $f_on |= 0x200; # FLAG_xx @@ -756,6 +774,10 @@ sub init { &RxPOS++; if (my $f = $S->can("FLAG_$_")) { my $v = $S->$f(0) and $r_off .= $_; + # charset flags may not appear after - (Perl 5.14+) + if ($v && ($v & 0xF0)) { + $S->error($S->RPe_NEGCHR, $_); + } # -xx: also turn off the xx bit (Perl 5.26+) if ($_ eq 'x' && ($f_off & $v)) { $f_off |= 0x200; # FLAG_xx diff --git a/t/11errors.t b/t/11errors.t index a8f5233..143d9bb 100644 --- a/t/11errors.t +++ b/t/11errors.t @@ -263,6 +263,46 @@ warns_regex('(?o)', 'useless (?o) flag warning'); # Negative flag forms too warns_regex('(?-g)', 'useless (?-g) flag warning'); +## +## 9b. CHARSET FLAG CONFLICTS — RPe_DUPLCH, RPe_EXCLCH, RPe_NEGCHR +## + +# Doubled non-a charset flags (Perl rejects these) +fails_regex('(?dd:a)', ($r->RPe_DUPLCH)[0], 'doubled d flag'); +fails_regex('(?ll:a)', ($r->RPe_DUPLCH)[0], 'doubled l flag'); +fails_regex('(?uu:a)', ($r->RPe_DUPLCH)[0], 'doubled u flag'); + +# Mutually exclusive charset flags +fails_regex('(?al:a)', ($r->RPe_EXCLCH)[0], 'a and l mutually exclusive'); +fails_regex('(?ad:a)', ($r->RPe_EXCLCH)[0], 'a and d mutually exclusive'); +fails_regex('(?au:a)', ($r->RPe_EXCLCH)[0], 'a and u mutually exclusive'); +fails_regex('(?du:a)', ($r->RPe_EXCLCH)[0], 'd and u mutually exclusive'); +fails_regex('(?dl:a)', ($r->RPe_EXCLCH)[0], 'd and l mutually exclusive'); +fails_regex('(?lu:a)', ($r->RPe_EXCLCH)[0], 'l and u mutually exclusive'); + +# Charset flags not allowed after - (negation section) +fails_regex('(?-a:a)', ($r->RPe_NEGCHR)[0], 'charset a after -'); +fails_regex('(?-d:a)', ($r->RPe_NEGCHR)[0], 'charset d after -'); +fails_regex('(?-l:a)', ($r->RPe_NEGCHR)[0], 'charset l after -'); +fails_regex('(?-u:a)', ($r->RPe_NEGCHR)[0], 'charset u after -'); +fails_regex('(?i-a:a)', ($r->RPe_NEGCHR)[0], 'charset a after - with on-flags'); +fails_regex('(?i-al:a)', ($r->RPe_NEGCHR)[0], 'charset a after - mixed'); + +# Valid charset flag usage +parses_ok('(?aa:a)', 'aa (strict ASCII) is valid'); +parses_ok('(?a:a)', 'single a flag valid'); +parses_ok('(?d:a)', 'single d flag valid'); +parses_ok('(?l:a)', 'single l flag valid'); +parses_ok('(?u:a)', 'single u flag valid'); +parses_ok('(?ai:a)', 'charset a + modifier i valid'); +parses_ok('(?aai:a)', 'strict ASCII + modifier i valid'); +parses_ok('(?^a:a)', 'caret + charset a valid'); +parses_ok('(?^aa:a)', 'caret + strict ASCII valid'); + +# Caret with charset conflicts should also fail +fails_regex('(?^al:a)', ($r->RPe_EXCLCH)[0], 'caret + a and l exclusive'); +fails_regex('(?^du:a)', ($r->RPe_EXCLCH)[0], 'caret + d and u exclusive'); + ## ## 10. BAD ESCAPE IN CHARACTER CLASS WARNINGS — RPe_BADESC ##