diff --git a/README b/README index dfc5d78..f5b637f 100644 --- a/README +++ b/README @@ -305,6 +305,15 @@ ERROR HANDLING RPe_IRANGE (-28) Invalid [] range "%s-%s" + RPe_DUPLCH (-29) + Regexp modifier "%s" may not appear twice + + RPe_EXCLCH (-30) + Regexp modifiers "%s" and "%s" are mutually exclusive + + RPe_NEGCHR (-31) + Regexp modifier "%s" may not appear after the "-" + EXTENSIONS Here are some ideas for extensions (sub-classes) for this module. Some of them may be absorbed into the core functionality of Regexp::Parser in diff --git a/lib/Regexp/Parser.pm b/lib/Regexp/Parser.pm index 9412df5..d007a99 100644 --- a/lib/Regexp/Parser.pm +++ b/lib/Regexp/Parser.pm @@ -706,6 +706,18 @@ False [] range "%s-%s" Invalid [] range "%s-%s" +=item RPe_DUPLCH (-29) + +Regexp modifier "%s" may not appear twice + +=item RPe_EXCLCH (-30) + +Regexp modifiers "%s" and "%s" are mutually exclusive + +=item RPe_NEGCHR (-31) + +Regexp modifier "%s" may not appear after the "-" + =back =head1 EXTENSIONS diff --git a/lib/Regexp/Parser/Diagnostics.pm b/lib/Regexp/Parser/Diagnostics.pm index 971cfd3..fce3fb4 100644 --- a/lib/Regexp/Parser/Diagnostics.pm +++ b/lib/Regexp/Parser/Diagnostics.pm @@ -34,5 +34,8 @@ use constant RPe_OUTPOS => --$ENUM, 'POSIX syntax [%s %s] belongs inside charact use constant RPe_EMPTYB => --$ENUM, 'Empty \%s{}'; use constant RPe_FRANGE => --$ENUM, 'False [] range "%s-%s"'; use constant RPe_IRANGE => --$ENUM, 'Invalid [] range "%s-%s"'; +use constant RPe_DUPLCH => --$ENUM, 'Regexp modifier "%s" may not appear twice'; +use constant RPe_EXCLCH => --$ENUM, 'Regexp modifiers "%s" and "%s" are mutually exclusive'; +use constant RPe_NEGCHR => --$ENUM, 'Regexp modifier "%s" may not appear after the "-"'; 1; diff --git a/lib/Regexp/Parser/Handlers.pm b/lib/Regexp/Parser/Handlers.pm index e700f73..4d959d2 100644 --- a/lib/Regexp/Parser/Handlers.pm +++ b/lib/Regexp/Parser/Handlers.pm @@ -734,10 +734,28 @@ sub init { &RxPOS -= length($on.$off); my $old = &RxPOS; + my $charset_seen = ''; # track first charset flag (a/d/l/u) for (split //, $on) { &RxPOS++; if (my $f = $S->can("FLAG_$_")) { my $v = $S->$f(1) and $r_on .= $_; + # charset flag conflict detection (Perl 5.14+) + if ($v && ($v & 0xF0)) { # this is a charset flag + if ($charset_seen) { + if ($charset_seen eq $_ && $_ eq 'a') { + # aa (strict ASCII) is valid — continue + } + elsif ($charset_seen eq $_) { + # dd, ll, uu — doubled non-a charset flag + $S->error($S->RPe_DUPLCH, $_); + } + else { + # different charset flags — mutually exclusive + $S->error($S->RPe_EXCLCH, $charset_seen, $_); + } + } + $charset_seen = $_; + } # /xx: if x is already on, set the xx bit (Perl 5.26+) if ($_ eq 'x' && ($f_on & $v)) { $f_on |= 0x200; # FLAG_xx @@ -756,6 +774,10 @@ sub init { &RxPOS++; if (my $f = $S->can("FLAG_$_")) { my $v = $S->$f(0) and $r_off .= $_; + # charset flags may not appear after - (Perl 5.14+) + if ($v && ($v & 0xF0)) { + $S->error($S->RPe_NEGCHR, $_); + } # -xx: also turn off the xx bit (Perl 5.26+) if ($_ eq 'x' && ($f_off & $v)) { $f_off |= 0x200; # FLAG_xx diff --git a/t/11errors.t b/t/11errors.t index a8f5233..143d9bb 100644 --- a/t/11errors.t +++ b/t/11errors.t @@ -263,6 +263,46 @@ warns_regex('(?o)', 'useless (?o) flag warning'); # Negative flag forms too warns_regex('(?-g)', 'useless (?-g) flag warning'); +## +## 9b. CHARSET FLAG CONFLICTS — RPe_DUPLCH, RPe_EXCLCH, RPe_NEGCHR +## + +# Doubled non-a charset flags (Perl rejects these) +fails_regex('(?dd:a)', ($r->RPe_DUPLCH)[0], 'doubled d flag'); +fails_regex('(?ll:a)', ($r->RPe_DUPLCH)[0], 'doubled l flag'); +fails_regex('(?uu:a)', ($r->RPe_DUPLCH)[0], 'doubled u flag'); + +# Mutually exclusive charset flags +fails_regex('(?al:a)', ($r->RPe_EXCLCH)[0], 'a and l mutually exclusive'); +fails_regex('(?ad:a)', ($r->RPe_EXCLCH)[0], 'a and d mutually exclusive'); +fails_regex('(?au:a)', ($r->RPe_EXCLCH)[0], 'a and u mutually exclusive'); +fails_regex('(?du:a)', ($r->RPe_EXCLCH)[0], 'd and u mutually exclusive'); +fails_regex('(?dl:a)', ($r->RPe_EXCLCH)[0], 'd and l mutually exclusive'); +fails_regex('(?lu:a)', ($r->RPe_EXCLCH)[0], 'l and u mutually exclusive'); + +# Charset flags not allowed after - (negation section) +fails_regex('(?-a:a)', ($r->RPe_NEGCHR)[0], 'charset a after -'); +fails_regex('(?-d:a)', ($r->RPe_NEGCHR)[0], 'charset d after -'); +fails_regex('(?-l:a)', ($r->RPe_NEGCHR)[0], 'charset l after -'); +fails_regex('(?-u:a)', ($r->RPe_NEGCHR)[0], 'charset u after -'); +fails_regex('(?i-a:a)', ($r->RPe_NEGCHR)[0], 'charset a after - with on-flags'); +fails_regex('(?i-al:a)', ($r->RPe_NEGCHR)[0], 'charset a after - mixed'); + +# Valid charset flag usage +parses_ok('(?aa:a)', 'aa (strict ASCII) is valid'); +parses_ok('(?a:a)', 'single a flag valid'); +parses_ok('(?d:a)', 'single d flag valid'); +parses_ok('(?l:a)', 'single l flag valid'); +parses_ok('(?u:a)', 'single u flag valid'); +parses_ok('(?ai:a)', 'charset a + modifier i valid'); +parses_ok('(?aai:a)', 'strict ASCII + modifier i valid'); +parses_ok('(?^a:a)', 'caret + charset a valid'); +parses_ok('(?^aa:a)', 'caret + strict ASCII valid'); + +# Caret with charset conflicts should also fail +fails_regex('(?^al:a)', ($r->RPe_EXCLCH)[0], 'caret + a and l exclusive'); +fails_regex('(?^du:a)', ($r->RPe_EXCLCH)[0], 'caret + d and u exclusive'); + ## ## 10. BAD ESCAPE IN CHARACTER CLASS WARNINGS — RPe_BADESC ##