From 916c07aecd7277d126d71ca4181bced1bb20991c Mon Sep 17 00:00:00 2001 From: Toddr Bot Date: Tue, 31 Mar 2026 08:03:48 +0000 Subject: [PATCH] =?UTF-8?q?feat:=20fix=20\N=20escape=20handling=20?= =?UTF-8?q?=E2=80=94=20bare=20\N,=20\N{U+HHHH},=20quantifier=20disambiguat?= =?UTF-8?q?ion?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three bugs fixed in \N handling: 1. Bare \N now parses as "not newline" (Perl 5.12+) instead of erroring. Creates a nonnewline node type. Still errors inside character classes, matching Perl's behavior. 2. \N{U+HHHH} no longer produces "isn't numeric in chr" warnings. The nchar() method now detects U+HHHH format and uses chr(hex()) directly instead of passing through charnames::vianame() which returns the character (not code point) for this format. 3. \N{3,5} is now correctly parsed as \N + quantifier {3,5}, not as a named character lookup for "3,5". The handler checks if braced content looks like a quantifier pattern before consuming it. Co-Authored-By: Claude Opus 4.6 --- MANIFEST | 1 + lib/Regexp/Parser.pm | 12 ++- lib/Regexp/Parser/Handlers.pm | 28 ++++-- lib/Regexp/Parser/Objects.pm | 32 ++++++- t/10roundtrip.t | 8 ++ t/11errors.t | 5 +- t/18nonnewline.t | 174 ++++++++++++++++++++++++++++++++++ 7 files changed, 251 insertions(+), 9 deletions(-) create mode 100644 t/18nonnewline.t diff --git a/MANIFEST b/MANIFEST index fc423c7..7fc2304 100644 --- a/MANIFEST +++ b/MANIFEST @@ -29,4 +29,5 @@ t/14quotemeta.t t/15boundary.t t/16define.t t/17extcharclass.t +t/18nonnewline.t t/99misc.t diff --git a/lib/Regexp/Parser.pm b/lib/Regexp/Parser.pm index 9412df5..6df114e 100644 --- a/lib/Regexp/Parser.pm +++ b/lib/Regexp/Parser.pm @@ -137,7 +137,17 @@ sub named_captures { sub nchar { my $self = shift; - return map chr(/^\^(\S)/ ? (64 ^ ord $1) : charnames::vianame($_)), @_; + return map { + if (/^\^(\S)/) { + chr(64 ^ ord $1); + } + elsif (/^U\+([0-9a-fA-F]+)$/) { + chr(hex($1)); + } + else { + chr(charnames::vianame($_)); + } + } @_; } diff --git a/lib/Regexp/Parser/Handlers.pm b/lib/Regexp/Parser/Handlers.pm index e700f73..4ff1278 100644 --- a/lib/Regexp/Parser/Handlers.pm +++ b/lib/Regexp/Parser/Handlers.pm @@ -221,15 +221,31 @@ sub init { $S->error($S->RPe_BRACES, 'g'); }); - # named (named character) + # \N (not newline) or \N{NAME} / \N{U+HHHH} (named character) $self->add_handler('\N' => sub { my ($S, $cc) = @_; - $S->error($S->RPe_BRACES, 'N') if ${&Rx} !~ m{ \G \{ }xgc; - $S->error($S->RPe_RBRACE, 'N') if ${&Rx} !~ m{ \G ([^\}]*) \} }xgc; - my $name = $1; - return $S->force_object(anyof_char => $S->nchar($name), "\\N{$name}") if $cc; - return $S->object(exact => $S->nchar($name), "\\N{$name}"); + # Check for \N{...} + if (${&Rx} =~ m{ \G \{ }xgc) { + # Disambiguate \N{3,5} (quantifier) from \N{NAME} (named char): + # if contents look like a quantifier pattern, back up and treat as bare \N + if (!$cc && ${&Rx} =~ m{ \G (?= \d+,?\d*\} ) }xgc) { + --&RxPOS; # un-consume the { + return $S->object(nonnewline =>); + } + + $S->error($S->RPe_RBRACE, 'N') if ${&Rx} !~ m{ \G ([^\}]*) \} }xgc; + + my $name = $1; + return $S->force_object(anyof_char => $S->nchar($name), "\\N{$name}") if $cc; + return $S->object(exact => $S->nchar($name), "\\N{$name}"); + } + + # bare \N = "not newline" (Perl 5.12+), not valid in character class + if ($cc) { + $S->error($S->RPe_BRACES, 'N'); + } + return $S->object(nonnewline =>); }); # \o{NNN} octal escape (Perl 5.14+) diff --git a/lib/Regexp/Parser/Objects.pm b/lib/Regexp/Parser/Objects.pm index cea754c..10f28cb 100644 --- a/lib/Regexp/Parser/Objects.pm +++ b/lib/Regexp/Parser/Objects.pm @@ -1626,6 +1626,25 @@ } +{ + # \N (not newline, added in Perl 5.12) + package Regexp::Parser::nonnewline; + our @ISA = qw( Regexp::Parser::__object__ ); + + sub new { + my ($class, $rx) = @_; + my $self = bless { + rx => $rx, + flags => $rx->{flags}[-1], + family => 'nonnewline', + type => 'nonnewline', + vis => '\N', + }, $class; + return $self; + } +} + + { # \R (generic linebreak, added in Perl 5.10) package Regexp::Parser::lnbreak; @@ -2105,7 +2124,7 @@ character class's ender is an C node. The general family of this object. These are any of: alnum, anchor, anyof, anyof_char, anyof_class, anyof_range, assertion, branch, charclass_expr, close, clump, digit, exact, flags, group, groupp, -grouppn, hspace, lnbreak, minmod, open, possessive, prop, quant, +grouppn, hspace, lnbreak, minmod, nonnewline, open, possessive, prop, quant, recurse, ref, reg_any, verb, vspace. =item my $f = $obj->flags() @@ -2671,6 +2690,17 @@ Neg: 1 if negated Vertical whitespace character class shorthand (Perl 5.10+). Matches characters like newline, carriage return, form feed, and vertical tab. +=head2 nonnewline + +Family: nonnewline + +Types: nonnewline (C<\N>) + +Not-newline assertion (Perl 5.12+). Matches any character except C<\n>, +regardless of the C flag. Unlike C<.>, which matches newlines under +C, C<\N> never matches a newline. Not valid inside character classes +(use C<\N{NAME}> or C<\N{U+HHHH}> there instead). + =head2 lnbreak Family: lnbreak diff --git a/t/10roundtrip.t b/t/10roundtrip.t index d1167cf..eefd373 100644 --- a/t/10roundtrip.t +++ b/t/10roundtrip.t @@ -158,6 +158,14 @@ my @patterns = ( "(?bar)\\k'foo'", '(?bar)\\k{foo}', '(?a)(?P=x)', + + # \N (not newline) and \N{NAME} + '\\N', + '\\N+', + '\\N{3,5}', + '\\N{SPACE}', + '\\N{U+0041}', + '\\N{LATIN SMALL LETTER A}', ); plan tests => scalar(@patterns) * 2; diff --git a/t/11errors.t b/t/11errors.t index a8f5233..dc65500 100644 --- a/t/11errors.t +++ b/t/11errors.t @@ -145,7 +145,10 @@ fails_regex('\\', ($r->RPe_ESLASH)[0], 'lone backslash'); # Missing braces on \g, \N fails_regex('\\g', ($r->RPe_BRACES)[0], '\\g without braces'); -fails_regex('\\N', ($r->RPe_BRACES)[0], '\\N without braces'); +# bare \N is valid since Perl 5.12 (means "not newline") +parses_ok('\\N', '\\N bare (not newline)'); +# but bare \N inside character class is still an error +fails_regex('[\\N]', ($r->RPe_BRACES)[0], '\\N without braces in char class'); # Missing right brace fails_regex('\\x{abc', ($r->RPe_RBRACE)[0], '\\x{... missing right brace'); diff --git a/t/18nonnewline.t b/t/18nonnewline.t new file mode 100644 index 0000000..8da76fa --- /dev/null +++ b/t/18nonnewline.t @@ -0,0 +1,174 @@ +use strict; +use warnings; +use Test::More; +use Regexp::Parser; + +# Tests for \N handling: +# - \N bare = "not newline" (Perl 5.12+) +# - \N{NAME} = named character +# - \N{U+HHHH} = Unicode code point + +my $r = Regexp::Parser->new; + +## +## 1. Bare \N (not newline) +## + +{ + my $ok = $r->regex('\N'); + ok($ok, '\\N bare parses successfully'); + is($r->visual, '\N', '\\N visual round-trips'); + + # Check node type + my $w = $r->walker; + my $node = $w->(); + ok($node, '\\N produces a node'); + is($node->family, 'nonnewline', '\\N node family is nonnewline'); + is($node->type, 'nonnewline', '\\N node type is nonnewline'); + is($node->visual, '\N', '\\N node visual is \\N'); +} + +# \N in a larger pattern +{ + $r->regex('a\Nb'); + is($r->visual, 'a\Nb', '\\N in context: a\\Nb'); +} + +# \N with quantifiers +{ + $r->regex('\N+'); + is($r->visual, '\N+', '\\N+ (one or more non-newlines)'); +} + +{ + $r->regex('\N*'); + is($r->visual, '\N*', '\\N* (zero or more non-newlines)'); +} + +{ + my @warnings; + local $SIG{__WARN__} = sub { push @warnings, $_[0] }; + $r->regex('\N{3,5}'); + is($r->visual, '\N{3,5}', '\\N{3,5} (quantifier, not named char)'); + is(scalar @warnings, 0, '\\N{3,5} produces no warnings'); + + # Verify structure: quantifier wrapping nonnewline + my $w = $r->walker; + my $node = $w->(); + is($node->family, 'quant', '\\N{3,5} quantifier node present'); + my $inner = $w->(); + is($inner->family, 'nonnewline', '\\N{3,5} inner node is nonnewline'); +} + +# \N{3} is also a quantifier, not a named char +{ + my @warnings; + local $SIG{__WARN__} = sub { push @warnings, $_[0] }; + $r->regex('\N{3}'); + is($r->visual, '\N{3}', '\\N{3} (exact quantifier)'); + is(scalar @warnings, 0, '\\N{3} produces no warnings'); +} + +# Round-trip: parse -> visual -> re-parse -> visual +{ + $r->regex('\N'); + my $v1 = $r->visual; + $r->regex($v1); + my $v2 = $r->visual; + is($v2, $v1, '\\N round-trips correctly'); +} + +{ + $r->regex('^\N+$'); + my $v1 = $r->visual; + $r->regex($v1); + my $v2 = $r->visual; + is($v2, $v1, '^\N+$ round-trips correctly'); +} + +## +## 2. \N{NAME} (named character) +## + +{ + my @warnings; + local $SIG{__WARN__} = sub { push @warnings, $_[0] }; + $r->regex('\N{SPACE}'); + is($r->visual, '\N{SPACE}', '\\N{SPACE} visual'); + is(scalar @warnings, 0, '\\N{SPACE} produces no warnings'); +} + +{ + my @warnings; + local $SIG{__WARN__} = sub { push @warnings, $_[0] }; + $r->regex('\N{LATIN SMALL LETTER A}'); + is($r->visual, '\N{LATIN SMALL LETTER A}', '\\N{LATIN SMALL LETTER A} visual'); + is(scalar @warnings, 0, '\\N{LATIN SMALL LETTER A} no warnings'); +} + +## +## 3. \N{U+HHHH} (Unicode code point) +## + +{ + my @warnings; + local $SIG{__WARN__} = sub { push @warnings, $_[0] }; + $r->regex('\N{U+0041}'); + is($r->visual, '\N{U+0041}', '\\N{U+0041} visual'); + is(scalar @warnings, 0, '\\N{U+0041} produces no warnings'); + + # Check the node contains the right character + my $w = $r->walker; + my $node = $w->(); + ok($node, '\\N{U+0041} produces a node'); + is($node->visual, '\N{U+0041}', '\\N{U+0041} node visual'); + is($node->data, 'A', '\\N{U+0041} resolves to "A"'); +} + +{ + my @warnings; + local $SIG{__WARN__} = sub { push @warnings, $_[0] }; + $r->regex('\N{U+0020}'); + is($r->visual, '\N{U+0020}', '\\N{U+0020} visual (space)'); + is(scalar @warnings, 0, '\\N{U+0020} produces no warnings'); + + my $w = $r->walker; + my $node = $w->(); + is($node->data, ' ', '\\N{U+0020} resolves to space'); +} + +# \N{U+HHHH} inside character class +{ + my @warnings; + local $SIG{__WARN__} = sub { push @warnings, $_[0] }; + $r->regex('[\N{U+0041}]'); + is($r->visual, '[\N{U+0041}]', '\\N{U+0041} in character class'); + is(scalar @warnings, 0, '\\N{U+0041} in char class no warnings'); +} + +# \N{U+HHHH} round-trip +{ + $r->regex('\N{U+0041}'); + my $v1 = $r->visual; + $r->regex($v1); + my $v2 = $r->visual; + is($v2, $v1, '\\N{U+0041} round-trips correctly'); +} + +## +## 4. Error cases preserved +## + +# \N bare inside character class is still an error +{ + my $ok = $r->regex('[\N]'); + ok(!$ok, '\\N bare inside character class is rejected'); +} + +# \N{ without closing brace +{ + my $ok = $r->regex('\N{SPACE'); + ok(!$ok, '\\N{SPACE (unclosed) is rejected'); +} + +done_testing;