Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions MANIFEST
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,5 @@ t/14quotemeta.t
t/15boundary.t
t/16define.t
t/17extcharclass.t
t/18nonnewline.t
t/99misc.t
12 changes: 11 additions & 1 deletion lib/Regexp/Parser.pm
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,17 @@ sub named_captures {

sub nchar {
my $self = shift;
return map chr(/^\^(\S)/ ? (64 ^ ord $1) : charnames::vianame($_)), @_;
return map {
if (/^\^(\S)/) {
chr(64 ^ ord $1);
}
elsif (/^U\+([0-9a-fA-F]+)$/) {
chr(hex($1));
}
else {
chr(charnames::vianame($_));
}
} @_;
}


Expand Down
28 changes: 22 additions & 6 deletions lib/Regexp/Parser/Handlers.pm
Original file line number Diff line number Diff line change
Expand Up @@ -221,15 +221,31 @@ sub init {
$S->error($S->RPe_BRACES, 'g');
});

# named (named character)
# \N (not newline) or \N{NAME} / \N{U+HHHH} (named character)
$self->add_handler('\N' => sub {
my ($S, $cc) = @_;
$S->error($S->RPe_BRACES, 'N') if ${&Rx} !~ m{ \G \{ }xgc;
$S->error($S->RPe_RBRACE, 'N') if ${&Rx} !~ m{ \G ([^\}]*) \} }xgc;

my $name = $1;
return $S->force_object(anyof_char => $S->nchar($name), "\\N{$name}") if $cc;
return $S->object(exact => $S->nchar($name), "\\N{$name}");
# Check for \N{...}
if (${&Rx} =~ m{ \G \{ }xgc) {
# Disambiguate \N{3,5} (quantifier) from \N{NAME} (named char):
# if contents look like a quantifier pattern, back up and treat as bare \N
if (!$cc && ${&Rx} =~ m{ \G (?= \d+,?\d*\} ) }xgc) {
--&RxPOS; # un-consume the {
return $S->object(nonnewline =>);
}

$S->error($S->RPe_RBRACE, 'N') if ${&Rx} !~ m{ \G ([^\}]*) \} }xgc;

my $name = $1;
return $S->force_object(anyof_char => $S->nchar($name), "\\N{$name}") if $cc;
return $S->object(exact => $S->nchar($name), "\\N{$name}");
}

# bare \N = "not newline" (Perl 5.12+), not valid in character class
if ($cc) {
$S->error($S->RPe_BRACES, 'N');
}
return $S->object(nonnewline =>);
});

# \o{NNN} octal escape (Perl 5.14+)
Expand Down
32 changes: 31 additions & 1 deletion lib/Regexp/Parser/Objects.pm
Original file line number Diff line number Diff line change
Expand Up @@ -1626,6 +1626,25 @@
}


{
# \N (not newline, added in Perl 5.12)
package Regexp::Parser::nonnewline;
our @ISA = qw( Regexp::Parser::__object__ );

sub new {
my ($class, $rx) = @_;
my $self = bless {
rx => $rx,
flags => $rx->{flags}[-1],
family => 'nonnewline',
type => 'nonnewline',
vis => '\N',
}, $class;
return $self;
}
}


{
# \R (generic linebreak, added in Perl 5.10)
package Regexp::Parser::lnbreak;
Expand Down Expand Up @@ -2105,7 +2124,7 @@ character class's ender is an C<anyof_close> node.
The general family of this object. These are any of: alnum, anchor,
anyof, anyof_char, anyof_class, anyof_range, assertion, branch,
charclass_expr, close, clump, digit, exact, flags, group, groupp,
grouppn, hspace, lnbreak, minmod, open, possessive, prop, quant,
grouppn, hspace, lnbreak, minmod, nonnewline, open, possessive, prop, quant,
recurse, ref, reg_any, verb, vspace.

=item my $f = $obj->flags()
Expand Down Expand Up @@ -2671,6 +2690,17 @@ Neg: 1 if negated
Vertical whitespace character class shorthand (Perl 5.10+). Matches
characters like newline, carriage return, form feed, and vertical tab.

=head2 nonnewline

Family: nonnewline

Types: nonnewline (C<\N>)

Not-newline assertion (Perl 5.12+). Matches any character except C<\n>,
regardless of the C</s> flag. Unlike C<.>, which matches newlines under
C</s>, C<\N> never matches a newline. Not valid inside character classes
(use C<\N{NAME}> or C<\N{U+HHHH}> there instead).

=head2 lnbreak

Family: lnbreak
Expand Down
8 changes: 8 additions & 0 deletions t/10roundtrip.t
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,14 @@ my @patterns = (
"(?<foo>bar)\\k'foo'",
'(?<foo>bar)\\k{foo}',
'(?<x>a)(?P=x)',

# \N (not newline) and \N{NAME}
'\\N',
'\\N+',
'\\N{3,5}',
'\\N{SPACE}',
'\\N{U+0041}',
'\\N{LATIN SMALL LETTER A}',
);

plan tests => scalar(@patterns) * 2;
Expand Down
5 changes: 4 additions & 1 deletion t/11errors.t
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,10 @@ fails_regex('\\', ($r->RPe_ESLASH)[0], 'lone backslash');

# Missing braces on \g, \N
fails_regex('\\g', ($r->RPe_BRACES)[0], '\\g without braces');
fails_regex('\\N', ($r->RPe_BRACES)[0], '\\N without braces');
# bare \N is valid since Perl 5.12 (means "not newline")
parses_ok('\\N', '\\N bare (not newline)');
# but bare \N inside character class is still an error
fails_regex('[\\N]', ($r->RPe_BRACES)[0], '\\N without braces in char class');

# Missing right brace
fails_regex('\\x{abc', ($r->RPe_RBRACE)[0], '\\x{... missing right brace');
Expand Down
174 changes: 174 additions & 0 deletions t/18nonnewline.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
use strict;
use warnings;
use Test::More;
use Regexp::Parser;

# Tests for \N handling:
# - \N bare = "not newline" (Perl 5.12+)
# - \N{NAME} = named character
# - \N{U+HHHH} = Unicode code point

my $r = Regexp::Parser->new;

##
## 1. Bare \N (not newline)
##

{
my $ok = $r->regex('\N');
ok($ok, '\\N bare parses successfully');
is($r->visual, '\N', '\\N visual round-trips');

# Check node type
my $w = $r->walker;
my $node = $w->();
ok($node, '\\N produces a node');
is($node->family, 'nonnewline', '\\N node family is nonnewline');
is($node->type, 'nonnewline', '\\N node type is nonnewline');
is($node->visual, '\N', '\\N node visual is \\N');
}

# \N in a larger pattern
{
$r->regex('a\Nb');
is($r->visual, 'a\Nb', '\\N in context: a\\Nb');
}

# \N with quantifiers
{
$r->regex('\N+');
is($r->visual, '\N+', '\\N+ (one or more non-newlines)');
}

{
$r->regex('\N*');
is($r->visual, '\N*', '\\N* (zero or more non-newlines)');
}

{
my @warnings;
local $SIG{__WARN__} = sub { push @warnings, $_[0] };
$r->regex('\N{3,5}');
is($r->visual, '\N{3,5}', '\\N{3,5} (quantifier, not named char)');
is(scalar @warnings, 0, '\\N{3,5} produces no warnings');

# Verify structure: quantifier wrapping nonnewline
my $w = $r->walker;
my $node = $w->();
is($node->family, 'quant', '\\N{3,5} quantifier node present');
my $inner = $w->();
is($inner->family, 'nonnewline', '\\N{3,5} inner node is nonnewline');
}

# \N{3} is also a quantifier, not a named char
{
my @warnings;
local $SIG{__WARN__} = sub { push @warnings, $_[0] };
$r->regex('\N{3}');
is($r->visual, '\N{3}', '\\N{3} (exact quantifier)');
is(scalar @warnings, 0, '\\N{3} produces no warnings');
}

# Round-trip: parse -> visual -> re-parse -> visual
{
$r->regex('\N');
my $v1 = $r->visual;
$r->regex($v1);
my $v2 = $r->visual;
is($v2, $v1, '\\N round-trips correctly');
}

{
$r->regex('^\N+$');
my $v1 = $r->visual;
$r->regex($v1);
my $v2 = $r->visual;
is($v2, $v1, '^\N+$ round-trips correctly');
}

##
## 2. \N{NAME} (named character)
##

{
my @warnings;
local $SIG{__WARN__} = sub { push @warnings, $_[0] };
$r->regex('\N{SPACE}');
is($r->visual, '\N{SPACE}', '\\N{SPACE} visual');
is(scalar @warnings, 0, '\\N{SPACE} produces no warnings');
}

{
my @warnings;
local $SIG{__WARN__} = sub { push @warnings, $_[0] };
$r->regex('\N{LATIN SMALL LETTER A}');
is($r->visual, '\N{LATIN SMALL LETTER A}', '\\N{LATIN SMALL LETTER A} visual');
is(scalar @warnings, 0, '\\N{LATIN SMALL LETTER A} no warnings');
}

##
## 3. \N{U+HHHH} (Unicode code point)
##

{
my @warnings;
local $SIG{__WARN__} = sub { push @warnings, $_[0] };
$r->regex('\N{U+0041}');
is($r->visual, '\N{U+0041}', '\\N{U+0041} visual');
is(scalar @warnings, 0, '\\N{U+0041} produces no warnings');

# Check the node contains the right character
my $w = $r->walker;
my $node = $w->();
ok($node, '\\N{U+0041} produces a node');
is($node->visual, '\N{U+0041}', '\\N{U+0041} node visual');
is($node->data, 'A', '\\N{U+0041} resolves to "A"');
}

{
my @warnings;
local $SIG{__WARN__} = sub { push @warnings, $_[0] };
$r->regex('\N{U+0020}');
is($r->visual, '\N{U+0020}', '\\N{U+0020} visual (space)');
is(scalar @warnings, 0, '\\N{U+0020} produces no warnings');

my $w = $r->walker;
my $node = $w->();
is($node->data, ' ', '\\N{U+0020} resolves to space');
}

# \N{U+HHHH} inside character class
{
my @warnings;
local $SIG{__WARN__} = sub { push @warnings, $_[0] };
$r->regex('[\N{U+0041}]');
is($r->visual, '[\N{U+0041}]', '\\N{U+0041} in character class');
is(scalar @warnings, 0, '\\N{U+0041} in char class no warnings');
}

# \N{U+HHHH} round-trip
{
$r->regex('\N{U+0041}');
my $v1 = $r->visual;
$r->regex($v1);
my $v2 = $r->visual;
is($v2, $v1, '\\N{U+0041} round-trips correctly');
}

##
## 4. Error cases preserved
##

# \N bare inside character class is still an error
{
my $ok = $r->regex('[\N]');
ok(!$ok, '\\N bare inside character class is rejected');
}

# \N{ without closing brace
{
my $ok = $r->regex('\N{SPACE');
ok(!$ok, '\\N{SPACE (unclosed) is rejected');
}

done_testing;
Loading