Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 53 additions & 11 deletions src/matcher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -201,10 +201,13 @@ struct CompiledPattern {
/// entire haystack to find the longest. This makes it unsuitable for searching,
/// but it's perfect for a second, anchored match pass for POSIX semantics.
longest_anchored: Regex,
reject_invalid_utf8_matches: bool,
}

impl CompiledPattern {
fn compile(pattern: &str, config: &Config) -> UResult<Self> {
let bre_whitespace_escape =
config.regex_mode == RegexMode::Basic && has_bre_whitespace_escape(pattern);
let mut syntax = *match config.regex_mode {
RegexMode::Fixed => Syntax::asis(),
RegexMode::Basic => Syntax::grep(),
Expand All @@ -215,6 +218,10 @@ impl CompiledPattern {
// GNU grep supports `{,n}` as an alias for `{0,n}`.
syntax.enable_behavior(SyntaxBehavior::SYNTAX_BEHAVIOR_ALLOW_INTERVAL_LOW_ABBREV);
}
if bre_whitespace_escape {
// GNU grep treats `\s`/`\S` as whitespace shorthands in BRE mode.
syntax.enable_operators(SyntaxOperator::SYNTAX_OPERATOR_ESC_S_WHITE_SPACE);
}
if config.regex_mode == RegexMode::Perl {
// GNU grep supports `(?P<name>...)`.
// Unfortunately, the onig crate defines the OP2 flag without the
Expand Down Expand Up @@ -247,20 +254,28 @@ impl CompiledPattern {
Ok(Self {
leftmost,
longest_anchored,
reject_invalid_utf8_matches: bre_whitespace_escape,
})
}

/// Find the leftmost match starting at or after `offset`.
fn search_leftmost(&self, line: &[u8], offset: usize) -> Option<(usize, usize)> {
let mut region = Region::new();
self.leftmost.search_with_encoding(
EncodedBytes::from_parts(line, &raw mut OnigEncodingUTF8),
offset,
line.len(),
SearchOptions::SEARCH_OPTION_NONE,
Some(&mut region),
)?;
region.pos(0)
fn search_leftmost(&self, line: &[u8], mut offset: usize) -> Option<(usize, usize)> {
while offset <= line.len() {
let mut region = Region::new();
self.leftmost.search_with_encoding(
EncodedBytes::from_parts(line, &raw mut OnigEncodingUTF8),
offset,
line.len(),
SearchOptions::SEARCH_OPTION_NONE,
Some(&mut region),
)?;
let (start, end) = region.pos(0)?;
if self.accepts_match(line, start, end) {
return Some((start, end));
}
offset = end.max(start + 1);
}
None
}

/// Given a known leftmost start `start`, return the longest extent
Expand All @@ -273,11 +288,18 @@ impl CompiledPattern {
SearchOptions::SEARCH_OPTION_NONE,
Some(&mut region),
);
region.pos(0).map(|(_, end)| end)
region
.pos(0)
.filter(|&(start, end)| self.accepts_match(line, start, end))
.map(|(_, end)| end)
}

/// True if any match exists in `line` (including zero-length).
fn is_match(&self, line: &[u8]) -> bool {
if self.reject_invalid_utf8_matches {
return self.search_leftmost(line, 0).is_some();
}

self.leftmost
.search_with_encoding(
EncodedBytes::from_parts(line, &raw mut OnigEncodingUTF8),
Expand All @@ -288,4 +310,24 @@ impl CompiledPattern {
)
.is_some()
}

fn accepts_match(&self, line: &[u8], start: usize, end: usize) -> bool {
!self.reject_invalid_utf8_matches || std::str::from_utf8(&line[start..end]).is_ok()
}
}

fn has_bre_whitespace_escape(pattern: &str) -> bool {
let bytes = pattern.as_bytes();
let mut i = 0;
while i + 1 < bytes.len() {
if bytes[i] != b'\\' {
i += 1;
continue;
}
if matches!(bytes[i + 1], b's' | b'S') {
return true;
}
i += 2;
}
false
}
22 changes: 22 additions & 0 deletions tests/test_grep.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,28 @@ fn bre_gnu_extensions() {
.succeeds()
.stdout_only("contain\n");

let (_s, mut c) = ucmd();
c.args(&[r"\s"])
.pipe_in("a b\nxy\n\tindented\n")
.succeeds()
.stdout_only("a b\n\tindented\n");

let (_s, mut c) = ucmd();
c.args(&[r"\S"])
.pipe_in("aS b\n \nx\n")
.succeeds()
.stdout_only("aS b\nx\n");

let (scene, _) = ucmd();
scene.fixtures.write_bytes("invalid-utf8", b"\x82\n");
for pattern in [r"^\s$", r"^\S$"] {
let mut c = scene.cmd(env!("CARGO_BIN_EXE_grep"));
c.args(&[pattern, "invalid-utf8"])
.fails_with_code(1)
.no_stdout()
.no_stderr();
}

// BRE backreference: repeated adjacent word.
let (_s, mut c) = ucmd();
c.args(&[r"\(\b\w\+\b\) \1"])
Expand Down
Loading