From 28186e9ec3867c70cbdaa6988fbb9e24b95a2195 Mon Sep 17 00:00:00 2001 From: Sylvestre Ledru Date: Sun, 31 May 2026 10:06:09 +0200 Subject: [PATCH 1/2] perf: buffer-at-a-time search for literal patterns Literal searches were ~50-70x slower than GNU grep because every line paid per-line costs (terminator scan, NUL scan, dispatch) even when a buffer held no match. Add a buffer-at-a-time driver that scans whole chunks with a substring searcher and only locates line boundaries around the matches it finds; a chunk with no match costs a single vectorized sweep and no per-line work. The driver activates only for plain ASCII literal patterns (case sensitive, no metacharacters) in the simpler output modes: -c, -l, -L, -q, and plain line printing with -n/-b/filename/-m. Anything needing match positions, context, inversion, color, or special binary handling falls back to the unchanged line-at-a-time path. Output stays byte-identical to that path, including binary/invalid-UTF-8 behavior. - line_buffer: read_chunk() yields the largest span of complete lines. - matcher: expose per-pattern memmem searchers when every pattern is a plain literal (plain_literal()). - searcher: eligible_for_fast_path(), fast_locate(), fast_print(). All scanning rides on the memchr crate (SIMD memchr/memrchr/memmem). Unit tests for read_chunk and plain_literal; integration tests for prefixes, -m, and multi-chunk line-number correctness. Benchmarks (31 MB corpus) vs prior release: -F (no match): 232ms -> 15ms (15.9x; now faster than GNU) -c literal: 229ms -> 15ms (15.2x) plain print: 248ms -> 18ms (13.5x) Regex and -i paths are unchanged (still the line-at-a-time engine). --- src/lib.rs | 2 +- src/line_buffer.rs | 191 +++++++++++++++++++++++++++++++++++- src/matcher.rs | 99 ++++++++++++++++++- src/searcher.rs | 240 ++++++++++++++++++++++++++++++++++++++++++++- tests/test_grep.rs | 67 +++++++++++++ 5 files changed, 595 insertions(+), 4 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 0f2b63a..e34bd0a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -22,7 +22,7 @@ use std::io::{IsTerminal as _, Read}; use std::path::Path; use uucore::error::{FromIo, UResult, USimpleError}; -#[derive(Clone, Copy, PartialEq, Eq)] +#[derive(Clone, Copy, PartialEq, Eq, Debug)] #[doc(hidden)] pub enum RegexMode { Fixed, diff --git a/src/line_buffer.rs b/src/line_buffer.rs index 54e7057..51ee4da 100644 --- a/src/line_buffer.rs +++ b/src/line_buffer.rs @@ -3,7 +3,7 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -use memchr::memchr; +use memchr::{memchr, memrchr}; use std::fs::File; use std::io::{self, Read as _}; @@ -111,4 +111,193 @@ impl LineBuffer { self.end += n; } } + + /// Read the next run of *complete* lines as a single slice. + /// + /// Returns `Ok(None)` at end of input. Otherwise returns `Ok(Some((chunk, + /// chunk_start)))`, where `chunk` spans one or more whole lines (each ending + /// in the terminator) and `chunk_start` is the absolute byte offset of the + /// first byte of the chunk. The only exception is a final line lacking a + /// terminator, which is returned on its own as the last chunk. + /// + /// This hands back as much buffered data as ends on a line boundary, so a + /// caller can scan many lines with one pass instead of line by line. + pub fn read_chunk(&mut self, file: &mut File) -> io::Result> { + loop { + // Hand back everything up to and including the last terminator. + if self.end > self.beg + && let Some(off) = memrchr(self.line_terminator, &self.buffer[self.beg..self.end]) + { + let beg = self.beg; + let lim = self.beg + off + 1; + let chunk_start = self.next_line_start; + self.next_line_start += (lim - beg) as u64; + self.beg = lim; + self.scan = lim; + return Ok(Some((&self.buffer[beg..lim], chunk_start))); + } + + // No whole line buffered. At EOF, flush any unterminated remainder. + if self.eof { + if self.beg == self.end { + return Ok(None); + } + let beg = self.beg; + let chunk_start = self.next_line_start; + self.next_line_start += (self.end - beg) as u64; + self.beg = self.end; + self.scan = self.end; + return Ok(Some((&self.buffer[beg..self.end], chunk_start))); + } + + // Slide the partial tail to the front to maximize room for reading. + if self.beg > 0 { + self.buffer.copy_within(self.beg..self.end, 0); + self.end -= self.beg; + self.beg = 0; + self.scan = 0; + } + if self.end == self.buffer.len() { + // A single line is longer than the whole buffer; grow it. + self.buffer.resize(self.buffer.len() * 2, 0); + } + + let n = loop { + match file.read(&mut self.buffer[self.end..]) { + Ok(n) => break n, + Err(e) if e.kind() == io::ErrorKind::Interrupted => {} + Err(e) => return Err(e), + } + }; + if n == 0 { + self.eof = true; + } else { + self.end += n; + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::io::{Seek as _, SeekFrom, Write as _}; + use std::sync::atomic::{AtomicU32, Ordering}; + + static COUNTER: AtomicU32 = AtomicU32::new(0); + + /// A temp file pre-loaded with `content`, rewound to the start, and removed + /// from disk when dropped. + struct TempInput { + file: File, + path: std::path::PathBuf, + } + + impl Drop for TempInput { + fn drop(&mut self) { + let _ = std::fs::remove_file(&self.path); + } + } + + fn temp_input(content: &[u8]) -> TempInput { + let mut path = std::env::temp_dir(); + let n = COUNTER.fetch_add(1, Ordering::Relaxed); + path.push(format!("uu_grep_lb_{}_{n}.tmp", std::process::id())); + let mut file = std::fs::OpenOptions::new() + .read(true) + .write(true) + .create(true) + .truncate(true) + .open(&path) + .unwrap(); + file.write_all(content).unwrap(); + file.seek(SeekFrom::Start(0)).unwrap(); + TempInput { file, path } + } + + /// Drain `read_chunk` into a list of (owned bytes, start offset) pairs. + fn chunks(term: u8, content: &[u8]) -> Vec<(Vec, u64)> { + let mut lb = LineBuffer::new(term); + let mut input = temp_input(content); + let mut out = Vec::new(); + while let Some((chunk, start)) = lb.read_chunk(&mut input.file).unwrap() { + out.push((chunk.to_vec(), start)); + } + out + } + + #[test] + fn empty_input_yields_nothing() { + assert!(chunks(b'\n', b"").is_empty()); + } + + #[test] + fn whole_complete_lines_come_back_as_one_chunk() { + // Small input arrives in a single read, so everything up to the final + // terminator is one chunk starting at offset 0. + assert_eq!( + chunks(b'\n', b"a\nbb\nccc\n"), + vec![(b"a\nbb\nccc\n".to_vec(), 0)] + ); + } + + #[test] + fn unterminated_tail_is_a_final_chunk_with_its_own_offset() { + // "a\n" is the complete-line chunk; "bb" is flushed at EOF at offset 2. + assert_eq!( + chunks(b'\n', b"a\nbb"), + vec![(b"a\n".to_vec(), 0), (b"bb".to_vec(), 2)] + ); + } + + #[test] + fn input_without_any_terminator_is_one_chunk() { + assert_eq!(chunks(b'\n', b"abc"), vec![(b"abc".to_vec(), 0)]); + } + + #[test] + fn honors_a_custom_terminator() { + assert_eq!( + chunks(b'\0', b"a\0bb\0c"), + vec![(b"a\0bb\0".to_vec(), 0), (b"c".to_vec(), 5)] + ); + } + + #[test] + fn reassembles_input_larger_than_the_buffer() { + // Force many reads and at least one chunk boundary mid-file. + let mut content = Vec::new(); + for i in 0..50_000u32 { + content.extend_from_slice(format!("line number {i}\n").as_bytes()); + } + assert!(content.len() > 128 * 1024); + + let got = chunks(b'\n', &content); + assert!(got.len() > 1, "expected multiple chunks, got {}", got.len()); + + // Chunks must tile the input exactly, contiguously, each ending on a + // line boundary (the input ends with a terminator). + let mut expected_start = 0u64; + let mut joined = Vec::new(); + for (bytes, start) in &got { + assert_eq!(*start, expected_start); + assert_eq!(*bytes.last().unwrap(), b'\n'); + expected_start += bytes.len() as u64; + joined.extend_from_slice(bytes); + } + assert_eq!(joined, content); + } + + #[test] + fn grows_to_hold_a_single_overlong_line() { + // One line far bigger than the initial 128 KiB buffer, then a short one. + let mut content = vec![b'x'; 300 * 1024]; + content.push(b'\n'); + content.extend_from_slice(b"tail\n"); + + let got = chunks(b'\n', &content); + let joined: Vec = got.iter().flat_map(|(b, _)| b.clone()).collect(); + assert_eq!(joined, content); + assert_eq!(got[0].1, 0); + } } diff --git a/src/matcher.rs b/src/matcher.rs index d9cf846..604259c 100644 --- a/src/matcher.rs +++ b/src/matcher.rs @@ -4,6 +4,7 @@ // file that was distributed with this source code. use crate::{Config, RegexMode}; +use memchr::memmem; use onig::{ EncodedBytes, Regex, RegexOptions, Region, SearchOptions, Syntax, SyntaxBehavior, SyntaxOperator, @@ -14,6 +15,12 @@ use uucore::error::{UResult, USimpleError}; pub struct Matcher<'a> { config: &'a Config<'a>, patterns: Vec, + /// One substring searcher per pattern, present only when *every* pattern is + /// a plain literal that a raw byte search resolves exactly (see + /// [`plain_literal`]). When set, a caller can decide a line matches by + /// looking for any of these needles, bypassing the regex engine entirely. + /// `None` as soon as a single pattern needs real regex evaluation. + literal_searchers: Option>>, } impl<'a> Matcher<'a> { @@ -22,7 +29,32 @@ impl<'a> Matcher<'a> { for raw in config.patterns { patterns.push(CompiledPattern::compile(raw, config)?); } - Ok(Self { config, patterns }) + + // If we can reduce the whole pattern set to literal needles, keep a + // searcher for each so the driver can take a bulk substring-scan path. + let needles: Option>> = config + .patterns + .iter() + .map(|p| plain_literal(p, config.ignore_case, config.regex_mode)) + .collect(); + let literal_searchers = needles.filter(|n| !n.is_empty()).map(|n| { + n.iter() + .map(|w| memmem::Finder::new(w).into_owned()) + .collect() + }); + + Ok(Self { + config, + patterns, + literal_searchers, + }) + } + + /// Per-pattern substring searchers, present only when the pattern set is a + /// pure set of literals (no regex needed). Used by the searcher to scan a + /// whole buffer at once instead of testing line by line. + pub fn literal_searchers(&self) -> Option<&[memmem::Finder<'static>]> { + self.literal_searchers.as_deref() } /// Decide whether `line` matches and return the positions to highlight. @@ -194,6 +226,25 @@ impl Cursor<'_> { } } +/// Return the literal bytes of `pattern` when a raw byte-for-byte substring +/// search is *exactly* equivalent to matching it, otherwise `None`. +/// +/// We accept only ASCII, case-sensitive needles. That keeps the byte search in +/// agreement with the regex engine on every possible input, including bytes that +/// are not valid UTF-8: an ASCII byte can never be part of a multi-byte sequence, +/// so its presence is unambiguous. In the regex modes we also require that no +/// byte could ever act as a metacharacter; under `-F` the text is literal as-is. +fn plain_literal(pattern: &str, ignore_case: bool, mode: RegexMode) -> Option> { + if ignore_case || pattern.is_empty() || !pattern.is_ascii() { + return None; + } + // Every byte that carries special meaning in any of our regex syntaxes. + // A needle without these reads the same as a literal in Basic/Extended/Perl. + const SPECIAL: &[u8] = b".*[]^$\\+?{}()|"; + let plain = mode == RegexMode::Fixed || !pattern.bytes().any(|b| SPECIAL.contains(&b)); + plain.then(|| pattern.as_bytes().to_vec()) +} + struct CompiledPattern { /// Default semantics. It's decently fast and used for searching. leftmost: Regex, @@ -289,3 +340,49 @@ impl CompiledPattern { .is_some() } } + +#[cfg(test)] +mod tests { + use super::plain_literal; + use crate::RegexMode; + + fn lit(p: &str, ic: bool, mode: RegexMode) -> Option> { + plain_literal(p, ic, mode) + } + + #[test] + fn fixed_mode_takes_any_ascii_verbatim() { + // Under -F every byte is literal, even regex metacharacters. + assert_eq!(lit("abc", false, RegexMode::Fixed), Some(b"abc".to_vec())); + assert_eq!(lit("a.*b", false, RegexMode::Fixed), Some(b"a.*b".to_vec())); + assert_eq!(lit("a+b", false, RegexMode::Fixed), Some(b"a+b".to_vec())); + } + + #[test] + fn regex_modes_accept_metacharacter_free_literals() { + for mode in [RegexMode::Basic, RegexMode::Extended, RegexMode::Perl] { + assert_eq!(lit("ing", false, mode), Some(b"ing".to_vec())); + assert_eq!(lit("Hello123", false, mode), Some(b"Hello123".to_vec())); + } + } + + #[test] + fn regex_modes_reject_anything_with_a_metacharacter() { + for mode in [RegexMode::Basic, RegexMode::Extended, RegexMode::Perl] { + for p in [ + "a.b", "a*", "[ab]", "^a", "a$", "a\\b", "a+", "a?", "(a)", "a|b", "a{2}", + ] { + assert_eq!(lit(p, false, mode), None, "pattern {p:?} in {mode:?}"); + } + } + } + + #[test] + fn rejects_empty_case_insensitive_and_non_ascii() { + assert_eq!(lit("", false, RegexMode::Fixed), None); + assert_eq!(lit("abc", true, RegexMode::Fixed), None); // -i + assert_eq!(lit("abc", true, RegexMode::Basic), None); + assert_eq!(lit("café", false, RegexMode::Fixed), None); // non-ASCII + assert_eq!(lit("naïve", false, RegexMode::Basic), None); + } +} diff --git a/src/searcher.rs b/src/searcher.rs index c63c826..5da5936 100644 --- a/src/searcher.rs +++ b/src/searcher.rs @@ -8,7 +8,8 @@ use crate::line_buffer::LineBuffer; use crate::matcher::Matcher; use crate::output::OutputWriter; use crate::{BinaryMode, Config, DeviceMode, DirectoryMode}; -use memchr::memchr; +use memchr::memmem::Finder; +use memchr::{memchr, memchr_iter, memrchr}; use std::ffi::OsStr; use std::fs::File; use std::io; @@ -248,12 +249,221 @@ impl<'a> Searcher<'a> { self.binary_notice_enabled && self.session_binary_detected && self.session_any_match() } + /// Whether the current configuration can use the buffer-at-a-time fast + /// path. It applies only to pure-literal patterns and the simpler output + /// modes — anything needing match positions, context, inversion, or special + /// binary handling falls back to the line-at-a-time [`Self::session_run`]. + fn eligible_for_fast_path(&self) -> bool { + // On Windows the line-at-a-time path strips a trailing CR before + // matching; the fast path mirrors that only for printed output, so a + // literal needle still behaves the same. Nothing else differs. + self.matcher.literal_searchers().is_some() + && !self.config.invert_match + && !self.config.word_regexp + && !self.config.line_regexp + && !self.config.only_matching + && !self.config.use_color + // `has_context` also covers `-C 0`, which still emits `--` separators. + && !self.config.has_context + && !self.config.null_data + && self.config.binary_mode != BinaryMode::WithoutMatch + } + + /// Buffer-at-a-time driver for literal patterns. Instead of testing every + /// line, it scans whole chunks with a substring searcher and only locates + /// line boundaries around the matches it finds. + fn session_run_fast( + &mut self, + lb: &mut LineBuffer, + path: &Path, + reader: &mut File, + ) -> io::Result { + lb.reset(); + if self.config.quiet + || self.config.files_with_matches + || self.config.files_without_match + || self.config.count + { + self.fast_locate(lb, path, reader) + } else { + self.fast_print(lb, path, reader) + } + } + + /// Fast path for modes that only need to know *whether* / *how many* lines + /// match: `-c`, `-l`, `-L`, `-q`. No per-line rendering, so no line numbers, + /// byte offsets, or binary bookkeeping are required (the count of matching + /// lines is unaffected by binary detection, and `-l`/`-L`/`-q` list files + /// regardless). + fn fast_locate( + &mut self, + lb: &mut LineBuffer, + path: &Path, + reader: &mut File, + ) -> io::Result { + let finders = self + .matcher + .literal_searchers() + .expect("eligibility guarantees literal searchers"); + let max = self.config.max_count; + // Existence is enough for these three; only `-c` needs the full tally. + let stop_at_first = + self.config.quiet || self.config.files_with_matches || self.config.files_without_match; + + let mut count: u64 = 0; + let mut matched = false; + 'outer: while let Some((chunk, _)) = lb.read_chunk(reader)? { + let mut p = 0; + while p < chunk.len() { + let Some(rel) = leftmost_match(finders, &chunk[p..]) else { + break; + }; + if max.is_some_and(|mx| count >= mx) { + break 'outer; + } + let (_, line_end) = line_bounds(chunk, p + rel); + count += 1; + matched = true; + if stop_at_first { + break 'outer; + } + // Each line counts once: resume past this line's terminator. + p = line_end + 1; + } + } + + // `-l`/`-L` take precedence over `-c`, matching the line-at-a-time path. + if self.config.quiet { + // Exit status only. + } else if self.config.files_with_matches { + if matched { + self.writer.write_filename(path)?; + } + } else if self.config.files_without_match { + if !matched { + self.writer.write_filename(path)?; + } + } else if self.config.count { + self.writer.write_count(count, path)?; + } + Ok(matched) + } + + /// Fast path that prints whole matching lines (optionally with `-n`, `-b`, + /// filename prefixes, `-m`). Binary files are detected per chunk and reported + /// with the usual notice instead of dumping their lines. + fn fast_print( + &mut self, + lb: &mut LineBuffer, + path: &Path, + reader: &mut File, + ) -> io::Result { + let finders = self + .matcher + .literal_searchers() + .expect("eligibility guarantees literal searchers"); + let max = self.config.max_count; + let want_lineno = self.config.line_number; + let detect_binary = self.config.binary_mode != BinaryMode::Text; + let notice_enabled = self.binary_notice_enabled; + + let mut count: u64 = 0; + let mut matched = false; + let mut binary = false; + // Number of terminators in all previously consumed chunks (for `-n`). + let mut base_lines: u64 = 0; + + 'outer: while let Some((chunk, chunk_off)) = lb.read_chunk(reader)? { + let mut p = 0; + // NUL scanned up to here; terminators counted up to `nl_cursor`. + let mut nul_scanned = 0; + let mut nl_cursor = 0; + let mut nl_before = 0u64; + + while p < chunk.len() { + let Some(rel) = leftmost_match(finders, &chunk[p..]) else { + break; + }; + if max.is_some_and(|mx| count >= mx) { + break 'outer; + } + let (line_beg, line_end) = line_bounds(chunk, p + rel); + + // A NUL anywhere up to this line marks the file binary, as does + // an invalid-UTF-8 matching line. + if detect_binary && !binary { + if memchr(0, &chunk[nul_scanned..line_end]).is_some() { + binary = true; + } + nul_scanned = line_end; + } + + let line = &chunk[line_beg..line_end]; + #[cfg(windows)] + let line = if self.config.strip_cr && line.last() == Some(&b'\r') { + &line[..line.len() - 1] + } else { + line + }; + + if detect_binary && !binary && std::str::from_utf8(line).is_err() { + binary = true; + } + + if binary { + // First match in a binary file: stop and emit the notice + // once at the end instead of dumping the line. + matched = true; + break 'outer; + } + + let line_number = if want_lineno { + nl_before += count_terminators(&chunk[nl_cursor..line_beg]); + nl_cursor = line_beg; + base_lines + nl_before + 1 + } else { + 0 + }; + self.writer.write_line( + &LineView { + line, + line_number, + byte_offset: chunk_off + line_beg as u64, + is_match: true, + match_positions: &[], + }, + path, + )?; + count += 1; + matched = true; + p = line_end + 1; + } + + // Carry NUL detection and the line tally across the chunk boundary. + if detect_binary && !binary && memchr(0, &chunk[nul_scanned..]).is_some() { + binary = true; + } + if want_lineno { + base_lines += nl_before + count_terminators(&chunk[nl_cursor..]); + } + } + + if binary && notice_enabled && matched { + self.writer.report_binary_match(path); + } + Ok(matched) + } + fn session_run( &mut self, lb: &mut LineBuffer, path: &Path, reader: &mut File, ) -> io::Result { + if self.eligible_for_fast_path() { + return self.session_run_fast(lb, path, reader); + } + // Reset all session (per-file) state. self.session_context_buf.clear(); self.session_match_count = 0; @@ -470,3 +680,31 @@ impl<'a> Searcher<'a> { } } } + +/// Offset of the earliest occurrence of any needle in `hay`, or `None`. +fn leftmost_match(finders: &[Finder<'static>], hay: &[u8]) -> Option { + let mut best: Option = None; + for finder in finders { + if let Some(pos) = finder.find(hay) { + best = Some(best.map_or(pos, |b| b.min(pos))); + if best == Some(0) { + break; // Can't start any earlier. + } + } + } + best +} + +/// Count line terminators in `bytes`. +fn count_terminators(bytes: &[u8]) -> u64 { + memchr_iter(b'\n', bytes).count() as u64 +} + +/// Byte range `[start, end)` of the line containing `pos` in `buf`, excluding +/// the trailing terminator. `start` follows the previous terminator (or 0); +/// `end` is the next terminator (or end of buffer). +fn line_bounds(buf: &[u8], pos: usize) -> (usize, usize) { + let start = memrchr(b'\n', &buf[..pos]).map_or(0, |i| i + 1); + let end = memchr(b'\n', &buf[pos..]).map_or(buf.len(), |i| pos + i); + (start, end) +} diff --git a/tests/test_grep.rs b/tests/test_grep.rs index d061918..87bc292 100644 --- a/tests/test_grep.rs +++ b/tests/test_grep.rs @@ -1272,3 +1272,70 @@ fn repeated_options_are_accepted() { .succeeds() .stdout_only("a\nb\n"); } + +#[test] +fn literal_buffer_path_prefixes_and_max() { + // Plain literals are served by the buffer-at-a-time engine; the line/byte + // prefixes and -m must still be byte-identical to the line-at-a-time path. + + // -n and -b together: "lineno:byteoffset:line". + let (_s, mut c) = ucmd(); + c.args(&["-nb", "foo"]) + .pipe_in("foo\nbar\nfoobar\n") + .succeeds() + .stdout_only("1:0:foo\n3:8:foobar\n"); + + // A line matched more than once is still emitted once. + let (_s, mut c) = ucmd(); + c.args(&["-c", "oo"]) + .pipe_in("oooo\nbar\noo\n") + .succeeds() + .stdout_only("2\n"); + + // -m caps printed matches. + let (_s, mut c) = ucmd(); + c.args(&["-m", "2", "x"]) + .pipe_in("x\ny\nx\nz\nx\n") + .succeeds() + .stdout_only("x\nx\n"); + + // Final line without a trailing terminator still matches and is printed + // with an added newline. + let (_s, mut c) = ucmd(); + c.args(&["foo"]) + .pipe_in("bar\nfoo") + .succeeds() + .stdout_only("foo\n"); +} + +#[test] +fn literal_buffer_path_spans_many_chunks() { + // Build an input far larger than the read buffer so the buffer-at-a-time + // engine crosses several chunk boundaries, and check that line numbers and + // counts stay correct across them. + let mut input = String::new(); + let mut expected_n = String::new(); + let mut count = 0u32; + for i in 1..=100_000u32 { + if i % 7 == 0 { + input.push_str("needle\n"); + expected_n.push_str(&format!("{i}:needle\n")); + count += 1; + } else { + input.push_str("some filler text\n"); + } + } + assert!(input.len() > 512 * 1024, "input must exceed several chunks"); + + let (_s, mut c) = ucmd(); + c.args(&["-c", "needle"]) + .pipe_in(input.clone()) + .succeeds() + .stdout_only(format!("{count}\n")); + + let (_s, mut c) = ucmd(); + c.args(&["-n", "needle"]) + .pipe_in(input) + .succeeds() + .stdout_only(expected_n); +} From 56d774f576bd8e3a04027d555b9cdfc471cb923f Mon Sep 17 00:00:00 2001 From: Sylvestre Ledru Date: Sun, 31 May 2026 11:41:16 +0200 Subject: [PATCH 2/2] test: cover slow-path modes that literal tests no longer reach The buffer-at-a-time fast path now serves the literal patterns that the existing -l/-L/-q and binary tests used, leaving the line-at-a-time engine's equivalents uncovered. Add bracket-class (non-literal) tests for -l/-L/-q and binary handling (notice, -a text, without-match bail, and the finalize-time notice), plus a fast-path test for a NUL that is only discovered after a line was already printed. No dead code was found: the remaining uncovered lines are writer I/O error-propagation arms and pre-existing filesystem error handlers. --- tests/test_grep.rs | 100 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) diff --git a/tests/test_grep.rs b/tests/test_grep.rs index 87bc292..560dc12 100644 --- a/tests/test_grep.rs +++ b/tests/test_grep.rs @@ -1339,3 +1339,103 @@ fn literal_buffer_path_spans_many_chunks() { .succeeds() .stdout_only(expected_n); } + +// Plain literals run on the buffer-at-a-time fast path, so the following tests +// use bracket-class patterns (non-literal) to keep the line-at-a-time engine's +// `-l` / `-L` / `-q` and binary-handling paths exercised too. + +#[test] +fn slow_path_list_and_quiet_modes() { + let (scene, _) = ucmd(); + scene.fixtures.write("hit", "yes\n"); + scene.fixtures.write("miss", "no\n"); + + // -l: list matching files. + scene + .cmd(env!("CARGO_BIN_EXE_grep")) + .args(&["-l", "[y]es", "hit", "miss"]) + .succeeds() + .stdout_is("hit\n"); + + // -L with a match in one file: only the non-matching file is listed. + scene + .cmd(env!("CARGO_BIN_EXE_grep")) + .args(&["-L", "[y]es", "hit", "miss"]) + .succeeds() + .stdout_is("miss\n"); + + // -L with no match anywhere: both files listed, exit 1. + scene + .cmd(env!("CARGO_BIN_EXE_grep")) + .args(&["-L", "[z]z", "hit", "miss"]) + .fails_with_code(1) + .stdout_is("hit\nmiss\n"); + + // -q stops at the first match (exit 0) or reports no match (exit 1). + scene + .cmd(env!("CARGO_BIN_EXE_grep")) + .args(&["-q", "[y]es", "hit"]) + .succeeds() + .no_output(); + scene + .cmd(env!("CARGO_BIN_EXE_grep")) + .args(&["-q", "[z]z", "hit"]) + .fails_with_code(1) + .no_output(); +} + +#[test] +fn slow_path_binary_handling() { + let (scene, _) = ucmd(); + // NOTE: avoid the name "nul" here — it's a reserved device name on Windows, + // so writing/reading it hits the null device instead of a real file. + scene.fixtures.write_bytes("nulbin", b"hit\0\n"); + scene.fixtures.write_bytes("bad", b"a\x9d\n"); + + // Binary notice on the line-at-a-time engine (regex pattern). + scene + .cmd(env!("CARGO_BIN_EXE_grep")) + .args(&["[h]it", "nulbin"]) + .succeeds() + .no_stdout() + .stderr_contains("binary file matches"); + + // -a forces text mode: the NUL line is printed verbatim. + scene + .cmd(env!("CARGO_BIN_EXE_grep")) + .args(&["-a", "[h]it", "nulbin"]) + .succeeds() + .stdout_is_bytes(b"hit\0\n"); + + // --binary-files=without-match bails out on an invalid-UTF-8 match. + scene + .cmd(env!("CARGO_BIN_EXE_grep")) + .args(&["--binary-files=without-match", "[a]", "bad"]) + .fails_with_code(1) + .no_output(); + + // A NUL after the matched line means binariness is discovered at EOF, so + // the line is printed first and the notice is emitted during finalization. + scene.fixtures.write_bytes("late", b"hit\nno\0\n"); + scene + .cmd(env!("CARGO_BIN_EXE_grep")) + .args(&["[h]it", "late"]) + .succeeds() + .stdout_is("hit\n") + .stderr_contains("binary file matches"); +} + +#[test] +fn fast_path_binary_detected_after_a_printed_line() { + // A NUL that appears only after the last match in the buffer marks the file + // binary on the fast path *after* an earlier match was already printed: the + // printed line stays and the trailing notice is still emitted. + let (scene, _) = ucmd(); + scene.fixtures.write_bytes("b", b"hit\nno\0\n"); + scene + .cmd(env!("CARGO_BIN_EXE_grep")) + .args(&["hit", "b"]) + .succeeds() + .stdout_is("hit\n") + .stderr_contains("binary file matches"); +}