Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
141 changes: 141 additions & 0 deletions src/matcher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ use onig::{
SyntaxOperator,
};
use onig_sys::{OnigEncCtype_ONIGENC_CTYPE_WORD, OnigEncodingUTF8};
use std::borrow::Cow;
use uucore::error::{UResult, USimpleError};

pub struct Matcher<'a> {
Expand Down Expand Up @@ -259,6 +260,8 @@ struct CompiledPattern {

impl CompiledPattern {
fn compile(pattern: &str, config: &Config) -> UResult<Self> {
let pattern = normalize_posix_equivalence_classes(pattern, config.regex_mode);
let pattern = pattern.as_ref();
let mut syntax = *match config.regex_mode {
RegexMode::Fixed => Syntax::asis(),
RegexMode::Basic => Syntax::grep(),
Expand Down Expand Up @@ -344,6 +347,144 @@ impl CompiledPattern {
}
}

fn normalize_posix_equivalence_classes(pattern: &str, regex_mode: RegexMode) -> Cow<'_, str> {
if !matches!(regex_mode, RegexMode::Basic | RegexMode::Extended) || !pattern.contains("[=") {
return Cow::Borrowed(pattern);
}

let mut normalized = String::with_capacity(pattern.len());
let mut changed = false;
let mut i = 0;
while i < pattern.len() {
let Some((ch, next)) = next_char(pattern, i) else {
break;
};

if ch == '\\' {
normalized.push(ch);
i = next;
if let Some((escaped, escaped_next)) = next_char(pattern, i) {
normalized.push(escaped);
i = escaped_next;
}
continue;
}

if ch == '['
&& let Some((class, next, class_changed)) = normalize_bracket_class(pattern, i)
{
normalized.push_str(&class);
i = next;
changed |= class_changed;
continue;
}

normalized.push(ch);
i = next;
}

if changed {
Cow::Owned(normalized)
} else {
Cow::Borrowed(pattern)
}
}

// uu_grep does not implement locale collation yet, so single-character POSIX
// equivalence classes can only be made C-locale-compatible here. Multi-character
// collating elements are left for the regex engine to handle as before.
fn normalize_bracket_class(pattern: &str, start: usize) -> Option<(String, usize, bool)> {
let mut class = String::from("[");
let mut changed = false;
let mut first_item = true;
let mut i = start + 1;

if let Some(('^', next)) = next_char(pattern, i) {
class.push('^');
i = next;
}

while i < pattern.len() {
if let Some((ch, next)) = next_char(pattern, i)
&& ch == ']'
&& !first_item
{
class.push(ch);
return Some((class, next, changed));
}

if pattern[i..].starts_with("[=")
&& let Some((content, next)) = bracket_token_content(pattern, i, '=')
{
if let Some(ch) = single_char(content) {
push_bracket_literal(&mut class, ch);
changed = true;
} else {
class.push_str(&pattern[i..next]);
}
i = next;
first_item = false;
continue;
}

if pattern[i..].starts_with("[:")
&& let Some((_, next)) = bracket_token_content(pattern, i, ':')
{
class.push_str(&pattern[i..next]);
i = next;
first_item = false;
continue;
}

if pattern[i..].starts_with("[.")
&& let Some((_, next)) = bracket_token_content(pattern, i, '.')
{
class.push_str(&pattern[i..next]);
i = next;
first_item = false;
continue;
}

let (ch, next) = next_char(pattern, i)?;
class.push(ch);
i = next;
first_item = false;
}

None
}

fn bracket_token_content(pattern: &str, start: usize, delimiter: char) -> Option<(&str, usize)> {
let content_start = start + 2;
let mut i = content_start;
while i < pattern.len() {
let (ch, next) = next_char(pattern, i)?;
if ch == delimiter && pattern[next..].starts_with(']') {
return Some((&pattern[content_start..i], next + 1));
}
i = next;
}
None
}

fn single_char(value: &str) -> Option<char> {
let mut chars = value.chars();
let ch = chars.next()?;
chars.next().is_none().then_some(ch)
}

fn push_bracket_literal(class: &mut String, ch: char) {
if matches!(ch, '\\' | ']' | '-' | '^') {
class.push('\\');
}
class.push(ch);
}

fn next_char(value: &str, index: usize) -> Option<(char, usize)> {
let ch = value[index..].chars().next()?;
Some((ch, index + ch.len_utf8()))
}

#[cfg(test)]
mod tests {
use super::plain_literal;
Expand Down
33 changes: 33 additions & 0 deletions tests/test_grep.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,39 @@ fn bre_gnu_extensions() {
.stdout_only("*foo\n**foo\n");
}

#[test]
fn posix_equivalence_classes_in_brackets() {
let (_s, mut c) = ucmd();
c.args(&["[[=a=]]"])
.pipe_in("a\nb\n")
.succeeds()
.stdout_only("a\n");

let (_s, mut c) = ucmd();
c.args(&["[[=a=]b]"])
.pipe_in("a\nb\nc\n")
.succeeds()
.stdout_only("a\nb\n");

let (_s, mut c) = ucmd();
c.args(&["[[:alpha:]][[=1=]]"])
.pipe_in("a1\na2\n11\n")
.succeeds()
.stdout_only("a1\n");

let (_s, mut c) = ucmd();
c.args(&["-E", "[[=a=]]"])
.pipe_in("a\nb\n")
.succeeds()
.stdout_only("a\n");

let (_s, mut c) = ucmd();
c.args(&["-F", "[[=a=]]"])
.pipe_in("[[=a=]]\na\n")
.succeeds()
.stdout_only("[[=a=]]\n");
}

#[test]
fn ere_metacharacters() {
let cases: &[(&[&str], &str, &str)] = &[
Expand Down
Loading