From be15138efca236a0f05ab18302e5522c69b163a4 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 4 Apr 2026 08:56:53 +0000 Subject: [PATCH 1/2] Add FIND_NOT_EMPTY support for None-operation match patterns Agent-Logs-Url: https://github.com/forkeith/syntect/sessions/d70e3322-55e1-4e9e-8729-33764466ce96 Co-authored-by: keith-hall <11882719+keith-hall@users.noreply.github.com> --- src/parsing/parser.rs | 8 +++-- src/parsing/regex.rs | 60 ++++++++++++++++++++++++++++++-- src/parsing/syntax_definition.rs | 2 +- src/parsing/syntax_set.rs | 2 +- src/parsing/yaml_load.rs | 8 ++--- 5 files changed, 70 insertions(+), 10 deletions(-) diff --git a/src/parsing/parser.rs b/src/parsing/parser.rs index 875d3733..f1caf8ed 100644 --- a/src/parsing/parser.rs +++ b/src/parsing/parser.rs @@ -566,7 +566,7 @@ impl ParseState { let mut esc_regions = Region::new(); if entry .regex - .search(line, start, line.len(), Some(&mut esc_regions)) + .search(line, start, line.len(), Some(&mut esc_regions), true) { let (esc_start, _esc_end) = esc_regions.pos(0).unwrap(); if esc_start < search_end { @@ -714,7 +714,11 @@ impl ParseState { _ => (match_pat.regex(), true), }; // print!(" executing regex: {:?} at pos {} on line {}", regex.regex_str(), start, line); - let matched = regex.search(line, start, search_end, Some(regions)); + // Only None-operation patterns should avoid zero-length matches. All other operations + // (Push, Set, Pop, Embed, Branch, Fail) legitimately need to match zero-length input + // (e.g. lookaheads used with Branch/Fail, empty patterns used with Pop/Set). + let allow_empty = !matches!(match_pat.operation, MatchOperation::None); + let matched = regex.search(line, start, search_end, Some(regions), allow_empty); if matched { let (match_start, match_end) = regions.pos(0).unwrap(); diff --git a/src/parsing/regex.rs b/src/parsing/regex.rs index aaa6a126..2ab5cbf3 100644 --- a/src/parsing/regex.rs +++ b/src/parsing/regex.rs @@ -12,6 +12,9 @@ use std::error::Error; pub struct Regex { regex_str: String, regex: OnceCell, + /// Lazily-compiled variant that won't match zero-length strings (for use with + /// match patterns whose operation does not modify the parser context stack). + regex_not_empty: OnceCell, } /// A region contains text positions for capture groups in a match result. @@ -29,6 +32,7 @@ impl Regex { Self { regex_str, regex: OnceCell::new(), + regex_not_empty: OnceCell::new(), } } @@ -53,6 +57,10 @@ impl Regex { /// the [`Region`] to be reused between searches, which makes a significant performance /// difference. /// + /// When `allow_empty` is `false`, zero-length matches are not considered. This should be used + /// for match patterns whose operation does not push, set, pop or embed a context, to prevent + /// the parser from stalling at the same position. + /// /// [`Region`]: struct.Region.html pub fn search( &self, @@ -60,9 +68,26 @@ impl Regex { begin: usize, end: usize, region: Option<&mut Region>, + allow_empty: bool, ) -> bool { - self.regex() - .search(text, begin, end, region.map(|r| &mut r.region)) + if allow_empty { + return self.regex().search(text, begin, end, region.map(|r| &mut r.region)); + } + // For Oniguruma, the not_empty_regex is compiled with FIND_NOT_EMPTY which + // natively avoids empty matches. For fancy-regex, which lacks a compile-time + // equivalent option, we additionally filter out any zero-length match below. + match region { + Some(region) => { + let matched = self + .not_empty_regex() + .search(text, begin, end, Some(&mut region.region)); + if matched && region.pos(0).map_or(false, |(ms, me)| ms == me) { + return false; + } + matched + } + None => self.not_empty_regex().search(text, begin, end, None), + } } fn regex(&self) -> ®ex_impl::Regex { @@ -70,6 +95,13 @@ impl Regex { regex_impl::Regex::new(&self.regex_str).expect("regex string should be pre-tested") }) } + + fn not_empty_regex(&self) -> ®ex_impl::Regex { + self.regex_not_empty.get_or_init(|| { + regex_impl::Regex::new_find_not_empty(&self.regex_str) + .expect("regex string should be pre-tested") + }) + } } impl Clone for Regex { @@ -77,6 +109,7 @@ impl Clone for Regex { Regex { regex_str: self.regex_str.clone(), regex: OnceCell::new(), + regex_not_empty: OnceCell::new(), } } } @@ -158,6 +191,21 @@ mod regex_impl { } } + pub fn new_find_not_empty( + regex_str: &str, + ) -> Result> { + let result = onig::Regex::with_options( + regex_str, + RegexOptions::REGEX_OPTION_CAPTURE_GROUP + | RegexOptions::REGEX_OPTION_FIND_NOT_EMPTY, + Syntax::default(), + ); + match result { + Ok(regex) => Ok(Regex { regex }), + Err(error) => Err(Box::new(error)), + } + } + pub fn is_match(&self, text: &str) -> bool { self.regex .match_with_options(text, 0, SearchOptions::SEARCH_OPTION_NONE, None) @@ -220,6 +268,14 @@ mod regex_impl { } } + pub fn new_find_not_empty( + regex_str: &str, + ) -> Result> { + // fancy-regex doesn't support a compile-time FIND_NOT_EMPTY option; empty matches are + // filtered out at search time via a wrapper in the outer Regex::search method. + Self::new(regex_str) + } + pub fn is_match(&self, text: &str) -> bool { // Errors are treated as non-matches self.regex.is_match(text).unwrap_or(false) diff --git a/src/parsing/syntax_definition.rs b/src/parsing/syntax_definition.rs index 49bfcffc..21d3b151 100644 --- a/src/parsing/syntax_definition.rs +++ b/src/parsing/syntax_definition.rs @@ -414,7 +414,7 @@ mod tests { let r = Regex::new(r"(\\\[\]\(\))(b)(c)(d)(e)".into()); let s = r"\[]()bcde"; let mut region = Region::new(); - let matched = r.search(s, 0, s.len(), Some(&mut region)); + let matched = r.search(s, 0, s.len(), Some(&mut region), true); assert!(matched); let regex_with_refs = pat.regex_with_refs(®ion, s); diff --git a/src/parsing/syntax_set.rs b/src/parsing/syntax_set.rs index 2dea1bce..adeecd74 100644 --- a/src/parsing/syntax_set.rs +++ b/src/parsing/syntax_set.rs @@ -224,7 +224,7 @@ impl SyntaxSet { let s = s.strip_prefix("\u{feff}").unwrap_or(s); // Strip UTF-8 BOM let cache = self.first_line_cache(); for &(ref reg, i) in cache.regexes.iter().rev() { - if reg.search(s, 0, s.len(), None) { + if reg.search(s, 0, s.len(), None, true) { return Some(&self.syntaxes[i]); } } diff --git a/src/parsing/yaml_load.rs b/src/parsing/yaml_load.rs index e48f0421..31fe9c2a 100644 --- a/src/parsing/yaml_load.rs +++ b/src/parsing/yaml_load.rs @@ -356,7 +356,7 @@ impl SyntaxDefinition { // Thanks @wbond for letting me know this is the correct way to check for captures has_captures = state .backref_regex - .search(®ex_str, 0, regex_str.len(), None); + .search(®ex_str, 0, regex_str.len(), None, true); MatchOperation::Pop(y as usize) } else if let Ok(y) = get_key(map, "push", Some) { MatchOperation::Push(SyntaxDefinition::parse_pushargs(y, state, contexts, namer)?) @@ -378,7 +378,7 @@ impl SyntaxDefinition { let escape_has_captures = state .backref_regex - .search(&escape_regex_str, 0, escape_regex_str.len(), None); + .search(&escape_regex_str, 0, escape_regex_str.len(), None, true); let escape_captures = if let Ok(cap_map) = get_key(map, "escape_captures", |x| x.as_hash()) { @@ -517,7 +517,7 @@ impl SyntaxDefinition { let mut region = Region::new(); while state .variable_regex - .search(raw_regex, index, raw_regex.len(), Some(&mut region)) + .search(raw_regex, index, raw_regex.len(), Some(&mut region), true) { let (begin, end) = region.pos(0).unwrap(); @@ -660,7 +660,7 @@ fn re_resolve_variables(raw_regex: &str, state: &ReResolveState<'_>) -> String { let mut region = Region::new(); while state .variable_regex - .search(raw_regex, index, raw_regex.len(), Some(&mut region)) + .search(raw_regex, index, raw_regex.len(), Some(&mut region), true) { let (begin, end) = region.pos(0).unwrap(); result.push_str(&raw_regex[index..begin]); From 95ac65454df93d7eab78d4164f0970cd96fdb984 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 4 Apr 2026 10:05:06 +0000 Subject: [PATCH 2/2] Fix CI: update public API snapshot, formatting, and known syntest failures Agent-Logs-Url: https://github.com/forkeith/syntect/sessions/00d31603-18b2-499d-ae07-3c2ad4e52d3a Co-authored-by: keith-hall <11882719+keith-hall@users.noreply.github.com> --- src/parsing/regex.rs | 10 ++++++---- src/parsing/yaml_load.rs | 22 ++++++++++++++-------- testdata/known_syntest_failures.txt | 5 +---- testdata/known_syntest_failures_fancy.txt | 2 -- tests/snapshots/public-api.txt | 2 +- 5 files changed, 22 insertions(+), 19 deletions(-) diff --git a/src/parsing/regex.rs b/src/parsing/regex.rs index 2ab5cbf3..e4a63e1f 100644 --- a/src/parsing/regex.rs +++ b/src/parsing/regex.rs @@ -71,16 +71,18 @@ impl Regex { allow_empty: bool, ) -> bool { if allow_empty { - return self.regex().search(text, begin, end, region.map(|r| &mut r.region)); + return self + .regex() + .search(text, begin, end, region.map(|r| &mut r.region)); } // For Oniguruma, the not_empty_regex is compiled with FIND_NOT_EMPTY which // natively avoids empty matches. For fancy-regex, which lacks a compile-time // equivalent option, we additionally filter out any zero-length match below. match region { Some(region) => { - let matched = self - .not_empty_regex() - .search(text, begin, end, Some(&mut region.region)); + let matched = + self.not_empty_regex() + .search(text, begin, end, Some(&mut region.region)); if matched && region.pos(0).map_or(false, |(ms, me)| ms == me) { return false; } diff --git a/src/parsing/yaml_load.rs b/src/parsing/yaml_load.rs index 31fe9c2a..9e075a0c 100644 --- a/src/parsing/yaml_load.rs +++ b/src/parsing/yaml_load.rs @@ -375,10 +375,13 @@ impl SyntaxDefinition { if let Ok(v) = get_key(map, "escape", Some) { let escape_raw = v.as_str().ok_or(ParseSyntaxError::TypeMismatch)?; let escape_regex_str = Self::parse_regex(escape_raw, state)?; - let escape_has_captures = - state - .backref_regex - .search(&escape_regex_str, 0, escape_regex_str.len(), None, true); + let escape_has_captures = state.backref_regex.search( + &escape_regex_str, + 0, + escape_regex_str.len(), + None, + true, + ); let escape_captures = if let Ok(cap_map) = get_key(map, "escape_captures", |x| x.as_hash()) { @@ -515,10 +518,13 @@ impl SyntaxDefinition { let mut result = String::new(); let mut index = 0; let mut region = Region::new(); - while state - .variable_regex - .search(raw_regex, index, raw_regex.len(), Some(&mut region), true) - { + while state.variable_regex.search( + raw_regex, + index, + raw_regex.len(), + Some(&mut region), + true, + ) { let (begin, end) = region.pos(0).unwrap(); result.push_str(&raw_regex[index..begin]); diff --git a/testdata/known_syntest_failures.txt b/testdata/known_syntest_failures.txt index f55cff60..e69fd3ea 100644 --- a/testdata/known_syntest_failures.txt +++ b/testdata/known_syntest_failures.txt @@ -1,5 +1,2 @@ loading syntax definitions from testdata/Packages -FAILED testdata/Packages/C#/tests/syntax_test_Strings.cs: 38 -FAILED testdata/Packages/LaTeX/syntax_test_latex.tex: 1 -FAILED testdata/Packages/Makefile/syntax_test_makefile.mak: 6 -exiting with code 1 +exiting with code 0 diff --git a/testdata/known_syntest_failures_fancy.txt b/testdata/known_syntest_failures_fancy.txt index c052e1e9..66ddc45c 100644 --- a/testdata/known_syntest_failures_fancy.txt +++ b/testdata/known_syntest_failures_fancy.txt @@ -1,5 +1,3 @@ loading syntax definitions from testdata/Packages -FAILED testdata/Packages/C#/tests/syntax_test_Strings.cs: 38 -FAILED testdata/Packages/LaTeX/syntax_test_latex.tex: 1 FAILED testdata/Packages/Markdown/syntax_test_markdown.md: 11 exiting with code 1 diff --git a/tests/snapshots/public-api.txt b/tests/snapshots/public-api.txt index 55bf59bd..f037cb58 100644 --- a/tests/snapshots/public-api.txt +++ b/tests/snapshots/public-api.txt @@ -1054,7 +1054,7 @@ impl syntect::parsing::Regex pub fn syntect::parsing::Regex::is_match(&self, text: &str) -> bool pub fn syntect::parsing::Regex::new(regex_str: alloc::string::String) -> Self pub fn syntect::parsing::Regex::regex_str(&self) -> &str -pub fn syntect::parsing::Regex::search(&self, text: &str, begin: usize, end: usize, region: core::option::Option<&mut syntect::parsing::Region>) -> bool +pub fn syntect::parsing::Regex::search(&self, text: &str, begin: usize, end: usize, region: core::option::Option<&mut syntect::parsing::Region>, allow_empty: bool) -> bool pub fn syntect::parsing::Regex::try_compile(regex_str: &str) -> core::option::Option> impl core::clone::Clone for syntect::parsing::Regex pub fn syntect::parsing::Regex::clone(&self) -> Self