From 6e2a8d533c736ca2c645b934acc04e430589894f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ralph=20K=C3=BCpper?= Date: Tue, 16 Jun 2026 20:18:29 +0200 Subject: [PATCH 1/2] fix(regex): stop false-rejecting valid char classes & bounded quantifiers (semver/joi/winston) Three independent false-rejections in JS->Rust regex translation made valid patterns throw "Invalid regular expression: ...: invalid pattern": 1. Compiled-size cap (semver): the regex crate caps a compiled program at 10 MiB and rejects larger ones as CompiledTooBig. semver's ReDoS-hardened safeRe rewrites (\d{1,256}, [...]{0,250}, ...) exceed that. JS has no such limit, so raise the budget to 64 MiB for both the regex crate and the fancy-regex delegate (build_std_regex / build_fancy_regex helpers). 2. Class hyphen adjacent to a shorthand (joi): inside a class, a '-' next to a \d/\w/\s shorthand or \p{...} property is a LITERAL hyphen in JS (a shorthand can't bound a range), but the regex crate reads it as a range and errors with ClassRangeLiteral. Escape such hyphens to \- during translation. 3. Trivial char classes / ANSI patterns (winston via @colors/colors): covered by the above plus a regression test for [0m] and escapeStringRegexp output. Adds unit regression tests for all three. No version/CHANGELOG/Cargo.lock edits (maintainer folds version + changelog at merge). --- crates/perry-runtime/src/regex.rs | 41 +++++- crates/perry-runtime/src/regex/compile.rs | 8 +- crates/perry-runtime/src/regex/grammar.rs | 147 ++++++++++++++++++++++ 3 files changed, 188 insertions(+), 8 deletions(-) diff --git a/crates/perry-runtime/src/regex.rs b/crates/perry-runtime/src/regex.rs index e5e909dfd9..37b6d51b6b 100644 --- a/crates/perry-runtime/src/regex.rs +++ b/crates/perry-runtime/src/regex.rs @@ -120,6 +120,40 @@ thread_local! { static FANCY_CACHE: RefCell>> = RefCell::new(HashMap::new()); } +/// Compiled-program size budget handed to both regex engines. +/// +/// The `regex` crate (and the `regex-automata` backend `fancy-regex` +/// delegates to) caps a compiled program at 10 MiB by default and rejects +/// anything larger with `CompiledTooBig` / `ExceededSizeLimit` — which our +/// callers surface as a bogus `SyntaxError: invalid pattern`. JS itself has +/// no such limit, so a *valid* pattern with large bounded repetitions is +/// wrongly rejected. semver's ReDoS-hardened `safeRe` rewrites (`\s{0,1}`, +/// `\d{1,256}`, `[…]{0,250}`, …) blow well past 10 MiB; raise the budget so +/// these legitimate patterns compile. 64 MiB comfortably fits semver's full +/// range regex while still bounding pathological input. +#[cfg(feature = "regex-engine")] +const REGEX_SIZE_LIMIT: usize = 64 * 1024 * 1024; + +/// Build a `regex` crate `Regex` with the raised [`REGEX_SIZE_LIMIT`] so that +/// large-but-valid bounded-quantifier patterns aren't rejected as +/// `CompiledTooBig`. Drop-in replacement for `regex::Regex::new`. +#[cfg(feature = "regex-engine")] +pub(crate) fn build_std_regex(pattern: &str) -> Result { + regex::RegexBuilder::new(pattern) + .size_limit(REGEX_SIZE_LIMIT) + .build() +} + +/// Build a `fancy_regex` `Regex` with the raised delegate size limit (see +/// [`REGEX_SIZE_LIMIT`]). `fancy-regex` delegates non-fancy subpatterns to the +/// `regex` crate, so the same 10 MiB cap applies there; raise it in lockstep. +#[cfg(feature = "regex-engine")] +pub(crate) fn build_fancy_regex(pattern: &str) -> Result { + fancy_regex::RegexBuilder::new(pattern) + .delegate_size_limit(REGEX_SIZE_LIMIT) + .build() +} + #[cfg(feature = "regex-engine")] fn get_or_compile_regex(pattern: &str, flags: &str) -> Arc { REGEX_CACHE.with(|cache| { @@ -150,7 +184,7 @@ fn get_or_compile_regex(pattern: &str, flags: &str) -> Arc { } else { translated }; - let regex = match Regex::new(®ex_pattern) { + let regex = match build_std_regex(®ex_pattern) { Ok(re) => re, Err(_) => { // Pattern has features regex crate doesn't support @@ -161,7 +195,7 @@ fn get_or_compile_regex(pattern: &str, flags: &str) -> Arc { // existing callers don't crash — the fancy-regex fallback // is handled in js_regexp_exec_fancy below. FANCY_CACHE.with(|fc| { - if let Ok(fre) = fancy_regex::Regex::new(®ex_pattern) { + if let Ok(fre) = build_fancy_regex(®ex_pattern) { fc.borrow_mut().insert( (pattern.to_string(), flags.to_string()), std::sync::Arc::new(fre), @@ -399,8 +433,7 @@ pub extern "C" fn js_regexp_new( )); } let translated = js_regex_to_rust(pattern_str); - if regex::Regex::new(&translated).is_err() && fancy_regex::Regex::new(&translated).is_err() - { + if build_std_regex(&translated).is_err() && build_fancy_regex(&translated).is_err() { throw_regexp_syntax_error(&format!( "Invalid regular expression: /{}/: invalid pattern", pattern_str diff --git a/crates/perry-runtime/src/regex/compile.rs b/crates/perry-runtime/src/regex/compile.rs index b0dd22e073..9a3710f703 100644 --- a/crates/perry-runtime/src/regex/compile.rs +++ b/crates/perry-runtime/src/regex/compile.rs @@ -8,9 +8,9 @@ use regex::Regex; use super::grammar::{has_invalid_repeated_quantifier, js_regex_to_rust}; use super::{ - get_or_compile_regex, is_regex_pointer, is_valid_ptr, is_valid_regex_ptr, js_regexp_get_flags, - js_regexp_get_source, js_string_from_str, string_as_str, throw_regexp_syntax_error, - validate_and_canonicalize_flags, RegExpHeader, + build_fancy_regex, build_std_regex, get_or_compile_regex, is_regex_pointer, is_valid_ptr, + is_valid_regex_ptr, js_regexp_get_flags, js_regexp_get_source, js_string_from_str, + string_as_str, throw_regexp_syntax_error, validate_and_canonicalize_flags, RegExpHeader, }; /// `RegExp.prototype.compile(pattern, flags)`. Re-initializes the receiver @@ -90,7 +90,7 @@ pub extern "C" fn js_regexp_compile_value( )); } let translated = js_regex_to_rust(pattern_str); - if regex::Regex::new(&translated).is_err() && fancy_regex::Regex::new(&translated).is_err() { + if build_std_regex(&translated).is_err() && build_fancy_regex(&translated).is_err() { throw_regexp_syntax_error(&format!( "Invalid regular expression: /{}/: invalid pattern", pattern_str diff --git a/crates/perry-runtime/src/regex/grammar.rs b/crates/perry-runtime/src/regex/grammar.rs index 0b6d2a1713..7f377668a7 100644 --- a/crates/perry-runtime/src/regex/grammar.rs +++ b/crates/perry-runtime/src/regex/grammar.rs @@ -551,6 +551,62 @@ fn emoji_string_property_expansion(value: &str) -> Option { }) } +/// Does the already-emitted translation `out` end in a character-class +/// shorthand (`\d`/`\w`/`\s` and their negations, or a `\p{…}`/`\P{…}` Unicode +/// property)? Such an element cannot be the bound of a range, so a `-` +/// immediately after it is a *literal* hyphen in JS, not a range operator. +fn out_ends_with_class_shorthand(out: &str) -> bool { + let b = out.as_bytes(); + // `\p{…}` / `\P{…}` property: ends with `}` preceded by a `{…` opened by + // an unescaped `\p` / `\P`. + if b.last() == Some(&b'}') { + if let Some(open) = out.rfind('{') { + let pre = &out[..open]; + let pb = pre.as_bytes(); + if pb.len() >= 2 + && matches!(pb[pb.len() - 1], b'p' | b'P') + && pb[pb.len() - 2] == b'\\' + && !is_escaped_backslash(pb, pb.len() - 2) + { + return true; + } + } + } + if b.len() < 2 { + return false; + } + let last = b[b.len() - 1]; + b[b.len() - 2] == b'\\' + && !is_escaped_backslash(b, b.len() - 2) + && matches!(last, b'd' | b'D' | b'w' | b'W' | b's' | b'S') +} + +/// Is the backslash at `b[bs]` itself escaped (i.e. preceded by an odd run of +/// backslashes)? Used so `\\d` (literal backslash + `d`) isn't mistaken for the +/// `\d` shorthand. +fn is_escaped_backslash(b: &[u8], bs: usize) -> bool { + let mut count = 0usize; + let mut k = bs; + while k > 0 && b[k - 1] == b'\\' { + count += 1; + k -= 1; + } + count % 2 == 1 +} + +/// Will the next class member at `chars[i..]` (where `chars[i]` is a `\`) be a +/// shorthand class (`\d`/`\w`/`\s` & negations, or `\p{…}`/`\P{…}`)? A `-` +/// directly before such an element is a literal hyphen in JS. +fn next_is_class_shorthand(chars: &[char], i: usize) -> bool { + if chars.get(i) != Some(&'\\') { + return false; + } + matches!( + chars.get(i + 1), + Some('d' | 'D' | 'w' | 'W' | 's' | 'S' | 'p' | 'P') + ) +} + /// Translate a JavaScript regex pattern to a Rust regex-crate compatible pattern. /// Handles JS-specific escape sequences not supported by the Rust regex crate. /// Also converts JS-style named groups `(?...)` to Rust-style `(?P...)`. @@ -732,6 +788,21 @@ pub(super) fn js_regex_to_rust(pattern: &str) -> String { result.push(chars[i]); i += 1; } + } else if in_class + && chars[i] == '-' + && (out_ends_with_class_shorthand(&result) + || next_is_class_shorthand(&chars, i + 1)) + { + // Inside a class, a `-` adjacent to a shorthand class (`\d`, `\w`, + // `\s`, …, or a `\p{…}` property) is a *literal* hyphen in JS — a + // shorthand can't be a range bound. The Rust `regex` crate instead + // tries to read `\w-\.` / `\d-z` as a range and rejects it with a + // `ClassRangeLiteral` parse error. Escape the hyphen so it stays a + // literal. joi's URI validator (`[\w-\.~%\dA-Fa-f…]`, IPv6 host) + // and many other real-world classes rely on this. + result.push('\\'); + result.push('-'); + i += 1; } else { result.push(chars[i]); i += 1; @@ -890,4 +961,80 @@ mod tests { assert!(regex::Regex::new(r"\P{RGI_Emoji}").is_err()); assert!(fancy_regex::Regex::new(r"\P{RGI_Emoji}").is_err()); } + + #[test] + fn trivial_char_class_compiles_and_matches() { + // Regression for the winston/`@colors/colors` tail: a trivial, valid + // character class must translate and compile (and *not* be rejected by + // the invalid-pattern guard). `[0m]` matches `0` or `m`. + let translated = js_regex_to_rust("[0m]"); + let re = regex::Regex::new(&translated).expect("[0m] must compile"); + assert!(re.is_match("0")); + assert!(re.is_match("m")); + assert!(!re.is_match("x")); + // The `@colors/colors` ANSI-strip literal `\x1B\[\d+m` and the + // escaped-bracket form `\x1B\[0m` (`escapeStringRegexp` output) compile. + for pat in [r"\x1B\[\d+m", r"\x1B\[0m"] { + assert!( + regex::Regex::new(&js_regex_to_rust(pat)).is_ok(), + "ANSI pattern must compile: {pat}" + ); + } + // Neither trips the bounded-quantifier false-positive guard. + assert!(!super::has_invalid_repeated_quantifier("[0m]")); + assert!(!super::has_invalid_repeated_quantifier(r"\x1B\[\d+m")); + } + + #[test] + fn bounded_quantifier_in_class_not_rejected() { + // semver's ReDoS-hardened `safeRe` rewrites `\d+`→`\d{1,N}`, + // `\s*`→`\s{0,1}`, `[…]*`→`[…]{0,N}`. These bounded quantifiers are + // valid and must NOT be flagged by `has_invalid_repeated_quantifier`. + for pat in [ + r"\d{1,16}", + r"\s{0,1}", + r"\d{0,256}", + r"[a-zA-Z0-9-]{0,250}", + r"(?:<|>)?=?", + r"a{0,1}", + ] { + assert!( + !super::has_invalid_repeated_quantifier(pat), + "valid bounded quantifier wrongly rejected: {pat}" + ); + } + // A genuinely-dangling quantifier (no preceding atom) is still caught. + assert!(super::has_invalid_repeated_quantifier("{0,1}")); + assert!(super::has_invalid_repeated_quantifier("*abc")); + } + + #[test] + fn class_hyphen_adjacent_to_shorthand_is_literal() { + // joi's URI validator builds classes like `[\w-\.~%\dA-Fa-f…]` where a + // `-` sits next to a `\w`/`\d` shorthand. In JS that `-` is a literal + // hyphen (a shorthand can't bound a range); the Rust `regex` crate + // would otherwise reject `\w-` as `ClassRangeLiteral`. The hyphen must + // be escaped to `\-`. + for (src, expect) in [ + (r"[\w-\.]", r"[\w\-\.]"), + (r"[\d-z]", r"[\d\-z]"), + (r"[a\w-]", r"[a\w\-]"), + (r"[a-\d]", r"[a\-\d]"), + (r"[\p{L}-x]", r"[\p{L}\-x]"), + ] { + assert_eq!(js_regex_to_rust(src), expect, "src={src}"); + assert!( + regex::Regex::new(&js_regex_to_rust(src)).is_ok(), + "must compile: {src}" + ); + } + // An ordinary `a-z` range between two single literals is untouched. + assert_eq!(js_regex_to_rust("[a-z]"), "[a-z]"); + // Outside a class, `-` is never escaped. + assert_eq!(js_regex_to_rust(r"\d-\w"), r"\d-\w"); + // A `\w-\.` member must match `\w`, a literal `-`, and `.`. + let re = regex::Regex::new(&js_regex_to_rust(r"^[\w-\.]+$")).unwrap(); + assert!(re.is_match("a-b.c_d")); + assert!(!re.is_match("a b")); + } } From c6543c02d81a7536cb2f1d47458b11899b165d2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ralph=20K=C3=BCpper?= Date: Wed, 17 Jun 2026 04:26:27 +0200 Subject: [PATCH 2/2] style: rustfmt --- crates/perry-runtime/src/regex/grammar.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/crates/perry-runtime/src/regex/grammar.rs b/crates/perry-runtime/src/regex/grammar.rs index 7f377668a7..1909c931ce 100644 --- a/crates/perry-runtime/src/regex/grammar.rs +++ b/crates/perry-runtime/src/regex/grammar.rs @@ -790,8 +790,7 @@ pub(super) fn js_regex_to_rust(pattern: &str) -> String { } } else if in_class && chars[i] == '-' - && (out_ends_with_class_shorthand(&result) - || next_is_class_shorthand(&chars, i + 1)) + && (out_ends_with_class_shorthand(&result) || next_is_class_shorthand(&chars, i + 1)) { // Inside a class, a `-` adjacent to a shorthand class (`\d`, `\w`, // `\s`, …, or a `\p{…}` property) is a *literal* hyphen in JS — a