Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 37 additions & 4 deletions crates/perry-runtime/src/regex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,40 @@ thread_local! {
static FANCY_CACHE: RefCell<HashMap<(String, String), Arc<fancy_regex::Regex>>> = RefCell::new(HashMap::new());
}

/// Compiled-program size budget handed to both regex engines.
///
/// The `regex` crate (and the `regex-automata` backend `fancy-regex`
/// delegates to) caps a compiled program at 10 MiB by default and rejects
/// anything larger with `CompiledTooBig` / `ExceededSizeLimit` — which our
/// callers surface as a bogus `SyntaxError: invalid pattern`. JS itself has
/// no such limit, so a *valid* pattern with large bounded repetitions is
/// wrongly rejected. semver's ReDoS-hardened `safeRe` rewrites (`\s{0,1}`,
/// `\d{1,256}`, `[…]{0,250}`, …) blow well past 10 MiB; raise the budget so
/// these legitimate patterns compile. 64 MiB comfortably fits semver's full
/// range regex while still bounding pathological input.
#[cfg(feature = "regex-engine")]
const REGEX_SIZE_LIMIT: usize = 64 * 1024 * 1024;

/// Build a `regex` crate `Regex` with the raised [`REGEX_SIZE_LIMIT`] so that
/// large-but-valid bounded-quantifier patterns aren't rejected as
/// `CompiledTooBig`. Drop-in replacement for `regex::Regex::new`.
#[cfg(feature = "regex-engine")]
pub(crate) fn build_std_regex(pattern: &str) -> Result<Regex, regex::Error> {
regex::RegexBuilder::new(pattern)
.size_limit(REGEX_SIZE_LIMIT)
.build()
}

/// Build a `fancy_regex` `Regex` with the raised delegate size limit (see
/// [`REGEX_SIZE_LIMIT`]). `fancy-regex` delegates non-fancy subpatterns to the
/// `regex` crate, so the same 10 MiB cap applies there; raise it in lockstep.
#[cfg(feature = "regex-engine")]
pub(crate) fn build_fancy_regex(pattern: &str) -> Result<fancy_regex::Regex, fancy_regex::Error> {
fancy_regex::RegexBuilder::new(pattern)
.delegate_size_limit(REGEX_SIZE_LIMIT)
.build()
}

#[cfg(feature = "regex-engine")]
fn get_or_compile_regex(pattern: &str, flags: &str) -> Arc<Regex> {
REGEX_CACHE.with(|cache| {
Expand Down Expand Up @@ -150,7 +184,7 @@ fn get_or_compile_regex(pattern: &str, flags: &str) -> Arc<Regex> {
} else {
translated
};
let regex = match Regex::new(&regex_pattern) {
let regex = match build_std_regex(&regex_pattern) {
Ok(re) => re,
Err(_) => {
// Pattern has features regex crate doesn't support
Expand All @@ -161,7 +195,7 @@ fn get_or_compile_regex(pattern: &str, flags: &str) -> Arc<Regex> {
// existing callers don't crash — the fancy-regex fallback
// is handled in js_regexp_exec_fancy below.
FANCY_CACHE.with(|fc| {
if let Ok(fre) = fancy_regex::Regex::new(&regex_pattern) {
if let Ok(fre) = build_fancy_regex(&regex_pattern) {
fc.borrow_mut().insert(
(pattern.to_string(), flags.to_string()),
std::sync::Arc::new(fre),
Expand Down Expand Up @@ -399,8 +433,7 @@ pub extern "C" fn js_regexp_new(
));
}
let translated = js_regex_to_rust(pattern_str);
if regex::Regex::new(&translated).is_err() && fancy_regex::Regex::new(&translated).is_err()
{
if build_std_regex(&translated).is_err() && build_fancy_regex(&translated).is_err() {
throw_regexp_syntax_error(&format!(
"Invalid regular expression: /{}/: invalid pattern",
pattern_str
Expand Down
8 changes: 4 additions & 4 deletions crates/perry-runtime/src/regex/compile.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ use regex::Regex;

use super::grammar::{has_invalid_repeated_quantifier, js_regex_to_rust};
use super::{
get_or_compile_regex, is_regex_pointer, is_valid_ptr, is_valid_regex_ptr, js_regexp_get_flags,
js_regexp_get_source, js_string_from_str, string_as_str, throw_regexp_syntax_error,
validate_and_canonicalize_flags, RegExpHeader,
build_fancy_regex, build_std_regex, get_or_compile_regex, is_regex_pointer, is_valid_ptr,
is_valid_regex_ptr, js_regexp_get_flags, js_regexp_get_source, js_string_from_str,
string_as_str, throw_regexp_syntax_error, validate_and_canonicalize_flags, RegExpHeader,
};

/// `RegExp.prototype.compile(pattern, flags)`. Re-initializes the receiver
Expand Down Expand Up @@ -90,7 +90,7 @@ pub extern "C" fn js_regexp_compile_value(
));
}
let translated = js_regex_to_rust(pattern_str);
if regex::Regex::new(&translated).is_err() && fancy_regex::Regex::new(&translated).is_err() {
if build_std_regex(&translated).is_err() && build_fancy_regex(&translated).is_err() {
throw_regexp_syntax_error(&format!(
"Invalid regular expression: /{}/: invalid pattern",
pattern_str
Expand Down
146 changes: 146 additions & 0 deletions crates/perry-runtime/src/regex/grammar.rs
Original file line number Diff line number Diff line change
Expand Up @@ -551,6 +551,62 @@ fn emoji_string_property_expansion(value: &str) -> Option<String> {
})
}

/// Does the already-emitted translation `out` end in a character-class
/// shorthand (`\d`/`\w`/`\s` and their negations, or a `\p{…}`/`\P{…}` Unicode
/// property)? Such an element cannot be the bound of a range, so a `-`
/// immediately after it is a *literal* hyphen in JS, not a range operator.
fn out_ends_with_class_shorthand(out: &str) -> bool {
let b = out.as_bytes();
// `\p{…}` / `\P{…}` property: ends with `}` preceded by a `{…` opened by
// an unescaped `\p` / `\P`.
if b.last() == Some(&b'}') {
if let Some(open) = out.rfind('{') {
let pre = &out[..open];
let pb = pre.as_bytes();
if pb.len() >= 2
&& matches!(pb[pb.len() - 1], b'p' | b'P')
&& pb[pb.len() - 2] == b'\\'
&& !is_escaped_backslash(pb, pb.len() - 2)
{
return true;
}
}
}
if b.len() < 2 {
return false;
}
let last = b[b.len() - 1];
b[b.len() - 2] == b'\\'
&& !is_escaped_backslash(b, b.len() - 2)
&& matches!(last, b'd' | b'D' | b'w' | b'W' | b's' | b'S')
}

/// Is the backslash at `b[bs]` itself escaped (i.e. preceded by an odd run of
/// backslashes)? Used so `\\d` (literal backslash + `d`) isn't mistaken for the
/// `\d` shorthand.
fn is_escaped_backslash(b: &[u8], bs: usize) -> bool {
let mut count = 0usize;
let mut k = bs;
while k > 0 && b[k - 1] == b'\\' {
count += 1;
k -= 1;
}
count % 2 == 1
}

/// Will the next class member at `chars[i..]` (where `chars[i]` is a `\`) be a
/// shorthand class (`\d`/`\w`/`\s` & negations, or `\p{…}`/`\P{…}`)? A `-`
/// directly before such an element is a literal hyphen in JS.
fn next_is_class_shorthand(chars: &[char], i: usize) -> bool {
if chars.get(i) != Some(&'\\') {
return false;
}
matches!(
chars.get(i + 1),
Some('d' | 'D' | 'w' | 'W' | 's' | 'S' | 'p' | 'P')
)
}

/// Translate a JavaScript regex pattern to a Rust regex-crate compatible pattern.
/// Handles JS-specific escape sequences not supported by the Rust regex crate.
/// Also converts JS-style named groups `(?<name>...)` to Rust-style `(?P<name>...)`.
Expand Down Expand Up @@ -732,6 +788,20 @@ pub(super) fn js_regex_to_rust(pattern: &str) -> String {
result.push(chars[i]);
i += 1;
}
} else if in_class
&& chars[i] == '-'
&& (out_ends_with_class_shorthand(&result) || next_is_class_shorthand(&chars, i + 1))
{
// Inside a class, a `-` adjacent to a shorthand class (`\d`, `\w`,
// `\s`, …, or a `\p{…}` property) is a *literal* hyphen in JS — a
// shorthand can't be a range bound. The Rust `regex` crate instead
// tries to read `\w-\.` / `\d-z` as a range and rejects it with a
// `ClassRangeLiteral` parse error. Escape the hyphen so it stays a
// literal. joi's URI validator (`[\w-\.~%\dA-Fa-f…]`, IPv6 host)
// and many other real-world classes rely on this.
result.push('\\');
result.push('-');
i += 1;
} else {
result.push(chars[i]);
i += 1;
Expand Down Expand Up @@ -890,4 +960,80 @@ mod tests {
assert!(regex::Regex::new(r"\P{RGI_Emoji}").is_err());
assert!(fancy_regex::Regex::new(r"\P{RGI_Emoji}").is_err());
}

#[test]
fn trivial_char_class_compiles_and_matches() {
// Regression for the winston/`@colors/colors` tail: a trivial, valid
// character class must translate and compile (and *not* be rejected by
// the invalid-pattern guard). `[0m]` matches `0` or `m`.
let translated = js_regex_to_rust("[0m]");
let re = regex::Regex::new(&translated).expect("[0m] must compile");
assert!(re.is_match("0"));
assert!(re.is_match("m"));
assert!(!re.is_match("x"));
// The `@colors/colors` ANSI-strip literal `\x1B\[\d+m` and the
// escaped-bracket form `\x1B\[0m` (`escapeStringRegexp` output) compile.
for pat in [r"\x1B\[\d+m", r"\x1B\[0m"] {
assert!(
regex::Regex::new(&js_regex_to_rust(pat)).is_ok(),
"ANSI pattern must compile: {pat}"
);
}
// Neither trips the bounded-quantifier false-positive guard.
assert!(!super::has_invalid_repeated_quantifier("[0m]"));
assert!(!super::has_invalid_repeated_quantifier(r"\x1B\[\d+m"));
}

#[test]
fn bounded_quantifier_in_class_not_rejected() {
// semver's ReDoS-hardened `safeRe` rewrites `\d+`→`\d{1,N}`,
// `\s*`→`\s{0,1}`, `[…]*`→`[…]{0,N}`. These bounded quantifiers are
// valid and must NOT be flagged by `has_invalid_repeated_quantifier`.
for pat in [
r"\d{1,16}",
r"\s{0,1}",
r"\d{0,256}",
r"[a-zA-Z0-9-]{0,250}",
r"(?:<|>)?=?",
r"a{0,1}",
] {
assert!(
!super::has_invalid_repeated_quantifier(pat),
"valid bounded quantifier wrongly rejected: {pat}"
);
}
// A genuinely-dangling quantifier (no preceding atom) is still caught.
assert!(super::has_invalid_repeated_quantifier("{0,1}"));
assert!(super::has_invalid_repeated_quantifier("*abc"));
}

#[test]
fn class_hyphen_adjacent_to_shorthand_is_literal() {
// joi's URI validator builds classes like `[\w-\.~%\dA-Fa-f…]` where a
// `-` sits next to a `\w`/`\d` shorthand. In JS that `-` is a literal
// hyphen (a shorthand can't bound a range); the Rust `regex` crate
// would otherwise reject `\w-` as `ClassRangeLiteral`. The hyphen must
// be escaped to `\-`.
for (src, expect) in [
(r"[\w-\.]", r"[\w\-\.]"),
(r"[\d-z]", r"[\d\-z]"),
(r"[a\w-]", r"[a\w\-]"),
(r"[a-\d]", r"[a\-\d]"),
(r"[\p{L}-x]", r"[\p{L}\-x]"),
] {
assert_eq!(js_regex_to_rust(src), expect, "src={src}");
assert!(
regex::Regex::new(&js_regex_to_rust(src)).is_ok(),
"must compile: {src}"
);
}
// An ordinary `a-z` range between two single literals is untouched.
assert_eq!(js_regex_to_rust("[a-z]"), "[a-z]");
// Outside a class, `-` is never escaped.
assert_eq!(js_regex_to_rust(r"\d-\w"), r"\d-\w");
// A `\w-\.` member must match `\w`, a literal `-`, and `.`.
let re = regex::Regex::new(&js_regex_to_rust(r"^[\w-\.]+$")).unwrap();
assert!(re.is_match("a-b.c_d"));
assert!(!re.is_match("a b"));
}
}
Loading