diff --git a/core/engine/src/builtins/regexp/mod.rs b/core/engine/src/builtins/regexp/mod.rs index 037eed6eb19..58c20803d58 100644 --- a/core/engine/src/builtins/regexp/mod.rs +++ b/core/engine/src/builtins/regexp/mod.rs @@ -1136,6 +1136,24 @@ impl RegExp { // 9. If flags contains "u" or flags contains "v", let fullUnicode be true; else let fullUnicode be false. let full_unicode = flags.contains(b'u') || flags.contains(b'v'); + // When the /u or /v flag is active, the input string is modeled as a sequence + // of Unicode code points (ยง22.2.2). Since `last_index` is a UTF-16 code unit + // index, it may point to the trailing half of a surrogate pair, which is not + // a valid code point boundary. In that case, we adjust the matcher start + // position to the preceding lead surrogate so matching begins at a valid + // code point boundary. + // Ref: https://tc39.es/ecma262/#sec-pattern-semantics + let mut start_index = last_index; + if full_unicode + && start_index > 0 + && let Some(cu) = input.code_unit_at(start_index as usize) + && (0xDC00..=0xDFFF).contains(&cu) + && let Some(prev_cu) = input.code_unit_at(start_index as usize - 1) + && (0xD800..=0xDBFF).contains(&prev_cu) + { + start_index -= 1; + } + // NOTE: The following steps are take care of by regress: // // SKIP: 10. Let matchSucceeded be false. @@ -1163,13 +1181,13 @@ impl RegExp { let input = input.to_vec(); // NOTE: We can use the faster ucs2 variant since there will never be two byte unicode. - matcher.find_from_ucs2(&input, last_index as usize).next() + matcher.find_from_ucs2(&input, start_index as usize).next() } (true, JsStrVariant::Utf16(input)) => { - matcher.find_from_utf16(input, last_index as usize).next() + matcher.find_from_utf16(input, start_index as usize).next() } (false, JsStrVariant::Utf16(input)) => { - matcher.find_from_ucs2(input, last_index as usize).next() + matcher.find_from_ucs2(input, start_index as usize).next() } }; diff --git a/core/engine/src/builtins/regexp/tests.rs b/core/engine/src/builtins/regexp/tests.rs index 0897b682720..1a6ede5db36 100644 --- a/core/engine/src/builtins/regexp/tests.rs +++ b/core/engine/src/builtins/regexp/tests.rs @@ -262,3 +262,51 @@ fn regexp_no_panic_on_empty_class_quantifier() { // It should return null without panicking. run_test_actions([TestAction::assert_eq("/[]*1/u.exec()", JsValue::null())]); } + +#[test] +fn regexp_exec_coercion_order() { + // ECMAScript ยง21.2.5.2.1 โ€” RegExpExec + // Ensures ToString(input) happens before accessing lastIndex + run_test_actions([TestAction::assert_eq( + indoc! {r#" + let log = []; + let re = /a/g; + + re.lastIndex = { + valueOf() { log.push("lastIndex"); return 0; } + }; + + let str = { + toString() { log.push("string"); return "a"; } + }; + + re.exec(str); + log.join(","); + "#}, + js_str!("string,lastIndex"), + )]); +} + +#[test] +fn regexp_unicode_lastindex_surrogate_boundary() { + run_test_actions([TestAction::assert_eq( + indoc! {r#" + let re = /./gu; + re.lastIndex = 1; + re.exec("๐Ÿ’ฉ")[0]; + "#}, + js_str!("๐Ÿ’ฉ"), + )]); +} + +#[test] +fn regexp_unicode_lastindex_no_adjustment() { + run_test_actions([TestAction::assert_eq( + indoc! {r#" + let re = /./gu; + re.lastIndex = 0; + re.exec("๐Ÿ’ฉ")[0]; + "#}, + js_str!("๐Ÿ’ฉ"), + )]); +}