From e7f5610ed99ed7d3e976de9d072238ae00db3ce2 Mon Sep 17 00:00:00 2001 From: mrhapile Date: Sun, 5 Apr 2026 17:19:52 +0530 Subject: [PATCH 1/3] fix(regexp): align lastIndex with code point boundary under /u Signed-off-by: mrhapile --- core/engine/src/builtins/regexp/mod.rs | 26 ++++++++++-- core/engine/src/builtins/regexp/tests.rs | 54 ++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 3 deletions(-) diff --git a/core/engine/src/builtins/regexp/mod.rs b/core/engine/src/builtins/regexp/mod.rs index 037eed6eb19..38ce09ae0ef 100644 --- a/core/engine/src/builtins/regexp/mod.rs +++ b/core/engine/src/builtins/regexp/mod.rs @@ -1136,6 +1136,26 @@ impl RegExp { // 9. If flags contains "u" or flags contains "v", let fullUnicode be true; else let fullUnicode be false. let full_unicode = flags.contains(b'u') || flags.contains(b'v'); + // When the /u or /v flag is active, the input string is modeled as a sequence + // of Unicode code points (ยง22.2.2). Since `last_index` is a UTF-16 code unit + // index, it may point to the trailing half of a surrogate pair, which is not + // a valid code point boundary. In that case, we adjust the matcher start + // position to the preceding lead surrogate so matching begins at a valid + // code point boundary. + // Ref: https://tc39.es/ecma262/#sec-pattern-semantics + let mut start_index = last_index; + if full_unicode && start_index > 0 { + if let Some(cu) = input.code_unit_at(start_index as usize) { + if (0xDC00..=0xDFFF).contains(&cu) { + if let Some(prev_cu) = input.code_unit_at(start_index as usize - 1) { + if (0xD800..=0xDBFF).contains(&prev_cu) { + start_index -= 1; + } + } + } + } + } + // NOTE: The following steps are take care of by regress: // // SKIP: 10. Let matchSucceeded be false. @@ -1163,13 +1183,13 @@ impl RegExp { let input = input.to_vec(); // NOTE: We can use the faster ucs2 variant since there will never be two byte unicode. - matcher.find_from_ucs2(&input, last_index as usize).next() + matcher.find_from_ucs2(&input, start_index as usize).next() } (true, JsStrVariant::Utf16(input)) => { - matcher.find_from_utf16(input, last_index as usize).next() + matcher.find_from_utf16(input, start_index as usize).next() } (false, JsStrVariant::Utf16(input)) => { - matcher.find_from_ucs2(input, last_index as usize).next() + matcher.find_from_ucs2(input, start_index as usize).next() } }; diff --git a/core/engine/src/builtins/regexp/tests.rs b/core/engine/src/builtins/regexp/tests.rs index 0897b682720..c89a225098d 100644 --- a/core/engine/src/builtins/regexp/tests.rs +++ b/core/engine/src/builtins/regexp/tests.rs @@ -262,3 +262,57 @@ fn regexp_no_panic_on_empty_class_quantifier() { // It should return null without panicking. run_test_actions([TestAction::assert_eq("/[]*1/u.exec()", JsValue::null())]); } + +#[test] +fn regexp_exec_coercion_order() { + // ECMAScript ยง21.2.5.2.1 โ€” RegExpExec + // Ensures ToString(input) happens before accessing lastIndex + run_test_actions([ + TestAction::assert_eq( + indoc! {r#" + let log = []; + let re = /a/g; + + re.lastIndex = { + valueOf() { log.push("lastIndex"); return 0; } + }; + + let str = { + toString() { log.push("string"); return "a"; } + }; + + re.exec(str); + log.join(","); + "#}, + js_str!("string,lastIndex"), + ), + ]); +} + +#[test] +fn regexp_unicode_lastindex_surrogate_boundary() { + run_test_actions([ + TestAction::assert_eq( + indoc! {r#" + let re = /./gu; + re.lastIndex = 1; + re.exec("๐Ÿ’ฉ")[0]; + "#}, + js_str!("๐Ÿ’ฉ"), + ), + ]); +} + +#[test] +fn regexp_unicode_lastindex_no_adjustment() { + run_test_actions([ + TestAction::assert_eq( + indoc! {r#" + let re = /./gu; + re.lastIndex = 0; + re.exec("๐Ÿ’ฉ")[0]; + "#}, + js_str!("๐Ÿ’ฉ"), + ), + ]); +} From bc1bb77c176a4ecb39c91fdf769fca125631c096 Mon Sep 17 00:00:00 2001 From: mrhapile Date: Sun, 5 Apr 2026 17:21:32 +0530 Subject: [PATCH 2/3] style: apply rustfmt Signed-off-by: mrhapile --- core/engine/src/builtins/regexp/tests.rs | 30 ++++++++++-------------- 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/core/engine/src/builtins/regexp/tests.rs b/core/engine/src/builtins/regexp/tests.rs index c89a225098d..1a6ede5db36 100644 --- a/core/engine/src/builtins/regexp/tests.rs +++ b/core/engine/src/builtins/regexp/tests.rs @@ -267,9 +267,8 @@ fn regexp_no_panic_on_empty_class_quantifier() { fn regexp_exec_coercion_order() { // ECMAScript ยง21.2.5.2.1 โ€” RegExpExec // Ensures ToString(input) happens before accessing lastIndex - run_test_actions([ - TestAction::assert_eq( - indoc! {r#" + run_test_actions([TestAction::assert_eq( + indoc! {r#" let log = []; let re = /a/g; @@ -284,35 +283,30 @@ fn regexp_exec_coercion_order() { re.exec(str); log.join(","); "#}, - js_str!("string,lastIndex"), - ), - ]); + js_str!("string,lastIndex"), + )]); } #[test] fn regexp_unicode_lastindex_surrogate_boundary() { - run_test_actions([ - TestAction::assert_eq( - indoc! {r#" + run_test_actions([TestAction::assert_eq( + indoc! {r#" let re = /./gu; re.lastIndex = 1; re.exec("๐Ÿ’ฉ")[0]; "#}, - js_str!("๐Ÿ’ฉ"), - ), - ]); + js_str!("๐Ÿ’ฉ"), + )]); } #[test] fn regexp_unicode_lastindex_no_adjustment() { - run_test_actions([ - TestAction::assert_eq( - indoc! {r#" + run_test_actions([TestAction::assert_eq( + indoc! {r#" let re = /./gu; re.lastIndex = 0; re.exec("๐Ÿ’ฉ")[0]; "#}, - js_str!("๐Ÿ’ฉ"), - ), - ]); + js_str!("๐Ÿ’ฉ"), + )]); } From da29940ba354f9f771305484ac14a14f8e111a42 Mon Sep 17 00:00:00 2001 From: mrhapile Date: Sun, 5 Apr 2026 17:24:06 +0530 Subject: [PATCH 3/3] fix: resolve clippy collapsible-if lint --- core/engine/src/builtins/regexp/mod.rs | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/core/engine/src/builtins/regexp/mod.rs b/core/engine/src/builtins/regexp/mod.rs index 38ce09ae0ef..58c20803d58 100644 --- a/core/engine/src/builtins/regexp/mod.rs +++ b/core/engine/src/builtins/regexp/mod.rs @@ -1144,16 +1144,14 @@ impl RegExp { // code point boundary. // Ref: https://tc39.es/ecma262/#sec-pattern-semantics let mut start_index = last_index; - if full_unicode && start_index > 0 { - if let Some(cu) = input.code_unit_at(start_index as usize) { - if (0xDC00..=0xDFFF).contains(&cu) { - if let Some(prev_cu) = input.code_unit_at(start_index as usize - 1) { - if (0xD800..=0xDBFF).contains(&prev_cu) { - start_index -= 1; - } - } - } - } + if full_unicode + && start_index > 0 + && let Some(cu) = input.code_unit_at(start_index as usize) + && (0xDC00..=0xDFFF).contains(&cu) + && let Some(prev_cu) = input.code_unit_at(start_index as usize - 1) + && (0xD800..=0xDBFF).contains(&prev_cu) + { + start_index -= 1; } // NOTE: The following steps are take care of by regress: