-
-
Notifications
You must be signed in to change notification settings - Fork 620
fix(regexp): align lastIndex with code point boundary under /u and /v #5303
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1136,6 +1136,24 @@ impl RegExp { | |
| // 9. If flags contains "u" or flags contains "v", let fullUnicode be true; else let fullUnicode be false. | ||
| let full_unicode = flags.contains(b'u') || flags.contains(b'v'); | ||
|
|
||
| // When the /u or /v flag is active, the input string is modeled as a sequence | ||
| // of Unicode code points (§22.2.2). Since `last_index` is a UTF-16 code unit | ||
| // index, it may point to the trailing half of a surrogate pair, which is not | ||
| // a valid code point boundary. In that case, we adjust the matcher start | ||
| // position to the preceding lead surrogate so matching begins at a valid | ||
| // code point boundary. | ||
| // Ref: https://tc39.es/ecma262/#sec-pattern-semantics | ||
| let mut start_index = last_index; | ||
| if full_unicode | ||
| && start_index > 0 | ||
| && let Some(cu) = input.code_unit_at(start_index as usize) | ||
| && (0xDC00..=0xDFFF).contains(&cu) | ||
| && let Some(prev_cu) = input.code_unit_at(start_index as usize - 1) | ||
| && (0xD800..=0xDBFF).contains(&prev_cu) | ||
| { | ||
| start_index -= 1; | ||
| } | ||
|
|
||
| // NOTE: The following steps are take care of by regress: | ||
| // | ||
| // SKIP: 10. Let matchSucceeded be false. | ||
|
|
@@ -1163,13 +1181,13 @@ impl RegExp { | |
| let input = input.to_vec(); | ||
|
|
||
| // NOTE: We can use the faster ucs2 variant since there will never be two byte unicode. | ||
| matcher.find_from_ucs2(&input, last_index as usize).next() | ||
| matcher.find_from_ucs2(&input, start_index as usize).next() | ||
| } | ||
| (true, JsStrVariant::Utf16(input)) => { | ||
| matcher.find_from_utf16(input, last_index as usize).next() | ||
| matcher.find_from_utf16(input, start_index as usize).next() | ||
| } | ||
| (false, JsStrVariant::Utf16(input)) => { | ||
| matcher.find_from_ucs2(input, last_index as usize).next() | ||
| matcher.find_from_ucs2(input, start_index as usize).next() | ||
|
Comment on lines
1183
to
+1190
|
||
| } | ||
| }; | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
start_indexis derived fromlast_index(u64) and cast tousizeincode_unit_atbefore validatinglast_index <= length. On 32-bit targets (or with very largelastIndexvalues), theas usizecast can truncate and accidentally make the index appear in-bounds, leading to incorrect surrogate-boundary adjustment (and potentially incorrect match results). Consider moving the surrogate-boundary adjustment block to after theif last_index > length { ... return }early-exit, and/or using a checked conversion (usize::try_from) before callingcode_unit_at.