Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 65 additions & 42 deletions core/runtime/src/text/encodings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,70 +12,93 @@ pub(crate) mod utf8 {
.collect()
}

pub(crate) fn decode(mut input: &[u8], strip_bom: bool) -> JsString {
pub(crate) fn decode(mut input: &[u8], strip_bom: bool, fatal: bool) -> Result<JsString, ()> {
if strip_bom {
input = input.strip_prefix(&[0xEF, 0xBB, 0xBF]).unwrap_or(input);
}
let string = String::from_utf8_lossy(input);
JsString::from(string.as_ref())
if fatal {
let s = std::str::from_utf8(input).map_err(|_| ())?;
Ok(JsString::from(s))
} else {
let string = String::from_utf8_lossy(input);
Ok(JsString::from(string.as_ref()))
}
}
}

/// Decodes an iterator of UTF-16 code units into a well-formed `JsString`,
/// replacing any unpaired surrogates with U+FFFD.
///
/// If `dangling_byte` is true and the last decoded code unit is not a high
/// surrogate (which would already have been replaced), an additional U+FFFD
/// is appended for the truncated trailing byte.
///
/// When `fatal` is true, any decoder error (unpaired surrogate or dangling
/// byte) causes this function to return `Err(())` instead of inserting a
/// replacement character.
fn decode_utf16_units(
code_units: impl IntoIterator<Item = u16>,
dangling_byte: bool,
fatal: bool,
) -> Result<boa_engine::JsString, ()> {
let mut string = String::new();
let mut last_code_unit = None;
for result in std::char::decode_utf16(code_units.into_iter().inspect(|code_unit| {
last_code_unit = Some(*code_unit);
})) {
match result {
Ok(c) => string.push(c),
Err(_) if fatal => return Err(()),
Err(_) => string.push('\u{FFFD}'),
}
}
let trailing_high_surrogate =
last_code_unit.is_some_and(|code_unit| (0xD800..=0xDBFF).contains(&code_unit));
if dangling_byte {
if fatal {
return Err(());
}
if !trailing_high_surrogate {
string.push('\u{FFFD}');
}
}
Ok(boa_engine::JsString::from(string))
}

pub(crate) mod utf16le {
use boa_engine::{JsString, js_string};
use boa_engine::JsString;

pub(crate) fn decode(mut input: &[u8], strip_bom: bool) -> JsString {
pub(crate) fn decode(mut input: &[u8], strip_bom: bool, fatal: bool) -> Result<JsString, ()> {
if strip_bom {
input = input.strip_prefix(&[0xFF, 0xFE]).unwrap_or(input);
}

// After this point, input is of even length.
let dangling = if input.len().is_multiple_of(2) {
false
} else {
let dangling_byte = !input.len().is_multiple_of(2);
if dangling_byte {
input = &input[0..input.len() - 1];
true
};

let input: &[u16] = bytemuck::cast_slice(input);

if dangling {
JsString::from(&[JsString::from(input), js_string!("\u{FFFD}")])
} else {
JsString::from(input)
}

let code_units: &[u16] = bytemuck::cast_slice(input);
super::decode_utf16_units(code_units.iter().copied(), dangling_byte, fatal)
}
}

pub(crate) mod utf16be {
use boa_engine::{JsString, js_string};
use boa_engine::JsString;

pub(crate) fn decode(mut input: Vec<u8>, strip_bom: bool) -> JsString {
if strip_bom && input.starts_with(&[0xFE, 0xFF]) {
input.drain(..2);
pub(crate) fn decode(mut input: &[u8], strip_bom: bool, fatal: bool) -> Result<JsString, ()> {
if strip_bom && let Some(rest) = input.strip_prefix(&[0xFE, 0xFF]) {
input = rest;
}

let mut input = input.as_mut_slice();
// After this point, input is of even length.
let dangling = if input.len().is_multiple_of(2) {
false
} else {
let new_len = input.len() - 1;
input = &mut input[0..new_len];
true
};

let input: &mut [u16] = bytemuck::cast_slice_mut(input);

// Swap the bytes.
for b in &mut *input {
*b = b.swap_bytes();
let dangling_byte = !input.len().is_multiple_of(2);
if dangling_byte {
input = &input[0..input.len() - 1];
}

if dangling {
JsString::from(&[JsString::from(&*input), js_string!("\u{FFFD}")])
} else {
JsString::from(&*input)
}
let code_units = input
.chunks_exact(2)
.map(|pair| u16::from_be_bytes([pair[0], pair[1]]));
super::decode_utf16_units(code_units, dangling_byte, fatal)
}
}
31 changes: 23 additions & 8 deletions core/runtime/src/text/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ mod encodings;
pub struct TextDecoderOptions {
#[boa(rename = "ignoreBOM")]
ignore_bom: Option<bool>,
fatal: Option<bool>,
}

/// The character encoding used by [`TextDecoder`].
Expand Down Expand Up @@ -73,6 +74,8 @@ pub struct TextDecoder {
encoding: Encoding,
#[unsafe_ignore_trace]
ignore_bom: bool,
#[unsafe_ignore_trace]
fatal: bool,
}

#[boa_class]
Expand All @@ -89,6 +92,7 @@ impl TextDecoder {
options: Option<TextDecoderOptions>,
) -> JsResult<Self> {
let ignore_bom = options.and_then(|o| o.ignore_bom).unwrap_or(false);
let fatal = options.and_then(|o| o.fatal).unwrap_or(false);

let encoding = match encoding {
Some(enc) => {
Expand All @@ -103,6 +107,7 @@ impl TextDecoder {
Ok(Self {
encoding,
ignore_bom,
fatal,
})
}

Expand Down Expand Up @@ -131,6 +136,17 @@ impl TextDecoder {
self.ignore_bom
}

/// The [`TextDecoder.fatal`][mdn] read-only property is a `bool` indicating whether
/// the error mode of the decoder is fatal, i.e. whether invalid input throws a
/// `TypeError` instead of being replaced with U+FFFD.
///
/// [mdn]: https://developer.mozilla.org/en-US/docs/Web/API/TextDecoder/fatal
#[boa(getter)]
#[must_use]
pub fn fatal(&self) -> bool {
self.fatal
}

/// The [`TextDecoder.decode()`][mdn] method returns a string containing text decoded from the
/// buffer passed as a parameter.
///
Expand Down Expand Up @@ -197,14 +213,13 @@ impl TextDecoder {
&full_data
};

Ok(match self.encoding {
Encoding::Utf8 => encodings::utf8::decode(data, strip_bom),
Encoding::Utf16Le => encodings::utf16le::decode(data, strip_bom),
Encoding::Utf16Be => {
let owned = data.to_vec();
encodings::utf16be::decode(owned, strip_bom)
}
})
let result = match self.encoding {
Encoding::Utf8 => encodings::utf8::decode(data, strip_bom, self.fatal),
Encoding::Utf16Le => encodings::utf16le::decode(data, strip_bom, self.fatal),
Encoding::Utf16Be => encodings::utf16be::decode(data, strip_bom, self.fatal),
};

result.map_err(|()| js_error!(TypeError: "The encoded data was not valid."))
}
}

Expand Down
1 change: 1 addition & 0 deletions tests/wpt/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -399,6 +399,7 @@ fn encoding(
#[base_dir = "${WPT_ROOT}"]
#[files("encoding/api-*.any.js")]
#[files("encoding/textencoder-constructor-non-utf.any.js")]
#[files("encoding/textdecoder-utf16-surrogates.any.js")]
// TODO: re-enable those when better encoding and options are supported.
// #[files("encoding/textdecoder-*.any.js")]
// #[files("encoding/textencoder-*.any.js")]
Expand Down
Loading