From 9b72970a3166b4cb0fecaa013b34060d900abdbc Mon Sep 17 00:00:00 2001 From: Kornel Date: Fri, 20 Feb 2026 13:40:27 +0000 Subject: [PATCH 01/13] Don't panic in release on unsupported selectors --- src/selectors_vm/ast.rs | 10 +++++++--- src/selectors_vm/compiler.rs | 12 ++++++------ 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/src/selectors_vm/ast.rs b/src/selectors_vm/ast.rs index 783c0797..7344e925 100644 --- a/src/selectors_vm/ast.rs +++ b/src/selectors_vm/ast.rs @@ -150,9 +150,13 @@ impl From<&Component> for Condition { // pseudo class-related. Ideally none of them should appear in // the parsed selector as we should bail earlier in the parser. // Otherwise, we'll have AST in invalid state in case of error. - bad_selector => unreachable!( - "Unsupported selector components should be filtered out by the parser: {bad_selector:?}" - ), + bad_selector => { + debug_assert!( + false, + "Unsupported selector components should be filtered out by the parser: {bad_selector:?}" + ); + Self::OnTagName(OnTagNameExpr::Unmatchable) + } } } } diff --git a/src/selectors_vm/compiler.rs b/src/selectors_vm/compiler.rs index fd71eddb..1b8d17d8 100644 --- a/src/selectors_vm/compiler.rs +++ b/src/selectors_vm/compiler.rs @@ -354,7 +354,7 @@ mod tests { }}; } - fn compile( + fn test_compile( selectors: &[&str], encoding: &'static Encoding, expected_entry_point_count: usize, @@ -433,7 +433,7 @@ mod tests { encoding: &'static Encoding, test_cases: &[(&str, bool)], ) { - let program = compile(&[selector], encoding, 1); + let program = test_compile(&[selector], encoding, 1); let instr = &program.instructions[program.entry_points.start]; for_each_test_case( @@ -463,7 +463,7 @@ mod tests { test_cases: &[(&str, bool)], ) { for (selector, test_cases) in with_negated(selector, test_cases) { - let program = compile(&[&selector], encoding, 1); + let program = test_compile(&[&selector], encoding, 1); let instr = &program.instructions[program.entry_points.start]; for_each_test_case( @@ -522,7 +522,7 @@ mod tests { encoding: &'static Encoding, test_cases: &[(&str, bool)], ) { - let program = compile(&[selector], encoding, 1); + let program = test_compile(&[selector], encoding, 1); let instr = &program.instructions[program.entry_points.start]; for_each_test_case( @@ -589,7 +589,7 @@ mod tests { expected_entry_point_count: usize, test_cases: &[(&str, Vec)], ) { - let program = compile(selectors, UTF_8, expected_entry_point_count); + let program = test_compile(selectors, UTF_8, expected_entry_point_count); // NOTE: encoding of the individual components is tested by other tests, // so we use only UTF-8 here. @@ -975,7 +975,7 @@ mod tests { "[foo=bar] #id1 > #id2", ]; - let program = compile(&selectors, UTF_8, 2); + let program = test_compile(&selectors, UTF_8, 2); macro_rules! exec { ($html:expr, $add_range:expr, $expected_payload:expr) => {{ From 8cac9b26e34fa6282f0b65713902933daebfbb9f Mon Sep 17 00:00:00 2001 From: Kornel Date: Fri, 20 Feb 2026 13:21:08 +0000 Subject: [PATCH 02/13] Remove runtime panics from TextEncoder --- src/rewritable_units/text_encoder.rs | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/rewritable_units/text_encoder.rs b/src/rewritable_units/text_encoder.rs index e366df56..7f992546 100644 --- a/src/rewritable_units/text_encoder.rs +++ b/src/rewritable_units/text_encoder.rs @@ -86,10 +86,10 @@ impl TextEncoder { if written > 0 && written <= buffer.len() { (output_handler)(&buffer[..written]); } - if read >= content.len() { - return; - } - content = &content[read..]; + content = match content.get(read..) { + Some(rest) if !rest.is_empty() => rest, + _ => return, + }; match result { CoderResult::InputEmpty => { @@ -99,11 +99,11 @@ impl TextEncoder { // we've made progress, and can try again without growing the buffer CoderResult::OutputFull if written > 0 => {} CoderResult::OutputFull => { - // encoding_rs only needs a dozen bytes. If a large buffer is insufficient, it must be a bug. - assert!( - buffer.len() < Buffer::DEFAULT_HEAP_BUFFER_SIZE, - "encoding_rs infinite loop" - ); + // this never happens, encoding_rs only needs a dozen bytes. + if buffer.len() >= Buffer::DEFAULT_HEAP_BUFFER_SIZE { + debug_assert!(false, "encoding_rs stalled"); + return; + } self.buffer = Buffer::Heap(vec![0; Buffer::DEFAULT_HEAP_BUFFER_SIZE]); } } From ca68250cab67345fe928272c8ad04ba5586fca77 Mon Sep 17 00:00:00 2001 From: Kornel Date: Fri, 20 Feb 2026 14:45:11 +0000 Subject: [PATCH 03/13] Avoid (impossible) runtime panic in encoding_to_index --- src/base/encoding.rs | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/base/encoding.rs b/src/base/encoding.rs index 0227b308..848a5092 100644 --- a/src/base/encoding.rs +++ b/src/base/encoding.rs @@ -49,13 +49,16 @@ static ALL_ENCODINGS: [&Encoding; 40] = [ &encoding_rs::ISO_2022_JP_INIT, ]; +#[cfg_attr(debug_assertions, track_caller)] fn encoding_to_index(encoding: AsciiCompatibleEncoding) -> usize { let encoding: &'static Encoding = encoding.into(); - ALL_ENCODINGS - .iter() - .position(|&e| e == encoding) - .expect("the ALL_ENCODINGS is not complete and needs to be updated") + let index = ALL_ENCODINGS.iter().position(|&e| e == encoding); + debug_assert!( + index.is_some(), + "the ALL_ENCODINGS is not complete and needs to be updated" + ); + index.unwrap_or(0) } /// A charset encoding that can be shared and modified. @@ -71,6 +74,7 @@ pub struct SharedEncoding { impl SharedEncoding { #[must_use] + #[cfg_attr(debug_assertions, track_caller)] pub fn new(encoding: AsciiCompatibleEncoding) -> Self { Self { encoding: Arc::new(AtomicUsize::new(encoding_to_index(encoding))), @@ -84,6 +88,7 @@ impl SharedEncoding { ALL_ENCODINGS.get(encoding).unwrap_or(&ALL_ENCODINGS[0]) } + #[cfg_attr(debug_assertions, track_caller)] pub fn set(&self, encoding: AsciiCompatibleEncoding) { self.encoding .store(encoding_to_index(encoding), Ordering::Relaxed); From a217d673930fbf5d51fde2d588705b72ef3caeb5 Mon Sep 17 00:00:00 2001 From: Kornel Date: Fri, 20 Feb 2026 14:47:46 +0000 Subject: [PATCH 04/13] Don't take address of `encoding_rs::*_INIT` --- src/base/encoding.rs | 80 ++++++++++++++++++++++---------------------- 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/src/base/encoding.rs b/src/base/encoding.rs index 848a5092..e38132da 100644 --- a/src/base/encoding.rs +++ b/src/base/encoding.rs @@ -6,47 +6,47 @@ use std::sync::atomic::{AtomicUsize, Ordering}; /// This serves as a map from integer to [`Encoding`], which allows more efficient /// sets/gets of the [`SharedEncoding`]. static ALL_ENCODINGS: [&Encoding; 40] = [ - &encoding_rs::UTF_8_INIT, - &encoding_rs::SHIFT_JIS_INIT, - &encoding_rs::BIG5_INIT, - &encoding_rs::EUC_JP_INIT, - &encoding_rs::EUC_KR_INIT, - &encoding_rs::GB18030_INIT, - &encoding_rs::GBK_INIT, - &encoding_rs::IBM866_INIT, - &encoding_rs::ISO_8859_2_INIT, - &encoding_rs::ISO_8859_3_INIT, - &encoding_rs::ISO_8859_4_INIT, - &encoding_rs::ISO_8859_5_INIT, - &encoding_rs::ISO_8859_6_INIT, - &encoding_rs::ISO_8859_7_INIT, - &encoding_rs::ISO_8859_8_I_INIT, - &encoding_rs::ISO_8859_8_INIT, - &encoding_rs::ISO_8859_10_INIT, - &encoding_rs::ISO_8859_13_INIT, - &encoding_rs::ISO_8859_14_INIT, - &encoding_rs::ISO_8859_15_INIT, - &encoding_rs::ISO_8859_16_INIT, - &encoding_rs::KOI8_R_INIT, - &encoding_rs::KOI8_U_INIT, - &encoding_rs::MACINTOSH_INIT, - &encoding_rs::WINDOWS_1250_INIT, - &encoding_rs::WINDOWS_1251_INIT, - &encoding_rs::WINDOWS_1252_INIT, - &encoding_rs::WINDOWS_1253_INIT, - &encoding_rs::WINDOWS_1254_INIT, - &encoding_rs::WINDOWS_1255_INIT, - &encoding_rs::WINDOWS_1256_INIT, - &encoding_rs::WINDOWS_1257_INIT, - &encoding_rs::WINDOWS_1258_INIT, - &encoding_rs::WINDOWS_874_INIT, - &encoding_rs::X_MAC_CYRILLIC_INIT, - &encoding_rs::X_USER_DEFINED_INIT, + encoding_rs::UTF_8, + encoding_rs::SHIFT_JIS, + encoding_rs::BIG5, + encoding_rs::EUC_JP, + encoding_rs::EUC_KR, + encoding_rs::GB18030, + encoding_rs::GBK, + encoding_rs::IBM866, + encoding_rs::ISO_8859_2, + encoding_rs::ISO_8859_3, + encoding_rs::ISO_8859_4, + encoding_rs::ISO_8859_5, + encoding_rs::ISO_8859_6, + encoding_rs::ISO_8859_7, + encoding_rs::ISO_8859_8_I, + encoding_rs::ISO_8859_8, + encoding_rs::ISO_8859_10, + encoding_rs::ISO_8859_13, + encoding_rs::ISO_8859_14, + encoding_rs::ISO_8859_15, + encoding_rs::ISO_8859_16, + encoding_rs::KOI8_R, + encoding_rs::KOI8_U, + encoding_rs::MACINTOSH, + encoding_rs::WINDOWS_1250, + encoding_rs::WINDOWS_1251, + encoding_rs::WINDOWS_1252, + encoding_rs::WINDOWS_1253, + encoding_rs::WINDOWS_1254, + encoding_rs::WINDOWS_1255, + encoding_rs::WINDOWS_1256, + encoding_rs::WINDOWS_1257, + encoding_rs::WINDOWS_1258, + encoding_rs::WINDOWS_874, + encoding_rs::X_MAC_CYRILLIC, + encoding_rs::X_USER_DEFINED, // non-ASCII-compatible - &encoding_rs::REPLACEMENT_INIT, - &encoding_rs::UTF_16BE_INIT, - &encoding_rs::UTF_16LE_INIT, - &encoding_rs::ISO_2022_JP_INIT, + encoding_rs::REPLACEMENT, + encoding_rs::UTF_16BE, + encoding_rs::UTF_16LE, + encoding_rs::ISO_2022_JP, ]; #[cfg_attr(debug_assertions, track_caller)] From fe22da6af563b8c06ff4e7079d2b0a657a671c90 Mon Sep 17 00:00:00 2001 From: Kornel Date: Fri, 20 Feb 2026 12:00:13 +0000 Subject: [PATCH 05/13] DRY --- c-api/src/rewriter.rs | 54 +++++++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/c-api/src/rewriter.rs b/c-api/src/rewriter.rs index cdcbbd82..6fa9bd8d 100644 --- a/c-api/src/rewriter.rs +++ b/c-api/src/rewriter.rs @@ -34,8 +34,7 @@ impl OutputSink for ExternOutputSink { } } -#[unsafe(no_mangle)] -pub unsafe extern "C" fn lol_html_rewriter_build( +fn lol_html_rewriter_build_inner( builder: *mut HtmlRewriterBuilder, encoding: *const c_char, encoding_len: size_t, @@ -43,31 +42,34 @@ pub unsafe extern "C" fn lol_html_rewriter_build( output_sink: unsafe extern "C" fn(*const c_char, size_t, *mut c_void), output_sink_user_data: *mut c_void, strict: bool, -) -> *mut HtmlRewriter { + enable_esi_tags: bool, +) -> Result> { let builder = to_ref!(builder); let handlers = builder.get_safe_handlers(); let maybe_encoding = encoding_rs::Encoding::for_label_no_replacement(to_bytes!(encoding, encoding_len)); - let encoding = unwrap_or_ret_null! { maybe_encoding.ok_or(EncodingError::UnknownEncoding) }; + let encoding = maybe_encoding.ok_or(EncodingError::UnknownEncoding)?; let settings = Settings { element_content_handlers: handlers.element, document_content_handlers: handlers.document, - encoding: unwrap_or_ret_null! { encoding.try_into().or(Err(EncodingError::NonAsciiCompatibleEncoding)) }, + encoding: encoding + .try_into() + .or(Err(EncodingError::NonAsciiCompatibleEncoding))?, memory_settings, strict, - enable_esi_tags: false, + enable_esi_tags, adjust_charset_on_meta_tag: false, }; let output_sink = ExternOutputSink::new(output_sink, output_sink_user_data); let rewriter = lol_html::HtmlRewriter::new(settings, output_sink); - to_ptr_mut(HtmlRewriter(Some(rewriter))) + Ok(HtmlRewriter(Some(rewriter))) } #[unsafe(no_mangle)] -pub unsafe extern "C" fn unstable_lol_html_rewriter_build_with_esi_tags( +pub unsafe extern "C" fn lol_html_rewriter_build( builder: *mut HtmlRewriterBuilder, encoding: *const c_char, encoding_len: size_t, @@ -76,26 +78,24 @@ pub unsafe extern "C" fn unstable_lol_html_rewriter_build_with_esi_tags( output_sink_user_data: *mut c_void, strict: bool, ) -> *mut HtmlRewriter { - let builder = to_ref!(builder); - let handlers = builder.get_safe_handlers(); - - let maybe_encoding = - encoding_rs::Encoding::for_label_no_replacement(to_bytes!(encoding, encoding_len)); - let encoding = unwrap_or_ret_null! { maybe_encoding.ok_or(EncodingError::UnknownEncoding) }; - let settings = Settings { - element_content_handlers: handlers.element, - document_content_handlers: handlers.document, - encoding: unwrap_or_ret_null! { encoding.try_into().or(Err(EncodingError::NonAsciiCompatibleEncoding)) }, - memory_settings, - strict, - enable_esi_tags: true, - adjust_charset_on_meta_tag: false, - }; - - let output_sink = ExternOutputSink::new(output_sink, output_sink_user_data); - let rewriter = lol_html::HtmlRewriter::new(settings, output_sink); + to_ptr_mut(unwrap_or_ret_null! { + lol_html_rewriter_build_inner(builder, encoding, encoding_len, memory_settings, output_sink, output_sink_user_data, strict, false) + }) +} - to_ptr_mut(HtmlRewriter(Some(rewriter))) +#[unsafe(no_mangle)] +pub unsafe extern "C" fn unstable_lol_html_rewriter_build_with_esi_tags( + builder: *mut HtmlRewriterBuilder, + encoding: *const c_char, + encoding_len: size_t, + memory_settings: MemorySettings, + output_sink: unsafe extern "C" fn(*const c_char, size_t, *mut c_void), + output_sink_user_data: *mut c_void, + strict: bool, +) -> *mut HtmlRewriter { + to_ptr_mut(unwrap_or_ret_null! { + lol_html_rewriter_build_inner(builder, encoding, encoding_len, memory_settings, output_sink, output_sink_user_data, strict, true) + }) } #[unsafe(no_mangle)] From 2e88578bdd6c457088978b9bd1b7655b38f86536 Mon Sep 17 00:00:00 2001 From: Kornel Date: Fri, 20 Feb 2026 15:20:04 +0000 Subject: [PATCH 06/13] Don't panic at FFI boundary --- c-api/src/lib.rs | 20 ++++++++++++++++++++ c-api/src/rewriter.rs | 12 ++++++------ 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/c-api/src/lib.rs b/c-api/src/lib.rs index 9bffe20e..790ee041 100644 --- a/c-api/src/lib.rs +++ b/c-api/src/lib.rs @@ -74,6 +74,26 @@ macro_rules! unwrap_or_ret_null { }; } +#[cold] +fn panic_err(payload: Box) -> Box { + if let Some(&s) = payload.downcast_ref::<&str>() { + Box::from(s) + } else if let Ok(s) = payload.downcast::() { + Box::from(*s) + } else { + Box::from("panic") // never happens + } +} + +fn catch_panic( + callback: impl FnOnce() -> Result, +) -> Result> +where + Box: From, +{ + Ok(std::panic::catch_unwind(std::panic::AssertUnwindSafe(callback)).map_err(panic_err)??) +} + macro_rules! impl_content_mutation_handlers { ($name:ident: $typ:ty [ $($(#[$meta:meta])* $(@$kind:ident)? $fn_name:ident => $method:ident),+$(,)? ]) => { $( diff --git a/c-api/src/rewriter.rs b/c-api/src/rewriter.rs index 6fa9bd8d..6f1d0467 100644 --- a/c-api/src/rewriter.rs +++ b/c-api/src/rewriter.rs @@ -78,9 +78,9 @@ pub unsafe extern "C" fn lol_html_rewriter_build( output_sink_user_data: *mut c_void, strict: bool, ) -> *mut HtmlRewriter { - to_ptr_mut(unwrap_or_ret_null! { + to_ptr_mut(unwrap_or_ret_null! { catch_panic(move || { lol_html_rewriter_build_inner(builder, encoding, encoding_len, memory_settings, output_sink, output_sink_user_data, strict, false) - }) + })}) } #[unsafe(no_mangle)] @@ -93,9 +93,9 @@ pub unsafe extern "C" fn unstable_lol_html_rewriter_build_with_esi_tags( output_sink_user_data: *mut c_void, strict: bool, ) -> *mut HtmlRewriter { - to_ptr_mut(unwrap_or_ret_null! { + to_ptr_mut(unwrap_or_ret_null! { catch_panic(move || { lol_html_rewriter_build_inner(builder, encoding, encoding_len, memory_settings, output_sink, output_sink_user_data, strict, true) - }) + })}) } #[unsafe(no_mangle)] @@ -110,7 +110,7 @@ pub unsafe extern "C" fn lol_html_rewriter_write( .as_mut() .expect("cannot call `lol_html_rewriter_write` after calling `end()`"); - unwrap_or_ret_err_code! { rewriter.write(chunk) }; + unwrap_or_ret_err_code! { catch_panic(move || rewriter.write(chunk)) }; 0 } @@ -122,7 +122,7 @@ pub unsafe extern "C" fn lol_html_rewriter_end(rewriter: *mut HtmlRewriter) -> c .take() // Using `take()` allows calling `free()` afterwards (it will be a no-op). .expect("cannot call `lol_html_rewriter_end` after calling `end()`"); - unwrap_or_ret_err_code! { rewriter.end() }; + unwrap_or_ret_err_code! { catch_panic(move || rewriter.end()) }; 0 } From e39f16ec3d98d9865f70c25f34351c89e4cecdfc Mon Sep 17 00:00:00 2001 From: Kornel Date: Fri, 20 Feb 2026 18:26:42 +0000 Subject: [PATCH 07/13] Panic-free take_last_error --- c-api/src/element.rs | 2 +- c-api/src/errors.rs | 17 +++++++++++++---- c-api/src/lib.rs | 2 +- c-api/src/string.rs | 12 +++++++++--- 4 files changed, 24 insertions(+), 9 deletions(-) diff --git a/c-api/src/element.rs b/c-api/src/element.rs index f652e949..b8909e9b 100644 --- a/c-api/src/element.rs +++ b/c-api/src/element.rs @@ -132,7 +132,7 @@ pub unsafe extern "C" fn lol_html_element_get_attribute( name_len: size_t, ) -> Str { let element = to_ref!(element); - let name = unwrap_or_ret!(to_str!(name, name_len), Str::from_opt(None)); + let name = unwrap_or_ret!(to_str!(name, name_len), Str::EMPTY); Str::from_opt(element.get_attribute(name)) } diff --git a/c-api/src/errors.rs b/c-api/src/errors.rs index a06f9929..c1a4ac0e 100644 --- a/c-api/src/errors.rs +++ b/c-api/src/errors.rs @@ -1,15 +1,24 @@ use super::*; -use std::error::Error; thread_local! { - pub static LAST_ERROR: RefCell>> = RefCell::new(None); + pub static LAST_ERROR: RefCell>> = RefCell::new(None); } #[unsafe(no_mangle)] pub extern "C" fn lol_html_take_last_error() -> Str { - let err = LAST_ERROR.with(|e| e.borrow_mut().take()); + Str::from_opt( + LAST_ERROR + .try_with(|e| e.try_borrow_mut().ok()?.take()) + .ok() + .flatten(), + ) +} - Str::from_opt(err.map(|e| e.to_string())) +#[cold] +#[inline(never)] +pub(crate) fn save_last_error(err: String) { + let err = Some(err.into_boxed_str()); + let _ = crate::errors::LAST_ERROR.try_with(|e| e.try_borrow_mut().map(|mut v| *v = err)); } #[derive(Error, Debug, Eq, PartialEq, Copy, Clone)] diff --git a/c-api/src/lib.rs b/c-api/src/lib.rs index 790ee041..f806cc6d 100644 --- a/c-api/src/lib.rs +++ b/c-api/src/lib.rs @@ -55,7 +55,7 @@ macro_rules! unwrap_or_ret { match $expr { Ok(v) => v, Err(err) => { - crate::errors::LAST_ERROR.with(|e| *e.borrow_mut() = Some(err.into())); + crate::errors::save_last_error(err.to_string()); return $ret_val; } } diff --git a/c-api/src/string.rs b/c-api/src/string.rs index 3511a7f5..18696f69 100644 --- a/c-api/src/string.rs +++ b/c-api/src/string.rs @@ -9,11 +9,17 @@ pub struct Str { } impl Str { + pub const EMPTY: Self = Self { + data: std::ptr::null(), + len: 0, + }; + #[must_use] - pub fn new(string: String) -> Self { + pub fn new(string: impl Into>) -> Self { + let string = string.into(); Self { len: string.len(), - data: Box::into_raw(string.into_boxed_str()) as *const c_char, + data: Box::into_raw(string).cast::(), } } @@ -22,7 +28,7 @@ impl Str { /// If `string` is `None`, `data` will be set to `NULL`. #[inline] #[must_use] - pub fn from_opt(string: Option) -> Self { + pub fn from_opt(string: Option>>) -> Self { match string { Some(string) => Self::new(string), None => Self { From 6c4af5709b42ca181c54282a375bd97ab3605acc Mon Sep 17 00:00:00 2001 From: Kornel Date: Fri, 20 Feb 2026 15:20:32 +0000 Subject: [PATCH 08/13] Debug assert handler refcounts --- src/rewriter/handlers_dispatcher.rs | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/rewriter/handlers_dispatcher.rs b/src/rewriter/handlers_dispatcher.rs index f859dc94..4b96d207 100644 --- a/src/rewriter/handlers_dispatcher.rs +++ b/src/rewriter/handlers_dispatcher.rs @@ -48,13 +48,23 @@ impl HandlerVec { #[inline] pub fn inc_user_count(&mut self, idx: usize) { - self.items[idx].user_count += 1; + let Some(item) = self.items.get_mut(idx) else { + debug_assert!(false); + return; + }; + item.user_count += 1; self.user_count += 1; } #[inline] pub fn dec_user_count(&mut self, idx: usize) { - self.items[idx].user_count -= 1; + let Some(item) = self.items.get_mut(idx) else { + debug_assert!(false); + return; + }; + debug_assert!(item.user_count > 0); + debug_assert!(self.user_count > 0); + item.user_count -= 1; self.user_count -= 1; } From ea20a3c8464758fb8ccbd30b2c794c9540f6c15a Mon Sep 17 00:00:00 2001 From: Kornel Date: Fri, 20 Feb 2026 15:51:13 +0000 Subject: [PATCH 09/13] Panic-free AttributeMatcher --- src/selectors_vm/attribute_matcher.rs | 76 +++++++++++++-------------- 1 file changed, 37 insertions(+), 39 deletions(-) diff --git a/src/selectors_vm/attribute_matcher.rs b/src/selectors_vm/attribute_matcher.rs index 2cf46b5c..4ae2f4e5 100644 --- a/src/selectors_vm/attribute_matcher.rs +++ b/src/selectors_vm/attribute_matcher.rs @@ -62,19 +62,16 @@ impl<'i> AttributeMatcher<'i> { self.attributes .iter() .find(|&a| { - if lowercased_name.len() != a.name.end - a.name.start { + let Some(attr_name) = self.input.opt_slice(Some(a.name)) else { + return false; + }; + if attr_name.len() != lowercased_name.len() { return false; } - - let attr_name = self.input.slice(a.name); - - for i in 0..attr_name.len() { - if attr_name[i].to_ascii_lowercase() != lowercased_name[i] { - return false; - } - } - - true + attr_name + .iter() + .map(|c| c.to_ascii_lowercase()) + .eq(lowercased_name.iter().copied()) }) .copied() } @@ -144,7 +141,9 @@ impl<'i> AttributeMatcher<'i> { !actual_value.is_empty() && actual_value.len() >= prefix_len - && case_sensitivity.eq(&actual_value[..prefix_len], &operand.value) + && actual_value + .get(..prefix_len) + .is_some_and(|prefix| case_sensitivity.eq(prefix, &operand.value)) }) } @@ -160,7 +159,9 @@ impl<'i> AttributeMatcher<'i> { let prefix_len = operand.value.len(); actual_value.get(prefix_len) == Some(&b'-') - && case_sensitivity.eq(&actual_value[..prefix_len], &operand.value) + && actual_value + .get(..prefix_len) + .is_some_and(|prefix| case_sensitivity.eq(prefix, &operand.value)) }) } @@ -174,45 +175,42 @@ impl<'i> AttributeMatcher<'i> { !actual_value.is_empty() && value_len >= suffix_len - && case_sensitivity.eq(&actual_value[value_len - suffix_len..], &operand.value) + && actual_value + .get(value_len - suffix_len..) + .is_some_and(|prefix| case_sensitivity.eq(prefix, &operand.value)) }) } #[inline] pub fn has_attr_with_substring(&self, operand: &AttrExprOperands) -> bool { self.value_matches(&operand.name, |actual_value| { - let case_sensitivity = to_unconditional(operand.case_sensitivity, self.is_html_element); - let Some((&first_byte, rest)) = operand.value.split_first() else { return false; }; - let first_byte_searcher: &dyn Fn(_) -> _ = match case_sensitivity { - CaseSensitivity::CaseSensitive => &move |h| memchr(first_byte, h), - CaseSensitivity::AsciiCaseInsensitive => { - let lo = first_byte.to_ascii_lowercase(); - let up = first_byte.to_ascii_uppercase(); - - &move |h| memchr2(lo, up, h) + fn search( + mut haystack: &[u8], + rest: &[u8], + case_sensitivity: CaseSensitivity, + first_byte_searcher: impl Fn(&[u8]) -> Option, + ) -> Option<()> { + loop { + haystack = haystack.get(first_byte_searcher(haystack)? + 1..)?; + if case_sensitivity.eq(haystack.get(..rest.len())?, rest) { + return Some(()); + } } - }; - - let mut haystack = actual_value; - - loop { - match first_byte_searcher(haystack) { - Some(pos) => { - haystack = &haystack[pos + 1..]; + } - if haystack.len() < rest.len() { - return false; - } + match to_unconditional(operand.case_sensitivity, self.is_html_element) { + case @ CaseSensitivity::CaseSensitive => { + search(actual_value, rest, case, move |h| memchr(first_byte, h)).is_some() + } + case @ CaseSensitivity::AsciiCaseInsensitive => { + let lo = first_byte.to_ascii_lowercase(); + let up = first_byte.to_ascii_uppercase(); - if case_sensitivity.eq(&haystack[..rest.len()], rest) { - return true; - } - } - None => return false, + search(actual_value, rest, case, move |h| memchr2(lo, up, h)).is_some() } } }) From c935bfc5b9fb536bee08e6671df80769ed33c929 Mon Sep 17 00:00:00 2001 From: Kornel Date: Fri, 20 Feb 2026 16:34:43 +0000 Subject: [PATCH 10/13] Handle preallocated_size without panics --- src/memory/arena.rs | 20 ++++++++++++++------ src/memory/limiter.rs | 20 -------------------- 2 files changed, 14 insertions(+), 26 deletions(-) diff --git a/src/memory/arena.rs b/src/memory/arena.rs index 5ebe2286..e589395d 100644 --- a/src/memory/arena.rs +++ b/src/memory/arena.rs @@ -10,12 +10,20 @@ pub(crate) struct Arena { impl Arena { pub fn new(limiter: SharedMemoryLimiter, preallocated_size: usize) -> Self { - limiter.preallocate(preallocated_size); - - Self { - limiter, - data: Vec::with_capacity(preallocated_size), - } + let mut data = Vec::new(); + + let preallocated = limiter + .increase_usage(preallocated_size) + .ok() + .and_then(|_| data.try_reserve_exact(preallocated_size).ok()) + .is_some(); + // HtmlRewriter::new() has no way to report this + debug_assert!( + preallocated, + "Total preallocated memory size should be less than `MemorySettings::max_allowed_memory_usage`." + ); + + Self { limiter, data } } pub fn append(&mut self, slice: &[u8]) -> Result<(), MemoryLimitExceededError> { diff --git a/src/memory/limiter.rs b/src/memory/limiter.rs index bb831620..7ef3af89 100644 --- a/src/memory/limiter.rs +++ b/src/memory/limiter.rs @@ -44,13 +44,6 @@ impl SharedMemoryLimiter { } } - #[inline] - pub fn preallocate(&self, byte_count: usize) { - self.increase_usage(byte_count).expect( - "Total preallocated memory size should be less than `MemorySettings::max_allowed_memory_usage`.", - ); - } - #[inline] pub fn decrease_usage(&self, byte_count: usize) { self.current_usage.fetch_sub(byte_count, Ordering::Relaxed); @@ -80,17 +73,4 @@ mod tests { assert_eq!(err, MemoryLimitExceededError); } - - #[test] - #[should_panic( - expected = "Total preallocated memory size should be less than `MemorySettings::max_allowed_memory_usage`." - )] - fn preallocate() { - let limiter = SharedMemoryLimiter::new(10); - - limiter.preallocate(8); - assert_eq!(limiter.current_usage(), 8); - - limiter.preallocate(10); - } } From a2d25b12e62eb20d1e49cfe60207fbeedae6eb81 Mon Sep 17 00:00:00 2001 From: Kornel Date: Fri, 20 Feb 2026 16:42:15 +0000 Subject: [PATCH 11/13] Avoid panicking OnceCell init --- src/base/bytes.rs | 2 +- src/rewritable_units/tokens/attributes.rs | 29 ++++++++++++++++++----- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/src/base/bytes.rs b/src/base/bytes.rs index 8d4ef661..971e8cf3 100644 --- a/src/base/bytes.rs +++ b/src/base/bytes.rs @@ -11,7 +11,7 @@ use std::str; pub struct HasReplacementsError; /// A thin wrapper around byte slice with handy APIs attached -#[derive(Copy, Clone, PartialEq, Eq, Hash)] +#[derive(Copy, Clone, PartialEq, Eq, Hash, Default)] #[repr(transparent)] pub(crate) struct Bytes<'b>(&'b [u8]); diff --git a/src/rewritable_units/tokens/attributes.rs b/src/rewritable_units/tokens/attributes.rs index 35c6b573..03c539a9 100644 --- a/src/rewritable_units/tokens/attributes.rs +++ b/src/rewritable_units/tokens/attributes.rs @@ -215,14 +215,27 @@ impl<'i> Attributes<'i> { false } + #[inline(never)] fn init_items(&self) -> Vec> { + let cant_fail = || { + debug_assert!(false); + Bytes::default() + }; self.attribute_buffer .iter() .map(|a| { Attribute::new( - self.input.slice(a.name).into(), - self.input.slice(a.value).into(), - self.input.slice(a.raw_range), + self.input + .opt_slice(Some(a.name)) + .unwrap_or_else(cant_fail) + .into(), + self.input + .opt_slice(Some(a.value)) + .unwrap_or_else(cant_fail) + .into(), + self.input + .opt_slice(Some(a.raw_range)) + .unwrap_or_else(cant_fail), self.encoding, ) }) @@ -231,9 +244,13 @@ impl<'i> Attributes<'i> { #[inline] fn as_mut_vec(&mut self) -> &mut Vec> { - let _ = self.items.get_or_init(|| self.init_items()); - - self.items.get_mut().expect("Items should be initialized") + if self.items.get().is_none() { + // `get_mut_or_init` is unstable and has a pointless re-entrancy check + let cell = OnceCell::new(); + let _ = cell.set(self.init_items()); + self.items = cell; // optimizes out get_mut() + } + self.items.get_mut().unwrap_or_else(|| unreachable!()) } #[cfg(test)] From 7e8828e1c5c0ef8e5a88baf17b36c64d1d64fce6 Mon Sep 17 00:00:00 2001 From: Kornel Date: Fri, 20 Feb 2026 18:39:58 +0000 Subject: [PATCH 12/13] Simplify attribute serializer --- src/rewritable_units/tokens/attributes.rs | 37 ++++++++++------------- src/rewritable_units/tokens/start_tag.rs | 9 +++--- 2 files changed, 21 insertions(+), 25 deletions(-) diff --git a/src/rewritable_units/tokens/attributes.rs b/src/rewritable_units/tokens/attributes.rs index 03c539a9..e0f62740 100644 --- a/src/rewritable_units/tokens/attributes.rs +++ b/src/rewritable_units/tokens/attributes.rs @@ -6,7 +6,6 @@ use crate::rewritable_units::Serialize; use encoding_rs::Encoding; use std::cell::OnceCell; use std::fmt::{self, Debug}; -use std::ops::Deref; use thiserror::Error; /// An error that occurs when invalid value is provided for the attribute name. @@ -215,6 +214,14 @@ impl<'i> Attributes<'i> { false } + pub fn is_empty(&self) -> bool { + // check without materializing items + self.items + .get() + .map(|items| items.is_empty()) + .unwrap_or(self.attribute_buffer.is_empty()) + } + #[inline(never)] fn init_items(&self) -> Vec> { let cant_fail = || { @@ -242,6 +249,10 @@ impl<'i> Attributes<'i> { .collect() } + pub(crate) fn to_slice(&self) -> &[Attribute<'i>] { + self.items.get_or_init(|| self.init_items()) + } + #[inline] fn as_mut_vec(&mut self) -> &mut Vec> { if self.items.get().is_none() { @@ -259,28 +270,12 @@ impl<'i> Attributes<'i> { } } -impl<'i> Deref for Attributes<'i> { - type Target = [Attribute<'i>]; - - #[inline] - fn deref(&self) -> &[Attribute<'i>] { - self.items.get_or_init(|| self.init_items()) - } -} - -impl Serialize for &Attributes<'_> { +impl Serialize for &mut Attributes<'_> { #[inline] fn into_bytes(self, output_handler: &mut dyn FnMut(&[u8])) -> Result<(), RewritingError> { - if !self.is_empty() { - let last = self.len() - 1; - - for (idx, attr) in self.iter().enumerate() { - attr.into_bytes(output_handler)?; - - if idx != last { - output_handler(b" "); - } - } + for attr in self.as_mut_vec() { + output_handler(b" "); + attr.into_bytes(output_handler)?; } Ok(()) } diff --git a/src/rewritable_units/tokens/start_tag.rs b/src/rewritable_units/tokens/start_tag.rs index c7ba6bd8..aef9b6da 100644 --- a/src/rewritable_units/tokens/start_tag.rs +++ b/src/rewritable_units/tokens/start_tag.rs @@ -94,7 +94,7 @@ impl<'input_token> StartTag<'input_token> { /// Returns an immutable collection of tag's attributes. #[inline] pub fn attributes(&self) -> &[Attribute<'input_token>] { - &self.attributes + self.attributes.to_slice() } /// Sets `value` of tag's attribute with `name`. The value may have HTML/XML entities. @@ -215,7 +215,10 @@ impl<'input_token> StartTag<'input_token> { self.mutations.mutate().remove(); } - fn serialize_self(&self, sink: &mut StreamingHandlerSink<'_>) -> Result<(), RewritingError> { + fn serialize_self( + &mut self, + sink: &mut StreamingHandlerSink<'_>, + ) -> Result<(), RewritingError> { let output_handler = sink.output_handler(); if let Some(raw) = self.raw.original() { @@ -227,8 +230,6 @@ impl<'input_token> StartTag<'input_token> { output_handler(&self.name); if !self.attributes.is_empty() { - output_handler(b" "); - self.attributes.into_bytes(output_handler)?; // NOTE: attributes can be modified the way that From f0452f8165de68ddac30a25e1ac568c3f733eb9e Mon Sep 17 00:00:00 2001 From: Kornel Date: Sat, 21 Feb 2026 00:03:24 +0000 Subject: [PATCH 13/13] Clippy --- c-api/src/errors.rs | 2 +- c-api/src/rewriter.rs | 1 + src/memory/arena.rs | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/c-api/src/errors.rs b/c-api/src/errors.rs index c1a4ac0e..67941406 100644 --- a/c-api/src/errors.rs +++ b/c-api/src/errors.rs @@ -1,7 +1,7 @@ use super::*; thread_local! { - pub static LAST_ERROR: RefCell>> = RefCell::new(None); + pub static LAST_ERROR: RefCell>> = const { RefCell::new(None) }; } #[unsafe(no_mangle)] diff --git a/c-api/src/rewriter.rs b/c-api/src/rewriter.rs index 6f1d0467..402a5ea7 100644 --- a/c-api/src/rewriter.rs +++ b/c-api/src/rewriter.rs @@ -34,6 +34,7 @@ impl OutputSink for ExternOutputSink { } } +#[allow(clippy::too_many_arguments)] fn lol_html_rewriter_build_inner( builder: *mut HtmlRewriterBuilder, encoding: *const c_char, diff --git a/src/memory/arena.rs b/src/memory/arena.rs index e589395d..7adfcc7b 100644 --- a/src/memory/arena.rs +++ b/src/memory/arena.rs @@ -15,7 +15,7 @@ impl Arena { let preallocated = limiter .increase_usage(preallocated_size) .ok() - .and_then(|_| data.try_reserve_exact(preallocated_size).ok()) + .and_then(|()| data.try_reserve_exact(preallocated_size).ok()) .is_some(); // HtmlRewriter::new() has no way to report this debug_assert!(