From 5a2c676041fee738d3d7796ebb2d55ef4072f42d Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Wed, 4 Mar 2026 21:47:45 +0000 Subject: [PATCH 1/7] parse: Make PAX/GNU extension handling truly zero-copy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace PendingMetadata (which stored extension data as Vec) with a Copy struct holding borrowed &'a [u8] slices from the input buffer. Extension data is now threaded through parse_header → handle_extension → emit_entry as function parameters instead of being stored in the Parser struct. This eliminates all heap allocations during extension processing. PAX- derived paths, link targets, user/group names, and xattrs are now Cow::Borrowed directly from the input, and ParsedEntry.pax is &'a [u8] instead of Vec. The save/restore pattern on NeedData is also gone since no state is written to self during extension handling. Assisted-by: OpenCode (Claude claude-opus-4-6) --- src/parse.rs | 344 ++++++++++++++++++++++++--------------------------- 1 file changed, 165 insertions(+), 179 deletions(-) diff --git a/src/parse.rs b/src/parse.rs index ff54de1..6e9794e 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -388,7 +388,7 @@ pub enum ParseEvent<'a> { /// Number of bytes consumed from the input (header + padded content). consumed: usize, /// The raw PAX key-value data from the global header. - pax_data: Vec, + pax_data: &'a [u8], }, /// Archive end marker reached (two consecutive zero blocks, or clean EOF). @@ -497,7 +497,7 @@ pub struct ParsedEntry<'a> { /// so that callers can iterate all PAX key-value pairs (not just the ones /// tar-core resolves into struct fields). Parse it with /// [`PaxExtensions::new`](crate::PaxExtensions::new). - pub pax: Option>, + pub pax: Option<&'a [u8]>, } impl<'a> ParsedEntry<'a> { @@ -563,12 +563,17 @@ enum ExtensionKind { Pax, } -/// Pending metadata from GNU/PAX extension entries. -#[derive(Debug, Default, Clone)] -struct PendingMetadata { - gnu_long_name: Option>, - gnu_long_link: Option>, - pax_extensions: Option>, +/// Borrowed extension data accumulated during recursive extension processing. +/// +/// This is threaded through `parse_header` → `handle_extension` → `parse_header` +/// calls within a single `parse()` invocation. Since extension chains are always +/// fully resolved within one call (or discarded on `NeedData`), we can borrow +/// directly from the input slice — no allocation needed. +#[derive(Debug, Default, Clone, Copy)] +struct PendingMetadata<'a> { + gnu_long_name: Option<&'a [u8]>, + gnu_long_link: Option<&'a [u8]>, + pax_extensions: Option<&'a [u8]>, count: usize, } @@ -582,18 +587,84 @@ struct SparseContext { ext_consumed: usize, } -impl PendingMetadata { +impl PendingMetadata<'_> { fn is_empty(&self) -> bool { self.gnu_long_name.is_none() && self.gnu_long_link.is_none() && self.pax_extensions.is_none() } +} - fn clear(&mut self) { - self.gnu_long_name = None; - self.gnu_long_link = None; - self.pax_extensions = None; - self.count = 0; +/// Check PAX extensions for GNU sparse version. +/// +/// Returns `Some((major, minor))` if `GNU.sparse.major` and +/// `GNU.sparse.minor` are both present and parseable, `None` if +/// the keys are absent. In strict mode, malformed values produce +/// errors instead of being silently ignored. +fn pax_sparse_version(pax: &[u8], strict: bool) -> Result> { + let mut major = None; + let mut minor = None; + for ext in PaxExtensions::new(pax) { + let ext = ext?; + let key = match ext.key() { + Ok(k) => k, + Err(_) if !strict => continue, + Err(e) => return Err(ParseError::from(e)), + }; + match key { + PAX_GNU_SPARSE_MAJOR => { + let s = match ext.value() { + Ok(s) => s, + Err(_) if !strict => continue, + Err(_) => { + return Err(ParseError::InvalidPaxValue { + key: PAX_GNU_SPARSE_MAJOR, + value: String::from_utf8_lossy(ext.value_bytes()).into(), + }) + } + }; + match s.parse::() { + Ok(v) => major = Some(v), + Err(_) if !strict => {} + Err(_) => { + return Err(ParseError::InvalidPaxValue { + key: PAX_GNU_SPARSE_MAJOR, + value: s.into(), + }) + } + } + } + PAX_GNU_SPARSE_MINOR => { + let s = match ext.value() { + Ok(s) => s, + Err(_) if !strict => continue, + Err(_) => { + return Err(ParseError::InvalidPaxValue { + key: PAX_GNU_SPARSE_MINOR, + value: String::from_utf8_lossy(ext.value_bytes()).into(), + }) + } + }; + match s.parse::() { + Ok(v) => minor = Some(v), + Err(_) if !strict => {} + Err(_) => { + return Err(ParseError::InvalidPaxValue { + key: PAX_GNU_SPARSE_MINOR, + value: s.into(), + }) + } + } + } + _ => {} + } + if major.is_some() && minor.is_some() { + break; + } + } + match (major, minor) { + (Some(maj), Some(min)) => Ok(Some((maj, min))), + _ => Ok(None), } } @@ -640,7 +711,6 @@ impl PendingMetadata { pub struct Parser { limits: Limits, state: State, - pending: PendingMetadata, /// When true, entries with empty paths are allowed through instead of /// returning [`ParseError::EmptyPath`]. allow_empty_path: bool, @@ -659,7 +729,6 @@ impl Parser { Self { limits, state: State::ReadHeader, - pending: PendingMetadata::default(), allow_empty_path: false, verify_checksums: true, } @@ -720,12 +789,16 @@ impl Parser { pub fn parse<'a>(&mut self, input: &'a [u8]) -> Result> { match self.state { State::Done => Ok(ParseEvent::End { consumed: 0 }), - State::ReadHeader => self.parse_header(input), + State::ReadHeader => self.parse_header(input, PendingMetadata::default()), } } /// Parse a header from the input. - fn parse_header<'a>(&mut self, input: &'a [u8]) -> Result> { + fn parse_header<'a>( + &mut self, + input: &'a [u8], + slices: PendingMetadata<'a>, + ) -> Result> { // Need at least one header block if input.len() < HEADER_SIZE { return Ok(ParseEvent::NeedData { @@ -752,7 +825,7 @@ impl Parser { let second_block = &input[HEADER_SIZE..2 * HEADER_SIZE]; if second_block.iter().all(|&b| b == 0) { self.state = State::Done; - if !self.pending.is_empty() { + if !slices.is_empty() { return Err(ParseError::OrphanedMetadata); } return Ok(ParseEvent::End { @@ -762,14 +835,14 @@ impl Parser { // Not end of archive — single stray zero block; skip it and // continue with the next block as a header. return self - .parse_header(&input[HEADER_SIZE..]) + .parse_header(&input[HEADER_SIZE..], slices) .map(|e| e.add_consumed(HEADER_SIZE)); } // Check pending entry limit - if self.pending.count > self.limits.max_pending_entries { + if slices.count > self.limits.max_pending_entries { return Err(ParseError::TooManyPendingEntries { - count: self.pending.count, + count: slices.count, limit: self.limits.max_pending_entries, }); } @@ -794,13 +867,13 @@ impl Parser { let is_extension_format = header.is_gnu() || header.is_ustar(); match entry_type { EntryType::GnuLongName if is_extension_format => { - self.handle_extension(input, size, padded_size, ExtensionKind::GnuLongName) + self.handle_extension(input, size, padded_size, ExtensionKind::GnuLongName, slices) } EntryType::GnuLongLink if is_extension_format => { - self.handle_extension(input, size, padded_size, ExtensionKind::GnuLongLink) + self.handle_extension(input, size, padded_size, ExtensionKind::GnuLongLink, slices) } EntryType::XHeader if is_extension_format => { - self.handle_extension(input, size, padded_size, ExtensionKind::Pax) + self.handle_extension(input, size, padded_size, ExtensionKind::Pax, slices) } // Global PAX headers (type 'g') are defined by POSIX // independently of the UStar/GNU magic, so we always handle @@ -826,7 +899,7 @@ impl Parser { let content_start = HEADER_SIZE; let content_end = content_start + size as usize; - let pax_data = input[content_start..content_end].to_vec(); + let pax_data = &input[content_start..content_end]; Ok(ParseEvent::GlobalExtensions { consumed: total_size as usize, @@ -834,17 +907,22 @@ impl Parser { }) } EntryType::GnuSparse if is_extension_format => { - self.handle_gnu_sparse(input, header, size) + self.handle_gnu_sparse(input, header, size, slices) } _ => { // Check for PAX v1.0 sparse before emitting — it requires // reading the sparse map from the data stream. - if self.pending_pax_sparse_version()? == Some((1, 0)) { - self.handle_pax_sparse_v1(input, header, size) + let sparse_version = if let Some(pax) = slices.pax_extensions { + pax_sparse_version(pax, self.limits.strict)? + } else { + None + }; + if sparse_version == Some((1, 0)) { + self.handle_pax_sparse_v1(input, header, size, slices) } else { // Actual entry — emit_entry handles v0.0/v0.1 PAX sparse // inline during PAX extension processing. - self.emit_entry(header, size, None) + self.emit_entry(header, size, None, slices) } } } @@ -852,25 +930,25 @@ impl Parser { /// Process a GNU long name/link or PAX extension entry. /// - /// Extracts the extension data, adds it to pending metadata, and - /// recurses to parse the next header. If the recursive call returns - /// NeedData (not enough input for the following header), the pending - /// state is restored so the caller can retry with more data and - /// re-parse the entire extension chain from scratch. + /// Extracts the extension data as a borrowed slice (zero-copy), adds it + /// to `slices`, and recurses to parse the next header. No state is stored + /// in `self`, so on `NeedData` the recursion simply unwinds — the caller + /// retries from scratch, re-parsing the extension chain. fn handle_extension<'a>( &mut self, input: &'a [u8], size: u64, padded_size: u64, kind: ExtensionKind, + slices: PendingMetadata<'a>, ) -> Result> { // Check for duplicate - let slot = match kind { - ExtensionKind::GnuLongName => &self.pending.gnu_long_name, - ExtensionKind::GnuLongLink => &self.pending.gnu_long_link, - ExtensionKind::Pax => &self.pending.pax_extensions, + let has_dup = match kind { + ExtensionKind::GnuLongName => slices.gnu_long_name.is_some(), + ExtensionKind::GnuLongLink => slices.gnu_long_link.is_some(), + ExtensionKind::Pax => slices.pax_extensions.is_some(), }; - if slot.is_some() { + if has_dup { return Err(match kind { ExtensionKind::GnuLongName => ParseError::DuplicateGnuLongName, ExtensionKind::GnuLongLink => ParseError::DuplicateGnuLongLink, @@ -909,18 +987,18 @@ impl Parser { }); } - // Extract content + // Extract content as a borrowed slice (zero-copy) let content_start = HEADER_SIZE; let content_end = content_start + size as usize; - let mut data = input[content_start..content_end].to_vec(); + let mut data: &'a [u8] = &input[content_start..content_end]; // Strip trailing null for GNU long name/link if matches!( kind, ExtensionKind::GnuLongName | ExtensionKind::GnuLongLink ) { - if data.last() == Some(&0) { - data.pop(); + if let Some(trimmed) = data.strip_suffix(&[0]) { + data = trimmed; } if data.len() > self.limits.max_path_len { return Err(ParseError::PathTooLong { @@ -930,107 +1008,19 @@ impl Parser { } } - // Save current pending state, apply the new extension data, - // and recurse. If the recursive call needs more data, restore - // the saved state so the caller can retry from scratch. - let saved = core::mem::take(&mut self.pending); - self.pending = PendingMetadata { - count: saved.count + 1, - ..saved.clone() - }; - let slot = match kind { - ExtensionKind::GnuLongName => &mut self.pending.gnu_long_name, - ExtensionKind::GnuLongLink => &mut self.pending.gnu_long_link, - ExtensionKind::Pax => &mut self.pending.pax_extensions, + // Build new pending slices with the added extension data + let mut new_slices = PendingMetadata { + count: slices.count + 1, + ..slices }; - *slot = Some(data); - - let result = self - .parse_header(&input[total_size as usize..]) - .map(|e| e.add_consumed(total_size as usize)); - - if matches!(result, Ok(ParseEvent::NeedData { .. })) { - self.pending = saved; + match kind { + ExtensionKind::GnuLongName => new_slices.gnu_long_name = Some(data), + ExtensionKind::GnuLongLink => new_slices.gnu_long_link = Some(data), + ExtensionKind::Pax => new_slices.pax_extensions = Some(data), } - result - } - /// Check pending PAX extensions for GNU sparse version. - /// - /// Returns `Some((major, minor))` if `GNU.sparse.major` and - /// `GNU.sparse.minor` are both present and parseable, `None` if - /// the keys are absent. In strict mode, malformed values produce - /// errors instead of being silently ignored. - fn pending_pax_sparse_version(&self) -> Result> { - let pax = match self.pending.pax_extensions.as_ref() { - Some(p) => p, - None => return Ok(None), - }; - let strict = self.limits.strict; - let mut major = None; - let mut minor = None; - for ext in PaxExtensions::new(pax) { - let ext = ext?; - let key = match ext.key() { - Ok(k) => k, - Err(_) if !strict => continue, - Err(e) => return Err(ParseError::from(e)), - }; - match key { - PAX_GNU_SPARSE_MAJOR => { - let s = match ext.value() { - Ok(s) => s, - Err(_) if !strict => continue, - Err(_) => { - return Err(ParseError::InvalidPaxValue { - key: PAX_GNU_SPARSE_MAJOR, - value: String::from_utf8_lossy(ext.value_bytes()).into(), - }) - } - }; - match s.parse::() { - Ok(v) => major = Some(v), - Err(_) if !strict => {} - Err(_) => { - return Err(ParseError::InvalidPaxValue { - key: PAX_GNU_SPARSE_MAJOR, - value: s.into(), - }) - } - } - } - PAX_GNU_SPARSE_MINOR => { - let s = match ext.value() { - Ok(s) => s, - Err(_) if !strict => continue, - Err(_) => { - return Err(ParseError::InvalidPaxValue { - key: PAX_GNU_SPARSE_MINOR, - value: String::from_utf8_lossy(ext.value_bytes()).into(), - }) - } - }; - match s.parse::() { - Ok(v) => minor = Some(v), - Err(_) if !strict => {} - Err(_) => { - return Err(ParseError::InvalidPaxValue { - key: PAX_GNU_SPARSE_MINOR, - value: s.into(), - }) - } - } - } - _ => {} - } - if major.is_some() && minor.is_some() { - break; - } - } - match (major, minor) { - (Some(maj), Some(min)) => Ok(Some((maj, min))), - _ => Ok(None), - } + self.parse_header(&input[total_size as usize..], new_slices) + .map(|e| e.add_consumed(total_size as usize)) } /// Handle a PAX v1.0 sparse entry. @@ -1052,12 +1042,11 @@ impl Parser { input: &'a [u8], header: &'a Header, size: u64, + slices: PendingMetadata<'a>, ) -> Result> { // Extract sparse metadata from PAX extensions. - let pax = self - .pending + let pax = slices .pax_extensions - .as_ref() .ok_or_else(|| ParseError::InvalidPaxSparseMap("missing PAX extensions".into()))?; let strict = self.limits.strict; @@ -1094,7 +1083,7 @@ impl Parser { } } PAX_GNU_SPARSE_NAME => { - sparse_name = Some(ext.value_bytes().to_vec()); + sparse_name = Some(ext.value_bytes()); } _ => {} } @@ -1202,18 +1191,18 @@ impl Parser { ext_consumed: map_size, }; - // Override the path with GNU.sparse.name if present. - if let Some(name) = sparse_name { - // Stash in pending so emit_entry picks it up. We don't use - // gnu_long_name because that has different semantics (no - // trailing NUL). Instead we'll handle it after emit_entry - // returns, or we can pass it through the sparse context. - // Actually, the cleanest way: temporarily store it in - // gnu_long_name so emit_entry applies it. - self.pending.gnu_long_name = Some(name); - } + // Override the path with GNU.sparse.name if present by + // stashing it in the slices so emit_entry picks it up. + let slices = if let Some(name) = sparse_name { + PendingMetadata { + gnu_long_name: Some(name), + ..slices + } + } else { + slices + }; - self.emit_entry(header, content_size, Some(sparse_ctx)) + self.emit_entry(header, content_size, Some(sparse_ctx), slices) } /// Handle a GNU sparse entry (type 'S'). @@ -1227,6 +1216,7 @@ impl Parser { input: &'a [u8], header: &'a Header, size: u64, + slices: PendingMetadata<'a>, ) -> Result> { let gnu = header.try_as_gnu().ok_or(ParseError::SparseNotGnu)?; let real_size = gnu.real_size()?; @@ -1298,7 +1288,7 @@ impl Parser { ext_consumed, }; - self.emit_entry(header, size, Some(sparse_ctx)) + self.emit_entry(header, size, Some(sparse_ctx), slices) } fn emit_entry<'a>( @@ -1306,6 +1296,7 @@ impl Parser { header: &'a Header, size: u64, sparse: Option, + slices: PendingMetadata<'a>, ) -> Result> { // Start with header values let mut path: Cow<'a, [u8]> = Cow::Borrowed(header.path_bytes()); @@ -1335,13 +1326,13 @@ impl Parser { } // Apply GNU long name (overrides header + prefix) - if let Some(long_name) = self.pending.gnu_long_name.take() { - path = Cow::Owned(long_name); + if let Some(long_name) = slices.gnu_long_name { + path = Cow::Borrowed(long_name); } // Apply GNU long link - if let Some(long_link) = self.pending.gnu_long_link.take() { - link_target = Some(Cow::Owned(long_link)); + if let Some(long_link) = slices.gnu_long_link { + link_target = Some(Cow::Borrowed(long_link)); } else { let header_link = header.link_name_bytes(); if !header_link.is_empty() { @@ -1350,17 +1341,17 @@ impl Parser { } // Apply PAX extensions (highest priority) - let raw_pax = self.pending.pax_extensions.take(); + let raw_pax = slices.pax_extensions; // PAX sparse v0.0/v0.1 tracking. v0.0 uses repeated offset/numbytes // pairs; v0.1 uses a single comma-separated map string. let mut pax_sparse_map: Option> = None; let mut pax_sparse_real_size: Option = None; - let mut pax_sparse_name: Option> = None; + let mut pax_sparse_name: Option<&'a [u8]> = None; // v0.0: current offset waiting for its numbytes pair let mut pax_sparse_pending_offset: Option = None; - if let Some(ref pax) = raw_pax { + if let Some(pax) = raw_pax { let strict = self.limits.strict; let extensions = PaxExtensions::new(pax); @@ -1401,7 +1392,7 @@ impl Parser { limit: self.limits.max_path_len, }); } - path = Cow::Owned(value.to_vec()); + path = Cow::Borrowed(value); } PAX_LINKPATH => { if value.len() > self.limits.max_path_len { @@ -1410,7 +1401,7 @@ impl Parser { limit: self.limits.max_path_len, }); } - link_target = Some(Cow::Owned(value.to_vec())); + link_target = Some(Cow::Borrowed(value)); } PAX_SIZE => { if let Some(v) = parse_pax_u64(&ext, PAX_SIZE)? { @@ -1453,10 +1444,10 @@ impl Parser { } } PAX_UNAME => { - uname = Some(Cow::Owned(value.to_vec())); + uname = Some(Cow::Borrowed(value)); } PAX_GNAME => { - gname = Some(Cow::Owned(value.to_vec())); + gname = Some(Cow::Borrowed(value)); } // PAX sparse v0.0: repeated offset/numbytes pairs @@ -1535,7 +1526,7 @@ impl Parser { limit: self.limits.max_path_len, }); } - pax_sparse_name = Some(value.to_vec()); + pax_sparse_name = Some(value); } // Skip version fields — already handled in @@ -1544,10 +1535,8 @@ impl Parser { _ => { if let Some(attr_name) = key.strip_prefix(PAX_SCHILY_XATTR) { - xattrs.push(( - Cow::Owned(attr_name.as_bytes().to_vec()), - Cow::Owned(value.to_vec()), - )); + xattrs + .push((Cow::Borrowed(attr_name.as_bytes()), Cow::Borrowed(value))); } } } @@ -1556,12 +1545,9 @@ impl Parser { // Apply PAX sparse name override (highest priority for path). if let Some(name) = pax_sparse_name { - path = Cow::Owned(name); + path = Cow::Borrowed(name); } - // Clear pending metadata - self.pending.clear(); - // Normalize: empty optional byte fields are semantically equivalent to // absent. PAX overrides and GNU long link can set empty values that // would otherwise surface as `Some([])` instead of `None`. @@ -2157,7 +2143,7 @@ mod tests { assert_eq!(entry.xattrs.len(), 1); // Raw PAX data is preserved - let raw = entry.pax.as_ref().expect("pax should be Some"); + let raw = entry.pax.expect("pax should be Some"); let exts = PaxExtensions::new(raw); let keys: Vec<&str> = exts .filter_map(|e| e.ok()) From ea0007eb64d24efd9a0c0d2e03e5cc8ef46e2b92 Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Wed, 4 Mar 2026 21:48:03 +0000 Subject: [PATCH 2/7] parse: Use Cow<'static, str> in error types to avoid from_utf8_lossy Change InvalidPaxValue.value and InvalidPaxSparseMap from String to Cow<'static, str>. Static error messages use Cow::Borrowed (zero allocation), while the few cases that interpolate values (format!) use Cow::Owned. The from_utf8_lossy calls on non-UTF-8 PAX values are replaced with a static "" placeholder since the key alone identifies the problem. Assisted-by: OpenCode (Claude claude-opus-4-6) --- src/parse.rs | 80 +++++++++++++++++++++++++++------------------------- 1 file changed, 42 insertions(+), 38 deletions(-) diff --git a/src/parse.rs b/src/parse.rs index 6e9794e..5281d9b 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -46,6 +46,7 @@ //! ``` use alloc::borrow::Cow; +use alloc::borrow::ToOwned; use alloc::format; use alloc::string::String; use alloc::vec::Vec; @@ -286,7 +287,7 @@ pub enum ParseError { /// A PAX sparse map field is malformed. #[error("invalid PAX sparse map: {0}")] - InvalidPaxSparseMap(String), + InvalidPaxSparseMap(Cow<'static, str>), /// A PAX extension value failed to parse in strict mode. #[error("invalid PAX {key} value: {value:?}")] @@ -294,7 +295,7 @@ pub enum ParseError { /// The PAX key (e.g. "uid", "size"). key: &'static str, /// The raw value string. - value: String, + value: Cow<'static, str>, }, /// Entry path is empty after applying all overrides (GNU long name, PAX path, etc.). @@ -619,7 +620,7 @@ fn pax_sparse_version(pax: &[u8], strict: bool) -> Result> { Err(_) => { return Err(ParseError::InvalidPaxValue { key: PAX_GNU_SPARSE_MAJOR, - value: String::from_utf8_lossy(ext.value_bytes()).into(), + value: Cow::Borrowed(""), }) } }; @@ -629,7 +630,7 @@ fn pax_sparse_version(pax: &[u8], strict: bool) -> Result> { Err(_) => { return Err(ParseError::InvalidPaxValue { key: PAX_GNU_SPARSE_MAJOR, - value: s.into(), + value: s.to_owned().into(), }) } } @@ -641,7 +642,7 @@ fn pax_sparse_version(pax: &[u8], strict: bool) -> Result> { Err(_) => { return Err(ParseError::InvalidPaxValue { key: PAX_GNU_SPARSE_MINOR, - value: String::from_utf8_lossy(ext.value_bytes()).into(), + value: Cow::Borrowed(""), }) } }; @@ -651,7 +652,7 @@ fn pax_sparse_version(pax: &[u8], strict: bool) -> Result> { Err(_) => { return Err(ParseError::InvalidPaxValue { key: PAX_GNU_SPARSE_MINOR, - value: s.into(), + value: s.to_owned().into(), }) } } @@ -1047,7 +1048,9 @@ impl Parser { // Extract sparse metadata from PAX extensions. let pax = slices .pax_extensions - .ok_or_else(|| ParseError::InvalidPaxSparseMap("missing PAX extensions".into()))?; + .ok_or(ParseError::InvalidPaxSparseMap(Cow::Borrowed( + "missing PAX extensions", + )))?; let strict = self.limits.strict; let mut real_size = None; @@ -1067,7 +1070,7 @@ impl Parser { Err(_) => { return Err(ParseError::InvalidPaxValue { key: PAX_GNU_SPARSE_REALSIZE, - value: String::from_utf8_lossy(ext.value_bytes()).into(), + value: Cow::Borrowed(""), }) } }; @@ -1077,7 +1080,7 @@ impl Parser { Err(_) => { return Err(ParseError::InvalidPaxValue { key: PAX_GNU_SPARSE_REALSIZE, - value: s.into(), + value: s.to_owned().into(), }) } } @@ -1089,8 +1092,9 @@ impl Parser { } } - let real_size = real_size - .ok_or_else(|| ParseError::InvalidPaxSparseMap("missing GNU.sparse.realsize".into()))?; + let real_size = real_size.ok_or(ParseError::InvalidPaxSparseMap(Cow::Borrowed( + "missing GNU.sparse.realsize", + )))?; // The sparse map data starts right after the header (at offset // HEADER_SIZE within the input). We need to parse it without @@ -1113,16 +1117,16 @@ impl Parser { let s = match core::str::from_utf8(line) { Ok(s) => s, Err(_) => { - return Some(Err(ParseError::InvalidPaxSparseMap( - "non-UTF8 in sparse map".into(), - ))) + return Some(Err(ParseError::InvalidPaxSparseMap(Cow::Borrowed( + "non-UTF8 in sparse map", + )))) } }; match s.parse::() { Ok(v) => Some(Ok(v)), - Err(_) => Some(Err(ParseError::InvalidPaxSparseMap(format!( - "invalid decimal: {s:?}" - )))), + Err(_) => Some(Err(ParseError::InvalidPaxSparseMap( + format!("invalid decimal: {s:?}").into(), + ))), } }; @@ -1180,9 +1184,11 @@ impl Parser { // The remaining content size is the original size minus the // sparse map prefix (including padding). - let content_size = size.checked_sub(map_size as u64).ok_or_else(|| { - ParseError::InvalidPaxSparseMap("sparse map prefix larger than entry size".into()) - })?; + let content_size = + size.checked_sub(map_size as u64) + .ok_or(ParseError::InvalidPaxSparseMap(Cow::Borrowed( + "sparse map prefix larger than entry size", + )))?; let sparse_ctx = SparseContext { sparse_map, @@ -1365,7 +1371,7 @@ impl Parser { Err(_) => { return Err(ParseError::InvalidPaxValue { key, - value: String::from_utf8_lossy(ext.value_bytes()).into(), + value: Cow::Borrowed(""), }) } }; @@ -1374,7 +1380,7 @@ impl Parser { Err(_) if !strict => Ok(None), Err(_) => Err(ParseError::InvalidPaxValue { key, - value: s.into(), + value: s.to_owned().into(), }), } }; @@ -1427,7 +1433,7 @@ impl Parser { Err(_) => { return Err(ParseError::InvalidPaxValue { key: PAX_MTIME, - value: String::from_utf8_lossy(value).into(), + value: Cow::Borrowed(""), }) } }; @@ -1438,7 +1444,7 @@ impl Parser { Err(_) => { return Err(ParseError::InvalidPaxValue { key: PAX_MTIME, - value: s.into(), + value: s.to_owned().into(), }) } } @@ -1477,17 +1483,17 @@ impl Parser { Ok(s) => s, Err(_) if !strict => continue, Err(_) => { - return Err(ParseError::InvalidPaxSparseMap( - "non-UTF8 sparse map".into(), - )) + return Err(ParseError::InvalidPaxSparseMap(Cow::Borrowed( + "non-UTF8 sparse map", + ))) } }; let mut map = Vec::new(); let parts: Vec<&str> = s.split(',').filter(|p| !p.is_empty()).collect(); if parts.len() % 2 != 0 { - return Err(ParseError::InvalidPaxSparseMap( - "odd number of values in GNU.sparse.map".into(), - )); + return Err(ParseError::InvalidPaxSparseMap(Cow::Borrowed( + "odd number of values in GNU.sparse.map", + ))); } for pair in parts.chunks(2) { if map.len() >= self.limits.max_sparse_entries { @@ -1497,16 +1503,14 @@ impl Parser { }); } let offset = pair[0].parse::().map_err(|_| { - ParseError::InvalidPaxSparseMap(format!( - "invalid offset: {:?}", - pair[0] - )) + ParseError::InvalidPaxSparseMap( + format!("invalid offset: {:?}", pair[0]).into(), + ) })?; let length = pair[1].parse::().map_err(|_| { - ParseError::InvalidPaxSparseMap(format!( - "invalid length: {:?}", - pair[1] - )) + ParseError::InvalidPaxSparseMap( + format!("invalid length: {:?}", pair[1]).into(), + ) })?; map.push(SparseEntry { offset, length }); } From 1f04dc6de8261e7bfaf4fdc9d975497305622746 Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Wed, 4 Mar 2026 21:48:23 +0000 Subject: [PATCH 3/7] parse: Consolidate metadata limits into max_metadata_size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace max_pax_size, max_gnu_long_size, and the previous max_path_len with two cleaner limits: - max_metadata_size: u32 — aggregate budget for all extension data (PAX headers + GNU long name/link) per entry. Default 1 MiB. Consolidates PaxTooLarge/GnuLongTooLarge into MetadataTooLarge. - max_path_len: Option — optional filesystem-level path length check. None by default (we're a parser, not a filesystem). Callers extracting to disk should set this to libc::PATH_MAX or equivalent. Also add Limits::check_path_len() helper and track running metadata size in PendingMetadata. Assisted-by: OpenCode (Claude claude-opus-4-6) --- src/parse.rs | 195 +++++++++++++++++------------------------ tests/stream_parser.rs | 73 ++++++--------- 2 files changed, 109 insertions(+), 159 deletions(-) diff --git a/src/parse.rs b/src/parse.rs index 5281d9b..596aaf2 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -82,37 +82,36 @@ use crate::{ /// /// // Customize limits /// let strict_limits = Limits { -/// max_path_len: 1024, -/// max_pax_size: 64 * 1024, +/// max_metadata_size: 64 * 1024, +/// // Set to libc::PATH_MAX when extracting to disk +/// max_path_len: Some(4096), /// ..Default::default() /// }; /// ``` #[derive(Debug, Clone, PartialEq, Eq)] pub struct Limits { - /// Maximum path length in bytes. + /// Maximum total size of all extension metadata for a single entry, in bytes. /// - /// Applies to both file paths and link targets. Paths exceeding this - /// limit will cause a [`ParseError::PathTooLong`] error. - /// - /// Default: 4096 bytes (Linux PATH_MAX). - pub max_path_len: usize, - - /// Maximum size of PAX extended header data in bytes. - /// - /// This limits the total size of a single PAX 'x' entry's content. - /// PAX headers larger than this will cause a [`ParseError::PaxTooLarge`] error. + /// This is an aggregate budget: the combined size of PAX extended headers, + /// GNU long name, and GNU long link data for one file entry must not exceed + /// this limit. Exceeding it will cause a [`ParseError::MetadataTooLarge`] + /// error. /// /// Default: 1 MiB (1,048,576 bytes). - pub max_pax_size: u64, + pub max_metadata_size: u32, - /// Maximum size of GNU long name/link data in bytes. + /// Optional maximum path length in bytes. + /// + /// When set, paths and link targets exceeding this limit will cause a + /// [`ParseError::PathTooLong`] error. When `None`, no path length check + /// is performed (the default). /// - /// GNU 'L' (long name) and 'K' (long link) entries should only contain - /// a single path. Values exceeding this limit will cause a - /// [`ParseError::GnuLongTooLarge`] error. + /// Callers extracting to a real filesystem should set this to + /// `libc::PATH_MAX` (4096 on Linux, 1024 on macOS) or the appropriate + /// platform constant. /// - /// Default: 4096 bytes. - pub max_gnu_long_size: u64, + /// Default: `None`. + pub max_path_len: Option, /// Maximum number of consecutive metadata entries before an actual entry. /// @@ -146,9 +145,8 @@ pub struct Limits { impl Default for Limits { fn default() -> Self { Self { - max_path_len: 4096, - max_pax_size: 1024 * 1024, // 1 MiB - max_gnu_long_size: 4096, + max_metadata_size: 1024 * 1024, // 1 MiB + max_path_len: None, max_pending_entries: 16, max_sparse_entries: 10_000, strict: true, @@ -170,9 +168,8 @@ impl Limits { #[must_use] pub fn permissive() -> Self { Self { - max_path_len: usize::MAX, - max_pax_size: u64::MAX, - max_gnu_long_size: u64::MAX, + max_metadata_size: u32::MAX, + max_path_len: None, max_pending_entries: usize::MAX, max_sparse_entries: 1_000_000, strict: false, @@ -186,14 +183,26 @@ impl Limits { #[must_use] pub fn strict() -> Self { Self { - max_path_len: 1024, - max_pax_size: 64 * 1024, // 64 KiB - max_gnu_long_size: 1024, + max_metadata_size: 64 * 1024, // 64 KiB + max_path_len: Some(4096), max_pending_entries: 8, max_sparse_entries: 1000, strict: true, } } + + /// Check a path length against the configured limit. + /// + /// Returns `Ok(())` if the path is within the limit (or no limit is set), + /// or `Err(ParseError::PathTooLong)` if it exceeds it. + pub fn check_path_len(&self, len: usize) -> Result<()> { + if let Some(limit) = self.max_path_len { + if len > limit as usize { + return Err(ParseError::PathTooLong { len, limit }); + } + } + Ok(()) + } } // ============================================================================ @@ -226,25 +235,19 @@ pub enum ParseError { /// Actual path length. len: usize, /// Configured limit. - limit: usize, + limit: u32, }, - /// PAX extended header exceeds configured maximum size. - #[error("PAX header exceeds limit: {size} bytes > {limit} bytes")] - PaxTooLarge { - /// Actual PAX header size. - size: u64, - /// Configured limit. - limit: u64, - }, - - /// GNU long name/link exceeds configured maximum size. - #[error("GNU long name/link exceeds limit: {size} bytes > {limit} bytes")] - GnuLongTooLarge { - /// Actual GNU long name/link size. + /// Extension metadata exceeds configured maximum size. + /// + /// The aggregate size of all extension data (PAX headers, GNU long + /// name/link) for a single entry exceeded [`Limits::max_metadata_size`]. + #[error("metadata exceeds limit: {size} bytes > {limit} bytes")] + MetadataTooLarge { + /// Total metadata size that would result. size: u64, /// Configured limit. - limit: u64, + limit: u32, }, /// Duplicate GNU long name entry without an intervening actual entry. @@ -576,6 +579,8 @@ struct PendingMetadata<'a> { gnu_long_link: Option<&'a [u8]>, pax_extensions: Option<&'a [u8]>, count: usize, + /// Running total of all extension data bytes accumulated so far. + metadata_size: u64, } /// Context for GNU sparse entries, passed from `handle_gnu_sparse` to @@ -881,11 +886,11 @@ impl Parser { // them here. Routing through emit_entry would fail because // global headers have arbitrary metadata fields. EntryType::XGlobalHeader => { - // Check size limit (same as local PAX headers) - if size > self.limits.max_pax_size { - return Err(ParseError::PaxTooLarge { + // Check size limit + if size > self.limits.max_metadata_size as u64 { + return Err(ParseError::MetadataTooLarge { size, - limit: self.limits.max_pax_size, + limit: self.limits.max_metadata_size, }); } @@ -957,25 +962,12 @@ impl Parser { }); } - // Check size limit - let max_size = match kind { - ExtensionKind::GnuLongName | ExtensionKind::GnuLongLink => { - self.limits.max_gnu_long_size - } - ExtensionKind::Pax => self.limits.max_pax_size, - }; - if size > max_size { - return Err(match kind { - ExtensionKind::GnuLongName | ExtensionKind::GnuLongLink => { - ParseError::GnuLongTooLarge { - size, - limit: max_size, - } - } - ExtensionKind::Pax => ParseError::PaxTooLarge { - size, - limit: max_size, - }, + // Check aggregate metadata size limit + let new_metadata_size = slices.metadata_size + size; + if new_metadata_size > self.limits.max_metadata_size as u64 { + return Err(ParseError::MetadataTooLarge { + size: new_metadata_size, + limit: self.limits.max_metadata_size, }); } @@ -1001,17 +993,13 @@ impl Parser { if let Some(trimmed) = data.strip_suffix(&[0]) { data = trimmed; } - if data.len() > self.limits.max_path_len { - return Err(ParseError::PathTooLong { - len: data.len(), - limit: self.limits.max_path_len, - }); - } + self.limits.check_path_len(data.len())?; } - // Build new pending slices with the added extension data + // Build new pending metadata with the added extension data let mut new_slices = PendingMetadata { count: slices.count + 1, + metadata_size: new_metadata_size, ..slices }; match kind { @@ -1392,21 +1380,11 @@ impl Parser { match key { PAX_PATH => { - if value.len() > self.limits.max_path_len { - return Err(ParseError::PathTooLong { - len: value.len(), - limit: self.limits.max_path_len, - }); - } + self.limits.check_path_len(value.len())?; path = Cow::Borrowed(value); } PAX_LINKPATH => { - if value.len() > self.limits.max_path_len { - return Err(ParseError::PathTooLong { - len: value.len(), - limit: self.limits.max_path_len, - }); - } + self.limits.check_path_len(value.len())?; link_target = Some(Cow::Borrowed(value)); } PAX_SIZE => { @@ -1524,12 +1502,7 @@ impl Parser { } } PAX_GNU_SPARSE_NAME => { - if value.len() > self.limits.max_path_len { - return Err(ParseError::PathTooLong { - len: value.len(), - limit: self.limits.max_path_len, - }); - } + self.limits.check_path_len(value.len())?; pax_sparse_name = Some(value); } @@ -1571,12 +1544,7 @@ impl Parser { } // Validate final path length - if path.len() > self.limits.max_path_len { - return Err(ParseError::PathTooLong { - len: path.len(), - limit: self.limits.max_path_len, - }); - } + self.limits.check_path_len(path.len())?; let entry = ParsedEntry { header, @@ -1633,24 +1601,23 @@ mod tests { #[test] fn test_default_limits() { let limits = Limits::default(); - assert_eq!(limits.max_path_len, 4096); - assert_eq!(limits.max_pax_size, 1024 * 1024); - assert_eq!(limits.max_gnu_long_size, 4096); + assert_eq!(limits.max_metadata_size, 1024 * 1024); + assert_eq!(limits.max_path_len, None); assert_eq!(limits.max_pending_entries, 16); } #[test] fn test_permissive_limits() { let limits = Limits::permissive(); - assert_eq!(limits.max_path_len, usize::MAX); - assert_eq!(limits.max_pax_size, u64::MAX); + assert_eq!(limits.max_metadata_size, u32::MAX); + assert_eq!(limits.max_path_len, None); } #[test] fn test_strict_limits() { let limits = Limits::strict(); - assert!(limits.max_path_len < Limits::default().max_path_len); - assert!(limits.max_pax_size < Limits::default().max_pax_size); + assert_eq!(limits.max_path_len, Some(4096)); + assert!(limits.max_metadata_size < Limits::default().max_metadata_size); } #[test] @@ -2291,7 +2258,7 @@ mod tests { #[test] fn test_parser_global_pax_header_too_large() { - // Global PAX header exceeding max_pax_size should error + // Global PAX header exceeding max_metadata_size should error let large_value = "x".repeat(1000); let mut archive = Vec::new(); @@ -2303,13 +2270,13 @@ mod tests { archive.extend(zeroes(1024)); let limits = Limits { - max_pax_size: 100, + max_metadata_size: 100, ..Default::default() }; let mut parser = Parser::new(limits); let result = parser.parse(&archive); - assert!(matches!(result, Err(ParseError::PaxTooLarge { .. }))); + assert!(matches!(result, Err(ParseError::MetadataTooLarge { .. }))); } #[test] @@ -2607,13 +2574,13 @@ mod tests { archive.extend(zeroes(1024)); let limits = Limits { - max_gnu_long_size: 100, + max_metadata_size: 100, ..Default::default() }; let mut parser = Parser::new(limits); let result = parser.parse(&archive); - assert!(matches!(result, Err(ParseError::GnuLongTooLarge { .. }))); + assert!(matches!(result, Err(ParseError::MetadataTooLarge { .. }))); } #[test] @@ -2626,7 +2593,7 @@ mod tests { archive.extend(zeroes(1024)); let limits = Limits { - max_path_len: 100, + max_path_len: Some(100), ..Default::default() }; let mut parser = Parser::new(limits); @@ -2643,7 +2610,7 @@ mod tests { #[test] fn test_parser_pax_too_large() { - // Create a PAX header that exceeds the size limit + // Create a PAX header that exceeds the metadata size limit let large_value = "x".repeat(1000); let mut archive = Vec::new(); @@ -2652,13 +2619,13 @@ mod tests { archive.extend(zeroes(1024)); let limits = Limits { - max_pax_size: 100, + max_metadata_size: 100, ..Default::default() }; let mut parser = Parser::new(limits); let result = parser.parse(&archive); - assert!(matches!(result, Err(ParseError::PaxTooLarge { .. }))); + assert!(matches!(result, Err(ParseError::MetadataTooLarge { .. }))); } // ========================================================================= diff --git a/tests/stream_parser.rs b/tests/stream_parser.rs index fc437c5..0ed381c 100644 --- a/tests/stream_parser.rs +++ b/tests/stream_parser.rs @@ -24,6 +24,7 @@ struct PendingMetadata { gnu_long_link: Option>, pax_extensions: Option>, count: usize, + metadata_size: u64, } impl PendingMetadata { @@ -187,6 +188,7 @@ impl TarStreamParser { let gnu_long_link = self.pending.gnu_long_link.take(); let pax_extensions = self.pending.pax_extensions.take(); self.pending.count = 0; + self.pending.metadata_size = 0; let entry = self.resolve_entry_with_pending( gnu_long_name, @@ -251,23 +253,20 @@ impl TarStreamParser { if self.pending.gnu_long_name.is_some() { return Err(ParseError::DuplicateGnuLongName); } - if size > self.limits.max_gnu_long_size { - return Err(ParseError::GnuLongTooLarge { - size, - limit: self.limits.max_gnu_long_size, + let new_metadata_size = self.pending.metadata_size + size; + if new_metadata_size > self.limits.max_metadata_size as u64 { + return Err(ParseError::MetadataTooLarge { + size: new_metadata_size, + limit: self.limits.max_metadata_size, }); } let mut data = self.read_vec(size as usize)?; self.skip_bytes(padded_size - size)?; data.pop_if(|&mut x| x == 0); - if data.len() > self.limits.max_path_len { - return Err(ParseError::PathTooLong { - len: data.len(), - limit: self.limits.max_path_len, - }); - } + self.limits.check_path_len(data.len())?; self.pending.gnu_long_name = Some(data); self.pending.count += 1; + self.pending.metadata_size = new_metadata_size; Ok(()) } @@ -275,23 +274,20 @@ impl TarStreamParser { if self.pending.gnu_long_link.is_some() { return Err(ParseError::DuplicateGnuLongLink); } - if size > self.limits.max_gnu_long_size { - return Err(ParseError::GnuLongTooLarge { - size, - limit: self.limits.max_gnu_long_size, + let new_metadata_size = self.pending.metadata_size + size; + if new_metadata_size > self.limits.max_metadata_size as u64 { + return Err(ParseError::MetadataTooLarge { + size: new_metadata_size, + limit: self.limits.max_metadata_size, }); } let mut data = self.read_vec(size as usize)?; self.skip_bytes(padded_size - size)?; data.pop_if(|&mut x| x == 0); - if data.len() > self.limits.max_path_len { - return Err(ParseError::PathTooLong { - len: data.len(), - limit: self.limits.max_path_len, - }); - } + self.limits.check_path_len(data.len())?; self.pending.gnu_long_link = Some(data); self.pending.count += 1; + self.pending.metadata_size = new_metadata_size; Ok(()) } @@ -299,16 +295,18 @@ impl TarStreamParser { if self.pending.pax_extensions.is_some() { return Err(ParseError::DuplicatePaxHeader); } - if size > self.limits.max_pax_size { - return Err(ParseError::PaxTooLarge { - size, - limit: self.limits.max_pax_size, + let new_metadata_size = self.pending.metadata_size + size; + if new_metadata_size > self.limits.max_metadata_size as u64 { + return Err(ParseError::MetadataTooLarge { + size: new_metadata_size, + limit: self.limits.max_metadata_size, }); } let data = self.read_vec(size as usize)?; self.skip_bytes(padded_size - size)?; self.pending.pax_extensions = Some(data); self.pending.count += 1; + self.pending.metadata_size = new_metadata_size; Ok(()) } @@ -366,21 +364,11 @@ impl TarStreamParser { match key { "path" => { - if value.len() > self.limits.max_path_len { - return Err(ParseError::PathTooLong { - len: value.len(), - limit: self.limits.max_path_len, - }); - } + self.limits.check_path_len(value.len())?; path = Cow::Owned(value.to_vec()); } "linkpath" => { - if value.len() > self.limits.max_path_len { - return Err(ParseError::PathTooLong { - len: value.len(), - limit: self.limits.max_path_len, - }); - } + self.limits.check_path_len(value.len())?; link_target = Some(Cow::Owned(value.to_vec())); } "size" => { @@ -431,12 +419,7 @@ impl TarStreamParser { } } - if path.len() > self.limits.max_path_len { - return Err(ParseError::PathTooLong { - len: path.len(), - limit: self.limits.max_path_len, - }); - } + self.limits.check_path_len(path.len())?; Ok(ParsedEntry { header_bytes: &self.header_buf, @@ -867,7 +850,7 @@ fn test_path_too_long() { }); let limits = Limits { - max_path_len: 100, + max_path_len: Some(100), ..Default::default() }; let mut parser = TarStreamParser::new(Cursor::new(data), limits); @@ -891,13 +874,13 @@ fn test_gnu_long_too_large() { }); let limits = Limits { - max_gnu_long_size: 100, + max_metadata_size: 100, ..Default::default() }; let mut parser = TarStreamParser::new(Cursor::new(data), limits); let err = parser.next_entry().unwrap_err(); - assert!(matches!(err, ParseError::GnuLongTooLarge { .. })); + assert!(matches!(err, ParseError::MetadataTooLarge { .. })); } // ============================================================================= From ed97396615e61a75114aef037ea24fbcf5ee7b1e Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Wed, 4 Mar 2026 21:48:46 +0000 Subject: [PATCH 4/7] parse: Remove Limits::strict(), move error leniency to Parser MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The strict/lenient toggle for PAX parse errors was a parser behavior flag, not a resource limit — it was misplaced on Limits. Move it to Parser as set_ignore_parsing_errors(bool), matching the pattern of set_allow_empty_path and set_verify_checksums. Remove Limits::strict() entirely. The defaults are already safe for untrusted input (1 MiB metadata cap, bounded pending/sparse counts). Callers who want tighter resource limits can set fields directly; callers who want lenient PAX parsing call set_ignore_parsing_errors. Assisted-by: OpenCode (Claude claude-opus-4-6) --- src/parse.rs | 112 +++++++++++++++++++++++---------------------------- 1 file changed, 51 insertions(+), 61 deletions(-) diff --git a/src/parse.rs b/src/parse.rs index 596aaf2..e5a7c79 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -81,7 +81,7 @@ use crate::{ /// let limits = Limits::default(); /// /// // Customize limits -/// let strict_limits = Limits { +/// let limits = Limits { /// max_metadata_size: 64 * 1024, /// // Set to libc::PATH_MAX when extracting to disk /// max_path_len: Some(4096), @@ -133,13 +133,6 @@ pub struct Limits { /// /// Default: 10000. pub max_sparse_entries: usize, - - /// When true, PAX extension values that fail to parse (invalid UTF-8, - /// invalid integer for numeric fields like `uid`, `gid`, `size`, `mtime`) - /// cause errors instead of being silently ignored. - /// - /// Default: `true`. - pub strict: bool, } impl Default for Limits { @@ -149,7 +142,6 @@ impl Default for Limits { max_path_len: None, max_pending_entries: 16, max_sparse_entries: 10_000, - strict: true, } } } @@ -172,22 +164,6 @@ impl Limits { max_path_len: None, max_pending_entries: usize::MAX, max_sparse_entries: 1_000_000, - strict: false, - } - } - - /// Create strict limits suitable for untrusted archives. - /// - /// This sets conservative limits to minimize resource consumption - /// from potentially malicious archives. - #[must_use] - pub fn strict() -> Self { - Self { - max_metadata_size: 64 * 1024, // 64 KiB - max_path_len: Some(4096), - max_pending_entries: 8, - max_sparse_entries: 1000, - strict: true, } } @@ -292,7 +268,7 @@ pub enum ParseError { #[error("invalid PAX sparse map: {0}")] InvalidPaxSparseMap(Cow<'static, str>), - /// A PAX extension value failed to parse in strict mode. + /// A PAX extension value failed to parse. #[error("invalid PAX {key} value: {value:?}")] InvalidPaxValue { /// The PAX key (e.g. "uid", "size"). @@ -605,23 +581,23 @@ impl PendingMetadata<'_> { /// /// Returns `Some((major, minor))` if `GNU.sparse.major` and /// `GNU.sparse.minor` are both present and parseable, `None` if -/// the keys are absent. In strict mode, malformed values produce -/// errors instead of being silently ignored. -fn pax_sparse_version(pax: &[u8], strict: bool) -> Result> { +/// the keys are absent. When `ignore_errors` is true, malformed values +/// are silently skipped instead of producing errors. +fn pax_sparse_version(pax: &[u8], ignore_errors: bool) -> Result> { let mut major = None; let mut minor = None; for ext in PaxExtensions::new(pax) { let ext = ext?; let key = match ext.key() { Ok(k) => k, - Err(_) if !strict => continue, + Err(_) if ignore_errors => continue, Err(e) => return Err(ParseError::from(e)), }; match key { PAX_GNU_SPARSE_MAJOR => { let s = match ext.value() { Ok(s) => s, - Err(_) if !strict => continue, + Err(_) if ignore_errors => continue, Err(_) => { return Err(ParseError::InvalidPaxValue { key: PAX_GNU_SPARSE_MAJOR, @@ -631,7 +607,7 @@ fn pax_sparse_version(pax: &[u8], strict: bool) -> Result> { }; match s.parse::() { Ok(v) => major = Some(v), - Err(_) if !strict => {} + Err(_) if ignore_errors => {} Err(_) => { return Err(ParseError::InvalidPaxValue { key: PAX_GNU_SPARSE_MAJOR, @@ -643,7 +619,7 @@ fn pax_sparse_version(pax: &[u8], strict: bool) -> Result> { PAX_GNU_SPARSE_MINOR => { let s = match ext.value() { Ok(s) => s, - Err(_) if !strict => continue, + Err(_) if ignore_errors => continue, Err(_) => { return Err(ParseError::InvalidPaxValue { key: PAX_GNU_SPARSE_MINOR, @@ -653,7 +629,7 @@ fn pax_sparse_version(pax: &[u8], strict: bool) -> Result> { }; match s.parse::() { Ok(v) => minor = Some(v), - Err(_) if !strict => {} + Err(_) if ignore_errors => {} Err(_) => { return Err(ParseError::InvalidPaxValue { key: PAX_GNU_SPARSE_MINOR, @@ -726,6 +702,13 @@ pub struct Parser { /// /// Default: `true`. verify_checksums: bool, + /// When true, malformed PAX extension values (invalid UTF-8, unparseable + /// integers for uid/gid/size/mtime) are silently skipped instead of + /// producing errors. This matches the behavior of many real-world tar + /// implementations. + /// + /// Default: `false`. + ignore_pax_errors: bool, } impl Parser { @@ -737,6 +720,7 @@ impl Parser { state: State::ReadHeader, allow_empty_path: false, verify_checksums: true, + ignore_pax_errors: false, } } @@ -759,6 +743,18 @@ impl Parser { self.verify_checksums = verify; } + /// Control whether malformed PAX extension values are silently ignored. + /// + /// When set to `true`, PAX values that fail to parse (invalid UTF-8, + /// unparseable integers for `uid`, `gid`, `size`, `mtime`) are skipped + /// instead of producing [`ParseError::InvalidPaxValue`] errors. This + /// matches the lenient behavior of many real-world tar implementations. + /// + /// Default: `false` (malformed values produce errors). + pub fn set_ignore_pax_errors(&mut self, ignore: bool) { + self.ignore_pax_errors = ignore; + } + /// Create a new parser with default limits. #[must_use] pub fn with_defaults() -> Self { @@ -919,7 +915,7 @@ impl Parser { // Check for PAX v1.0 sparse before emitting — it requires // reading the sparse map from the data stream. let sparse_version = if let Some(pax) = slices.pax_extensions { - pax_sparse_version(pax, self.limits.strict)? + pax_sparse_version(pax, self.ignore_pax_errors)? } else { None }; @@ -1040,21 +1036,21 @@ impl Parser { "missing PAX extensions", )))?; - let strict = self.limits.strict; + let ignore_errors = self.ignore_pax_errors; let mut real_size = None; let mut sparse_name = None; for ext in PaxExtensions::new(pax) { let ext = ext?; let key = match ext.key() { Ok(k) => k, - Err(_) if !strict => continue, + Err(_) if ignore_errors => continue, Err(e) => return Err(ParseError::from(e)), }; match key { PAX_GNU_SPARSE_REALSIZE | PAX_GNU_SPARSE_SIZE => { let s = match ext.value() { Ok(s) => s, - Err(_) if !strict => continue, + Err(_) if ignore_errors => continue, Err(_) => { return Err(ParseError::InvalidPaxValue { key: PAX_GNU_SPARSE_REALSIZE, @@ -1064,7 +1060,7 @@ impl Parser { }; match s.parse::() { Ok(v) => real_size = Some(v), - Err(_) if !strict => {} + Err(_) if ignore_errors => {} Err(_) => { return Err(ParseError::InvalidPaxValue { key: PAX_GNU_SPARSE_REALSIZE, @@ -1346,16 +1342,16 @@ impl Parser { let mut pax_sparse_pending_offset: Option = None; if let Some(pax) = raw_pax { - let strict = self.limits.strict; + let ignore_errors = self.ignore_pax_errors; let extensions = PaxExtensions::new(pax); - // Helper: parse a PAX numeric value, returning Err in strict mode - // or Ok(None) in lenient mode when the value is unparseable. + // Helper: parse a PAX numeric value, returning Ok(None) when + // ignore_pax_errors is set and the value is unparseable. let parse_pax_u64 = |ext: &crate::PaxExtension<'_>, key: &'static str| -> Result> { let s = match ext.value() { Ok(s) => s, - Err(_) if !strict => return Ok(None), + Err(_) if ignore_errors => return Ok(None), Err(_) => { return Err(ParseError::InvalidPaxValue { key, @@ -1365,7 +1361,7 @@ impl Parser { }; match s.parse::() { Ok(v) => Ok(Some(v)), - Err(_) if !strict => Ok(None), + Err(_) if ignore_errors => Ok(None), Err(_) => Err(ParseError::InvalidPaxValue { key, value: s.to_owned().into(), @@ -1407,7 +1403,7 @@ impl Parser { // parse only the integer part. let s = match ext.value() { Ok(s) => s, - Err(_) if !strict => continue, + Err(_) if ignore_errors => continue, Err(_) => { return Err(ParseError::InvalidPaxValue { key: PAX_MTIME, @@ -1418,7 +1414,7 @@ impl Parser { let int_part = s.split('.').next().unwrap_or(s); match int_part.parse::() { Ok(v) => mtime = v, - Err(_) if !strict => {} + Err(_) if ignore_errors => {} Err(_) => { return Err(ParseError::InvalidPaxValue { key: PAX_MTIME, @@ -1459,7 +1455,7 @@ impl Parser { PAX_GNU_SPARSE_MAP => { let s = match ext.value() { Ok(s) => s, - Err(_) if !strict => continue, + Err(_) if ignore_errors => continue, Err(_) => { return Err(ParseError::InvalidPaxSparseMap(Cow::Borrowed( "non-UTF8 sparse map", @@ -1614,10 +1610,10 @@ mod tests { } #[test] - fn test_strict_limits() { - let limits = Limits::strict(); - assert_eq!(limits.max_path_len, Some(4096)); - assert!(limits.max_metadata_size < Limits::default().max_metadata_size); + fn test_permissive_limits_relaxed() { + let limits = Limits::permissive(); + assert!(limits.max_metadata_size > Limits::default().max_metadata_size); + assert!(limits.max_pending_entries > Limits::default().max_pending_entries); } #[test] @@ -2861,11 +2857,8 @@ mod tests { #[test] fn test_lenient_ignores_invalid_pax_uid() { let archive = make_archive_with_pax("uid", b"notanumber"); - let limits = Limits { - strict: false, - ..Default::default() - }; - let mut parser = Parser::new(limits); + let mut parser = Parser::new(Limits::default()); + parser.set_ignore_pax_errors(true); let event = parser.parse(&archive).unwrap(); match event { ParseEvent::Entry { entry, .. } => { @@ -2879,11 +2872,8 @@ mod tests { #[test] fn test_lenient_ignores_invalid_pax_size() { let archive = make_archive_with_pax("size", b"xyz"); - let limits = Limits { - strict: false, - ..Default::default() - }; - let mut parser = Parser::new(limits); + let mut parser = Parser::new(Limits::default()); + parser.set_ignore_pax_errors(true); let event = parser.parse(&archive).unwrap(); match event { ParseEvent::Entry { entry, .. } => { From 4fee1e74ca8ca4833cb318243b6a2f61e01eee2d Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Wed, 4 Mar 2026 22:15:21 +0000 Subject: [PATCH 5/7] ci: Use Justfile as single source of truth for CI commands Replace inline cargo commands in GHA workflows with `just ci`, `just fuzz-all`, and `just generate-corpus`. This eliminates duplication between the Justfile and the workflow definitions. Also add `cargo test --no-default-features` to the `unit` target and remove `fuzz-all` from the `ci` target (fuzzing runs as a separate GHA job since it requires a nightly toolchain). Assisted-by: OpenCode (Claude Opus) --- .github/workflows/ci.yaml | 29 ++++++++-------------------- .github/workflows/fuzz-extended.yaml | 16 ++++----------- Justfile | 5 +++-- 3 files changed, 15 insertions(+), 35 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index eeaeaf0..857e5e0 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -42,16 +42,12 @@ jobs: uses: dtolnay/rust-toolchain@stable with: components: rustfmt, clippy + - name: Install just + uses: extractions/setup-just@v3 - name: Cache Dependencies uses: Swatinem/rust-cache@v2 - - name: cargo fmt (check) - run: cargo fmt -- --check -l - - name: Build - run: cargo build - - name: Test - run: cargo test -- --nocapture --quiet - - name: cargo clippy - run: cargo clippy -- -D warnings + - name: CI (check + test) + run: just ci msrv: name: MSRV @@ -92,22 +88,13 @@ jobs: uses: dtolnay/rust-toolchain@nightly - name: Install cargo-fuzz run: cargo install cargo-fuzz --locked + - name: Install just + uses: extractions/setup-just@v3 - name: Cache Dependencies uses: Swatinem/rust-cache@v2 with: workspaces: fuzz - name: Generate seed corpus - run: cargo run --manifest-path fuzz/Cargo.toml --bin generate-corpus + run: just generate-corpus - name: Fuzz all targets (2 minutes each) - run: | - mkdir -p fuzz-logs - for target in $(cargo +nightly fuzz list); do - echo "--- Fuzzing $target (2 min) ---" - cargo +nightly fuzz run "$target" -- -max_total_time=120 \ - > "fuzz-logs/$target.log" 2>&1 \ - && echo " $target: OK" \ - || { echo "::error::Fuzzer $target failed"; cat "fuzz-logs/$target.log"; exit 1; } - # Print final stats line - tail -1 "fuzz-logs/$target.log" - done - working-directory: fuzz + run: just fuzz-all diff --git a/.github/workflows/fuzz-extended.yaml b/.github/workflows/fuzz-extended.yaml index 9a2bb3c..d4600ac 100644 --- a/.github/workflows/fuzz-extended.yaml +++ b/.github/workflows/fuzz-extended.yaml @@ -24,21 +24,13 @@ jobs: uses: dtolnay/rust-toolchain@nightly - name: Install cargo-fuzz run: cargo install cargo-fuzz --locked + - name: Install just + uses: extractions/setup-just@v3 - name: Cache Dependencies uses: Swatinem/rust-cache@v2 with: workspaces: fuzz - name: Generate seed corpus - run: cargo run --manifest-path fuzz/Cargo.toml --bin generate-corpus + run: just generate-corpus - name: Fuzz all targets (15 minutes each) - run: | - mkdir -p fuzz-logs - for target in $(cargo +nightly fuzz list); do - echo "--- Fuzzing $target (15 min) ---" - cargo +nightly fuzz run "$target" -- -max_total_time=900 \ - > "fuzz-logs/$target.log" 2>&1 \ - && echo " $target: OK" \ - || { echo "::error::Fuzzer $target failed"; cat "fuzz-logs/$target.log"; exit 1; } - tail -1 "fuzz-logs/$target.log" - done - working-directory: fuzz + run: just fuzz-all 900 diff --git a/Justfile b/Justfile index 8b65425..6c380a7 100644 --- a/Justfile +++ b/Justfile @@ -17,6 +17,7 @@ unit: else \ cargo test; \ fi + cargo test --no-default-features # Run cross-language interop tests (requires python3, go) interop: @@ -27,8 +28,8 @@ interop: # Run all tests test-all: unit interop -# Full CI check (format, lint, test, fuzz) -ci: check unit fuzz-all +# CI check (format, lint, test); see also fuzz-all +ci: check unit # Run Kani formal verification proofs (install: cargo install --locked kani-verifier && cargo kani setup) kani: From c680a6b6869c87799abda5a1a0b2daa4707fb849 Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Wed, 4 Mar 2026 22:29:36 +0000 Subject: [PATCH 6/7] parse: Fix addition overflow in add_consumed for chained extensions The fuzzer found that chained extension headers (GNU long name/link, PAX) with large declared sizes can cause an arithmetic overflow in add_consumed() when the recursive parse_header calls unwind. Each layer adds total_size (HEADER_SIZE + padded_size) to the consumed or min_bytes field, and with Limits::permissive() allowing metadata up to u32::MAX, two such additions can overflow usize. Fix by using saturating_add instead of plain +. A saturated value (usize::MAX) will always exceed the input length, so callers correctly treat it as insufficient data. Crash: parse.rs:413 panic_const_add_overflow in GlobalExtensions variant, found by extended fuzzing on commit 518dcfb. Assisted-by: OpenCode (Claude Opus) --- src/parse.rs | 70 ++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 65 insertions(+), 5 deletions(-) diff --git a/src/parse.rs b/src/parse.rs index e5a7c79..a4d3541 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -390,10 +390,10 @@ impl<'a> ParseEvent<'a> { fn add_consumed(self, n: usize) -> Self { match self { ParseEvent::NeedData { min_bytes } => ParseEvent::NeedData { - min_bytes: min_bytes + n, + min_bytes: min_bytes.saturating_add(n), }, ParseEvent::Entry { consumed, entry } => ParseEvent::Entry { - consumed: consumed + n, + consumed: consumed.saturating_add(n), entry, }, ParseEvent::SparseEntry { @@ -402,17 +402,17 @@ impl<'a> ParseEvent<'a> { sparse_map, real_size, } => ParseEvent::SparseEntry { - consumed: consumed + n, + consumed: consumed.saturating_add(n), entry, sparse_map, real_size, }, ParseEvent::GlobalExtensions { consumed, pax_data } => ParseEvent::GlobalExtensions { - consumed: consumed + n, + consumed: consumed.saturating_add(n), pax_data, }, ParseEvent::End { consumed } => ParseEvent::End { - consumed: consumed + n, + consumed: consumed.saturating_add(n), }, } } @@ -4007,4 +4007,64 @@ mod tests { } } } + + /// Regression test: `add_consumed` must not overflow when chained + /// extension headers declare very large sizes. + /// + /// With `Limits::permissive()` (`max_metadata_size = u32::MAX`), + /// extension headers can declare sizes close to `u32::MAX`. When + /// `handle_extension` recurses and the inner call returns `NeedData`, + /// `add_consumed(total_size)` is applied on unwind at each level. + /// Before the fix, `min_bytes + n` used plain `+` and could overflow + /// `usize` (especially on 32-bit targets). The fix uses + /// `saturating_add`. This test verifies the parser returns `NeedData` + /// (or an error) without panicking. + #[test] + fn test_add_consumed_no_overflow() { + // First extension: a complete GNU long name ('L') with a small + // payload so the parser can fully consume it and recurse. + let long_name = b"a]long/path".to_vec(); + let gnu_entry = make_gnu_long_name(&long_name); + let first_entry_size = gnu_entry.len(); // 1024 bytes (header + padded name) + + // Second extension: a PAX ('x') header that declares a size close + // to u32::MAX. We only provide the header—not the content—so the + // recursive call in handle_extension will return NeedData with + // min_bytes ≈ pax_size + 512. On unwind, add_consumed adds + // first_entry_size, giving min_bytes ≈ pax_size + 512 + 1024. + // On 32-bit this would overflow without saturating_add. + let pax_size: u64 = u32::MAX as u64 - long_name.len() as u64 - 512; + let pax_header = make_header(b"PaxHeaders/file", pax_size, b'x'); + + // Build input: complete GNU long name entry + PAX header only (no + // PAX content). + let mut input = Vec::with_capacity(first_entry_size + HEADER_SIZE); + input.extend_from_slice(&gnu_entry); + input.extend_from_slice(&pax_header); + + let mut parser = Parser::new(Limits::permissive()); + let result = parser.parse(&input); + + // The parser must not panic. It should return NeedData (because the + // PAX content is missing) or an error—both are acceptable. + match result { + Ok(ParseEvent::NeedData { min_bytes }) => { + // min_bytes must be at least the PAX entry's total_size + // (header + padded content), and must not have wrapped to + // a small value due to overflow. + assert!( + min_bytes > HEADER_SIZE, + "min_bytes should be large, got {min_bytes}" + ); + } + Err(_) => { + // An error (e.g. metadata too large) is also acceptable; + // the important thing is no panic from arithmetic overflow. + } + other => panic!( + "Expected NeedData or Err for truncated extension chain, got {:?}", + other + ), + } + } } From dda63905a2f3748d30a4d1ddb53aed0576fb1225 Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Wed, 4 Mar 2026 22:29:41 +0000 Subject: [PATCH 7/7] ci: Upload fuzz crash artifacts on failure When a fuzzer crashes, the crash-reproducing input and logs are now saved as GitHub Actions artifacts, making it possible to download and reproduce failures locally without re-running long fuzz sessions. Assisted-by: OpenCode (Claude Opus) --- .github/workflows/ci.yaml | 8 ++++++++ .github/workflows/fuzz-extended.yaml | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 857e5e0..3676d87 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -98,3 +98,11 @@ jobs: run: just generate-corpus - name: Fuzz all targets (2 minutes each) run: just fuzz-all + - name: Upload fuzz artifacts + if: failure() + uses: actions/upload-artifact@v4 + with: + name: fuzz-artifacts + path: | + fuzz/artifacts/ + target/fuzz-logs/ diff --git a/.github/workflows/fuzz-extended.yaml b/.github/workflows/fuzz-extended.yaml index d4600ac..fb72b06 100644 --- a/.github/workflows/fuzz-extended.yaml +++ b/.github/workflows/fuzz-extended.yaml @@ -34,3 +34,11 @@ jobs: run: just generate-corpus - name: Fuzz all targets (15 minutes each) run: just fuzz-all 900 + - name: Upload fuzz artifacts + if: failure() + uses: actions/upload-artifact@v4 + with: + name: fuzz-artifacts + path: | + fuzz/artifacts/ + target/fuzz-logs/