From 04eda3a6929c54f73a2add3ffa410f060845c585 Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Mon, 2 Mar 2026 01:34:36 +0000 Subject: [PATCH 1/7] fuzz: Fix checksum coverage gap in parse and differential fuzzers The parse.rs and differential.rs fuzz targets were getting almost zero coverage of deeper parser logic (PAX extensions, GNU long name/link, sparse files, field parsing, etc.) because random fuzz input almost never has valid tar header checksums. The parser's verify_checksum() call at the top of parse_header() rejects ~100% of random inputs immediately, causing the fuzzer to break on every input before exercising any interesting code paths. For the parse.rs fuzzer, add a Parser::set_verify_checksums(bool) API that allows skipping checksum verification entirely. The fuzzer uses this ~90% of the time (determined by the first byte of input), letting it exercise all the parsing code paths beyond the checksum gate. The remaining 10% keeps checksum validation testing. For the differential fuzzer, since both tar-core and tar-rs must see identical data with valid checksums, use a fixup_checksums() approach that walks through 512-byte blocks and rewrites checksum fields in-place before passing to both parsers. It also parses the size field to skip over content blocks, so subsequent headers get fixed too. The roundtrip.rs fuzzer already uses EntryBuilder which produces valid checksums, so it does not need changes. Assisted-by: OpenCode (Claude claude-opus-4-6) --- fuzz/fuzz_targets/differential.rs | 101 ++++++++++++++++++++++++++---- fuzz/fuzz_targets/parse.rs | 17 +++-- src/parse.rs | 24 ++++++- 3 files changed, 123 insertions(+), 19 deletions(-) diff --git a/fuzz/fuzz_targets/differential.rs b/fuzz/fuzz_targets/differential.rs index 5a8f879..9be178d 100644 --- a/fuzz/fuzz_targets/differential.rs +++ b/fuzz/fuzz_targets/differential.rs @@ -9,16 +9,10 @@ #![no_main] use libfuzzer_sys::fuzz_target; -use tar_core_testutil::{parse_tar_core, parse_tar_rs}; - -fuzz_target!(|data: &[u8]| { - if data.len() > 256 * 1024 { - return; - } - - let tar_rs_entries = parse_tar_rs(data); - let tar_core_entries = parse_tar_core(data); +use tar_core_testutil::{parse_tar_core, parse_tar_rs, OwnedEntry}; +/// Compare entries parsed by tar-rs and tar-core, asserting equivalence. +fn compare_entries(tar_rs_entries: &[OwnedEntry], tar_core_entries: &[OwnedEntry]) { assert_eq!( tar_core_entries.len(), tar_rs_entries.len(), @@ -27,10 +21,7 @@ fuzz_target!(|data: &[u8]| { tar_rs_entries.len(), ); - for i in 0..tar_rs_entries.len() { - let rs = &tar_rs_entries[i]; - let core = &tar_core_entries[i]; - + for (i, (rs, core)) in tar_rs_entries.iter().zip(tar_core_entries).enumerate() { assert_eq!( rs.path, core.path, @@ -68,4 +59,88 @@ fuzz_target!(|data: &[u8]| { ); assert_eq!(rs.xattrs, core.xattrs, "xattr mismatch at entry {i}"); } +} + +/// Preprocess fuzz input to fix up tar header checksums. +/// +/// Walks through 512-byte aligned blocks. For each non-zero block (potential +/// header), computes and sets a valid checksum. Then attempts to parse the +/// size field to skip over content blocks, advancing to the next header. +/// +/// This dramatically improves fuzzing coverage by allowing the parser to get +/// past the checksum verification gate and exercise deeper parsing logic +/// (PAX extensions, GNU long name/link, sparse files, etc.). +fn fixup_checksums(data: &mut [u8]) { + let mut offset = 0; + while offset + 512 <= data.len() { + let block = &data[offset..offset + 512]; + + // Skip zero blocks (end-of-archive markers) + if block.iter().all(|&b| b == 0) { + offset += 512; + continue; + } + + // Fill checksum field (bytes 148..156) with spaces + let block = &mut data[offset..offset + 512]; + block[148..156].fill(b' '); + + // Compute checksum: unsigned sum of all 512 bytes + let checksum: u64 = block.iter().map(|&b| u64::from(b)).sum(); + + // Encode as 7 octal digits + NUL terminator + let cksum_str = format!("{:07o}\0", checksum); + let cksum_bytes = cksum_str.as_bytes(); + let copy_len = cksum_bytes.len().min(8); + block[148..148 + copy_len].copy_from_slice(&cksum_bytes[..copy_len]); + + offset += 512; + + // Try to parse the size field (bytes 124..136) to skip content blocks + let size_field = &data[offset - 512 + 124..offset - 512 + 136]; + if let Some(size) = parse_octal_simple(size_field) { + let padded = ((size as usize) + 511) & !511; + if offset + padded <= data.len() { + offset += padded; + } + } + } +} + +/// Simple octal parser for the size field - doesn't need to handle base-256 +/// since we're just trying to skip content. Returns None on any parse failure. +fn parse_octal_simple(bytes: &[u8]) -> Option { + let trimmed: Vec = bytes + .iter() + .copied() + .skip_while(|&b| b == b' ') + .take_while(|&b| b != b' ' && b != 0) + .collect(); + if trimmed.is_empty() { + return Some(0); + } + let s = core::str::from_utf8(&trimmed).ok()?; + u64::from_str_radix(s, 8).ok() +} + +fuzz_target!(|data: &[u8]| { + if data.len() > 256 * 1024 { + return; + } + + // 90% of the time, fix up checksums to exercise deeper parser logic. + // 10% of the time, pass raw bytes to test checksum validation itself. + let should_fixup = !data.is_empty() && data[0] % 10 != 0; + + if should_fixup { + let mut data = data.to_vec(); + fixup_checksums(&mut data); + let tar_rs_entries = parse_tar_rs(&data); + let tar_core_entries = parse_tar_core(&data); + compare_entries(&tar_rs_entries, &tar_core_entries); + } else { + let tar_rs_entries = parse_tar_rs(data); + let tar_core_entries = parse_tar_core(data); + compare_entries(&tar_rs_entries, &tar_core_entries); + } }); diff --git a/fuzz/fuzz_targets/parse.rs b/fuzz/fuzz_targets/parse.rs index 5b40afb..0a47457 100644 --- a/fuzz/fuzz_targets/parse.rs +++ b/fuzz/fuzz_targets/parse.rs @@ -14,8 +14,9 @@ use tar_core::HEADER_SIZE; /// Drive a parser to completion over `data`, checking invariants on each entry. /// Returns normally on errors or NeedData — the point is that it must not panic. -fn run_parser(data: &[u8], limits: Limits) { +fn run_parser(data: &[u8], limits: Limits, verify_checksums: bool) { let mut parser = Parser::new(limits); + parser.set_verify_checksums(verify_checksums); let mut offset: usize = 0; loop { @@ -99,8 +100,14 @@ fn run_parser(data: &[u8], limits: Limits) { } fuzz_target!(|data: &[u8]| { - // Run with permissive limits (should accept anything that isn't structurally broken). - run_parser(data, Limits::permissive()); - // Run with default limits (stricter — may error on oversized paths/pax, but must not panic). - run_parser(data, Limits::default()); + // 90% of the time, skip checksum verification to exercise deeper parser + // logic (PAX extensions, GNU long name/link, sparse files, field parsing, + // etc.). Random fuzz input almost never has valid checksums, so without + // this the fuzzer would break immediately on every input. + // + // 10% of the time, verify checksums normally to test that code path too. + let skip_checksums = !data.is_empty() && data[0] % 10 != 0; + + run_parser(data, Limits::permissive(), !skip_checksums); + run_parser(data, Limits::default(), !skip_checksums); }); diff --git a/src/parse.rs b/src/parse.rs index c9dcfde..65c0cb3 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -644,6 +644,12 @@ pub struct Parser { /// When true, entries with empty paths are allowed through instead of /// returning [`ParseError::EmptyPath`]. allow_empty_path: bool, + /// When false, header checksum verification is skipped. This is useful + /// for fuzzing, where random input almost never has valid checksums, + /// preventing the fuzzer from exercising deeper parser logic. + /// + /// Default: `true`. + verify_checksums: bool, } impl Parser { @@ -655,6 +661,7 @@ impl Parser { state: State::ReadHeader, pending: PendingMetadata::default(), allow_empty_path: false, + verify_checksums: true, } } @@ -664,6 +671,19 @@ impl Parser { self.allow_empty_path = allow; } + /// Control whether header checksums are verified during parsing. + /// + /// When set to `false`, the parser skips [`Header::verify_checksum`] + /// calls, accepting headers regardless of their checksum field. This + /// is primarily useful for fuzz testing, where random input almost + /// never produces valid checksums, preventing the fuzzer from reaching + /// deeper parser code paths. + /// + /// Default: `true`. + pub fn set_verify_checksums(&mut self, verify: bool) { + self.verify_checksums = verify; + } + /// Create a new parser with default limits. #[must_use] pub fn with_defaults() -> Self { @@ -756,7 +776,9 @@ impl Parser { // Parse header let header = Header::from_bytes(header_bytes); - header.verify_checksum()?; + if self.verify_checksums { + header.verify_checksum()?; + } let entry_type = header.entry_type(); let size = header.entry_size()?; From 116b2827294c68862570a3c3ce87d5ec88a1a31f Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Mon, 2 Mar 2026 02:04:56 +0000 Subject: [PATCH 2/7] fuzz: Fix differential fuzzer to handle truncation and leniency differences The differential fuzzer was hitting false positives from two sources: 1. Truncated archives: tar-core's test harness pushed entries before verifying the full padded content was present. tar-rs refuses to yield entries whose content can't be fully read. Fix by checking that the entire padded content is available before pushing the entry. 2. Parsing leniency: tar-core intentionally accepts some inputs that tar-rs rejects (e.g. all-null numeric fields are parsed as 0, while tar-rs errors on them). Change the entry count assertion to only require tar-core parses at least as many entries as tar-rs, not exactly the same count. Found by running the improved checksum-aware fuzzers from the previous commit. Assisted-by: OpenCode (Claude claude-opus-4-6) --- fuzz/fuzz_targets/differential.rs | 12 ++++++++---- testutil/src/lib.rs | 28 ++++++++++++---------------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/fuzz/fuzz_targets/differential.rs b/fuzz/fuzz_targets/differential.rs index 9be178d..e99d68a 100644 --- a/fuzz/fuzz_targets/differential.rs +++ b/fuzz/fuzz_targets/differential.rs @@ -12,11 +12,15 @@ use libfuzzer_sys::fuzz_target; use tar_core_testutil::{parse_tar_core, parse_tar_rs, OwnedEntry}; /// Compare entries parsed by tar-rs and tar-core, asserting equivalence. +/// +/// tar-core is intentionally more lenient than tar-rs in some cases (e.g. +/// all-null numeric fields are accepted as 0), so we only require that +/// tar-core parses *at least* as many entries as tar-rs and that those +/// entries match. fn compare_entries(tar_rs_entries: &[OwnedEntry], tar_core_entries: &[OwnedEntry]) { - assert_eq!( - tar_core_entries.len(), - tar_rs_entries.len(), - "entry count mismatch: tar-core={} tar-rs={}", + assert!( + tar_core_entries.len() >= tar_rs_entries.len(), + "tar-core parsed fewer entries than tar-rs: tar-core={} tar-rs={}", tar_core_entries.len(), tar_rs_entries.len(), ); diff --git a/testutil/src/lib.rs b/testutil/src/lib.rs index 13eb40d..e7f460a 100644 --- a/testutil/src/lib.rs +++ b/testutil/src/lib.rs @@ -65,14 +65,19 @@ pub fn parse_tar_core_with_limits(data: &[u8], limits: Limits) -> Vec data.len() { + break; + } + + let read_size = size.min(MAX_CONTENT_READ) as usize; + let content = data[offset..offset + read_size].to_vec(); let xattrs: Vec<(Vec, Vec)> = entry .xattrs @@ -97,15 +102,6 @@ pub fn parse_tar_core_with_limits(data: &[u8], limits: Limits) -> Vec data.len() { - break; - } offset += padded; } Ok(ParseEvent::GlobalExtensions { consumed, .. }) => { From d4e9087e38dbf81fbc1c81281bcc28b900db6962 Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Mon, 2 Mar 2026 02:11:32 +0000 Subject: [PATCH 3/7] fuzz: Improve differential harness accuracy; fix parse_octal whitespace Multiple fixes found by running the improved checksum-aware fuzzers: parse_octal: Truncate at null first, then trim all ASCII whitespace (not just spaces) from both ends. This matches tar-rs's behavior of calling .trim() after null-truncation, fixing a differential where tar-core rejected fields with tab characters that tar-rs accepted. testutil parse_tar_core: Check that enough content data is present before yielding an entry, matching tar-rs's read_exact semantics. testutil parse_tar_rs: Skip metadata entry types (GNU long name/link, PAX headers) that tar-core handles internally. Require numeric fields to parse successfully instead of silently defaulting to 0. Skip entries with empty paths since tar-core rejects those. differential fuzzer: Dump raw header via Header's Debug impl on mismatch for easier diagnosis. Allow tar-core to parse more entries than tar-rs (it's intentionally more lenient for e.g. all-null fields). Assisted-by: OpenCode (Claude claude-opus-4-6) --- fuzz/fuzz_targets/differential.rs | 93 ++++++++++++++++--------------- src/lib.rs | 20 ++++--- testutil/src/lib.rs | 49 +++++++++++----- 3 files changed, 96 insertions(+), 66 deletions(-) diff --git a/fuzz/fuzz_targets/differential.rs b/fuzz/fuzz_targets/differential.rs index e99d68a..e6a83b1 100644 --- a/fuzz/fuzz_targets/differential.rs +++ b/fuzz/fuzz_targets/differential.rs @@ -11,57 +11,60 @@ use libfuzzer_sys::fuzz_target; use tar_core_testutil::{parse_tar_core, parse_tar_rs, OwnedEntry}; +/// Dump the raw 512-byte headers from the (post-fixup) data to stderr. +fn dump_headers(data: &[u8]) { + let mut offset = 0; + let mut i = 0; + while offset + 512 <= data.len() { + let block = &data[offset..offset + 512]; + if block.iter().all(|&b| b == 0) { + eprintln!("block[{i}] @{offset}: "); + offset += 512; + i += 1; + continue; + } + let header = tar_core::Header::from_bytes(block.try_into().unwrap()); + eprintln!("block[{i}] @{offset}: {header:?}"); + offset += 512; + i += 1; + } +} + /// Compare entries parsed by tar-rs and tar-core, asserting equivalence. /// /// tar-core is intentionally more lenient than tar-rs in some cases (e.g. /// all-null numeric fields are accepted as 0), so we only require that /// tar-core parses *at least* as many entries as tar-rs and that those /// entries match. -fn compare_entries(tar_rs_entries: &[OwnedEntry], tar_core_entries: &[OwnedEntry]) { - assert!( - tar_core_entries.len() >= tar_rs_entries.len(), - "tar-core parsed fewer entries than tar-rs: tar-core={} tar-rs={}", - tar_core_entries.len(), - tar_rs_entries.len(), - ); - - for (i, (rs, core)) in tar_rs_entries.iter().zip(tar_core_entries).enumerate() { - assert_eq!( - rs.path, - core.path, - "path mismatch at entry {i}: tar-rs={:?} tar-core={:?}", - String::from_utf8_lossy(&rs.path), - String::from_utf8_lossy(&core.path), - ); - assert_eq!(rs.size, core.size, "size mismatch at entry {i}"); - assert_eq!( - rs.entry_type, core.entry_type, - "entry_type mismatch at entry {i}" - ); - assert_eq!(rs.mode, core.mode, "mode mismatch at entry {i}"); - assert_eq!(rs.uid, core.uid, "uid mismatch at entry {i}"); - assert_eq!(rs.gid, core.gid, "gid mismatch at entry {i}"); - assert_eq!(rs.mtime, core.mtime, "mtime mismatch at entry {i}"); - assert_eq!( - rs.link_target, core.link_target, - "link_target mismatch at entry {i}" +fn compare_entries(data: &[u8], tar_rs_entries: &[OwnedEntry], tar_core_entries: &[OwnedEntry]) { + if tar_core_entries.len() < tar_rs_entries.len() { + eprintln!( + "entry count mismatch: tar-core={} tar-rs={}", + tar_core_entries.len(), + tar_rs_entries.len() ); - assert_eq!(rs.uname, core.uname, "uname mismatch at entry {i}"); - assert_eq!(rs.gname, core.gname, "gname mismatch at entry {i}"); - assert_eq!( - rs.dev_major, core.dev_major, - "dev_major mismatch at entry {i}" - ); - assert_eq!( - rs.dev_minor, core.dev_minor, - "dev_minor mismatch at entry {i}" - ); - assert_eq!( - rs.content, core.content, - "content mismatch at entry {i} (size={})", - rs.size, + dump_headers(data); + for (i, e) in tar_rs_entries.iter().enumerate() { + eprintln!("tar-rs [{i}]: {e:?}"); + } + for (i, e) in tar_core_entries.iter().enumerate() { + eprintln!("tar-core[{i}]: {e:?}"); + } + panic!( + "tar-core parsed fewer entries than tar-rs: tar-core={} tar-rs={}", + tar_core_entries.len(), + tar_rs_entries.len(), ); - assert_eq!(rs.xattrs, core.xattrs, "xattr mismatch at entry {i}"); + } + + for (i, (rs, core)) in tar_rs_entries.iter().zip(tar_core_entries).enumerate() { + if rs != core { + eprintln!("mismatch at entry {i}:"); + dump_headers(data); + eprintln!(" tar-rs: {rs:?}"); + eprintln!(" tar-core: {core:?}"); + panic!("entry {i} differs between tar-rs and tar-core"); + } } } @@ -141,10 +144,10 @@ fuzz_target!(|data: &[u8]| { fixup_checksums(&mut data); let tar_rs_entries = parse_tar_rs(&data); let tar_core_entries = parse_tar_core(&data); - compare_entries(&tar_rs_entries, &tar_core_entries); + compare_entries(&data, &tar_rs_entries, &tar_core_entries); } else { let tar_rs_entries = parse_tar_rs(data); let tar_core_entries = parse_tar_core(data); - compare_entries(&tar_rs_entries, &tar_core_entries); + compare_entries(data, &tar_rs_entries, &tar_core_entries); } }); diff --git a/src/lib.rs b/src/lib.rs index 02d4079..f50d93e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1456,17 +1456,23 @@ impl OctU64 { /// Returns [`HeaderError::InvalidOctal`] if the field contains invalid /// characters (anything other than spaces, digits 0-7, or null bytes). pub(crate) fn parse_octal(bytes: &[u8]) -> Result { - // Tar octal fields are padded with leading spaces and terminated by - // spaces or null bytes. Strip both ends to get the digit run. - let trimmed = bytes + // Tar octal fields are padded with leading spaces/nulls and terminated + // by spaces, tabs, or null bytes. We first truncate at the first null + // (matching how C-string fields work in tar), then trim ASCII + // whitespace from both ends to isolate the digit run. + let truncated = match bytes.iter().position(|&b| b == 0) { + Some(i) => &bytes[..i], + None => bytes, + }; + let trimmed = truncated .iter() - .position(|&b| b != b' ') + .position(|&b| !b.is_ascii_whitespace()) .map(|start| { - let rest = &bytes[start..]; + let rest = &truncated[start..]; let end = rest .iter() - .position(|&b| b == b' ' || b == b'\0') - .unwrap_or(rest.len()); + .rposition(|&b| !b.is_ascii_whitespace()) + .map_or(0, |p| p + 1); &rest[..end] }) .unwrap_or(&[]); diff --git a/testutil/src/lib.rs b/testutil/src/lib.rs index e7f460a..7bca987 100644 --- a/testutil/src/lib.rs +++ b/testutil/src/lib.rs @@ -66,17 +66,13 @@ pub fn parse_tar_core_with_limits(data: &[u8], limits: Limits) -> Vec data.len() { + // Require enough data for the content bytes we'll actually + // read. tar-rs reads `size` bytes (via read_exact) but + // does not require the padding to be present. + let read_size = size.min(MAX_CONTENT_READ) as usize; + if offset.saturating_add(read_size) > data.len() { break; } - - let read_size = size.min(MAX_CONTENT_READ) as usize; let content = data[offset..offset + read_size].to_vec(); let xattrs: Vec<(Vec, Vec)> = entry @@ -102,6 +98,11 @@ pub fn parse_tar_core_with_limits(data: &[u8], limits: Limits) -> Vec data.len() { + break; + } offset += padded; } Ok(ParseEvent::GlobalExtensions { consumed, .. }) => { @@ -133,14 +134,34 @@ pub fn parse_tar_rs(data: &[u8]) -> Vec { Err(_) => break, }; let header = entry.header().clone(); + let entry_type = header.entry_type().as_byte(); + + // Skip metadata-only entry types that tar-core handles internally + // (GNU long name/link, PAX headers, global PAX headers). + match header.entry_type() { + tar::EntryType::GNULongName + | tar::EntryType::GNULongLink + | tar::EntryType::XHeader + | tar::EntryType::XGlobalHeader => continue, + _ => {} + } let path = entry.path_bytes().into_owned(); let size = entry.size(); - let entry_type = header.entry_type().as_byte(); - let mode = header.mode().unwrap_or(0); - let uid = header.uid().unwrap_or(0); - let gid = header.gid().unwrap_or(0); - let mtime = header.mtime().unwrap_or(0); + + // tar-core rejects entries with empty paths; skip them here + // to match. + if path.is_empty() { + continue; + } + + // Require that numeric fields parse successfully. tar-core + // treats invalid numeric fields as hard errors, so if tar-rs + // silently defaulted to 0 we'd get false mismatches. + let Ok(mode) = header.mode() else { break }; + let Ok(uid) = header.uid() else { break }; + let Ok(gid) = header.gid() else { break }; + let Ok(mtime) = header.mtime() else { break }; // entry.link_name_bytes() applies PAX linkpath and GNU long link // overrides, unlike header.link_name_bytes() which is raw. let link_target = entry From e657e285b8bfeb382df7be780b0d944d1d85059f Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Mon, 2 Mar 2026 02:42:04 +0000 Subject: [PATCH 4/7] fuzz: Fix differential mismatches for V7-style metadata typeflags Fix several sources of differential fuzzer crashes between tar-core and tar-rs: Parser (src/parse.rs): Require recognized magic (is_gnu() || is_ustar()) before treating typeflags L/K/x/g/S as extension entries. A V7 header whose typeflag byte happens to be 'L' should be emitted as a regular entry, not consumed as a GNU long name extension. This matches tar-rs's `is_recognized_header` guard. Octal parsing (src/lib.rs): Add `is_tar_whitespace` helper that treats VT (0x0b) as whitespace, matching real tar implementations. Rust's `u8::is_ascii_whitespace` omits VT, causing fields like "0000000\x0b" to fail. Test harness (testutil/src/lib.rs): Three alignment fixes for `parse_tar_core`: - Call `set_allow_empty_path(true)` so parsing continues past entries with empty paths (previously it would error out early). - Skip entries with empty paths after parsing, matching `parse_tar_rs`. - Skip entries with metadata-only typeflags (L/K/x/g) that tar-core emits as regular entries when magic is unrecognized. `parse_tar_rs` already had an equivalent skip via its `continue` on these types. The root cause of the specific crash artifact: a fuzzed header had typeflag 'L' (GnuLongName) but no valid magic bytes. tar-core treated it as an extension and consumed it, while tar-rs treated it as a regular entry and skipped it. After these fixes, both parsers handle V7-style metadata typeflags consistently. Assisted-by: OpenCode (Claude claude-opus-4-6) --- src/lib.rs | 24 ++++++++++++++++++++---- src/parse.rs | 19 +++++++++++++------ testutil/src/lib.rs | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 66 insertions(+), 10 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index f50d93e..c09f4d1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1444,6 +1444,18 @@ impl OctU64 { } } +/// Test whether a byte is whitespace in the context of tar header fields. +/// +/// This includes all bytes that `u8::is_ascii_whitespace()` recognizes +/// (HT, LF, FF, CR, space) **plus** vertical tab (0x0b). Rust's +/// `is_ascii_whitespace` follows the WHATWG definition which omits VT, +/// but real tar implementations (and Rust's `str::trim()`) treat it as +/// whitespace. Without this, fields like `"0000000\x0b"` would fail to +/// parse. +fn is_tar_whitespace(b: u8) -> bool { + b.is_ascii_whitespace() || b == 0x0b +} + /// Parse an octal ASCII field into a u64. /// /// Octal fields in tar headers are ASCII strings with optional leading @@ -1458,20 +1470,24 @@ impl OctU64 { pub(crate) fn parse_octal(bytes: &[u8]) -> Result { // Tar octal fields are padded with leading spaces/nulls and terminated // by spaces, tabs, or null bytes. We first truncate at the first null - // (matching how C-string fields work in tar), then trim ASCII - // whitespace from both ends to isolate the digit run. + // (matching how C-string fields work in tar), then trim whitespace from + // both ends to isolate the digit run. + // + // Note: we use `is_tar_whitespace` rather than `u8::is_ascii_whitespace` + // because the latter omits vertical tab (0x0b), which real tar + // implementations treat as whitespace (and Rust's `str::trim()` strips). let truncated = match bytes.iter().position(|&b| b == 0) { Some(i) => &bytes[..i], None => bytes, }; let trimmed = truncated .iter() - .position(|&b| !b.is_ascii_whitespace()) + .position(|&b| !is_tar_whitespace(b)) .map(|start| { let rest = &truncated[start..]; let end = rest .iter() - .rposition(|&b| !b.is_ascii_whitespace()) + .rposition(|&b| !is_tar_whitespace(b)) .map_or(0, |p| p + 1); &rest[..end] }) diff --git a/src/parse.rs b/src/parse.rs index 65c0cb3..a426687 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -786,18 +786,23 @@ impl Parser { .checked_next_multiple_of(HEADER_SIZE as u64) .ok_or(ParseError::InvalidSize(size))?; - // Handle metadata entry types + // Metadata entry types (GNU long name/link, PAX headers, GNU sparse) + // only make sense in archives that actually use those formats. A V7- + // style header whose type flag byte happens to be 'L' or 'x' should + // be treated as a regular entry, not as a metadata extension. This + // matches tar-rs's `is_recognized_header` guard. + let is_extension_format = header.is_gnu() || header.is_ustar(); match entry_type { - EntryType::GnuLongName => { + EntryType::GnuLongName if is_extension_format => { self.handle_extension(input, size, padded_size, ExtensionKind::GnuLongName) } - EntryType::GnuLongLink => { + EntryType::GnuLongLink if is_extension_format => { self.handle_extension(input, size, padded_size, ExtensionKind::GnuLongLink) } - EntryType::XHeader => { + EntryType::XHeader if is_extension_format => { self.handle_extension(input, size, padded_size, ExtensionKind::Pax) } - EntryType::XGlobalHeader => { + EntryType::XGlobalHeader if is_extension_format => { // Check size limit (same as local PAX headers) if size > self.limits.max_pax_size { return Err(ParseError::PaxTooLarge { @@ -822,7 +827,9 @@ impl Parser { pax_data, }) } - EntryType::GnuSparse => self.handle_gnu_sparse(input, header, size), + EntryType::GnuSparse if is_extension_format => { + self.handle_gnu_sparse(input, header, size) + } _ => { // Check for PAX v1.0 sparse before emitting — it requires // reading the sparse map from the data stream. diff --git a/testutil/src/lib.rs b/testutil/src/lib.rs index 7bca987..5ebe5a1 100644 --- a/testutil/src/lib.rs +++ b/testutil/src/lib.rs @@ -48,6 +48,9 @@ pub fn parse_tar_core(data: &[u8]) -> Vec { pub fn parse_tar_core_with_limits(data: &[u8], limits: Limits) -> Vec { let mut results = Vec::new(); let mut parser = Parser::new(limits); + // Allow entries with empty paths so we don't stop parsing early. + // We skip them below to match parse_tar_rs's behaviour. + parser.set_allow_empty_path(true); let mut offset = 0; loop { @@ -81,6 +84,36 @@ pub fn parse_tar_core_with_limits(data: &[u8], limits: Limits) -> Vec data.len() { + break; + } + offset += padded; + continue; + } + + // Skip entries with empty paths to match parse_tar_rs. + if entry.path.is_empty() { + let padded = (size as usize).next_multiple_of(HEADER_SIZE); + if offset.saturating_add(padded) > data.len() { + break; + } + offset += padded; + continue; + } + results.push(OwnedEntry { entry_type: entry.entry_type.to_byte(), path: entry.path.to_vec(), From 29c762c5d75b5a2a8351003d814f418269ff54ff Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Mon, 2 Mar 2026 03:02:38 +0000 Subject: [PATCH 5/7] parse: Filter empty uname/gname to None Headers with a null byte at position 0 in the uname or gname field produce empty slices from username()/groupname(). Treat these as None to match tar-rs behavior, which filters empty values via .filter(|b| !b.is_empty()). Found via differential fuzzing. Assisted-by: OpenCode (Claude claude-opus-4-6) --- src/parse.rs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/parse.rs b/src/parse.rs index a426687..9d66f33 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -1307,8 +1307,14 @@ impl Parser { let mut mtime = header.mtime()?; let mut entry_size = size; let mut xattrs = Vec::new(); - let mut uname: Option> = header.username().map(Cow::Borrowed); - let mut gname: Option> = header.groupname().map(Cow::Borrowed); + let mut uname: Option> = header + .username() + .filter(|b| !b.is_empty()) + .map(Cow::Borrowed); + let mut gname: Option> = header + .groupname() + .filter(|b| !b.is_empty()) + .map(Cow::Borrowed); // Handle UStar prefix for path if let Some(prefix) = header.prefix() { From 2772ee8868802f3a22c3ab31f80d35381c6f2ad2 Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Mon, 2 Mar 2026 03:15:06 +0000 Subject: [PATCH 6/7] parse: Always handle XGlobalHeader regardless of magic Global PAX headers (type 'g') are defined by POSIX independently of the UStar/GNU magic field. Routing them through emit_entry (the catch-all path for unrecognized-magic extensions) is wrong because global headers may have arbitrary metadata in their numeric fields. Remove the is_extension_format guard for XGlobalHeader so these are always emitted as GlobalExtensions events. Found via differential fuzzing. Assisted-by: OpenCode (Claude claude-opus-4-6) --- src/parse.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/parse.rs b/src/parse.rs index 9d66f33..56d4eee 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -802,7 +802,11 @@ impl Parser { EntryType::XHeader if is_extension_format => { self.handle_extension(input, size, padded_size, ExtensionKind::Pax) } - EntryType::XGlobalHeader if is_extension_format => { + // Global PAX headers (type 'g') are defined by POSIX + // independently of the UStar/GNU magic, so we always handle + // them here. Routing through emit_entry would fail because + // global headers have arbitrary metadata fields. + EntryType::XGlobalHeader => { // Check size limit (same as local PAX headers) if size > self.limits.max_pax_size { return Err(ParseError::PaxTooLarge { From 918f62f5cf7ee0d3def15318ea8118e9b1b78a27 Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Mon, 2 Mar 2026 03:15:14 +0000 Subject: [PATCH 7/7] testutil: Reorder field checks before skip logic in parse_tar_rs Move numeric field validation (mode, uid, gid, mtime, device_major, device_minor) before the metadata-type and empty-path skip logic. Previously, parse_tar_rs would skip metadata-type entries (L/K/x/g) before checking numeric fields, while tar-core's parser validates all fields eagerly in emit_entry. This caused false mismatches when an unrecognized-magic 'x'-typed block had garbage in its numeric fields: tar-rs skipped it, tar-core errored. Also replace the lenient unwrap_or(None) for device fields with strict break-on-error, matching tar-core's behavior. Assisted-by: OpenCode (Claude claude-opus-4-6) --- testutil/src/lib.rs | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/testutil/src/lib.rs b/testutil/src/lib.rs index 5ebe5a1..a16eb80 100644 --- a/testutil/src/lib.rs +++ b/testutil/src/lib.rs @@ -169,6 +169,28 @@ pub fn parse_tar_rs(data: &[u8]) -> Vec { let header = entry.header().clone(); let entry_type = header.entry_type().as_byte(); + let path = entry.path_bytes().into_owned(); + let size = entry.size(); + + // Require that numeric fields parse successfully. tar-core + // treats invalid numeric fields as hard errors, so if tar-rs + // silently defaulted to 0 we'd get false mismatches. + // + // These checks must come before ALL skip logic (metadata types, + // empty paths) so both parsers stop on the same invalid fields. + let Ok(mode) = header.mode() else { break }; + let Ok(uid) = header.uid() else { break }; + let Ok(gid) = header.gid() else { break }; + let Ok(mtime) = header.mtime() else { break }; + // tar-rs normally uses unwrap_or(None) for device fields, but + // tar-core propagates errors. Break here to match. + let Ok(dev_major) = header.device_major() else { + break; + }; + let Ok(dev_minor) = header.device_minor() else { + break; + }; + // Skip metadata-only entry types that tar-core handles internally // (GNU long name/link, PAX headers, global PAX headers). match header.entry_type() { @@ -179,22 +201,11 @@ pub fn parse_tar_rs(data: &[u8]) -> Vec { _ => {} } - let path = entry.path_bytes().into_owned(); - let size = entry.size(); - // tar-core rejects entries with empty paths; skip them here // to match. if path.is_empty() { continue; } - - // Require that numeric fields parse successfully. tar-core - // treats invalid numeric fields as hard errors, so if tar-rs - // silently defaulted to 0 we'd get false mismatches. - let Ok(mode) = header.mode() else { break }; - let Ok(uid) = header.uid() else { break }; - let Ok(gid) = header.gid() else { break }; - let Ok(mtime) = header.mtime() else { break }; // entry.link_name_bytes() applies PAX linkpath and GNU long link // overrides, unlike header.link_name_bytes() which is raw. let link_target = entry @@ -241,9 +252,6 @@ pub fn parse_tar_rs(data: &[u8]) -> Vec { break; } - let dev_major = header.device_major().unwrap_or(None); - let dev_minor = header.device_minor().unwrap_or(None); - results.push(OwnedEntry { entry_type, path,