composefs · cgwalters · Mar 2, 2026 · Mar 2, 2026 · Mar 2, 2026 · Mar 2, 2026
diff --git a/fuzz/fuzz_targets/differential.rs b/fuzz/fuzz_targets/differential.rs
@@ -9,63 +9,145 @@
 #![no_main]
 
 use libfuzzer_sys::fuzz_target;
-use tar_core_testutil::{parse_tar_core, parse_tar_rs};
+use tar_core_testutil::{parse_tar_core, parse_tar_rs, OwnedEntry};
+
+/// Dump the raw 512-byte headers from the (post-fixup) data to stderr.
+fn dump_headers(data: &[u8]) {
+    let mut offset = 0;
+    let mut i = 0;
+    while offset + 512 <= data.len() {
+        let block = &data[offset..offset + 512];
+        if block.iter().all(|&b| b == 0) {
+            eprintln!("block[{i}] @{offset}: <all zeros>");
+            offset += 512;
+            i += 1;
+            continue;
+        }
+        let header = tar_core::Header::from_bytes(block.try_into().unwrap());
+        eprintln!("block[{i}] @{offset}: {header:?}");
+        offset += 512;
+        i += 1;
+    }
+}
+
+/// Compare entries parsed by tar-rs and tar-core, asserting equivalence.
+///
+/// tar-core is intentionally more lenient than tar-rs in some cases (e.g.
+/// all-null numeric fields are accepted as 0), so we only require that
+/// tar-core parses *at least* as many entries as tar-rs and that those
+/// entries match.
+fn compare_entries(data: &[u8], tar_rs_entries: &[OwnedEntry], tar_core_entries: &[OwnedEntry]) {
+    if tar_core_entries.len() < tar_rs_entries.len() {
+        eprintln!(
+            "entry count mismatch: tar-core={} tar-rs={}",
+            tar_core_entries.len(),
+            tar_rs_entries.len()
+        );
+        dump_headers(data);
+        for (i, e) in tar_rs_entries.iter().enumerate() {
+            eprintln!("tar-rs  [{i}]: {e:?}");
+        }
+        for (i, e) in tar_core_entries.iter().enumerate() {
+            eprintln!("tar-core[{i}]: {e:?}");
+        }
+        panic!(
+            "tar-core parsed fewer entries than tar-rs: tar-core={} tar-rs={}",
+            tar_core_entries.len(),
+            tar_rs_entries.len(),
+        );
+    }
+
+    for (i, (rs, core)) in tar_rs_entries.iter().zip(tar_core_entries).enumerate() {
+        if rs != core {
+            eprintln!("mismatch at entry {i}:");
+            dump_headers(data);
+            eprintln!("  tar-rs:   {rs:?}");
+            eprintln!("  tar-core: {core:?}");
+            panic!("entry {i} differs between tar-rs and tar-core");
+        }
+    }
+}
+
+/// Preprocess fuzz input to fix up tar header checksums.
+///
+/// Walks through 512-byte aligned blocks. For each non-zero block (potential
+/// header), computes and sets a valid checksum. Then attempts to parse the
+/// size field to skip over content blocks, advancing to the next header.
+///
+/// This dramatically improves fuzzing coverage by allowing the parser to get
+/// past the checksum verification gate and exercise deeper parsing logic
+/// (PAX extensions, GNU long name/link, sparse files, etc.).
+fn fixup_checksums(data: &mut [u8]) {
+    let mut offset = 0;
+    while offset + 512 <= data.len() {
+        let block = &data[offset..offset + 512];
+
+        // Skip zero blocks (end-of-archive markers)
+        if block.iter().all(|&b| b == 0) {
+            offset += 512;
+            continue;
+        }
+
+        // Fill checksum field (bytes 148..156) with spaces
+        let block = &mut data[offset..offset + 512];
+        block[148..156].fill(b' ');
+
+        // Compute checksum: unsigned sum of all 512 bytes
+        let checksum: u64 = block.iter().map(|&b| u64::from(b)).sum();
+
+        // Encode as 7 octal digits + NUL terminator
+        let cksum_str = format!("{:07o}\0", checksum);
+        let cksum_bytes = cksum_str.as_bytes();
+        let copy_len = cksum_bytes.len().min(8);
+        block[148..148 + copy_len].copy_from_slice(&cksum_bytes[..copy_len]);
+
+        offset += 512;
+
+        // Try to parse the size field (bytes 124..136) to skip content blocks
+        let size_field = &data[offset - 512 + 124..offset - 512 + 136];
+        if let Some(size) = parse_octal_simple(size_field) {
+            let padded = ((size as usize) + 511) & !511;
+            if offset + padded <= data.len() {
+                offset += padded;
+            }
+        }
+    }
+}
+
+/// Simple octal parser for the size field - doesn't need to handle base-256
+/// since we're just trying to skip content. Returns None on any parse failure.
+fn parse_octal_simple(bytes: &[u8]) -> Option<u64> {
+    let trimmed: Vec<u8> = bytes
+        .iter()
+        .copied()
+        .skip_while(|&b| b == b' ')
+        .take_while(|&b| b != b' ' && b != 0)
+        .collect();
+    if trimmed.is_empty() {
+        return Some(0);
+    }
+    let s = core::str::from_utf8(&trimmed).ok()?;
+    u64::from_str_radix(s, 8).ok()
+}
 
 fuzz_target!(|data: &[u8]| {
     if data.len() > 256 * 1024 {
         return;
     }
 
-    let tar_rs_entries = parse_tar_rs(data);
-    let tar_core_entries = parse_tar_core(data);
-
-    assert_eq!(
-        tar_core_entries.len(),
-        tar_rs_entries.len(),
-        "entry count mismatch: tar-core={} tar-rs={}",
-        tar_core_entries.len(),
-        tar_rs_entries.len(),
-    );
-
-    for i in 0..tar_rs_entries.len() {
-        let rs = &tar_rs_entries[i];
-        let core = &tar_core_entries[i];
-
-        assert_eq!(
-            rs.path,
-            core.path,
-            "path mismatch at entry {i}: tar-rs={:?} tar-core={:?}",
-            String::from_utf8_lossy(&rs.path),
-            String::from_utf8_lossy(&core.path),
-        );
-        assert_eq!(rs.size, core.size, "size mismatch at entry {i}");
-        assert_eq!(
-            rs.entry_type, core.entry_type,
-            "entry_type mismatch at entry {i}"
-        );
-        assert_eq!(rs.mode, core.mode, "mode mismatch at entry {i}");
-        assert_eq!(rs.uid, core.uid, "uid mismatch at entry {i}");
-        assert_eq!(rs.gid, core.gid, "gid mismatch at entry {i}");
-        assert_eq!(rs.mtime, core.mtime, "mtime mismatch at entry {i}");
-        assert_eq!(
-            rs.link_target, core.link_target,
-            "link_target mismatch at entry {i}"
-        );
-        assert_eq!(rs.uname, core.uname, "uname mismatch at entry {i}");
-        assert_eq!(rs.gname, core.gname, "gname mismatch at entry {i}");
-        assert_eq!(
-            rs.dev_major, core.dev_major,
-            "dev_major mismatch at entry {i}"
-        );
-        assert_eq!(
-            rs.dev_minor, core.dev_minor,
-            "dev_minor mismatch at entry {i}"
-        );
-        assert_eq!(
-            rs.content, core.content,
-            "content mismatch at entry {i} (size={})",
-            rs.size,
-        );
-        assert_eq!(rs.xattrs, core.xattrs, "xattr mismatch at entry {i}");
+    // 90% of the time, fix up checksums to exercise deeper parser logic.
+    // 10% of the time, pass raw bytes to test checksum validation itself.
+    let should_fixup = !data.is_empty() && data[0] % 10 != 0;
+
+    if should_fixup {
+        let mut data = data.to_vec();
+        fixup_checksums(&mut data);
+        let tar_rs_entries = parse_tar_rs(&data);
+        let tar_core_entries = parse_tar_core(&data);
+        compare_entries(&data, &tar_rs_entries, &tar_core_entries);
+    } else {
+        let tar_rs_entries = parse_tar_rs(data);
+        let tar_core_entries = parse_tar_core(data);
+        compare_entries(data, &tar_rs_entries, &tar_core_entries);
     }
 });
diff --git a/fuzz/fuzz_targets/parse.rs b/fuzz/fuzz_targets/parse.rs
@@ -14,8 +14,9 @@ use tar_core::HEADER_SIZE;
 
 /// Drive a parser to completion over `data`, checking invariants on each entry.
 /// Returns normally on errors or NeedData — the point is that it must not panic.
-fn run_parser(data: &[u8], limits: Limits) {
+fn run_parser(data: &[u8], limits: Limits, verify_checksums: bool) {
     let mut parser = Parser::new(limits);
+    parser.set_verify_checksums(verify_checksums);
     let mut offset: usize = 0;
 
     loop {
@@ -99,8 +100,14 @@ fn run_parser(data: &[u8], limits: Limits) {
 }
 
 fuzz_target!(|data: &[u8]| {
-    // Run with permissive limits (should accept anything that isn't structurally broken).
-    run_parser(data, Limits::permissive());
-    // Run with default limits (stricter — may error on oversized paths/pax, but must not panic).
-    run_parser(data, Limits::default());
+    // 90% of the time, skip checksum verification to exercise deeper parser
+    // logic (PAX extensions, GNU long name/link, sparse files, field parsing,
+    // etc.). Random fuzz input almost never has valid checksums, so without
+    // this the fuzzer would break immediately on every input.
+    //
+    // 10% of the time, verify checksums normally to test that code path too.
+    let skip_checksums = !data.is_empty() && data[0] % 10 != 0;
+
+    run_parser(data, Limits::permissive(), !skip_checksums);
+    run_parser(data, Limits::default(), !skip_checksums);
 });
diff --git a/src/lib.rs b/src/lib.rs
@@ -1444,6 +1444,18 @@ impl OctU64 {
     }
 }
 
+/// Test whether a byte is whitespace in the context of tar header fields.
+///
+/// This includes all bytes that `u8::is_ascii_whitespace()` recognizes
+/// (HT, LF, FF, CR, space) **plus** vertical tab (0x0b). Rust's
+/// `is_ascii_whitespace` follows the WHATWG definition which omits VT,
+/// but real tar implementations (and Rust's `str::trim()`) treat it as
+/// whitespace. Without this, fields like `"0000000\x0b"` would fail to
+/// parse.
+fn is_tar_whitespace(b: u8) -> bool {
+    b.is_ascii_whitespace() || b == 0x0b
+}
+
 /// Parse an octal ASCII field into a u64.
 ///
 /// Octal fields in tar headers are ASCII strings with optional leading
@@ -1456,17 +1468,27 @@ impl OctU64 {
 /// Returns [`HeaderError::InvalidOctal`] if the field contains invalid
 /// characters (anything other than spaces, digits 0-7, or null bytes).
 pub(crate) fn parse_octal(bytes: &[u8]) -> Result<u64> {
-    // Tar octal fields are padded with leading spaces and terminated by
-    // spaces or null bytes. Strip both ends to get the digit run.
-    let trimmed = bytes
+    // Tar octal fields are padded with leading spaces/nulls and terminated
+    // by spaces, tabs, or null bytes. We first truncate at the first null
+    // (matching how C-string fields work in tar), then trim whitespace from
+    // both ends to isolate the digit run.
+    //
+    // Note: we use `is_tar_whitespace` rather than `u8::is_ascii_whitespace`
+    // because the latter omits vertical tab (0x0b), which real tar
+    // implementations treat as whitespace (and Rust's `str::trim()` strips).
+    let truncated = match bytes.iter().position(|&b| b == 0) {
+        Some(i) => &bytes[..i],
+        None => bytes,
+    };
+    let trimmed = truncated
         .iter()
-        .position(|&b| b != b' ')
+        .position(|&b| !is_tar_whitespace(b))
         .map(|start| {
-            let rest = &bytes[start..];
+            let rest = &truncated[start..];
             let end = rest
                 .iter()
-                .position(|&b| b == b' ' || b == b'\0')
-                .unwrap_or(rest.len());
+                .rposition(|&b| !is_tar_whitespace(b))
+                .map_or(0, |p| p + 1);
             &rest[..end]
         })
         .unwrap_or(&[]);

diff --git a/src/parse.rs b/src/parse.rs
@@ -644,6 +644,12 @@ pub struct Parser {
     /// When true, entries with empty paths are allowed through instead of
     /// returning [`ParseError::EmptyPath`].
     allow_empty_path: bool,
+    /// When false, header checksum verification is skipped. This is useful
+    /// for fuzzing, where random input almost never has valid checksums,
+    /// preventing the fuzzer from exercising deeper parser logic.
+    ///
+    /// Default: `true`.
+    verify_checksums: bool,
 }
 
 impl Parser {
@@ -655,6 +661,7 @@ impl Parser {
             state: State::ReadHeader,
             pending: PendingMetadata::default(),
             allow_empty_path: false,
+            verify_checksums: true,
         }
     }
 
@@ -664,6 +671,19 @@ impl Parser {
         self.allow_empty_path = allow;
     }
 
+    /// Control whether header checksums are verified during parsing.
+    ///
+    /// When set to `false`, the parser skips [`Header::verify_checksum`]
+    /// calls, accepting headers regardless of their checksum field. This
+    /// is primarily useful for fuzz testing, where random input almost
+    /// never produces valid checksums, preventing the fuzzer from reaching
+    /// deeper parser code paths.
+    ///
+    /// Default: `true`.
+    pub fn set_verify_checksums(&mut self, verify: bool) {
+        self.verify_checksums = verify;
+    }
+
     /// Create a new parser with default limits.
     #[must_use]
     pub fn with_defaults() -> Self {
@@ -756,25 +776,36 @@ impl Parser {
 
         // Parse header
         let header = Header::from_bytes(header_bytes);
-        header.verify_checksum()?;
+        if self.verify_checksums {
+            header.verify_checksum()?;
+        }
 
         let entry_type = header.entry_type();
         let size = header.entry_size()?;
         let padded_size = size
             .checked_next_multiple_of(HEADER_SIZE as u64)
             .ok_or(ParseError::InvalidSize(size))?;
 
-        // Handle metadata entry types
+        // Metadata entry types (GNU long name/link, PAX headers, GNU sparse)
+        // only make sense in archives that actually use those formats. A V7-
+        // style header whose type flag byte happens to be 'L' or 'x' should
+        // be treated as a regular entry, not as a metadata extension. This
+        // matches tar-rs's `is_recognized_header` guard.
+        let is_extension_format = header.is_gnu() || header.is_ustar();
         match entry_type {
-            EntryType::GnuLongName => {
+            EntryType::GnuLongName if is_extension_format => {
                 self.handle_extension(input, size, padded_size, ExtensionKind::GnuLongName)
             }
-            EntryType::GnuLongLink => {
+            EntryType::GnuLongLink if is_extension_format => {
                 self.handle_extension(input, size, padded_size, ExtensionKind::GnuLongLink)
             }
-            EntryType::XHeader => {
+            EntryType::XHeader if is_extension_format => {
                 self.handle_extension(input, size, padded_size, ExtensionKind::Pax)
             }
+            // Global PAX headers (type 'g') are defined by POSIX
+            // independently of the UStar/GNU magic, so we always handle
+            // them here. Routing through emit_entry would fail because
+            // global headers have arbitrary metadata fields.
             EntryType::XGlobalHeader => {
                 // Check size limit (same as local PAX headers)
                 if size > self.limits.max_pax_size {
@@ -800,7 +831,9 @@ impl Parser {
                     pax_data,
                 })
             }
-            EntryType::GnuSparse => self.handle_gnu_sparse(input, header, size),
+            EntryType::GnuSparse if is_extension_format => {
+                self.handle_gnu_sparse(input, header, size)
+            }
             _ => {
                 // Check for PAX v1.0 sparse before emitting — it requires
                 // reading the sparse map from the data stream.
@@ -1278,8 +1311,14 @@ impl Parser {
         let mut mtime = header.mtime()?;
         let mut entry_size = size;
         let mut xattrs = Vec::new();
-        let mut uname: Option<Cow<'a, [u8]>> = header.username().map(Cow::Borrowed);
-        let mut gname: Option<Cow<'a, [u8]>> = header.groupname().map(Cow::Borrowed);
+        let mut uname: Option<Cow<'a, [u8]>> = header
+            .username()
+            .filter(|b| !b.is_empty())
+            .map(Cow::Borrowed);
+        let mut gname: Option<Cow<'a, [u8]>> = header
+            .groupname()
+            .filter(|b| !b.is_empty())
+            .map(Cow::Borrowed);
 
         // Handle UStar prefix for path
         if let Some(prefix) = header.prefix() {