diff --git a/fuzz/fuzz_targets/differential.rs b/fuzz/fuzz_targets/differential.rs index 5a8f879..e6a83b1 100644 --- a/fuzz/fuzz_targets/differential.rs +++ b/fuzz/fuzz_targets/differential.rs @@ -9,63 +9,145 @@ #![no_main] use libfuzzer_sys::fuzz_target; -use tar_core_testutil::{parse_tar_core, parse_tar_rs}; +use tar_core_testutil::{parse_tar_core, parse_tar_rs, OwnedEntry}; + +/// Dump the raw 512-byte headers from the (post-fixup) data to stderr. +fn dump_headers(data: &[u8]) { + let mut offset = 0; + let mut i = 0; + while offset + 512 <= data.len() { + let block = &data[offset..offset + 512]; + if block.iter().all(|&b| b == 0) { + eprintln!("block[{i}] @{offset}: "); + offset += 512; + i += 1; + continue; + } + let header = tar_core::Header::from_bytes(block.try_into().unwrap()); + eprintln!("block[{i}] @{offset}: {header:?}"); + offset += 512; + i += 1; + } +} + +/// Compare entries parsed by tar-rs and tar-core, asserting equivalence. +/// +/// tar-core is intentionally more lenient than tar-rs in some cases (e.g. +/// all-null numeric fields are accepted as 0), so we only require that +/// tar-core parses *at least* as many entries as tar-rs and that those +/// entries match. +fn compare_entries(data: &[u8], tar_rs_entries: &[OwnedEntry], tar_core_entries: &[OwnedEntry]) { + if tar_core_entries.len() < tar_rs_entries.len() { + eprintln!( + "entry count mismatch: tar-core={} tar-rs={}", + tar_core_entries.len(), + tar_rs_entries.len() + ); + dump_headers(data); + for (i, e) in tar_rs_entries.iter().enumerate() { + eprintln!("tar-rs [{i}]: {e:?}"); + } + for (i, e) in tar_core_entries.iter().enumerate() { + eprintln!("tar-core[{i}]: {e:?}"); + } + panic!( + "tar-core parsed fewer entries than tar-rs: tar-core={} tar-rs={}", + tar_core_entries.len(), + tar_rs_entries.len(), + ); + } + + for (i, (rs, core)) in tar_rs_entries.iter().zip(tar_core_entries).enumerate() { + if rs != core { + eprintln!("mismatch at entry {i}:"); + dump_headers(data); + eprintln!(" tar-rs: {rs:?}"); + eprintln!(" tar-core: {core:?}"); + panic!("entry {i} differs between tar-rs and tar-core"); + } + } +} + +/// Preprocess fuzz input to fix up tar header checksums. +/// +/// Walks through 512-byte aligned blocks. For each non-zero block (potential +/// header), computes and sets a valid checksum. Then attempts to parse the +/// size field to skip over content blocks, advancing to the next header. +/// +/// This dramatically improves fuzzing coverage by allowing the parser to get +/// past the checksum verification gate and exercise deeper parsing logic +/// (PAX extensions, GNU long name/link, sparse files, etc.). +fn fixup_checksums(data: &mut [u8]) { + let mut offset = 0; + while offset + 512 <= data.len() { + let block = &data[offset..offset + 512]; + + // Skip zero blocks (end-of-archive markers) + if block.iter().all(|&b| b == 0) { + offset += 512; + continue; + } + + // Fill checksum field (bytes 148..156) with spaces + let block = &mut data[offset..offset + 512]; + block[148..156].fill(b' '); + + // Compute checksum: unsigned sum of all 512 bytes + let checksum: u64 = block.iter().map(|&b| u64::from(b)).sum(); + + // Encode as 7 octal digits + NUL terminator + let cksum_str = format!("{:07o}\0", checksum); + let cksum_bytes = cksum_str.as_bytes(); + let copy_len = cksum_bytes.len().min(8); + block[148..148 + copy_len].copy_from_slice(&cksum_bytes[..copy_len]); + + offset += 512; + + // Try to parse the size field (bytes 124..136) to skip content blocks + let size_field = &data[offset - 512 + 124..offset - 512 + 136]; + if let Some(size) = parse_octal_simple(size_field) { + let padded = ((size as usize) + 511) & !511; + if offset + padded <= data.len() { + offset += padded; + } + } + } +} + +/// Simple octal parser for the size field - doesn't need to handle base-256 +/// since we're just trying to skip content. Returns None on any parse failure. +fn parse_octal_simple(bytes: &[u8]) -> Option { + let trimmed: Vec = bytes + .iter() + .copied() + .skip_while(|&b| b == b' ') + .take_while(|&b| b != b' ' && b != 0) + .collect(); + if trimmed.is_empty() { + return Some(0); + } + let s = core::str::from_utf8(&trimmed).ok()?; + u64::from_str_radix(s, 8).ok() +} fuzz_target!(|data: &[u8]| { if data.len() > 256 * 1024 { return; } - let tar_rs_entries = parse_tar_rs(data); - let tar_core_entries = parse_tar_core(data); - - assert_eq!( - tar_core_entries.len(), - tar_rs_entries.len(), - "entry count mismatch: tar-core={} tar-rs={}", - tar_core_entries.len(), - tar_rs_entries.len(), - ); - - for i in 0..tar_rs_entries.len() { - let rs = &tar_rs_entries[i]; - let core = &tar_core_entries[i]; - - assert_eq!( - rs.path, - core.path, - "path mismatch at entry {i}: tar-rs={:?} tar-core={:?}", - String::from_utf8_lossy(&rs.path), - String::from_utf8_lossy(&core.path), - ); - assert_eq!(rs.size, core.size, "size mismatch at entry {i}"); - assert_eq!( - rs.entry_type, core.entry_type, - "entry_type mismatch at entry {i}" - ); - assert_eq!(rs.mode, core.mode, "mode mismatch at entry {i}"); - assert_eq!(rs.uid, core.uid, "uid mismatch at entry {i}"); - assert_eq!(rs.gid, core.gid, "gid mismatch at entry {i}"); - assert_eq!(rs.mtime, core.mtime, "mtime mismatch at entry {i}"); - assert_eq!( - rs.link_target, core.link_target, - "link_target mismatch at entry {i}" - ); - assert_eq!(rs.uname, core.uname, "uname mismatch at entry {i}"); - assert_eq!(rs.gname, core.gname, "gname mismatch at entry {i}"); - assert_eq!( - rs.dev_major, core.dev_major, - "dev_major mismatch at entry {i}" - ); - assert_eq!( - rs.dev_minor, core.dev_minor, - "dev_minor mismatch at entry {i}" - ); - assert_eq!( - rs.content, core.content, - "content mismatch at entry {i} (size={})", - rs.size, - ); - assert_eq!(rs.xattrs, core.xattrs, "xattr mismatch at entry {i}"); + // 90% of the time, fix up checksums to exercise deeper parser logic. + // 10% of the time, pass raw bytes to test checksum validation itself. + let should_fixup = !data.is_empty() && data[0] % 10 != 0; + + if should_fixup { + let mut data = data.to_vec(); + fixup_checksums(&mut data); + let tar_rs_entries = parse_tar_rs(&data); + let tar_core_entries = parse_tar_core(&data); + compare_entries(&data, &tar_rs_entries, &tar_core_entries); + } else { + let tar_rs_entries = parse_tar_rs(data); + let tar_core_entries = parse_tar_core(data); + compare_entries(data, &tar_rs_entries, &tar_core_entries); } }); diff --git a/fuzz/fuzz_targets/parse.rs b/fuzz/fuzz_targets/parse.rs index 5b40afb..0a47457 100644 --- a/fuzz/fuzz_targets/parse.rs +++ b/fuzz/fuzz_targets/parse.rs @@ -14,8 +14,9 @@ use tar_core::HEADER_SIZE; /// Drive a parser to completion over `data`, checking invariants on each entry. /// Returns normally on errors or NeedData — the point is that it must not panic. -fn run_parser(data: &[u8], limits: Limits) { +fn run_parser(data: &[u8], limits: Limits, verify_checksums: bool) { let mut parser = Parser::new(limits); + parser.set_verify_checksums(verify_checksums); let mut offset: usize = 0; loop { @@ -99,8 +100,14 @@ fn run_parser(data: &[u8], limits: Limits) { } fuzz_target!(|data: &[u8]| { - // Run with permissive limits (should accept anything that isn't structurally broken). - run_parser(data, Limits::permissive()); - // Run with default limits (stricter — may error on oversized paths/pax, but must not panic). - run_parser(data, Limits::default()); + // 90% of the time, skip checksum verification to exercise deeper parser + // logic (PAX extensions, GNU long name/link, sparse files, field parsing, + // etc.). Random fuzz input almost never has valid checksums, so without + // this the fuzzer would break immediately on every input. + // + // 10% of the time, verify checksums normally to test that code path too. + let skip_checksums = !data.is_empty() && data[0] % 10 != 0; + + run_parser(data, Limits::permissive(), !skip_checksums); + run_parser(data, Limits::default(), !skip_checksums); }); diff --git a/src/lib.rs b/src/lib.rs index 02d4079..c09f4d1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1444,6 +1444,18 @@ impl OctU64 { } } +/// Test whether a byte is whitespace in the context of tar header fields. +/// +/// This includes all bytes that `u8::is_ascii_whitespace()` recognizes +/// (HT, LF, FF, CR, space) **plus** vertical tab (0x0b). Rust's +/// `is_ascii_whitespace` follows the WHATWG definition which omits VT, +/// but real tar implementations (and Rust's `str::trim()`) treat it as +/// whitespace. Without this, fields like `"0000000\x0b"` would fail to +/// parse. +fn is_tar_whitespace(b: u8) -> bool { + b.is_ascii_whitespace() || b == 0x0b +} + /// Parse an octal ASCII field into a u64. /// /// Octal fields in tar headers are ASCII strings with optional leading @@ -1456,17 +1468,27 @@ impl OctU64 { /// Returns [`HeaderError::InvalidOctal`] if the field contains invalid /// characters (anything other than spaces, digits 0-7, or null bytes). pub(crate) fn parse_octal(bytes: &[u8]) -> Result { - // Tar octal fields are padded with leading spaces and terminated by - // spaces or null bytes. Strip both ends to get the digit run. - let trimmed = bytes + // Tar octal fields are padded with leading spaces/nulls and terminated + // by spaces, tabs, or null bytes. We first truncate at the first null + // (matching how C-string fields work in tar), then trim whitespace from + // both ends to isolate the digit run. + // + // Note: we use `is_tar_whitespace` rather than `u8::is_ascii_whitespace` + // because the latter omits vertical tab (0x0b), which real tar + // implementations treat as whitespace (and Rust's `str::trim()` strips). + let truncated = match bytes.iter().position(|&b| b == 0) { + Some(i) => &bytes[..i], + None => bytes, + }; + let trimmed = truncated .iter() - .position(|&b| b != b' ') + .position(|&b| !is_tar_whitespace(b)) .map(|start| { - let rest = &bytes[start..]; + let rest = &truncated[start..]; let end = rest .iter() - .position(|&b| b == b' ' || b == b'\0') - .unwrap_or(rest.len()); + .rposition(|&b| !is_tar_whitespace(b)) + .map_or(0, |p| p + 1); &rest[..end] }) .unwrap_or(&[]); diff --git a/src/parse.rs b/src/parse.rs index c9dcfde..56d4eee 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -644,6 +644,12 @@ pub struct Parser { /// When true, entries with empty paths are allowed through instead of /// returning [`ParseError::EmptyPath`]. allow_empty_path: bool, + /// When false, header checksum verification is skipped. This is useful + /// for fuzzing, where random input almost never has valid checksums, + /// preventing the fuzzer from exercising deeper parser logic. + /// + /// Default: `true`. + verify_checksums: bool, } impl Parser { @@ -655,6 +661,7 @@ impl Parser { state: State::ReadHeader, pending: PendingMetadata::default(), allow_empty_path: false, + verify_checksums: true, } } @@ -664,6 +671,19 @@ impl Parser { self.allow_empty_path = allow; } + /// Control whether header checksums are verified during parsing. + /// + /// When set to `false`, the parser skips [`Header::verify_checksum`] + /// calls, accepting headers regardless of their checksum field. This + /// is primarily useful for fuzz testing, where random input almost + /// never produces valid checksums, preventing the fuzzer from reaching + /// deeper parser code paths. + /// + /// Default: `true`. + pub fn set_verify_checksums(&mut self, verify: bool) { + self.verify_checksums = verify; + } + /// Create a new parser with default limits. #[must_use] pub fn with_defaults() -> Self { @@ -756,7 +776,9 @@ impl Parser { // Parse header let header = Header::from_bytes(header_bytes); - header.verify_checksum()?; + if self.verify_checksums { + header.verify_checksum()?; + } let entry_type = header.entry_type(); let size = header.entry_size()?; @@ -764,17 +786,26 @@ impl Parser { .checked_next_multiple_of(HEADER_SIZE as u64) .ok_or(ParseError::InvalidSize(size))?; - // Handle metadata entry types + // Metadata entry types (GNU long name/link, PAX headers, GNU sparse) + // only make sense in archives that actually use those formats. A V7- + // style header whose type flag byte happens to be 'L' or 'x' should + // be treated as a regular entry, not as a metadata extension. This + // matches tar-rs's `is_recognized_header` guard. + let is_extension_format = header.is_gnu() || header.is_ustar(); match entry_type { - EntryType::GnuLongName => { + EntryType::GnuLongName if is_extension_format => { self.handle_extension(input, size, padded_size, ExtensionKind::GnuLongName) } - EntryType::GnuLongLink => { + EntryType::GnuLongLink if is_extension_format => { self.handle_extension(input, size, padded_size, ExtensionKind::GnuLongLink) } - EntryType::XHeader => { + EntryType::XHeader if is_extension_format => { self.handle_extension(input, size, padded_size, ExtensionKind::Pax) } + // Global PAX headers (type 'g') are defined by POSIX + // independently of the UStar/GNU magic, so we always handle + // them here. Routing through emit_entry would fail because + // global headers have arbitrary metadata fields. EntryType::XGlobalHeader => { // Check size limit (same as local PAX headers) if size > self.limits.max_pax_size { @@ -800,7 +831,9 @@ impl Parser { pax_data, }) } - EntryType::GnuSparse => self.handle_gnu_sparse(input, header, size), + EntryType::GnuSparse if is_extension_format => { + self.handle_gnu_sparse(input, header, size) + } _ => { // Check for PAX v1.0 sparse before emitting — it requires // reading the sparse map from the data stream. @@ -1278,8 +1311,14 @@ impl Parser { let mut mtime = header.mtime()?; let mut entry_size = size; let mut xattrs = Vec::new(); - let mut uname: Option> = header.username().map(Cow::Borrowed); - let mut gname: Option> = header.groupname().map(Cow::Borrowed); + let mut uname: Option> = header + .username() + .filter(|b| !b.is_empty()) + .map(Cow::Borrowed); + let mut gname: Option> = header + .groupname() + .filter(|b| !b.is_empty()) + .map(Cow::Borrowed); // Handle UStar prefix for path if let Some(prefix) = header.prefix() { diff --git a/testutil/src/lib.rs b/testutil/src/lib.rs index 13eb40d..a16eb80 100644 --- a/testutil/src/lib.rs +++ b/testutil/src/lib.rs @@ -48,6 +48,9 @@ pub fn parse_tar_core(data: &[u8]) -> Vec { pub fn parse_tar_core_with_limits(data: &[u8], limits: Limits) -> Vec { let mut results = Vec::new(); let mut parser = Parser::new(limits); + // Allow entries with empty paths so we don't stop parsing early. + // We skip them below to match parse_tar_rs's behaviour. + parser.set_allow_empty_path(true); let mut offset = 0; loop { @@ -65,14 +68,15 @@ pub fn parse_tar_core_with_limits(data: &[u8], limits: Limits) -> Vec data.len() { + break; + } + let content = data[offset..offset + read_size].to_vec(); let xattrs: Vec<(Vec, Vec)> = entry .xattrs @@ -80,6 +84,36 @@ pub fn parse_tar_core_with_limits(data: &[u8], limits: Limits) -> Vec data.len() { + break; + } + offset += padded; + continue; + } + + // Skip entries with empty paths to match parse_tar_rs. + if entry.path.is_empty() { + let padded = (size as usize).next_multiple_of(HEADER_SIZE); + if offset.saturating_add(padded) > data.len() { + break; + } + offset += padded; + continue; + } + results.push(OwnedEntry { entry_type: entry.entry_type.to_byte(), path: entry.path.to_vec(), @@ -97,11 +131,7 @@ pub fn parse_tar_core_with_limits(data: &[u8], limits: Limits) -> Vec data.len() { break; @@ -137,14 +167,45 @@ pub fn parse_tar_rs(data: &[u8]) -> Vec { Err(_) => break, }; let header = entry.header().clone(); + let entry_type = header.entry_type().as_byte(); let path = entry.path_bytes().into_owned(); let size = entry.size(); - let entry_type = header.entry_type().as_byte(); - let mode = header.mode().unwrap_or(0); - let uid = header.uid().unwrap_or(0); - let gid = header.gid().unwrap_or(0); - let mtime = header.mtime().unwrap_or(0); + + // Require that numeric fields parse successfully. tar-core + // treats invalid numeric fields as hard errors, so if tar-rs + // silently defaulted to 0 we'd get false mismatches. + // + // These checks must come before ALL skip logic (metadata types, + // empty paths) so both parsers stop on the same invalid fields. + let Ok(mode) = header.mode() else { break }; + let Ok(uid) = header.uid() else { break }; + let Ok(gid) = header.gid() else { break }; + let Ok(mtime) = header.mtime() else { break }; + // tar-rs normally uses unwrap_or(None) for device fields, but + // tar-core propagates errors. Break here to match. + let Ok(dev_major) = header.device_major() else { + break; + }; + let Ok(dev_minor) = header.device_minor() else { + break; + }; + + // Skip metadata-only entry types that tar-core handles internally + // (GNU long name/link, PAX headers, global PAX headers). + match header.entry_type() { + tar::EntryType::GNULongName + | tar::EntryType::GNULongLink + | tar::EntryType::XHeader + | tar::EntryType::XGlobalHeader => continue, + _ => {} + } + + // tar-core rejects entries with empty paths; skip them here + // to match. + if path.is_empty() { + continue; + } // entry.link_name_bytes() applies PAX linkpath and GNU long link // overrides, unlike header.link_name_bytes() which is raw. let link_target = entry @@ -191,9 +252,6 @@ pub fn parse_tar_rs(data: &[u8]) -> Vec { break; } - let dev_major = header.device_major().unwrap_or(None); - let dev_minor = header.device_minor().unwrap_or(None); - results.push(OwnedEntry { entry_type, path,