Skip to content
Closed
186 changes: 134 additions & 52 deletions fuzz/fuzz_targets/differential.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,63 +9,145 @@
#![no_main]

use libfuzzer_sys::fuzz_target;
use tar_core_testutil::{parse_tar_core, parse_tar_rs};
use tar_core_testutil::{parse_tar_core, parse_tar_rs, OwnedEntry};

/// Dump the raw 512-byte headers from the (post-fixup) data to stderr.
fn dump_headers(data: &[u8]) {
let mut offset = 0;
let mut i = 0;
while offset + 512 <= data.len() {
let block = &data[offset..offset + 512];
if block.iter().all(|&b| b == 0) {
eprintln!("block[{i}] @{offset}: <all zeros>");
offset += 512;
i += 1;
continue;
}
let header = tar_core::Header::from_bytes(block.try_into().unwrap());
eprintln!("block[{i}] @{offset}: {header:?}");
offset += 512;
i += 1;
}
}

/// Compare entries parsed by tar-rs and tar-core, asserting equivalence.
///
/// tar-core is intentionally more lenient than tar-rs in some cases (e.g.
/// all-null numeric fields are accepted as 0), so we only require that
/// tar-core parses *at least* as many entries as tar-rs and that those
/// entries match.
fn compare_entries(data: &[u8], tar_rs_entries: &[OwnedEntry], tar_core_entries: &[OwnedEntry]) {
if tar_core_entries.len() < tar_rs_entries.len() {
eprintln!(
"entry count mismatch: tar-core={} tar-rs={}",
tar_core_entries.len(),
tar_rs_entries.len()
);
dump_headers(data);
for (i, e) in tar_rs_entries.iter().enumerate() {
eprintln!("tar-rs [{i}]: {e:?}");
}
for (i, e) in tar_core_entries.iter().enumerate() {
eprintln!("tar-core[{i}]: {e:?}");
}
panic!(
"tar-core parsed fewer entries than tar-rs: tar-core={} tar-rs={}",
tar_core_entries.len(),
tar_rs_entries.len(),
);
}

for (i, (rs, core)) in tar_rs_entries.iter().zip(tar_core_entries).enumerate() {
if rs != core {
eprintln!("mismatch at entry {i}:");
dump_headers(data);
eprintln!(" tar-rs: {rs:?}");
eprintln!(" tar-core: {core:?}");
panic!("entry {i} differs between tar-rs and tar-core");
}
}
}

/// Preprocess fuzz input to fix up tar header checksums.
///
/// Walks through 512-byte aligned blocks. For each non-zero block (potential
/// header), computes and sets a valid checksum. Then attempts to parse the
/// size field to skip over content blocks, advancing to the next header.
///
/// This dramatically improves fuzzing coverage by allowing the parser to get
/// past the checksum verification gate and exercise deeper parsing logic
/// (PAX extensions, GNU long name/link, sparse files, etc.).
fn fixup_checksums(data: &mut [u8]) {
let mut offset = 0;
while offset + 512 <= data.len() {
let block = &data[offset..offset + 512];

// Skip zero blocks (end-of-archive markers)
if block.iter().all(|&b| b == 0) {
offset += 512;
continue;
}

// Fill checksum field (bytes 148..156) with spaces
let block = &mut data[offset..offset + 512];
block[148..156].fill(b' ');

// Compute checksum: unsigned sum of all 512 bytes
let checksum: u64 = block.iter().map(|&b| u64::from(b)).sum();

// Encode as 7 octal digits + NUL terminator
let cksum_str = format!("{:07o}\0", checksum);
let cksum_bytes = cksum_str.as_bytes();
let copy_len = cksum_bytes.len().min(8);
block[148..148 + copy_len].copy_from_slice(&cksum_bytes[..copy_len]);

offset += 512;

// Try to parse the size field (bytes 124..136) to skip content blocks
let size_field = &data[offset - 512 + 124..offset - 512 + 136];
if let Some(size) = parse_octal_simple(size_field) {
let padded = ((size as usize) + 511) & !511;
if offset + padded <= data.len() {
offset += padded;
}
}
}
}

/// Simple octal parser for the size field - doesn't need to handle base-256
/// since we're just trying to skip content. Returns None on any parse failure.
fn parse_octal_simple(bytes: &[u8]) -> Option<u64> {
let trimmed: Vec<u8> = bytes
.iter()
.copied()
.skip_while(|&b| b == b' ')
.take_while(|&b| b != b' ' && b != 0)
.collect();
if trimmed.is_empty() {
return Some(0);
}
let s = core::str::from_utf8(&trimmed).ok()?;
u64::from_str_radix(s, 8).ok()
}

fuzz_target!(|data: &[u8]| {
if data.len() > 256 * 1024 {
return;
}

let tar_rs_entries = parse_tar_rs(data);
let tar_core_entries = parse_tar_core(data);

assert_eq!(
tar_core_entries.len(),
tar_rs_entries.len(),
"entry count mismatch: tar-core={} tar-rs={}",
tar_core_entries.len(),
tar_rs_entries.len(),
);

for i in 0..tar_rs_entries.len() {
let rs = &tar_rs_entries[i];
let core = &tar_core_entries[i];

assert_eq!(
rs.path,
core.path,
"path mismatch at entry {i}: tar-rs={:?} tar-core={:?}",
String::from_utf8_lossy(&rs.path),
String::from_utf8_lossy(&core.path),
);
assert_eq!(rs.size, core.size, "size mismatch at entry {i}");
assert_eq!(
rs.entry_type, core.entry_type,
"entry_type mismatch at entry {i}"
);
assert_eq!(rs.mode, core.mode, "mode mismatch at entry {i}");
assert_eq!(rs.uid, core.uid, "uid mismatch at entry {i}");
assert_eq!(rs.gid, core.gid, "gid mismatch at entry {i}");
assert_eq!(rs.mtime, core.mtime, "mtime mismatch at entry {i}");
assert_eq!(
rs.link_target, core.link_target,
"link_target mismatch at entry {i}"
);
assert_eq!(rs.uname, core.uname, "uname mismatch at entry {i}");
assert_eq!(rs.gname, core.gname, "gname mismatch at entry {i}");
assert_eq!(
rs.dev_major, core.dev_major,
"dev_major mismatch at entry {i}"
);
assert_eq!(
rs.dev_minor, core.dev_minor,
"dev_minor mismatch at entry {i}"
);
assert_eq!(
rs.content, core.content,
"content mismatch at entry {i} (size={})",
rs.size,
);
assert_eq!(rs.xattrs, core.xattrs, "xattr mismatch at entry {i}");
// 90% of the time, fix up checksums to exercise deeper parser logic.
// 10% of the time, pass raw bytes to test checksum validation itself.
let should_fixup = !data.is_empty() && data[0] % 10 != 0;

if should_fixup {
let mut data = data.to_vec();
fixup_checksums(&mut data);
let tar_rs_entries = parse_tar_rs(&data);
let tar_core_entries = parse_tar_core(&data);
compare_entries(&data, &tar_rs_entries, &tar_core_entries);
} else {
let tar_rs_entries = parse_tar_rs(data);
let tar_core_entries = parse_tar_core(data);
compare_entries(data, &tar_rs_entries, &tar_core_entries);
}
});
17 changes: 12 additions & 5 deletions fuzz/fuzz_targets/parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,9 @@ use tar_core::HEADER_SIZE;

/// Drive a parser to completion over `data`, checking invariants on each entry.
/// Returns normally on errors or NeedData — the point is that it must not panic.
fn run_parser(data: &[u8], limits: Limits) {
fn run_parser(data: &[u8], limits: Limits, verify_checksums: bool) {
let mut parser = Parser::new(limits);
parser.set_verify_checksums(verify_checksums);
let mut offset: usize = 0;

loop {
Expand Down Expand Up @@ -99,8 +100,14 @@ fn run_parser(data: &[u8], limits: Limits) {
}

fuzz_target!(|data: &[u8]| {
// Run with permissive limits (should accept anything that isn't structurally broken).
run_parser(data, Limits::permissive());
// Run with default limits (stricter — may error on oversized paths/pax, but must not panic).
run_parser(data, Limits::default());
// 90% of the time, skip checksum verification to exercise deeper parser
// logic (PAX extensions, GNU long name/link, sparse files, field parsing,
// etc.). Random fuzz input almost never has valid checksums, so without
// this the fuzzer would break immediately on every input.
//
// 10% of the time, verify checksums normally to test that code path too.
let skip_checksums = !data.is_empty() && data[0] % 10 != 0;

run_parser(data, Limits::permissive(), !skip_checksums);
run_parser(data, Limits::default(), !skip_checksums);
});
36 changes: 29 additions & 7 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1444,6 +1444,18 @@ impl OctU64 {
}
}

/// Test whether a byte is whitespace in the context of tar header fields.
///
/// This includes all bytes that `u8::is_ascii_whitespace()` recognizes
/// (HT, LF, FF, CR, space) **plus** vertical tab (0x0b). Rust's
/// `is_ascii_whitespace` follows the WHATWG definition which omits VT,
/// but real tar implementations (and Rust's `str::trim()`) treat it as
/// whitespace. Without this, fields like `"0000000\x0b"` would fail to
/// parse.
fn is_tar_whitespace(b: u8) -> bool {
b.is_ascii_whitespace() || b == 0x0b
}

/// Parse an octal ASCII field into a u64.
///
/// Octal fields in tar headers are ASCII strings with optional leading
Expand All @@ -1456,17 +1468,27 @@ impl OctU64 {
/// Returns [`HeaderError::InvalidOctal`] if the field contains invalid
/// characters (anything other than spaces, digits 0-7, or null bytes).
pub(crate) fn parse_octal(bytes: &[u8]) -> Result<u64> {
// Tar octal fields are padded with leading spaces and terminated by
// spaces or null bytes. Strip both ends to get the digit run.
let trimmed = bytes
// Tar octal fields are padded with leading spaces/nulls and terminated
// by spaces, tabs, or null bytes. We first truncate at the first null
// (matching how C-string fields work in tar), then trim whitespace from
// both ends to isolate the digit run.
//
// Note: we use `is_tar_whitespace` rather than `u8::is_ascii_whitespace`
// because the latter omits vertical tab (0x0b), which real tar
// implementations treat as whitespace (and Rust's `str::trim()` strips).
let truncated = match bytes.iter().position(|&b| b == 0) {
Some(i) => &bytes[..i],
None => bytes,
};
let trimmed = truncated
.iter()
.position(|&b| b != b' ')
.position(|&b| !is_tar_whitespace(b))
.map(|start| {
let rest = &bytes[start..];
let rest = &truncated[start..];
let end = rest
.iter()
.position(|&b| b == b' ' || b == b'\0')
.unwrap_or(rest.len());
.rposition(|&b| !is_tar_whitespace(b))
.map_or(0, |p| p + 1);
&rest[..end]
})
.unwrap_or(&[]);
Expand Down
55 changes: 47 additions & 8 deletions src/parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -644,6 +644,12 @@ pub struct Parser {
/// When true, entries with empty paths are allowed through instead of
/// returning [`ParseError::EmptyPath`].
allow_empty_path: bool,
/// When false, header checksum verification is skipped. This is useful
/// for fuzzing, where random input almost never has valid checksums,
/// preventing the fuzzer from exercising deeper parser logic.
///
/// Default: `true`.
verify_checksums: bool,
}

impl Parser {
Expand All @@ -655,6 +661,7 @@ impl Parser {
state: State::ReadHeader,
pending: PendingMetadata::default(),
allow_empty_path: false,
verify_checksums: true,
}
}

Expand All @@ -664,6 +671,19 @@ impl Parser {
self.allow_empty_path = allow;
}

/// Control whether header checksums are verified during parsing.
///
/// When set to `false`, the parser skips [`Header::verify_checksum`]
/// calls, accepting headers regardless of their checksum field. This
/// is primarily useful for fuzz testing, where random input almost
/// never produces valid checksums, preventing the fuzzer from reaching
/// deeper parser code paths.
///
/// Default: `true`.
pub fn set_verify_checksums(&mut self, verify: bool) {
self.verify_checksums = verify;
}

/// Create a new parser with default limits.
#[must_use]
pub fn with_defaults() -> Self {
Expand Down Expand Up @@ -756,25 +776,36 @@ impl Parser {

// Parse header
let header = Header::from_bytes(header_bytes);
header.verify_checksum()?;
if self.verify_checksums {
header.verify_checksum()?;
}

let entry_type = header.entry_type();
let size = header.entry_size()?;
let padded_size = size
.checked_next_multiple_of(HEADER_SIZE as u64)
.ok_or(ParseError::InvalidSize(size))?;

// Handle metadata entry types
// Metadata entry types (GNU long name/link, PAX headers, GNU sparse)
// only make sense in archives that actually use those formats. A V7-
// style header whose type flag byte happens to be 'L' or 'x' should
// be treated as a regular entry, not as a metadata extension. This
// matches tar-rs's `is_recognized_header` guard.
let is_extension_format = header.is_gnu() || header.is_ustar();
match entry_type {
EntryType::GnuLongName => {
EntryType::GnuLongName if is_extension_format => {
self.handle_extension(input, size, padded_size, ExtensionKind::GnuLongName)
}
EntryType::GnuLongLink => {
EntryType::GnuLongLink if is_extension_format => {
self.handle_extension(input, size, padded_size, ExtensionKind::GnuLongLink)
}
EntryType::XHeader => {
EntryType::XHeader if is_extension_format => {
self.handle_extension(input, size, padded_size, ExtensionKind::Pax)
}
// Global PAX headers (type 'g') are defined by POSIX
// independently of the UStar/GNU magic, so we always handle
// them here. Routing through emit_entry would fail because
// global headers have arbitrary metadata fields.
EntryType::XGlobalHeader => {
// Check size limit (same as local PAX headers)
if size > self.limits.max_pax_size {
Expand All @@ -800,7 +831,9 @@ impl Parser {
pax_data,
})
}
EntryType::GnuSparse => self.handle_gnu_sparse(input, header, size),
EntryType::GnuSparse if is_extension_format => {
self.handle_gnu_sparse(input, header, size)
}
_ => {
// Check for PAX v1.0 sparse before emitting — it requires
// reading the sparse map from the data stream.
Expand Down Expand Up @@ -1278,8 +1311,14 @@ impl Parser {
let mut mtime = header.mtime()?;
let mut entry_size = size;
let mut xattrs = Vec::new();
let mut uname: Option<Cow<'a, [u8]>> = header.username().map(Cow::Borrowed);
let mut gname: Option<Cow<'a, [u8]>> = header.groupname().map(Cow::Borrowed);
let mut uname: Option<Cow<'a, [u8]>> = header
.username()
.filter(|b| !b.is_empty())
.map(Cow::Borrowed);
let mut gname: Option<Cow<'a, [u8]>> = header
.groupname()
.filter(|b| !b.is_empty())
.map(Cow::Borrowed);

// Handle UStar prefix for path
if let Some(prefix) = header.prefix() {
Expand Down
Loading
Loading