Skip to content

Commit fcd667b

Browse files
Velli20Copilot
andcommitted
feat(pdf-filter): implement LZWDecode filter with predictor support
Add LZW decompression (PDF spec §7.4.4) and shared Predictor post-processing (§7.4.4.4) to the pdf-filter crate. New modules: - lzw.rs: Pure-Rust LZW decoder with EarlyChange parameter support, variable-width codes (9-12 bits), MSB-first bit packing. - predictor.rs: TIFF Predictor 2 (horizontal differencing) and PNG predictors (None/Sub/Up/Average/Paeth). Shared by LZWDecode and FlateDecode. Changes: - bitreader.rs: Add read_bits(n) for variable-width code extraction. - filter.rs: Add LZWDecode variant, Lzw/Flate DecodeParms, wire into decode pipeline. Also wire predictor support into FlateDecode. - lib.rs: Register new modules, update crate documentation. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent e15d813 commit fcd667b

5 files changed

Lines changed: 954 additions & 2 deletions

File tree

crates/pdf-filter/src/bitreader.rs

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,26 @@ impl<'a> BitReader<'a> {
4141
}
4242
}
4343

44+
/// Read `n` bits MSB-first and return them as a `u16`.
45+
///
46+
/// Returns `None` if there are fewer than `n` bits remaining. `n` must be
47+
/// at most 16.
48+
#[allow(clippy::arithmetic_side_effects)]
49+
pub fn read_bits(&mut self, n: u8) -> Option<u16> {
50+
let total_bits = self.src.len().saturating_mul(8);
51+
if self.bit_pos.saturating_add(usize::from(n)) > total_bits {
52+
return None;
53+
}
54+
let mut value: u16 = 0;
55+
for _ in 0..n {
56+
value <<= 1;
57+
if self.next_bit()? {
58+
value |= 1;
59+
}
60+
}
61+
Some(value)
62+
}
63+
4464
/// Advance to the next byte boundary, but only if all padding bits are 0.
4565
/// Returns `true` if alignment happened, `false` if a non-zero pad bit was
4666
/// found (the caller should disable byte-alignment for remaining rows).
@@ -83,4 +103,41 @@ mod tests {
83103
r.skip_bits(100); // should not panic
84104
assert!(r.exhausted());
85105
}
106+
107+
#[test]
108+
fn read_bits_returns_msb_first_value() {
109+
// 0b1010_0110 = 0xA6
110+
let data = [0xA6u8];
111+
let mut r = BitReader::new(&data);
112+
// Read 4 bits: 1010 = 10
113+
assert_eq!(r.read_bits(4), Some(0b1010));
114+
// Read 4 bits: 0110 = 6
115+
assert_eq!(r.read_bits(4), Some(0b0110));
116+
}
117+
118+
#[test]
119+
fn read_bits_across_byte_boundary() {
120+
let data = [0b1111_0000u8, 0b1010_1010u8];
121+
let mut r = BitReader::new(&data);
122+
r.skip_bits(4);
123+
// Read 8 bits spanning bytes: 0000_1010 = 0x0A
124+
assert_eq!(r.read_bits(8), Some(0b0000_1010));
125+
}
126+
127+
#[test]
128+
fn read_bits_returns_none_when_exhausted() {
129+
let data = [0xFFu8];
130+
let mut r = BitReader::new(&data);
131+
assert_eq!(r.read_bits(9), None);
132+
}
133+
134+
#[test]
135+
fn read_bits_9_bit_code() {
136+
// Two bytes: 0b1_0000_0001 0xxxxxxx
137+
// Code 0x101 = 257 in 9 bits MSB-first
138+
// Byte 0: 1000_0000 Byte 1: 1xxxxxxx
139+
let data = [0b1000_0000u8, 0b1000_0000u8];
140+
let mut r = BitReader::new(&data);
141+
assert_eq!(r.read_bits(9), Some(0b1_0000_0001)); // 257
142+
}
86143
}

crates/pdf-filter/src/filter.rs

Lines changed: 60 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
use std::borrow::Cow;
22
use std::fmt;
33

4-
use crate::{ccitt_fax_params::CCITTFaxParams, error::FilterError};
4+
use crate::{ccitt_fax_params::CCITTFaxParams, error::FilterError, predictor::PredictorParams};
55

66
use pdf_object::{
77
dictionary::Dictionary,
@@ -43,6 +43,11 @@ pub enum Filter {
4343
/// bytes. The special character `z` represents four zero bytes. The
4444
/// end-of-data marker is `~>`.
4545
ASCII85Decode,
46+
/// The LZW (Lempel-Ziv-Welch) filter, a lossless compression algorithm.
47+
///
48+
/// Based on variable-length code substitution. This was used in older PDFs
49+
/// before FlateDecode became the standard. See PDF spec §7.4.4.
50+
LZWDecode,
4651
/// A filter that is not currently supported by this implementation.
4752
///
4853
/// The contained string holds the original filter name from the PDF,
@@ -58,6 +63,7 @@ impl From<Cow<'_, str>> for Filter {
5863
"JPXDecode" => Self::JPXDecode,
5964
"CCITTFaxDecode" => Self::CCITTFaxDecode,
6065
"ASCII85Decode" => Self::ASCII85Decode,
66+
"LZWDecode" => Self::LZWDecode,
6167
_ => Self::Unsupported(name.into_owned()),
6268
}
6369
}
@@ -77,6 +83,7 @@ impl fmt::Display for Filter {
7783
Self::FlateDecode => f.write_str("FlateDecode"),
7884
Self::CCITTFaxDecode => f.write_str("CCITTFaxDecode"),
7985
Self::ASCII85Decode => f.write_str("ASCII85Decode"),
86+
Self::LZWDecode => f.write_str("LZWDecode"),
8087
Self::Unsupported(name) => f.write_str(name),
8188
}
8289
}
@@ -92,6 +99,13 @@ pub(crate) enum DecodeParms {
9299
None,
93100
/// Parameters for the `CCITTFaxDecode` filter.
94101
CcittFax(CCITTFaxParams),
102+
/// Parameters for `LZWDecode`: EarlyChange flag + optional predictor.
103+
Lzw {
104+
early_change: bool,
105+
predictor: PredictorParams,
106+
},
107+
/// Predictor parameters for `FlateDecode`.
108+
Flate { predictor: PredictorParams },
95109
}
96110

97111
/// Methods for parsing the `/Filter` entry from a PDF dictionary.
@@ -229,6 +243,27 @@ pub fn decode(stream: &StreamObject) -> Result<Cow<'_, [u8]>, FilterError> {
229243
match filter {
230244
Filter::FlateDecode => {
231245
let decoded = Filter::decode_flate(&data)?;
246+
let decoded = match params {
247+
DecodeParms::Flate { predictor } if !predictor.is_none() => {
248+
crate::predictor::apply_predictor(&decoded, predictor)?
249+
}
250+
_ => decoded,
251+
};
252+
data = Cow::Owned(decoded);
253+
}
254+
Filter::LZWDecode => {
255+
let (early_change, predictor) = match params {
256+
DecodeParms::Lzw {
257+
early_change,
258+
predictor,
259+
} => (*early_change, Some(predictor)),
260+
_ => (true, None),
261+
};
262+
let decoded = crate::lzw::decode(&data, early_change)?;
263+
let decoded = match predictor {
264+
Some(p) if !p.is_none() => crate::predictor::apply_predictor(&decoded, p)?,
265+
_ => decoded,
266+
};
232267
data = Cow::Owned(decoded);
233268
}
234269
Filter::JPXDecode => {
@@ -246,7 +281,7 @@ pub fn decode(stream: &StreamObject) -> Result<Cow<'_, [u8]>, FilterError> {
246281
Filter::CCITTFaxDecode => {
247282
let ccitt_params = match params {
248283
DecodeParms::CcittFax(p) => p,
249-
DecodeParms::None => &CCITTFaxParams::DEFAULT,
284+
_ => &CCITTFaxParams::DEFAULT,
250285
};
251286
let decoded = crate::ccitt::decode(&data, ccitt_params)?;
252287
data = Cow::Owned(decoded);
@@ -302,6 +337,29 @@ fn parse_decode_params(
302337
DecodeParms::CcittFax(p)
303338
}
304339
(Filter::CCITTFaxDecode, None) => DecodeParms::CcittFax(CCITTFaxParams::default()),
340+
(Filter::LZWDecode, Some(d)) => {
341+
let early_change = d
342+
.get("EarlyChange")
343+
.and_then(|v| v.try_number::<i64>(objects).ok())
344+
.unwrap_or(1)
345+
!= 0;
346+
let predictor = PredictorParams::from_dictionary(d, objects).unwrap_or_default();
347+
DecodeParms::Lzw {
348+
early_change,
349+
predictor,
350+
}
351+
}
352+
(Filter::LZWDecode, None) => DecodeParms::Lzw {
353+
early_change: true,
354+
predictor: PredictorParams::default(),
355+
},
356+
(Filter::FlateDecode, Some(d)) => {
357+
let predictor = PredictorParams::from_dictionary(d, objects).unwrap_or_default();
358+
DecodeParms::Flate { predictor }
359+
}
360+
(Filter::FlateDecode, None) => DecodeParms::Flate {
361+
predictor: PredictorParams::default(),
362+
},
305363
_ => DecodeParms::None,
306364
})
307365
.collect()

crates/pdf-filter/src/lib.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
//! specification (§7.4):
55
//!
66
//! - **FlateDecode** — zlib/deflate (RFC 1950 / RFC 1951)
7+
//! - **LZWDecode** — LZW compression (§7.4.4)
78
//! - **DCTDecode** — baseline JPEG
89
//! - **JPXDecode** — JPEG 2000
910
//! - **CCITTFaxDecode** — Group 3 / Group 4 fax compression
@@ -20,3 +21,5 @@ pub(crate) mod ccitt_fax_params;
2021
mod ccitt_tables;
2122
pub mod error;
2223
pub mod filter;
24+
pub(crate) mod lzw;
25+
pub(crate) mod predictor;

0 commit comments

Comments
 (0)