@@ -10,18 +10,18 @@ use crate::varint::write_varint;
1010/// Longest match first — iterate entries in length-descending order.
1111///
1212/// Returns `Err(CodecError::CompressionFailed)` if the input contains any raw
13- /// byte in the dict-code range 0x02–0x1F. Such bytes would be misinterpreted
14- /// by `reverse_dict` as dictionary codes on decode, producing a different value.
13+ /// byte equal to an actual dictionary code value. Such bytes would be
14+ /// misinterpreted by `reverse_dict` as dictionary codes on decode, producing a
15+ /// different value. Only the exact `APP_DICT` code values are reserved —
16+ /// non-code control characters such as LF (0x0A) pass through unchanged so
17+ /// multi-line `notes` encode correctly (matches the TS reference).
1518pub ( super ) fn apply_dict ( input : & str ) -> Result < Vec < u8 > , CodecError > {
16- // Reject control bytes that overlap the dict-code range.
17- if input. bytes ( ) . any ( |b| matches ! ( b, 0x02 ..=0x1F ) ) {
19+ // Reject only bytes equal to an actual dict code (derived from APP_DICT).
20+ let is_dict_code = |b : u8 | APP_DICT . values ( ) . any ( |& code| code == b) ;
21+ if let Some ( c) = input. chars ( ) . find ( |& c| ( c as u32 ) < 0x100 && is_dict_code ( c as u8 ) ) {
1822 return Err ( CodecError :: CompressionFailed ( format ! (
19- "field value contains reserved control byte (0x02–0x1F): {}" ,
20- input
21- . chars( )
22- . find( |& c| matches!( c as u8 , 0x02 ..=0x1F ) )
23- . map( |c| format!( "0x{:02x}" , c as u8 ) )
24- . unwrap_or_default( )
23+ "field value contains reserved dictionary code byte: 0x{:02x}" ,
24+ c as u8
2525 ) ) ) ;
2626 }
2727
@@ -153,16 +153,72 @@ mod tests {
153153 ) ;
154154 }
155155
156- /// All bytes in the range 0x02–0x1F must be rejected.
156+ /// Every actual `APP_DICT` code value must be rejected as a raw byte .
157157 #[ test]
158- fn r3_all_control_bytes_in_range_rejected ( ) {
159- for code in 0x02u8 ..= 0x1Fu8 {
158+ fn r3_all_dict_code_bytes_rejected ( ) {
159+ for & code in APP_DICT . values ( ) {
160160 let hostile = format ! ( "{}" , char :: from( code) ) ;
161161 let err = apply_dict ( & hostile) . unwrap_err ( ) ;
162162 assert ! (
163163 matches!( err, crate :: error:: CodecError :: CompressionFailed ( _) ) ,
164- "expected CompressionFailed for control byte 0x{code:02x}, got {err:?}"
164+ "expected CompressionFailed for dict code 0x{code:02x}, got {err:?}"
165165 ) ;
166166 }
167167 }
168+
169+ // --- #4: exact-set rejection (match TS reference) ---
170+
171+ /// LF (0x0A) is NOT a dict code — multi-line `notes` must encode fine.
172+ #[ test]
173+ fn apply_dict_accepts_lf_multiline_notes ( ) {
174+ let multiline = "Line one\n Line two\n Line three" ;
175+ let encoded = apply_dict ( multiline) . expect ( "LF must be accepted" ) ;
176+ assert ! (
177+ encoded. contains( & 0x0A ) ,
178+ "LF byte must survive into the encoded output"
179+ ) ;
180+ }
181+
182+ /// TAB (0x09) IS a dict code (".com") — must be rejected.
183+ #[ test]
184+ fn apply_dict_rejects_tab ( ) {
185+ let err = apply_dict ( "col1\t col2" ) . unwrap_err ( ) ;
186+ assert ! (
187+ matches!( err, crate :: error:: CodecError :: CompressionFailed ( _) ) ,
188+ "expected CompressionFailed for TAB (0x09), got {err:?}"
189+ ) ;
190+ }
191+
192+ /// CR (0x0D) IS a dict code ("development") — must be rejected.
193+ #[ test]
194+ fn apply_dict_rejects_cr ( ) {
195+ let err = apply_dict ( "line\r wrap" ) . unwrap_err ( ) ;
196+ assert ! (
197+ matches!( err, crate :: error:: CodecError :: CompressionFailed ( _) ) ,
198+ "expected CompressionFailed for CR (0x0D), got {err:?}"
199+ ) ;
200+ }
201+
202+ /// FIX #1 (encode half): non-ASCII text must pass `apply_dict` and emit
203+ /// its exact UTF-8 bytes — `reverse_dict` round-trips it (see decode tests).
204+ #[ test]
205+ fn apply_dict_preserves_non_ascii_utf8 ( ) {
206+ let original = "Café 日本語 ñ" ;
207+ let encoded = apply_dict ( original) . expect ( "non-ASCII must be accepted" ) ;
208+ assert_eq ! (
209+ encoded,
210+ original. as_bytes( ) ,
211+ "non-ASCII input must emit its UTF-8 bytes unchanged"
212+ ) ;
213+ }
214+
215+ /// A raw 0x06 byte ("Invoice" dict code) must still be rejected.
216+ #[ test]
217+ fn apply_dict_rejects_raw_0x06 ( ) {
218+ let err = apply_dict ( "\x06 Acme" ) . unwrap_err ( ) ;
219+ assert ! (
220+ matches!( err, crate :: error:: CodecError :: CompressionFailed ( _) ) ,
221+ "expected CompressionFailed for 0x06, got {err:?}"
222+ ) ;
223+ }
168224}
0 commit comments