Skip to content

Commit 7e6cfee

Browse files
committed
fix(codec): dict layer — exact-set reject (#4) + UTF-8 reverse_dict (#1), match TS reference
1 parent 4ff58d7 commit 7e6cfee

2 files changed

Lines changed: 112 additions & 19 deletions

File tree

packages/codec/src/decode/dict.rs

Lines changed: 42 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,11 @@ use super::hex::bytes_to_address;
99

1010
/// Reverse app-level dictionary substitution (mirrors reverseDict from app-dict.ts).
1111
pub(super) fn reverse_dict(bytes: &[u8]) -> Result<String, CodecError> {
12-
// Decode raw bytes as a string — control chars are the dict codes
13-
let mut text = String::with_capacity(bytes.len());
14-
for &b in bytes {
15-
text.push(b as char);
16-
}
12+
// Decode raw bytes as UTF-8 (matches the TS reference's TextDecoder).
13+
// Dict-code bytes (0x02–0x0F) are valid single-byte UTF-8 and survive as
14+
// single chars, so the expansion loop below works unchanged.
15+
let mut text = String::from_utf8(bytes.to_vec())
16+
.map_err(|_| CodecError::CompressionFailed("invalid UTF-8 in dict text".to_string()))?;
1717

1818
// Reverse entries longest-pattern-first (same order as apply_dict)
1919
let entries: &[(&str, u8)] = &[
@@ -163,3 +163,40 @@ pub(super) fn decode_token_address(value: &[u8]) -> Result<String, CodecError> {
163163
bytes_to_address(&value[1..])
164164
}
165165
}
166+
167+
#[cfg(test)]
168+
mod tests {
169+
use super::*;
170+
171+
/// FIX #1: non-ASCII text must round-trip through dict layer.
172+
/// "Café 日本語 ñ" contains no `APP_DICT` pattern, so `apply_dict` would
173+
/// emit exactly its UTF-8 bytes — fed here directly to `reverse_dict`.
174+
/// The old `b as char` (Latin-1) path corrupted every multi-byte char.
175+
#[test]
176+
fn reverse_dict_roundtrips_non_ascii() {
177+
let original = "Café 日本語 ñ";
178+
let encoded = original.as_bytes(); // == apply_dict(original) — no dict match
179+
let decoded = reverse_dict(encoded).expect("valid UTF-8 must decode");
180+
assert_eq!(decoded, original, "non-ASCII text must round-trip intact");
181+
}
182+
183+
/// FIX #1: invalid UTF-8 input must surface an error, not silent garbage.
184+
#[test]
185+
fn reverse_dict_invalid_utf8_errors() {
186+
// 0xFF is never a valid UTF-8 byte.
187+
let bad = [b'a', 0xFF, b'b'];
188+
let err = reverse_dict(&bad).unwrap_err();
189+
assert!(
190+
matches!(err, CodecError::CompressionFailed(_)),
191+
"expected CompressionFailed for invalid UTF-8, got {err:?}"
192+
);
193+
}
194+
195+
/// Regression: dict-code expansion still works on a UTF-8-decoded string.
196+
#[test]
197+
fn reverse_dict_expands_dict_code() {
198+
// 0x06 = "Invoice" dict code.
199+
let decoded = reverse_dict(&[0x06, b' ', b'#', b'1']).unwrap();
200+
assert_eq!(decoded, "Invoice #1");
201+
}
202+
}

packages/codec/src/encode/dict.rs

Lines changed: 70 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -10,18 +10,18 @@ use crate::varint::write_varint;
1010
/// Longest match first — iterate entries in length-descending order.
1111
///
1212
/// Returns `Err(CodecError::CompressionFailed)` if the input contains any raw
13-
/// byte in the dict-code range 0x02–0x1F. Such bytes would be misinterpreted
14-
/// by `reverse_dict` as dictionary codes on decode, producing a different value.
13+
/// byte equal to an actual dictionary code value. Such bytes would be
14+
/// misinterpreted by `reverse_dict` as dictionary codes on decode, producing a
15+
/// different value. Only the exact `APP_DICT` code values are reserved —
16+
/// non-code control characters such as LF (0x0A) pass through unchanged so
17+
/// multi-line `notes` encode correctly (matches the TS reference).
1518
pub(super) fn apply_dict(input: &str) -> Result<Vec<u8>, CodecError> {
16-
// Reject control bytes that overlap the dict-code range.
17-
if input.bytes().any(|b| matches!(b, 0x02..=0x1F)) {
19+
// Reject only bytes equal to an actual dict code (derived from APP_DICT).
20+
let is_dict_code = |b: u8| APP_DICT.values().any(|&code| code == b);
21+
if let Some(c) = input.chars().find(|&c| (c as u32) < 0x100 && is_dict_code(c as u8)) {
1822
return Err(CodecError::CompressionFailed(format!(
19-
"field value contains reserved control byte (0x02–0x1F): {}",
20-
input
21-
.chars()
22-
.find(|&c| matches!(c as u8, 0x02..=0x1F))
23-
.map(|c| format!("0x{:02x}", c as u8))
24-
.unwrap_or_default()
23+
"field value contains reserved dictionary code byte: 0x{:02x}",
24+
c as u8
2525
)));
2626
}
2727

@@ -153,16 +153,72 @@ mod tests {
153153
);
154154
}
155155

156-
/// All bytes in the range 0x02–0x1F must be rejected.
156+
/// Every actual `APP_DICT` code value must be rejected as a raw byte.
157157
#[test]
158-
fn r3_all_control_bytes_in_range_rejected() {
159-
for code in 0x02u8..=0x1Fu8 {
158+
fn r3_all_dict_code_bytes_rejected() {
159+
for &code in APP_DICT.values() {
160160
let hostile = format!("{}", char::from(code));
161161
let err = apply_dict(&hostile).unwrap_err();
162162
assert!(
163163
matches!(err, crate::error::CodecError::CompressionFailed(_)),
164-
"expected CompressionFailed for control byte 0x{code:02x}, got {err:?}"
164+
"expected CompressionFailed for dict code 0x{code:02x}, got {err:?}"
165165
);
166166
}
167167
}
168+
169+
// --- #4: exact-set rejection (match TS reference) ---
170+
171+
/// LF (0x0A) is NOT a dict code — multi-line `notes` must encode fine.
172+
#[test]
173+
fn apply_dict_accepts_lf_multiline_notes() {
174+
let multiline = "Line one\nLine two\nLine three";
175+
let encoded = apply_dict(multiline).expect("LF must be accepted");
176+
assert!(
177+
encoded.contains(&0x0A),
178+
"LF byte must survive into the encoded output"
179+
);
180+
}
181+
182+
/// TAB (0x09) IS a dict code (".com") — must be rejected.
183+
#[test]
184+
fn apply_dict_rejects_tab() {
185+
let err = apply_dict("col1\tcol2").unwrap_err();
186+
assert!(
187+
matches!(err, crate::error::CodecError::CompressionFailed(_)),
188+
"expected CompressionFailed for TAB (0x09), got {err:?}"
189+
);
190+
}
191+
192+
/// CR (0x0D) IS a dict code ("development") — must be rejected.
193+
#[test]
194+
fn apply_dict_rejects_cr() {
195+
let err = apply_dict("line\rwrap").unwrap_err();
196+
assert!(
197+
matches!(err, crate::error::CodecError::CompressionFailed(_)),
198+
"expected CompressionFailed for CR (0x0D), got {err:?}"
199+
);
200+
}
201+
202+
/// FIX #1 (encode half): non-ASCII text must pass `apply_dict` and emit
203+
/// its exact UTF-8 bytes — `reverse_dict` round-trips it (see decode tests).
204+
#[test]
205+
fn apply_dict_preserves_non_ascii_utf8() {
206+
let original = "Café 日本語 ñ";
207+
let encoded = apply_dict(original).expect("non-ASCII must be accepted");
208+
assert_eq!(
209+
encoded,
210+
original.as_bytes(),
211+
"non-ASCII input must emit its UTF-8 bytes unchanged"
212+
);
213+
}
214+
215+
/// A raw 0x06 byte ("Invoice" dict code) must still be rejected.
216+
#[test]
217+
fn apply_dict_rejects_raw_0x06() {
218+
let err = apply_dict("\x06Acme").unwrap_err();
219+
assert!(
220+
matches!(err, crate::error::CodecError::CompressionFailed(_)),
221+
"expected CompressionFailed for 0x06, got {err:?}"
222+
);
223+
}
168224
}

0 commit comments

Comments
 (0)