Skip to content

Commit 5de26b9

Browse files
committed
fix(engine): reject Unicode noncharacters in text sanitization
Fixes bounty issue #1119 The sanitize() function previously only filtered control characters but allowed Unicode noncharacters (U+FFFE, U+FFFF, U+FDD0-U+FDEF, and the last two code points of each Unicode plane) to pass through. These noncharacters violate Unicode best practices and should never appear in valid text. Added is_unicode_noncharacter() helper function and updated sanitize() to filter out these characters per the Unicode Standard.
1 parent 42c3300 commit 5de26b9

1 file changed

Lines changed: 105 additions & 1 deletion

File tree

cortex-engine/src/text_encoding.rs

Lines changed: 105 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -349,10 +349,53 @@ pub fn is_binary(data: &[u8]) -> bool {
349349
non_printable > threshold
350350
}
351351

352+
/// Check if a character is a Unicode noncharacter.
353+
///
354+
/// Unicode noncharacters are code points that are permanently reserved and should
355+
/// never appear in valid Unicode text. They include:
356+
/// - U+FFFE and U+FFFF (BMP noncharacters)
357+
/// - U+FDD0 through U+FDEF (32 noncharacters in the Arabic Presentation Forms-A block)
358+
/// - The last two code points of each Unicode plane (U+nFFFE and U+nFFFF for n = 1..16)
359+
fn is_unicode_noncharacter(c: char) -> bool {
360+
let code = c as u32;
361+
362+
// U+FDD0 through U+FDEF (32 noncharacters)
363+
if (0xFDD0..=0xFDEF).contains(&code) {
364+
return true;
365+
}
366+
367+
// Last two code points of each plane (U+nFFFE and U+nFFFF)
368+
// This covers U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, ..., U+10FFFE, U+10FFFF
369+
let low_bits = code & 0xFFFF;
370+
if low_bits == 0xFFFE || low_bits == 0xFFFF {
371+
return true;
372+
}
373+
374+
false
375+
}
376+
352377
/// Sanitize text by removing or replacing invalid characters.
378+
///
379+
/// This function removes:
380+
/// - Control characters (except newline, carriage return, and tab)
381+
/// - Unicode noncharacters (U+FFFE, U+FFFF, U+FDD0-U+FDEF, and plane-end noncharacters)
353382
pub fn sanitize(text: &str) -> String {
354383
text.chars()
355-
.filter(|&c| !c.is_control() || c == '\n' || c == '\r' || c == '\t')
384+
.filter(|&c| {
385+
// Allow specific whitespace control characters
386+
if c == '\n' || c == '\r' || c == '\t' {
387+
return true;
388+
}
389+
// Reject other control characters
390+
if c.is_control() {
391+
return false;
392+
}
393+
// Reject Unicode noncharacters
394+
if is_unicode_noncharacter(c) {
395+
return false;
396+
}
397+
true
398+
})
356399
.collect()
357400
}
358401

@@ -426,4 +469,65 @@ mod tests {
426469
let truncated = truncate_bytes(text, 3);
427470
assert_eq!(truncated, "hé");
428471
}
472+
473+
#[test]
474+
fn test_is_unicode_noncharacter() {
475+
// BMP noncharacters
476+
assert!(is_unicode_noncharacter('\u{FFFE}'));
477+
assert!(is_unicode_noncharacter('\u{FFFF}'));
478+
479+
// Arabic Presentation Forms-A noncharacters (U+FDD0-U+FDEF)
480+
assert!(is_unicode_noncharacter('\u{FDD0}'));
481+
assert!(is_unicode_noncharacter('\u{FDEF}'));
482+
assert!(is_unicode_noncharacter('\u{FDD8}')); // middle of range
483+
484+
// Plane 1 noncharacters
485+
assert!(is_unicode_noncharacter('\u{1FFFE}'));
486+
assert!(is_unicode_noncharacter('\u{1FFFF}'));
487+
488+
// Plane 16 noncharacters (last valid plane)
489+
assert!(is_unicode_noncharacter('\u{10FFFE}'));
490+
assert!(is_unicode_noncharacter('\u{10FFFF}'));
491+
492+
// Valid characters should NOT be noncharacters
493+
assert!(!is_unicode_noncharacter('a'));
494+
assert!(!is_unicode_noncharacter('\u{FFFD}')); // replacement character is valid
495+
assert!(!is_unicode_noncharacter('\u{FDC0}')); // just before noncharacter range
496+
assert!(!is_unicode_noncharacter('\u{FDF0}')); // just after noncharacter range
497+
}
498+
499+
#[test]
500+
fn test_sanitize_removes_noncharacters() {
501+
// Test that noncharacters are removed
502+
let input = "Hello\u{FFFE}World\u{FFFF}!";
503+
let sanitized = sanitize(input);
504+
assert_eq!(sanitized, "HelloWorld!");
505+
506+
// Test FDD0-FDEF range
507+
let input = "Test\u{FDD0}ing\u{FDEF}!";
508+
let sanitized = sanitize(input);
509+
assert_eq!(sanitized, "Testing!");
510+
511+
// Test plane 1 noncharacters
512+
let input = "Multi\u{1FFFE}plane\u{1FFFF}!";
513+
let sanitized = sanitize(input);
514+
assert_eq!(sanitized, "Multiplane!");
515+
516+
// Test that valid Unicode is preserved
517+
let input = "Hello World!\n\t\r";
518+
let sanitized = sanitize(input);
519+
assert_eq!(sanitized, "Hello World!\n\t\r");
520+
521+
// Test replacement character is preserved (it's valid)
522+
let input = "Test\u{FFFD}replacement";
523+
let sanitized = sanitize(input);
524+
assert_eq!(sanitized, "Test\u{FFFD}replacement");
525+
}
526+
527+
#[test]
528+
fn test_sanitize_preserves_whitespace() {
529+
let input = "Line1\nLine2\r\nLine3\tTabbed";
530+
let sanitized = sanitize(input);
531+
assert_eq!(sanitized, "Line1\nLine2\r\nLine3\tTabbed");
532+
}
429533
}

0 commit comments

Comments
 (0)