@@ -349,10 +349,53 @@ pub fn is_binary(data: &[u8]) -> bool {
349349 non_printable > threshold
350350}
351351
352+ /// Check if a character is a Unicode noncharacter.
353+ ///
354+ /// Unicode noncharacters are code points that are permanently reserved and should
355+ /// never appear in valid Unicode text. They include:
356+ /// - U+FFFE and U+FFFF (BMP noncharacters)
357+ /// - U+FDD0 through U+FDEF (32 noncharacters in the Arabic Presentation Forms-A block)
358+ /// - The last two code points of each Unicode plane (U+nFFFE and U+nFFFF for n = 1..16)
359+ fn is_unicode_noncharacter ( c : char ) -> bool {
360+ let code = c as u32 ;
361+
362+ // U+FDD0 through U+FDEF (32 noncharacters)
363+ if ( 0xFDD0 ..=0xFDEF ) . contains ( & code) {
364+ return true ;
365+ }
366+
367+ // Last two code points of each plane (U+nFFFE and U+nFFFF)
368+ // This covers U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, ..., U+10FFFE, U+10FFFF
369+ let low_bits = code & 0xFFFF ;
370+ if low_bits == 0xFFFE || low_bits == 0xFFFF {
371+ return true ;
372+ }
373+
374+ false
375+ }
376+
352377/// Sanitize text by removing or replacing invalid characters.
378+ ///
379+ /// This function removes:
380+ /// - Control characters (except newline, carriage return, and tab)
381+ /// - Unicode noncharacters (U+FFFE, U+FFFF, U+FDD0-U+FDEF, and plane-end noncharacters)
353382pub fn sanitize ( text : & str ) -> String {
354383 text. chars ( )
355- . filter ( |& c| !c. is_control ( ) || c == '\n' || c == '\r' || c == '\t' )
384+ . filter ( |& c| {
385+ // Allow specific whitespace control characters
386+ if c == '\n' || c == '\r' || c == '\t' {
387+ return true ;
388+ }
389+ // Reject other control characters
390+ if c. is_control ( ) {
391+ return false ;
392+ }
393+ // Reject Unicode noncharacters
394+ if is_unicode_noncharacter ( c) {
395+ return false ;
396+ }
397+ true
398+ } )
356399 . collect ( )
357400}
358401
@@ -426,4 +469,65 @@ mod tests {
426469 let truncated = truncate_bytes ( text, 3 ) ;
427470 assert_eq ! ( truncated, "hé" ) ;
428471 }
472+
473+ #[ test]
474+ fn test_is_unicode_noncharacter ( ) {
475+ // BMP noncharacters
476+ assert ! ( is_unicode_noncharacter( '\u{FFFE}' ) ) ;
477+ assert ! ( is_unicode_noncharacter( '\u{FFFF}' ) ) ;
478+
479+ // Arabic Presentation Forms-A noncharacters (U+FDD0-U+FDEF)
480+ assert ! ( is_unicode_noncharacter( '\u{FDD0}' ) ) ;
481+ assert ! ( is_unicode_noncharacter( '\u{FDEF}' ) ) ;
482+ assert ! ( is_unicode_noncharacter( '\u{FDD8}' ) ) ; // middle of range
483+
484+ // Plane 1 noncharacters
485+ assert ! ( is_unicode_noncharacter( '\u{1FFFE}' ) ) ;
486+ assert ! ( is_unicode_noncharacter( '\u{1FFFF}' ) ) ;
487+
488+ // Plane 16 noncharacters (last valid plane)
489+ assert ! ( is_unicode_noncharacter( '\u{10FFFE}' ) ) ;
490+ assert ! ( is_unicode_noncharacter( '\u{10FFFF}' ) ) ;
491+
492+ // Valid characters should NOT be noncharacters
493+ assert ! ( !is_unicode_noncharacter( 'a' ) ) ;
494+ assert ! ( !is_unicode_noncharacter( '\u{FFFD}' ) ) ; // replacement character is valid
495+ assert ! ( !is_unicode_noncharacter( '\u{FDC0}' ) ) ; // just before noncharacter range
496+ assert ! ( !is_unicode_noncharacter( '\u{FDF0}' ) ) ; // just after noncharacter range
497+ }
498+
499+ #[ test]
500+ fn test_sanitize_removes_noncharacters ( ) {
501+ // Test that noncharacters are removed
502+ let input = "Hello\u{FFFE} World\u{FFFF} !" ;
503+ let sanitized = sanitize ( input) ;
504+ assert_eq ! ( sanitized, "HelloWorld!" ) ;
505+
506+ // Test FDD0-FDEF range
507+ let input = "Test\u{FDD0} ing\u{FDEF} !" ;
508+ let sanitized = sanitize ( input) ;
509+ assert_eq ! ( sanitized, "Testing!" ) ;
510+
511+ // Test plane 1 noncharacters
512+ let input = "Multi\u{1FFFE} plane\u{1FFFF} !" ;
513+ let sanitized = sanitize ( input) ;
514+ assert_eq ! ( sanitized, "Multiplane!" ) ;
515+
516+ // Test that valid Unicode is preserved
517+ let input = "Hello World!\n \t \r " ;
518+ let sanitized = sanitize ( input) ;
519+ assert_eq ! ( sanitized, "Hello World!\n \t \r " ) ;
520+
521+ // Test replacement character is preserved (it's valid)
522+ let input = "Test\u{FFFD} replacement" ;
523+ let sanitized = sanitize ( input) ;
524+ assert_eq ! ( sanitized, "Test\u{FFFD} replacement" ) ;
525+ }
526+
527+ #[ test]
528+ fn test_sanitize_preserves_whitespace ( ) {
529+ let input = "Line1\n Line2\r \n Line3\t Tabbed" ;
530+ let sanitized = sanitize ( input) ;
531+ assert_eq ! ( sanitized, "Line1\n Line2\r \n Line3\t Tabbed" ) ;
532+ }
429533}
0 commit comments