fix(engine): reject Unicode noncharacters in text sanitization

factorydroid · factorydroid · commit 5de26b9fe332 · 2026-01-26T23:14:18.000+04:00
Fixes bounty issue #1119

The sanitize() function previously only filtered control characters but
allowed Unicode noncharacters (U+FFFE, U+FFFF, U+FDD0-U+FDEF, and the
last two code points of each Unicode plane) to pass through. These
noncharacters violate Unicode best practices and should never appear
in valid text.

Added is_unicode_noncharacter() helper function and updated sanitize()
to filter out these characters per the Unicode Standard.
diff --git a/cortex-engine/src/text_encoding.rs b/cortex-engine/src/text_encoding.rs
@@ -349,10 +349,53 @@ pub fn is_binary(data: &[u8]) -> bool {
     non_printable > threshold
 }
 
+/// Check if a character is a Unicode noncharacter.
+///
+/// Unicode noncharacters are code points that are permanently reserved and should
+/// never appear in valid Unicode text. They include:
+/// - U+FFFE and U+FFFF (BMP noncharacters)
+/// - U+FDD0 through U+FDEF (32 noncharacters in the Arabic Presentation Forms-A block)
+/// - The last two code points of each Unicode plane (U+nFFFE and U+nFFFF for n = 1..16)
+fn is_unicode_noncharacter(c: char) -> bool {
+    let code = c as u32;
+
+    // U+FDD0 through U+FDEF (32 noncharacters)
+    if (0xFDD0..=0xFDEF).contains(&code) {
+        return true;
+    }
+
+    // Last two code points of each plane (U+nFFFE and U+nFFFF)
+    // This covers U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, ..., U+10FFFE, U+10FFFF
+    let low_bits = code & 0xFFFF;
+    if low_bits == 0xFFFE || low_bits == 0xFFFF {
+        return true;
+    }
+
+    false
+}
+
 /// Sanitize text by removing or replacing invalid characters.
+///
+/// This function removes:
+/// - Control characters (except newline, carriage return, and tab)
+/// - Unicode noncharacters (U+FFFE, U+FFFF, U+FDD0-U+FDEF, and plane-end noncharacters)
 pub fn sanitize(text: &str) -> String {
     text.chars()
-        .filter(|&c| !c.is_control() || c == '\n' || c == '\r' || c == '\t')
+        .filter(|&c| {
+            // Allow specific whitespace control characters
+            if c == '\n' || c == '\r' || c == '\t' {
+                return true;
+            }
+            // Reject other control characters
+            if c.is_control() {
+                return false;
+            }
+            // Reject Unicode noncharacters
+            if is_unicode_noncharacter(c) {
+                return false;
+            }
+            true
+        })
         .collect()
 }
 
@@ -426,4 +469,65 @@ mod tests {
         let truncated = truncate_bytes(text, 3);
         assert_eq!(truncated, "hé");
     }
+
+    #[test]
+    fn test_is_unicode_noncharacter() {
+        // BMP noncharacters
+        assert!(is_unicode_noncharacter('\u{FFFE}'));
+        assert!(is_unicode_noncharacter('\u{FFFF}'));
+
+        // Arabic Presentation Forms-A noncharacters (U+FDD0-U+FDEF)
+        assert!(is_unicode_noncharacter('\u{FDD0}'));
+        assert!(is_unicode_noncharacter('\u{FDEF}'));
+        assert!(is_unicode_noncharacter('\u{FDD8}')); // middle of range
+
+        // Plane 1 noncharacters
+        assert!(is_unicode_noncharacter('\u{1FFFE}'));
+        assert!(is_unicode_noncharacter('\u{1FFFF}'));
+
+        // Plane 16 noncharacters (last valid plane)
+        assert!(is_unicode_noncharacter('\u{10FFFE}'));
+        assert!(is_unicode_noncharacter('\u{10FFFF}'));
+
+        // Valid characters should NOT be noncharacters
+        assert!(!is_unicode_noncharacter('a'));
+        assert!(!is_unicode_noncharacter('\u{FFFD}')); // replacement character is valid
+        assert!(!is_unicode_noncharacter('\u{FDC0}')); // just before noncharacter range
+        assert!(!is_unicode_noncharacter('\u{FDF0}')); // just after noncharacter range
+    }
+
+    #[test]
+    fn test_sanitize_removes_noncharacters() {
+        // Test that noncharacters are removed
+        let input = "Hello\u{FFFE}World\u{FFFF}!";
+        let sanitized = sanitize(input);
+        assert_eq!(sanitized, "HelloWorld!");
+
+        // Test FDD0-FDEF range
+        let input = "Test\u{FDD0}ing\u{FDEF}!";
+        let sanitized = sanitize(input);
+        assert_eq!(sanitized, "Testing!");
+
+        // Test plane 1 noncharacters
+        let input = "Multi\u{1FFFE}plane\u{1FFFF}!";
+        let sanitized = sanitize(input);
+        assert_eq!(sanitized, "Multiplane!");
+
+        // Test that valid Unicode is preserved
+        let input = "Hello World!\n\t\r";
+        let sanitized = sanitize(input);
+        assert_eq!(sanitized, "Hello World!\n\t\r");
+
+        // Test replacement character is preserved (it's valid)
+        let input = "Test\u{FFFD}replacement";
+        let sanitized = sanitize(input);
+        assert_eq!(sanitized, "Test\u{FFFD}replacement");
+    }
+
+    #[test]
+    fn test_sanitize_preserves_whitespace() {
+        let input = "Line1\nLine2\r\nLine3\tTabbed";
+        let sanitized = sanitize(input);
+        assert_eq!(sanitized, "Line1\nLine2\r\nLine3\tTabbed");
+    }
 }