pascalkuthe · KnorpelSenf · Nov 8, 2025 · Nov 8, 2025 · Nov 22, 2025 · Feb 3, 2026
diff --git a/src/lib.rs b/src/lib.rs
@@ -142,11 +142,12 @@
 //! );
 //! ```
 
+use std::hash::Hash;
 use std::ops::Range;
 use std::slice;
 
 use crate::{
-    sources::words,
+    sources::{chars, words},
     util::{strip_common_postfix, strip_common_prefix},
 };
 
@@ -416,17 +417,18 @@ impl Hunk {
 
     /// Performs a word-diff on this hunk.
     ///
-    /// This requires passing the original [`input`](InternedInput) in order to look up
-    /// the tokens of the current hunk, which typically are lines.
-    /// Each token is split into words using the built-in [`words`] tokenizer.
-    /// The resulting word tokens are stored in a second [`diff_input`](InternedInput),
-    /// and a [`diff`](Diff) is computed on them, with basic post-processing applied.
+    /// This requires passing the original [`input`](InternedInput) in order to
+    /// look up the tokens of the current hunk, which typically are lines. Each
+    /// token is split into words using the built-in [`words`] tokenizer. The
+    /// resulting word tokens are stored in a second
+    /// [`word_tokens`](InternedInput), and a [`diff`](Diff) is computed on
+    /// them, with basic post-processing applied.
     ///
-    /// For performance reasons, this second [`diff_input`](InternedInput) as well as
-    /// the computed [`diff`](Diff) need to be passed as parameters so that they can be
-    /// re-used when iterating over hunks. Note that word tokens are always
-    /// added but never removed from the interner. Consider clearing it if you expect
-    /// your input to have a large vocabulary.
+    /// For performance reasons, this second [`word_tokens`](InternedInput) as
+    /// well as the computed [`diff`](Diff) need to be passed as parameters so
+    /// that they can be re-used when iterating over hunks. Note that word
+    /// tokens are always added but never removed from the interner. Consider
+    /// clearing it if you expect your input to have a large vocabulary.
     ///
     /// # Examples
     ///
@@ -457,35 +459,99 @@ impl Hunk {
         word_tokens: &mut InternedInput<&'a str>,
         diff: &mut Diff,
     ) {
+        self.granular_diff(words, input, word_tokens, diff)
+    }
+
+    /// Performs a character-diff of the hunk.
+    ///
+    /// This requires passing the original [`input`](InternedInput) in order to
+    /// look up the tokens of the current hunk, which typically are lines or
+    /// words. Each token is split into characters using the built-in [`chars`]
+    /// tokenizer which simply calls `str::chars` directly. The resulting
+    /// character tokens are stored in a second [`char_tokens`](InternedInput),
+    /// and a [`diff`](Diff) is computed on them, with basic post-processing
+    /// applied.
+    ///
+    /// For performance reasons, this second [`char_tokens`](InternedInput) as
+    /// well as the computed [`diff`](Diff) need to be passed as parameters so
+    /// that they can be re-used when iterating over hunks. Note that character
+    /// tokens are always added but never removed from the interner. Consider
+    /// clearing it if you expect your input to have a large alphabet.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # use imara_diff::{InternedInput, Diff, Algorithm};
+    /// // Compute diff normally
+    /// let before = "before text";
+    /// let after = "after text";
+    /// let mut lines = InternedInput::new(before, after);
+    /// let mut diff = Diff::compute(Algorithm::Histogram, &lines);
+    /// diff.postprocess_lines(&lines);
+    ///
+    /// // Compute char-diff per hunk, reusing allocations across iterations
+    /// let mut hunk_diff_input = InternedInput::default();
+    /// let mut hunk_diff = Diff::default();
+    /// for hunk in diff.hunks() {
+    ///   hunk.char_diff(&lines, &mut hunk_diff_input, &mut hunk_diff);
+    ///   let added = hunk_diff.count_additions();
+    ///   let removed = hunk_diff.count_removals();
+    ///   println!("char-diff of this hunk has {added} additions and {removed} removals");
+    ///   // optionally, clear the interner:
+    ///   hunk_diff_input.clear();
+    /// }
+    /// ```
+    pub fn char_diff(
+        &self,
+        input: &InternedInput<&str>,
+        char_tokens: &mut InternedInput<char>,
+        diff: &mut Diff,
+    ) {
+        self.granular_diff(chars, input, char_tokens, diff)
+    }
+
+    /// Performs a granular diff of the hunk based on a given tokenizer. For
+    /// instance, this can be used to compute a word-diff.
+    pub fn granular_diff<'a, F, I, T>(
+        &self,
+        tokenizer: F,
+        input: &InternedInput<&'a str>,
+        diff_input: &mut InternedInput<T>,
+        diff: &mut Diff,
+    ) where
+        F: Fn(&'a str) -> I,
+        I: Iterator<Item = T>,
+        T: Eq + Hash,
+    {
         let Hunk { before, after } = self.clone();
-        word_tokens.update_before(
+        diff_input.update_before(
             before
                 .map(|index| input.before[index as usize])
                 .map(|token| input.interner[token])
-                .flat_map(|line| words(line)),
+                .flat_map(&tokenizer),
         );
-        word_tokens.update_after(
+        diff_input.update_after(
             after
                 .map(|index| input.after[index as usize])
                 .map(|token| input.interner[token])
-                .flat_map(|line| words(line)),
+                .flat_map(&tokenizer),
         );
         diff.removed.clear();
-        diff.removed.resize(word_tokens.before.len(), false);
+        diff.removed.resize(diff_input.before.len(), false);
         diff.added.clear();
-        diff.added.resize(word_tokens.after.len(), false);
+        diff.added.resize(diff_input.after.len(), false);
         if self.is_pure_removal() {
             diff.removed.fill(true);
         } else if self.is_pure_insertion() {
             diff.added.fill(true);
         } else {
             diff.compute_with(
                 Algorithm::Myers,
-                &word_tokens.before,
-                &word_tokens.after,
-                word_tokens.interner.num_tokens(),
+                &diff_input.before,
+                &diff_input.after,
+                diff_input.interner.num_tokens(),
             );
-            diff.postprocess_no_heuristic(word_tokens);
+            diff.postprocess_no_heuristic(diff_input);
         }
     }
 }

diff --git a/src/sources.rs b/src/sources.rs
@@ -25,6 +25,11 @@ pub fn words(data: &str) -> Words<'_> {
     Words(data)
 }
 
+/// Returns a [`TokenSource`] that uses the characters in `data` as Tokens
+pub fn chars(data: &str) -> impl Iterator<Item = char> + Clone + '_ {
+    data.chars()
+}
+
 /// Returns a [`TokenSource`] that uses the lines in `data` as Tokens. The newline
 /// separator (`\r\n` or `\n`) is included in the emitted tokens. This means that changing
 /// the newline separator from `\r\n` to `\n` (or omitting it fully on the last line) is
@@ -139,6 +144,20 @@ impl<'a> TokenSource for Words<'a> {
     }
 }
 
+impl<'a> TokenSource for std::str::Chars<'a> {
+    type Token = char;
+
+    type Tokenizer = Self;
+
+    fn tokenize(&self) -> Self::Tokenizer {
+        self.clone()
+    }
+
+    fn estimate_tokens(&self) -> u32 {
+        self.as_str().len() as u32
-        self.as_str().len() as u32
+        256.min(self.as_str().len() as u32)
-        self.as_str().len() as u32
+        256.min(self.as_str().len() as u32)
+    }
+}
+
 /// A [`TokenSource`] that returns the lines of a byte slice as tokens. See [`byte_lines`]
 /// for details.
 #[derive(Clone, Copy, PartialEq, Eq)]