Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 87 additions & 21 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -142,11 +142,12 @@
//! );
//! ```

use std::hash::Hash;
use std::ops::Range;
use std::slice;

use crate::{
sources::words,
sources::{chars, words},
util::{strip_common_postfix, strip_common_prefix},
};

Expand Down Expand Up @@ -416,17 +417,18 @@ impl Hunk {

/// Performs a word-diff on this hunk.
///
/// This requires passing the original [`input`](InternedInput) in order to look up
/// the tokens of the current hunk, which typically are lines.
/// Each token is split into words using the built-in [`words`] tokenizer.
/// The resulting word tokens are stored in a second [`diff_input`](InternedInput),
/// and a [`diff`](Diff) is computed on them, with basic post-processing applied.
/// This requires passing the original [`input`](InternedInput) in order to
/// look up the tokens of the current hunk, which typically are lines. Each
/// token is split into words using the built-in [`words`] tokenizer. The
/// resulting word tokens are stored in a second
/// [`word_tokens`](InternedInput), and a [`diff`](Diff) is computed on
/// them, with basic post-processing applied.
///
/// For performance reasons, this second [`diff_input`](InternedInput) as well as
/// the computed [`diff`](Diff) need to be passed as parameters so that they can be
/// re-used when iterating over hunks. Note that word tokens are always
/// added but never removed from the interner. Consider clearing it if you expect
/// your input to have a large vocabulary.
/// For performance reasons, this second [`word_tokens`](InternedInput) as
/// well as the computed [`diff`](Diff) need to be passed as parameters so
/// that they can be re-used when iterating over hunks. Note that word
/// tokens are always added but never removed from the interner. Consider
/// clearing it if you expect your input to have a large vocabulary.
///
/// # Examples
///
Expand Down Expand Up @@ -457,35 +459,99 @@ impl Hunk {
word_tokens: &mut InternedInput<&'a str>,
diff: &mut Diff,
) {
self.granular_diff(words, input, word_tokens, diff)
}

/// Performs a character-diff of the hunk.
///
/// This requires passing the original [`input`](InternedInput) in order to
/// look up the tokens of the current hunk, which typically are lines or
/// words. Each token is split into characters using the built-in [`chars`]
/// tokenizer which simply calls `str::chars` directly. The resulting
/// character tokens are stored in a second [`char_tokens`](InternedInput),
/// and a [`diff`](Diff) is computed on them, with basic post-processing
/// applied.
///
/// For performance reasons, this second [`char_tokens`](InternedInput) as
/// well as the computed [`diff`](Diff) need to be passed as parameters so
/// that they can be re-used when iterating over hunks. Note that character
/// tokens are always added but never removed from the interner. Consider
/// clearing it if you expect your input to have a large alphabet.
///
/// # Examples
///
/// ```
/// # use imara_diff::{InternedInput, Diff, Algorithm};
/// // Compute diff normally
/// let before = "before text";
/// let after = "after text";
/// let mut lines = InternedInput::new(before, after);
/// let mut diff = Diff::compute(Algorithm::Histogram, &lines);
/// diff.postprocess_lines(&lines);
///
/// // Compute char-diff per hunk, reusing allocations across iterations
/// let mut hunk_diff_input = InternedInput::default();
/// let mut hunk_diff = Diff::default();
/// for hunk in diff.hunks() {
/// hunk.char_diff(&lines, &mut hunk_diff_input, &mut hunk_diff);
/// let added = hunk_diff.count_additions();
/// let removed = hunk_diff.count_removals();
/// println!("char-diff of this hunk has {added} additions and {removed} removals");
/// // optionally, clear the interner:
/// hunk_diff_input.clear();
/// }
/// ```
pub fn char_diff(
&self,
input: &InternedInput<&str>,
char_tokens: &mut InternedInput<char>,
diff: &mut Diff,
) {
self.granular_diff(chars, input, char_tokens, diff)
}

/// Performs a granular diff of the hunk based on a given tokenizer. For
/// instance, this can be used to compute a word-diff.
pub fn granular_diff<'a, F, I, T>(
&self,
tokenizer: F,
input: &InternedInput<&'a str>,
diff_input: &mut InternedInput<T>,
diff: &mut Diff,
) where
F: Fn(&'a str) -> I,
I: Iterator<Item = T>,
T: Eq + Hash,
{
let Hunk { before, after } = self.clone();
word_tokens.update_before(
diff_input.update_before(
before
.map(|index| input.before[index as usize])
.map(|token| input.interner[token])
.flat_map(|line| words(line)),
.flat_map(&tokenizer),
);
word_tokens.update_after(
diff_input.update_after(
after
.map(|index| input.after[index as usize])
.map(|token| input.interner[token])
.flat_map(|line| words(line)),
.flat_map(&tokenizer),
);
diff.removed.clear();
diff.removed.resize(word_tokens.before.len(), false);
diff.removed.resize(diff_input.before.len(), false);
diff.added.clear();
diff.added.resize(word_tokens.after.len(), false);
diff.added.resize(diff_input.after.len(), false);
if self.is_pure_removal() {
diff.removed.fill(true);
} else if self.is_pure_insertion() {
diff.added.fill(true);
} else {
diff.compute_with(
Algorithm::Myers,
&word_tokens.before,
&word_tokens.after,
word_tokens.interner.num_tokens(),
&diff_input.before,
&diff_input.after,
diff_input.interner.num_tokens(),
);
diff.postprocess_no_heuristic(word_tokens);
diff.postprocess_no_heuristic(diff_input);
}
}
}
Expand Down
19 changes: 19 additions & 0 deletions src/sources.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@ pub fn words(data: &str) -> Words<'_> {
Words(data)
}

/// Returns a [`TokenSource`] that uses the characters in `data` as Tokens
pub fn chars(data: &str) -> impl Iterator<Item = char> + Clone + '_ {
data.chars()
}

/// Returns a [`TokenSource`] that uses the lines in `data` as Tokens. The newline
/// separator (`\r\n` or `\n`) is included in the emitted tokens. This means that changing
/// the newline separator from `\r\n` to `\n` (or omitting it fully on the last line) is
Expand Down Expand Up @@ -139,6 +144,20 @@ impl<'a> TokenSource for Words<'a> {
}
}

impl<'a> TokenSource for std::str::Chars<'a> {
type Token = char;

type Tokenizer = Self;

fn tokenize(&self) -> Self::Tokenizer {
self.clone()
}

fn estimate_tokens(&self) -> u32 {
self.as_str().len() as u32
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There might be a better heuristic than this. Perhaps we should do

Suggested change
self.as_str().len() as u32
256.min(self.as_str().len() as u32)

instead?

}
}

/// A [`TokenSource`] that returns the lines of a byte slice as tokens. See [`byte_lines`]
/// for details.
#[derive(Clone, Copy, PartialEq, Eq)]
Expand Down