Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "distance"
version = "0.4.0"
version = "0.4.1"
authors = ["Marcus Brummer <mbrlabs7@gmail.com>"]

description = "A collection of approximate string matching algorithms"
Expand All @@ -18,4 +18,4 @@ include = [
"AUTHORS",
"CONTRIBUTORS",
"CHANGES",
]
]
20 changes: 11 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
# distance
[![](https://travis-ci.org/mbrlabs/distance.svg?branch=master)](https://travis-ci.org/mbrlabs/distance)
# distance
[![](https://travis-ci.org/mbrlabs/distance.svg?branch=master)](https://travis-ci.org/mbrlabs/distance)
[![](https://img.shields.io/crates/v/distance.svg)](https://crates.io/crates/distance)
[![](https://img.shields.io/badge/docs-v0.4.0-blue.svg)](https://mbrlabs.github.io/distance)

This is a rust library for approximate string matching algorithms.
This is a rust library for approximate string matching algorithms.
Possible applications for this are fuzzy string searching, spell checkers, spam filters, etc.

It is applicable in `no_std` environment given `alloc`.

## Algorithms
All algorithms support UTF-8 encoded strings.

- [Levenshtein distance (since v0.1)](https://en.wikipedia.org/wiki/Levenshtein_distance)
- [Levenshtein distance (since v0.1)](https://en.wikipedia.org/wiki/Levenshtein_distance)
- [Hamming distance (since v0.2)](https://en.wikipedia.org/wiki/Hamming_distance)
- [Damerau Levenshtein distance (since v0.3)](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance)
- [Sift3 distance (since v0.4)](http://siderite.blogspot.com/2007/04/super-fast-and-accurate-string-distance.html)
Expand All @@ -24,21 +26,21 @@ distance = "0.4"

## Usage
```rust
use distance::*;
use distance::*;

// Levenshtein distance
let distance = levenshtein("hannah", "hanna");
let distance = levenshtein("hannah", "hanna");
assert_eq!(1, distance);

// Damerau Levenshtein distance
let distance = damerau_levenshtein("hannah", "hannha");
let distance = damerau_levenshtein("hannah", "hannha");
assert_eq!(1, distance);

// Hamming distance
let distance = hamming("karolin", "kathrin").unwrap();
let distance = hamming("karolin", "kathrin").unwrap();
assert_eq!(3, distance);

// Sift3 distance
let distance = sift3("hannah", "hanna");
assert_eq!(0.5, distance);
```
```
51 changes: 25 additions & 26 deletions src/damerau_levenshtein.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,32 +12,32 @@
// See the License for the specific language governing permissions and
// limitations under the License.

use std::collections::HashMap;
use std::char;

use crate::prelude::*;
use alloc::collections::BTreeMap;
use core::char;
use utils;

/// Calculates the Damerau-Levenshtein distance between two strings.
///
///
/// # Damerau-Levenshtein distance
/// The [Damerau-Levenshtein distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) is the number of per-character changes
/// The [Damerau-Levenshtein distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) is the number of per-character changes
/// (insertion, deletion, substitution & transposition) that are neccessary to convert one string into annother.
/// The original Levenshtein distance does not take transposition into account.
/// This implementation does fully support unicode strings.
///
/// ## Complexity
/// m := len(s) + 2
/// n := len(t) + 2
///
/// Time complexity: O(mn)
/// Space complexity: O(mn + m)
/// ## Complexity
/// m := len(s) + 2
/// n := len(t) + 2
///
/// Time complexity: O(mn)
/// Space complexity: O(mn + m)
///
/// ## Examples
/// ```
/// use distance::*;
///
///
/// // Damerau-Levenshtein distance
/// let distance = damerau_levenshtein("hannah", "hannha");
/// let distance = damerau_levenshtein("hannah", "hannha");
/// assert_eq!(1, distance);
/// ```
///
Expand All @@ -51,30 +51,30 @@ pub fn damerau_levenshtein(s: &str, t: &str) -> usize {
let mut mat: Vec<Vec<usize>> = vec![vec![0; len_t + 2]; len_s + 2];
mat[0][0] = max_distance;
for i in 0..(len_s + 1) {
mat[i+1][0] = max_distance;
mat[i+1][1] = i;
mat[i + 1][0] = max_distance;
mat[i + 1][1] = i;
}
for i in 0..(len_t + 1) {
mat[0][i+1] = max_distance;
mat[1][i+1] = i;
mat[0][i + 1] = max_distance;
mat[1][i + 1] = i;
}

let mut char_map: HashMap<char, usize> = HashMap::new();
let mut char_map: BTreeMap<char, usize> = BTreeMap::new();
// apply edit operations
for (i, s_char) in s.chars().enumerate() {
let mut db = 0;
let i = i + 1;

for (j, t_char) in t.chars().enumerate() {
let j = j + 1;
let last = *char_map.get(&t_char).unwrap_or(&0);

let cost = if s_char == t_char { 0 } else { 1 };
mat[i+1][j+1] = utils::min4(
mat[i+1][j] + 1, // deletion
mat[i][j+1] + 1, // insertion
mat[i][j] + cost, // substitution
mat[last][db] + (i - last - 1) + 1 + (j - db - 1) // transposition
mat[i + 1][j + 1] = utils::min4(
mat[i + 1][j] + 1, // deletion
mat[i][j + 1] + 1, // insertion
mat[i][j] + cost, // substitution
mat[last][db] + (i - last - 1) + 1 + (j - db - 1), // transposition
);

// that's like s_char == t_char but more efficient
Expand Down Expand Up @@ -133,7 +133,7 @@ mod tests {
#[test]
fn lorem() {
assert_eq!(80, damerau_levenshtein(
"Lorem ipsum dolor sit amet, autem mucius eirmod ea per. Nec adhuc laudem id, vix dolor interesset ea.",
"Lorem ipsum dolor sit amet, autem mucius eirmod ea per. Nec adhuc laudem id, vix dolor interesset ea.",
"Id mundi ponderum constituam nam. Nam in legendos definiebas. Pri commune senserit omittantur cu.")
);
}
Expand All @@ -145,5 +145,4 @@ mod tests {
assert_eq!(3, damerau_levenshtein("", "öঙ香"));
assert_eq!(1, damerau_levenshtein("よさ", "äさ"));
}

}
36 changes: 18 additions & 18 deletions src/levenshtein.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,28 +12,29 @@
// See the License for the specific language governing permissions and
// limitations under the License.

use crate::prelude::*;
use utils;

/// Calculates the Levenshtein distance between two strings.
///
///
/// # Levenshtein distance
/// The [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) is the number of per-character changes (insertion, deletion & substitution)
/// that are neccessary to convert one string into annother.
/// This implementation does fully support unicode strings.
///
/// ## Complexity
/// m := len(s) + 1
/// n := len(t) + 1
/// ## Complexity
/// m := len(s) + 1
/// n := len(t) + 1
///
/// Time complexity: O(mn)
/// Time complexity: O(mn)
/// Space complexity: O(mn)
///
/// ## Examples
/// ```
/// use distance::*;
///
///
/// // Levenshtein distance
/// let distance = levenshtein("hannah", "hanna");
/// let distance = levenshtein("hannah", "hanna");
/// assert_eq!(1, distance);
/// ```
///
Expand All @@ -44,21 +45,21 @@ pub fn levenshtein(s: &str, t: &str) -> usize {

// initialize the matrix
let mut mat: Vec<Vec<usize>> = vec![vec![0; len_t + 1]; len_s + 1];
for i in 1..(len_s + 1) {
mat[i][0] = i;
for i in 1..(len_s + 1) {
mat[i][0] = i;
}
for i in 1..(len_t + 1) {
mat[0][i] = i;
for i in 1..(len_t + 1) {
mat[0][i] = i;
}

// apply edit operations
for (i, s_char) in s.chars().enumerate() {
for (j, t_char) in t.chars().enumerate() {
let substitution = if s_char == t_char {0} else {1};
mat[i+1][j+1] = utils::min3(
mat[i][j+1] + 1, // deletion
mat[i+1][j] + 1, // insertion
mat[i][j] + substitution // substitution
let substitution = if s_char == t_char { 0 } else { 1 };
mat[i + 1][j + 1] = utils::min3(
mat[i][j + 1] + 1, // deletion
mat[i + 1][j] + 1, // insertion
mat[i][j] + substitution, // substitution
);
}
}
Expand Down Expand Up @@ -106,5 +107,4 @@ mod tests {
assert_eq!(4, levenshtein("こんにちは", "こんにちは abc"));
assert_eq!(1, levenshtein("༆༃ʘ", "༆˥ʘ"));
}

}
}
26 changes: 17 additions & 9 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,27 +12,35 @@
// See the License for the specific language governing permissions and
// limitations under the License.

//! # A collection of approximate string matching algorithms
//! # A collection of approximate string matching algorithms
//!
//! This library contains algorithms dealing with [approximate string matching](https://en.wikipedia.org/wiki/Approximate_string_matching).
//! These algorithms can be used to tell the approximate difference between two
//! These algorithms can be used to tell the approximate difference between two
//! strings. This is usful for a varity of things like spell checking, fuzzy search, etc.
//!
//!
//! ## Algorithms
//! - [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance)
//! - [Damerau–Levenshtein distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance)
//! - [Hamming distance](https://en.wikipedia.org/wiki/Hamming_distance)
//! - [Sift3 distance](http://siderite.blogspot.com/2007/04/super-fast-and-accurate-string-distance.html)
//!
pub use self::levenshtein::*;
#![no_std]
#[macro_use]
extern crate alloc;

pub use self::damerau_levenshtein::*;
pub use self::errors::*;
pub use self::hamming::*;
pub use self::levenshtein::*;
pub use self::sift3::*;
pub use self::errors::*;

mod damerau_levenshtein;
mod errors;
mod hamming;
mod levenshtein;
mod sift3;
mod hamming;
mod errors;
mod damerau_levenshtein;
mod utils;
mod utils;

mod prelude {
pub use alloc::vec::Vec;
}
26 changes: 13 additions & 13 deletions src/sift3.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,21 +12,23 @@
// See the License for the specific language governing permissions and
// limitations under the License.

use crate::prelude::*;

/// Calculates the sift3 distance between two strings with a default max_distance of 5.
///
///
/// # Sift3
/// [Sift3](http://siderite.blogspot.com/2007/04/super-fast-and-accurate-string-distance.html) - super fast and accurate string distance algorithm.
/// The higher the return value, the more different the two strings are.
/// The higher the return value, the more different the two strings are.
/// A value of 0.0 means both strings are equal.
///
/// This implementation does fully support unicode strings.
///
/// ## Examples
/// ```
/// use distance::*;
///
///
/// // Sift3 distance
/// let distance = sift3("hannah", "hanna");
/// let distance = sift3("hannah", "hanna");
/// assert_eq!(0.5, distance);
/// ```
///
Expand All @@ -47,8 +49,8 @@ fn sift3_offset(s: &str, t: &str, max_offset: usize) -> f32 {
return len_t as f32;
}
}
if len_t == 0 {
return len_s as f32;
if len_t == 0 {
return len_s as f32;
}

let sv: Vec<char> = s.chars().collect();
Expand All @@ -68,7 +70,7 @@ fn sift3_offset(s: &str, t: &str, max_offset: usize) -> f32 {
if (c + i < len_s) && sv[c + i] == tv[c] {
offset1 = i;
break;
}
}

if (c + i < len_t) && (sv[c] == tv[c + i]) {
offset2 = i;
Expand All @@ -93,7 +95,7 @@ mod tests {
assert_eq!(2.0, sift3("book", "back"));
assert_eq!(4.5, sift3("table", "dinner"));
assert_eq!(2.0, sift3("person", "pardon"));
assert_eq!(0.5, sift3("person", "persons"));
assert_eq!(0.5, sift3("person", "persons"));
}

#[test]
Expand All @@ -109,7 +111,6 @@ mod tests {
assert_eq!(1.0, sift3("World", "world"));
}


#[test]
fn empty() {
assert_eq!(4.0, sift3("book", ""));
Expand All @@ -124,14 +125,13 @@ mod tests {
assert_eq!(1.0, sift3("さようなら", "さようなう"));
assert_eq!(2.0, sift3("こんにちは", "こんにちは abc"));
assert_eq!(1.0, sift3("༆༃ʘ", "༆˥ʘ"));
}
}

#[test]
fn lorem() {
assert_eq!(93.0, sift3(
"Lorem ipsum dolor sit amet, autem mucius eirmod ea per. Nec adhuc laudem id, vix dolor interesset ea.",
"Lorem ipsum dolor sit amet, autem mucius eirmod ea per. Nec adhuc laudem id, vix dolor interesset ea.",
"Id mundi ponderum constituam nam. Nam in legendos definiebas. Pri commune senserit omittantur cu.")
);
}

}
}
8 changes: 4 additions & 4 deletions src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,14 @@
// See the License for the specific language governing permissions and
// limitations under the License.

use std::cmp::min;
use core::cmp::min;

#[inline(always)]
pub fn min3(a: usize, b: usize, c: usize) -> usize {
return min(min(a, b), c);
return min(min(a, b), c);
}

#[inline(always)]
pub fn min4(a: usize, b: usize, c: usize, d: usize) -> usize {
return min(min(min(a, b), c), d);
}
return min(min(min(a, b), c), d);
}