diff --git a/Cargo.toml b/Cargo.toml index 426210c..9546f4e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,7 +4,7 @@ version = "0.10.0" authors = ["Danny Guo "] description = """ Implementations of string similarity metrics. Includes Hamming, Levenshtein, -OSA, Damerau-Levenshtein, Jaro, Jaro-Winkler, and Sørensen-Dice. +OSA, Damerau-Levenshtein, Jaro, Jaro-Winkler, Sørensen-Dice and LCS based algorithm. """ license = "MIT" readme = "README.md" diff --git a/README.md b/README.md index d8c9780..0482eed 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,7 @@ - [Damerau-Levenshtein] - distance & normalized - [Jaro and Jaro-Winkler] - this implementation of Jaro-Winkler does not limit the common prefix length - [Sørensen-Dice] + - [LCS based algorithm] - this algorithm uses LCS length finding variant with O(n * m) time complexity and O(min(n, m)) memory complexity The normalized versions return values between `0.0` and `1.0`, where `1.0` means an exact match. @@ -39,7 +40,7 @@ extern crate strsim; use strsim::{hamming, levenshtein, normalized_levenshtein, osa_distance, damerau_levenshtein, normalized_damerau_levenshtein, jaro, - jaro_winkler, sorensen_dice}; + jaro_winkler, sorensen_dice, lcs_normalized}; fn main() { match hamming("hamming", "hammers") { @@ -66,6 +67,8 @@ fn main() { assert_eq!(sorensen_dice("web applications", "applications of the web"), 0.7878787878787878); + + assert!(lcs_normalized("foobar", "ofobar") > 0.8); } ``` @@ -99,4 +102,5 @@ Benchmarks require a Nightly toolchain. Run `$ cargo +nightly bench`. [Hamming]:http://en.wikipedia.org/wiki/Hamming_distance [Optimal string alignment]:https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Optimal_string_alignment_distance [Sørensen-Dice]:http://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient +[LCS Algorithm]:https://en.wikipedia.org/wiki/Longest_common_subsequence_problem [Docker]:https://docs.docker.com/engine/installation/ diff --git a/benches/benches.rs b/benches/benches.rs index b7ba829..acd8c64 100644 --- a/benches/benches.rs +++ b/benches/benches.rs @@ -97,4 +97,13 @@ mod benches { strsim::sorensen_dice(&a, &b); }) } + + #[bench] + fn bench_lcs_normalized(bencher: &mut Bencher) { + let a = "Philosopher Friedrich Nietzsche"; + let b = "Philosopher Jean-Paul Sartre"; + bencher.iter(|| { + strsim::lcs_normalized(&a, &b); + }) + } } diff --git a/src/lib.rs b/src/lib.rs index 7a85935..2ff3286 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -464,6 +464,50 @@ pub fn sorensen_dice(a: &str, b: &str) -> f64 { (2 * intersection_size) as f64 / (a.len() + b.len() - 2) as f64 } +/// Uses LCS algorithm to find longest common subsequence +/// and then divides it by the length of the longes string +/// ``` +/// use strsim::lcs_normalized; +/// +/// assert_eq!(1.0, lcs_normalized("", "")); +/// assert_eq!(0.0, lcs_normalized("", "umbrella")); +/// assert_eq!(0.8, lcs_normalized("night", "fight")); +/// assert_eq!(1.0, lcs_normalized("ferris", "ferris")); +/// ``` +pub fn lcs_normalized(left: impl AsRef, right: impl AsRef) -> f64 { + let (len1, len2) = (left.as_ref().len(), right.as_ref().len()); + let lcs_len = lcs_length(left.as_ref(), right.as_ref()); + let size = max(len1, len2); + // Empty strings should match + if size == 0 { 1.0 } else { lcs_len as f64 / size as f64 } +} + +#[inline] +fn get_shorter_longer_strings(left: impl AsRef, right: impl AsRef) -> (String, String) { + if left.as_ref().len() < right.as_ref().len() { + (left.as_ref().to_string(), right.as_ref().to_string()) + } else { + (right.as_ref().to_string(), left.as_ref().to_string()) + } +} + +#[inline] +fn lcs_length(left: impl AsRef, right: impl AsRef) -> usize { + let (left, right) = get_shorter_longer_strings(left, right); + let mut table = vec![vec![0 as usize; left.len() + 1]; 2]; + for rletter in right.chars() { + for (col, lletter) in left.chars().enumerate() { + if rletter == lletter { + table[1][col + 1] = 1 + table[0][col]; + } else { + table[1][col + 1] = max(table[0][col + 1], table[1][col]); + } + } + table[0] = table.pop().unwrap(); + table.push(vec![0 as usize; left.len() + 1]); + } + *table[0].last().unwrap() +} #[cfg(test)] mod tests { @@ -989,4 +1033,71 @@ mod tests { sorensen_dice("this has one extra word", "this has one word") ); } + + #[test] + fn lcs_normalized_diff_unequal_length() { + assert!(lcs_normalized("damerau", "aderuaxyz") < 0.5); + } + + #[test] + fn lcs_normalized_diff_unequal_length_reversed() { + assert!(lcs_normalized("aderuaxyz", "damerau") < 0.5); + } + + #[test] + fn lcs_normalized_diff_comedians() { + assert!(lcs_normalized("Stewart", "Colbert") < 0.5); + } + + #[test] + fn lcs_normalized_many_transpositions() { + assert!(lcs_normalized("abcdefghijkl", "bacedfgihjlk") < 0.7); + } + + #[test] + fn lcs_normalized_diff_longer() { + let a = "The quick brown fox jumped over the angry dog."; + let b = "Lehem ipsum dolor sit amet, dicta latine an eam."; + assert!(lcs_normalized(a, b) < 0.4); + } + + #[test] + fn lcs_normalized_beginning_transposition() { + assert!(lcs_normalized("foobar", "ofobar") > 0.8); + } + + #[test] + fn lcs_normalized_end_transposition() { + assert!(lcs_normalized("specter", "spectre") > 0.8); + } + + #[test] + fn lcs_normalized_unrestricted_edit() { + assert!(lcs_normalized("a cat", "an abct") > 0.5); + } + + #[test] + fn lcs_normalized_diff_short() { + assert!(lcs_normalized("levenshtein", "löwenbräu") < 0.01); + } + + #[test] + fn lcs_normalized_for_empty_strings() { + assert!(lcs_normalized("", "") > 0.99); + } + + #[test] + fn lcs_normalized_first_empty() { + assert!(lcs_normalized("", "flower") < 0.01); + } + + #[test] + fn lcs_normalized_second_empty() { + assert!(lcs_normalized("tree", "") < 0.01); + } + + #[test] + fn lcs_normalized_identical_strings() { + assert!(lcs_normalized("sunglasses", "sunglasses") > 0.99); + } }