From 59327eff8b66f47ac5275c1a70704bb2598399e5 Mon Sep 17 00:00:00 2001 From: Ph0enixKM Date: Tue, 9 Aug 2022 16:57:33 +0200 Subject: [PATCH 1/3] feat: add lcs based solution --- Cargo.toml | 5 ++- README.md | 6 +++- benches/benches.rs | 9 +++++ src/lib.rs | 84 ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 102 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 426210c..9a33d9e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,7 +4,7 @@ version = "0.10.0" authors = ["Danny Guo "] description = """ Implementations of string similarity metrics. Includes Hamming, Levenshtein, -OSA, Damerau-Levenshtein, Jaro, Jaro-Winkler, and Sørensen-Dice. +OSA, Damerau-Levenshtein, Jaro, Jaro-Winkler, Sørensen-Dice and LCS based algorithm. """ license = "MIT" readme = "README.md" @@ -13,3 +13,6 @@ homepage = "https://github.com/dguo/strsim-rs" repository = "https://github.com/dguo/strsim-rs" documentation = "https://docs.rs/strsim/" exclude = ["/.github", "/dev"] + +[dependencies] +similar-string = "1.4.3" diff --git a/README.md b/README.md index d8c9780..6b72a4f 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,7 @@ - [Damerau-Levenshtein] - distance & normalized - [Jaro and Jaro-Winkler] - this implementation of Jaro-Winkler does not limit the common prefix length - [Sørensen-Dice] + - [LCS based algorithm] - this algorithm uses LCS length finding variant with O(n * m) time complexity and O(min(n, m)) memory complexity The normalized versions return values between `0.0` and `1.0`, where `1.0` means an exact match. @@ -39,7 +40,7 @@ extern crate strsim; use strsim::{hamming, levenshtein, normalized_levenshtein, osa_distance, damerau_levenshtein, normalized_damerau_levenshtein, jaro, - jaro_winkler, sorensen_dice}; + jaro_winkler, sorensen_dice, lcs_normalized}; fn main() { match hamming("hamming", "hammers") { @@ -66,6 +67,8 @@ fn main() { assert_eq!(sorensen_dice("web applications", "applications of the web"), 0.7878787878787878); + + assert!(sorensen_dice("foobar", "ofobar") > 0.8); } ``` @@ -99,4 +102,5 @@ Benchmarks require a Nightly toolchain. Run `$ cargo +nightly bench`. [Hamming]:http://en.wikipedia.org/wiki/Hamming_distance [Optimal string alignment]:https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Optimal_string_alignment_distance [Sørensen-Dice]:http://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient +[LCS Algorithm]:https://en.wikipedia.org/wiki/Longest_common_subsequence_problem [Docker]:https://docs.docker.com/engine/installation/ diff --git a/benches/benches.rs b/benches/benches.rs index b7ba829..acd8c64 100644 --- a/benches/benches.rs +++ b/benches/benches.rs @@ -97,4 +97,13 @@ mod benches { strsim::sorensen_dice(&a, &b); }) } + + #[bench] + fn bench_lcs_normalized(bencher: &mut Bencher) { + let a = "Philosopher Friedrich Nietzsche"; + let b = "Philosopher Jean-Paul Sartre"; + bencher.iter(|| { + strsim::lcs_normalized(&a, &b); + }) + } } diff --git a/src/lib.rs b/src/lib.rs index 7a85935..727e44d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,6 +2,8 @@ #![forbid(unsafe_code)] +extern crate similar_string; + use std::char; use std::cmp::{max, min}; use std::collections::HashMap; @@ -10,6 +12,8 @@ use std::fmt::{self, Display, Formatter}; use std::hash::Hash; use std::str::Chars; +use similar_string::compare_similarity; + #[derive(Debug, PartialEq)] pub enum StrSimError { DifferentLengthArgs, @@ -464,6 +468,19 @@ pub fn sorensen_dice(a: &str, b: &str) -> f64 { (2 * intersection_size) as f64 / (a.len() + b.len() - 2) as f64 } +/// Uses LCS algorithm to find longest common subsequence +/// and then divides it by the length of the longes string +/// ``` +/// use strsim::lcs_normalized; +/// +/// assert_eq!(1.0, lcs_normalized("", "")); +/// assert_eq!(0.0, lcs_normalized("", "umbrella")); +/// assert_eq!(0.8, lcs_normalized("night", "fight")); +/// assert_eq!(1.0, lcs_normalized("ferris", "ferris")); +/// ``` +pub fn lcs_normalized(a: &str, b: &str) -> f64 { + compare_similarity(a, b) +} #[cfg(test)] mod tests { @@ -989,4 +1006,71 @@ mod tests { sorensen_dice("this has one extra word", "this has one word") ); } + + #[test] + fn lcs_normalized_diff_unequal_length() { + assert!(compare_similarity("damerau", "aderuaxyz") < 0.5); + } + + #[test] + fn lcs_normalized_diff_unequal_length_reversed() { + assert!(compare_similarity("aderuaxyz", "damerau") < 0.5); + } + + #[test] + fn lcs_normalized_diff_comedians() { + assert!(compare_similarity("Stewart", "Colbert") < 0.5); + } + + #[test] + fn lcs_normalized_many_transpositions() { + assert!(compare_similarity("abcdefghijkl", "bacedfgihjlk") < 0.7); + } + + #[test] + fn lcs_normalized_diff_longer() { + let a = "The quick brown fox jumped over the angry dog."; + let b = "Lehem ipsum dolor sit amet, dicta latine an eam."; + assert!(compare_similarity(a, b) < 0.4); + } + + #[test] + fn lcs_normalized_beginning_transposition() { + assert!(compare_similarity("foobar", "ofobar") > 0.8); + } + + #[test] + fn lcs_normalized_end_transposition() { + assert!(compare_similarity("specter", "spectre") > 0.8); + } + + #[test] + fn lcs_normalized_unrestricted_edit() { + assert!(compare_similarity("a cat", "an abct") > 0.5); + } + + #[test] + fn lcs_normalized_diff_short() { + assert!(compare_similarity("levenshtein", "löwenbräu") < 0.01); + } + + #[test] + fn lcs_normalized_for_empty_strings() { + assert!(compare_similarity("", "") > 0.99); + } + + #[test] + fn lcs_normalized_first_empty() { + assert!(compare_similarity("", "flower") < 0.01); + } + + #[test] + fn lcs_normalized_second_empty() { + assert!(compare_similarity("tree", "") < 0.01); + } + + #[test] + fn lcs_normalized_identical_strings() { + assert!(compare_similarity("sunglasses", "sunglasses") > 0.99); + } } From 753beff1c60ff96ab2f8e02d8d4c04cfad6495e0 Mon Sep 17 00:00:00 2001 From: Ph0enixKM Date: Tue, 9 Aug 2022 19:23:13 +0200 Subject: [PATCH 2/3] fix: embed lcs code --- Cargo.toml | 3 --- src/lib.rs | 65 ++++++++++++++++++++++++++++++++++++++---------------- 2 files changed, 46 insertions(+), 22 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 9a33d9e..9546f4e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,6 +13,3 @@ homepage = "https://github.com/dguo/strsim-rs" repository = "https://github.com/dguo/strsim-rs" documentation = "https://docs.rs/strsim/" exclude = ["/.github", "/dev"] - -[dependencies] -similar-string = "1.4.3" diff --git a/src/lib.rs b/src/lib.rs index 727e44d..2ff3286 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,8 +2,6 @@ #![forbid(unsafe_code)] -extern crate similar_string; - use std::char; use std::cmp::{max, min}; use std::collections::HashMap; @@ -12,8 +10,6 @@ use std::fmt::{self, Display, Formatter}; use std::hash::Hash; use std::str::Chars; -use similar_string::compare_similarity; - #[derive(Debug, PartialEq)] pub enum StrSimError { DifferentLengthArgs, @@ -478,8 +474,39 @@ pub fn sorensen_dice(a: &str, b: &str) -> f64 { /// assert_eq!(0.8, lcs_normalized("night", "fight")); /// assert_eq!(1.0, lcs_normalized("ferris", "ferris")); /// ``` -pub fn lcs_normalized(a: &str, b: &str) -> f64 { - compare_similarity(a, b) +pub fn lcs_normalized(left: impl AsRef, right: impl AsRef) -> f64 { + let (len1, len2) = (left.as_ref().len(), right.as_ref().len()); + let lcs_len = lcs_length(left.as_ref(), right.as_ref()); + let size = max(len1, len2); + // Empty strings should match + if size == 0 { 1.0 } else { lcs_len as f64 / size as f64 } +} + +#[inline] +fn get_shorter_longer_strings(left: impl AsRef, right: impl AsRef) -> (String, String) { + if left.as_ref().len() < right.as_ref().len() { + (left.as_ref().to_string(), right.as_ref().to_string()) + } else { + (right.as_ref().to_string(), left.as_ref().to_string()) + } +} + +#[inline] +fn lcs_length(left: impl AsRef, right: impl AsRef) -> usize { + let (left, right) = get_shorter_longer_strings(left, right); + let mut table = vec![vec![0 as usize; left.len() + 1]; 2]; + for rletter in right.chars() { + for (col, lletter) in left.chars().enumerate() { + if rletter == lletter { + table[1][col + 1] = 1 + table[0][col]; + } else { + table[1][col + 1] = max(table[0][col + 1], table[1][col]); + } + } + table[0] = table.pop().unwrap(); + table.push(vec![0 as usize; left.len() + 1]); + } + *table[0].last().unwrap() } #[cfg(test)] @@ -1009,68 +1036,68 @@ mod tests { #[test] fn lcs_normalized_diff_unequal_length() { - assert!(compare_similarity("damerau", "aderuaxyz") < 0.5); + assert!(lcs_normalized("damerau", "aderuaxyz") < 0.5); } #[test] fn lcs_normalized_diff_unequal_length_reversed() { - assert!(compare_similarity("aderuaxyz", "damerau") < 0.5); + assert!(lcs_normalized("aderuaxyz", "damerau") < 0.5); } #[test] fn lcs_normalized_diff_comedians() { - assert!(compare_similarity("Stewart", "Colbert") < 0.5); + assert!(lcs_normalized("Stewart", "Colbert") < 0.5); } #[test] fn lcs_normalized_many_transpositions() { - assert!(compare_similarity("abcdefghijkl", "bacedfgihjlk") < 0.7); + assert!(lcs_normalized("abcdefghijkl", "bacedfgihjlk") < 0.7); } #[test] fn lcs_normalized_diff_longer() { let a = "The quick brown fox jumped over the angry dog."; let b = "Lehem ipsum dolor sit amet, dicta latine an eam."; - assert!(compare_similarity(a, b) < 0.4); + assert!(lcs_normalized(a, b) < 0.4); } #[test] fn lcs_normalized_beginning_transposition() { - assert!(compare_similarity("foobar", "ofobar") > 0.8); + assert!(lcs_normalized("foobar", "ofobar") > 0.8); } #[test] fn lcs_normalized_end_transposition() { - assert!(compare_similarity("specter", "spectre") > 0.8); + assert!(lcs_normalized("specter", "spectre") > 0.8); } #[test] fn lcs_normalized_unrestricted_edit() { - assert!(compare_similarity("a cat", "an abct") > 0.5); + assert!(lcs_normalized("a cat", "an abct") > 0.5); } #[test] fn lcs_normalized_diff_short() { - assert!(compare_similarity("levenshtein", "löwenbräu") < 0.01); + assert!(lcs_normalized("levenshtein", "löwenbräu") < 0.01); } #[test] fn lcs_normalized_for_empty_strings() { - assert!(compare_similarity("", "") > 0.99); + assert!(lcs_normalized("", "") > 0.99); } #[test] fn lcs_normalized_first_empty() { - assert!(compare_similarity("", "flower") < 0.01); + assert!(lcs_normalized("", "flower") < 0.01); } #[test] fn lcs_normalized_second_empty() { - assert!(compare_similarity("tree", "") < 0.01); + assert!(lcs_normalized("tree", "") < 0.01); } #[test] fn lcs_normalized_identical_strings() { - assert!(compare_similarity("sunglasses", "sunglasses") > 0.99); + assert!(lcs_normalized("sunglasses", "sunglasses") > 0.99); } } From b26ebeb9a440cf56278b0f3e347d0f92698ae481 Mon Sep 17 00:00:00 2001 From: Ph0enixKM Date: Tue, 9 Aug 2022 19:25:51 +0200 Subject: [PATCH 3/3] fix: readme typo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6b72a4f..0482eed 100644 --- a/README.md +++ b/README.md @@ -68,7 +68,7 @@ fn main() { assert_eq!(sorensen_dice("web applications", "applications of the web"), 0.7878787878787878); - assert!(sorensen_dice("foobar", "ofobar") > 0.8); + assert!(lcs_normalized("foobar", "ofobar") > 0.8); } ```