rapidfuzz · ljnsn · Dec 8, 2024 · Dec 8, 2024 · Dec 8, 2024 · Dec 9, 2024
diff --git a/src/distance.rs b/src/distance.rs
@@ -1,3 +1,4 @@
+pub mod common;
 pub mod damerau_levenshtein;
 pub mod hamming;
 pub mod indel;

diff --git a/src/distance/common.rs b/src/distance/common.rs
@@ -0,0 +1,15 @@
+/**
+Tuple like object describing the position of the compared strings in
+src and dest.
+
+It indicates that the score has been calculated between
+src[src_start:src_end] and dest[dest_start:dest_end]
+*/
+#[derive(PartialEq, Debug)]
+pub struct ScoreAlignment {
+    pub score: f64,
+    pub src_start: usize,
+    pub src_end: usize,
+    pub dest_start: usize,
+    pub dest_end: usize,
+}
diff --git a/src/fuzz.rs b/src/fuzz.rs
@@ -1,7 +1,9 @@
 use crate::common::{NoScoreCutoff, SimilarityCutoff, WithScoreCutoff};
 use crate::details::distance::MetricUsize;
+use crate::distance::common::ScoreAlignment;
 use crate::distance::indel;
 use crate::HashableChar;
+use std::collections::HashSet;
 
 #[must_use]
 #[derive(Clone, Copy, Debug)]
@@ -149,6 +151,263 @@ where
     }
 }
 
+/// Searches for the optimal alignment of the shorter string in the
+/// longer string and returns the fuzz.ratio for this alignment.
+///
+/// # Example
+/// ```
+/// use rapidfuzz::fuzz;
+/// /// score is 1.0
+/// let score = fuzz::partial_ratio("this is a test".chars(), "this is a test!".chars());
+/// ```
+///
+pub fn partial_ratio<Iter1, Iter2>(s1: Iter1, s2: Iter2) -> f64
+where
+    Iter1: IntoIterator,
+    Iter1::IntoIter: DoubleEndedIterator + Clone,
+    Iter2: IntoIterator,
+    Iter2::IntoIter: DoubleEndedIterator + Clone,
+    Iter1::Item: PartialEq<Iter2::Item> + HashableChar + Copy,
+    Iter2::Item: PartialEq<Iter1::Item> + HashableChar + Copy,
+{
+    partial_ratio_with_args(s1, s2, &Args::default())
+}
+
+pub fn partial_ratio_with_args<Iter1, Iter2, CutoffType>(
+    s1: Iter1,
+    s2: Iter2,
+    args: &Args<f64, CutoffType>,
+) -> CutoffType::Output
+where
+    Iter1: IntoIterator,
+    Iter1::IntoIter: DoubleEndedIterator + Clone,
+    Iter2: IntoIterator,
+    Iter2::IntoIter: DoubleEndedIterator + Clone,
+    Iter1::Item: PartialEq<Iter2::Item> + HashableChar + Copy,
+    Iter2::Item: PartialEq<Iter1::Item> + HashableChar + Copy,
+    CutoffType: SimilarityCutoff<f64, Output = f64>,
+{
+    let s1_iter = s1.into_iter();
+    let s2_iter = s2.into_iter();
+
+    let alignment = partial_ratio_alignment(
+        s1_iter.clone(),
+        s1_iter.count(),
+        s2_iter.clone(),
+        s2_iter.count(),
+        args,
+    );
+
+    match alignment {
+        Some(alignment) => alignment.score,
+        None => 0.0,
+    }
+}
+
+pub fn partial_ratio_alignment<Iter1, Iter2, CutoffType>(
+    s1: Iter1,
+    len1: usize,
+    s2: Iter2,
+    len2: usize,
+    args: &Args<f64, CutoffType>,
+) -> Option<ScoreAlignment>
+where
+    Iter1: IntoIterator,
+    Iter1::IntoIter: DoubleEndedIterator + Clone,
+    Iter2: IntoIterator,
+    Iter2::IntoIter: DoubleEndedIterator + Clone,
+    Iter1::Item: PartialEq<Iter2::Item> + HashableChar + Copy,
+    Iter2::Item: PartialEq<Iter1::Item> + HashableChar + Copy,
+    CutoffType: SimilarityCutoff<f64, Output = f64>,
+{
+    let s1_iter = s1.into_iter();
+    let s2_iter = s2.into_iter();
+    let mut score_cutoff = args.score_cutoff.cutoff().unwrap_or(0.0);
+
+    let mut res = if len1 <= len2 {
+        partial_ratio_impl(
+            s1_iter.clone(),
+            len1,
+            s2_iter.clone(),
+            len2,
+            score_cutoff,
+            args.score_hint,
+        )
+    } else {
+        partial_ratio_impl(
+            s2_iter.clone(),
+            len2,
+            s1_iter.clone(),
+            len1,
+            score_cutoff,
+            args.score_hint,
+        )
+    };
+    if (res.score != 1.0) && (len1 == len2) {
+        score_cutoff = f64::max(score_cutoff, res.score);
+        let res2 = if len1 <= len2 {
+            partial_ratio_impl(
+                s2_iter.clone(),
+                len2,
+                s1_iter.clone(),
+                len1,
+                score_cutoff,
+                args.score_hint,
+            )
+        } else {
+            partial_ratio_impl(
+                s1_iter.clone(),
+                len1,
+                s2_iter.clone(),
+                len2,
+                score_cutoff,
+                args.score_hint,
+            )
+        };
+        if res2.score > res.score {
+            res = ScoreAlignment {
+                score: res2.score,
+                src_start: res2.dest_start,
+                src_end: res2.dest_end,
+                dest_start: res2.src_start,
+                dest_end: res2.src_end,
+            };
+        }
+    }
+
+    (res.score >= score_cutoff).then_some(res)
+}
+
+/**
+implementation of partial_ratio for needles <= 64. assumes s1 is already the
+shorter string
+*/
+fn partial_ratio_impl<Iter1, Iter2>(
+    s1: Iter1,
+    len1: usize,
+    s2: Iter2,
+    len2: usize,
+    mut score_cutoff: f64,
+    score_hint: Option<f64>,
+) -> ScoreAlignment
+where
+    Iter1: IntoIterator,
+    Iter1::IntoIter: DoubleEndedIterator + Clone,
+    Iter2: IntoIterator,
+    Iter2::IntoIter: DoubleEndedIterator + Clone,
+    Iter1::Item: PartialEq<Iter2::Item> + HashableChar + Copy,
+    Iter2::Item: PartialEq<Iter1::Item> + HashableChar + Copy,
+{
+    if len1 == 0 {
+        return ScoreAlignment {
+            score: 0.0,
+            src_start: 0,
+            src_end: 0,
+            dest_start: 0,
+            dest_end: 0,
+        };
+    }
+
+    let s1_iter = s1.into_iter();
+    let s2_vec = s2.into_iter().collect::<Vec<_>>();
+
+    let s1_char_set = s1_iter
+        .clone()
+        .map(|c| c.hash_char())
+        .collect::<HashSet<_>>();
+
+    let mut res = ScoreAlignment {
+        score: 0.0,
+        src_start: 0,
+        src_end: len1,
+        dest_start: 0,
+        dest_end: len1,
+    };
+
+    let indel_comp = indel::BatchComparator::new(s1_iter.clone());
+
+    for i in 1..len1 {
+        let substr_last = &s2_vec[i - 1];
+        if !s1_char_set.contains(&substr_last.hash_char()) {
+            continue;
+        }
+
+        let ls_ratio = indel_comp
+            .normalized_similarity_with_args(
+                s2_vec[..i].iter().cloned(),
+                &indel::Args {
+                    score_cutoff: WithScoreCutoff(score_cutoff),
+                    score_hint,
+                },
+            )
+            .unwrap_or(0.0);
+        if ls_ratio > res.score {
+            score_cutoff = ls_ratio;
+            res.score = ls_ratio;
+            res.dest_start = 0;
+            res.dest_end = i;
+            if res.score == 1.0 {
+                return res;
+            }
+        }
+    }
+
+    let window_end = len2 - len1;
+    for i in 0..window_end {
+        let substr_last = &s2_vec[i + len1 - 1];
+        if !s1_char_set.contains(&substr_last.hash_char()) {
+            continue;
+        }
+
+        let ls_ratio = indel_comp
+            .normalized_similarity_with_args(
+                s2_vec[i..i + len1].iter().cloned(),
+                &indel::Args {
+                    score_cutoff: WithScoreCutoff(score_cutoff),
+                    score_hint,
+                },
+            )
+            .unwrap_or(0.0);
+        if ls_ratio > res.score {
+            score_cutoff = ls_ratio;
+            res.score = ls_ratio;
+            res.dest_start = i;
+            res.dest_end = i + len1;
+            if res.score == 1.0 {
+                return res;
+            }
+        }
+    }
+
+    for i in window_end..len2 {
+        let substr_first = &s2_vec[i];
+        if !s1_char_set.contains(&substr_first.hash_char()) {
+            continue;
+        }
+
+        let ls_ratio = indel_comp
+            .normalized_similarity_with_args(
+                s2_vec[i..].iter().cloned(),
+                &indel::Args {
+                    score_cutoff: WithScoreCutoff(score_cutoff),
+                    score_hint,
+                },
+            )
+            .unwrap_or(0.0);
+        if ls_ratio > res.score {
+            score_cutoff = ls_ratio;
+            res.score = ls_ratio;
+            res.dest_start = i;
+            res.dest_end = len2;
+            if res.score == 1.0 {
+                return res;
+            }
+        }
+    }
+
+    res
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -299,4 +558,92 @@ mod tests {
             );
         }
     }
+
+    #[test]
+    fn test_partial_ratio2() {
+        let s1 = "this is a test";
+        let s2 = "this is a test!";
+        let result = partial_ratio(s1.chars(), s2.chars());
+        assert_eq!(result, 1.0, "Expected 1.0");
+    }
+
+    #[test]
+    fn test_partial_ratio_issue138() {
+        let s1 = &"a".repeat(65);
+        let s2 = &format!("a{}{}", char::from_u32(256).unwrap(), "a".repeat(63));
+        let result = partial_ratio(s1.chars(), s2.chars());
+        assert!(
+            (result - 0.9922481).abs() < 1e-5,
+            "Expected approximately 0.9922481, got {}",
+            result
+        );
+    }
+
+    #[test]
+    fn test_partial_ratio_alignment() {
+        let str1 = "er merkantilismus förderte handle und verkehr mit teils marktkonformen, teils dirigistischen maßnahmen.";
+        let str2 = "ils marktkonformen, teils dirigistischen maßnahmen. an der schwelle zum 19. jahrhundert entstand ein neu";
+
+        let alignment = partial_ratio_alignment(
+            str1.chars(),
+            str1.chars().count(),
+            str2.chars(),
+            str2.chars().count(),
+            &Args::default(),
+        );
+
+        dbg!(&alignment);
+
+        assert!(
+            (alignment.as_ref().unwrap().score - 0.662337662).abs() < 1e-5,
+            "Expected 0.662337662, got {}",
+            alignment.unwrap().score
+        );
+        assert_eq!(alignment.as_ref().unwrap().src_start, 0);
+        assert_eq!(alignment.as_ref().unwrap().src_end, 103);
+        assert_eq!(alignment.as_ref().unwrap().dest_start, 0);
+        assert_eq!(alignment.as_ref().unwrap().dest_end, 51);
+    }
+
+    #[test]
+    fn test_partial_ratio_impl_identical() {
+        let s1 = "abcd";
+        let s2 = "abcd";
+
+        let result = partial_ratio_impl(
+            s1.chars(),
+            s1.chars().count(),
+            s2.chars(),
+            s2.chars().count(),
+            0.0,
+            None,
+        );
+
+        assert_eq!(result.score, 1.0);
+        assert_eq!(result.src_start, 0);
+        assert_eq!(result.src_end, 4);
+        assert_eq!(result.dest_start, 0);
+        assert_eq!(result.dest_end, 4);
+    }
+
+    #[test]
+    fn test_partial_ratio_impl_substring() {
+        let s1 = "bcd";
+        let s2 = "abcde";
+
+        let result = partial_ratio_impl(
+            s1.chars(),
+            s1.chars().count(),
+            s2.chars(),
+            s2.chars().count(),
+            0.0,
+            None,
+        );
+
+        assert_eq!(result.score, 1.0);
+        assert_eq!(result.src_start, 0);
+        assert_eq!(result.src_end, 3);
+        assert_eq!(result.dest_start, 1);
+        assert_eq!(result.dest_end, 4);
+    }
 }
diff --git a/src/lib.rs b/src/lib.rs
@@ -100,7 +100,7 @@ pub mod distance;
 pub mod fuzz;
 
 /// Hash value in the range `i64::MIN` - `u64::MAX`
-#[derive(Debug, Copy, Clone)]
+#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
 pub enum Hash {
     UNSIGNED(u64),
     SIGNED(i64),