Skip to content

Commit

Permalink
properly handle score_cutoff > 1.0
Browse files Browse the repository at this point in the history
  • Loading branch information
maxbachmann committed Apr 17, 2023
1 parent faa0687 commit 5684705
Show file tree
Hide file tree
Showing 7 changed files with 31 additions and 13 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
## Changelog

### [1.11.2] - 2023-04-17
#### Fixed
- fix handling of `score_cutoff > 1.0` in `Jaro` and `JaroWinkler`

### [1.11.1] - 2023-04-16
#### Fixed
- fix division by zero in simd implementation of normalized string metrics, when comparing empty strings
Expand Down
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ if (CMAKE_BINARY_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
message(FATAL_ERROR "Building in-source is not supported! Create a build dir and remove ${CMAKE_SOURCE_DIR}/CMakeCache.txt")
endif()

project(rapidfuzz LANGUAGES CXX VERSION 1.11.1)
project(rapidfuzz LANGUAGES CXX VERSION 1.11.2)

list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake")
include(GNUInstallDirs)
Expand Down
6 changes: 5 additions & 1 deletion extras/rapidfuzz_amalgamated.hpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// Licensed under the MIT License <http://opensource.org/licenses/MIT>.
// SPDX-License-Identifier: MIT
// RapidFuzz v1.0.2
// Generated: 2023-04-17 01:55:45.256062
// Generated: 2023-04-17 13:47:21.759334
// ----------------------------------------------------------
// This file is an amalgamation of multiple different files.
// You probably shouldn't edit it directly.
Expand Down Expand Up @@ -5203,6 +5203,8 @@ double jaro_similarity(Range<InputIt1> P, Range<InputIt2> T, double score_cutoff
int64_t P_len = P.size();
int64_t T_len = T.size();

if (score_cutoff > 1.0) return 0.0;

if (!P_len && !T_len) return 1.0;

/* filter out based on the length difference between the two strings */
Expand Down Expand Up @@ -5250,6 +5252,8 @@ double jaro_similarity(const BlockPatternMatchVector& PM, Range<InputIt1> P, Ran
int64_t P_len = P.size();
int64_t T_len = T.size();

if (score_cutoff > 1.0) return 0.0;

if (!P_len && !T_len) return 1.0;

/* filter out based on the length difference between the two strings */
Expand Down
4 changes: 4 additions & 0 deletions rapidfuzz/distance/Jaro_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,8 @@ double jaro_similarity(Range<InputIt1> P, Range<InputIt2> T, double score_cutoff
int64_t P_len = P.size();
int64_t T_len = T.size();

if (score_cutoff > 1.0) return 0.0;

if (!P_len && !T_len) return 1.0;

/* filter out based on the length difference between the two strings */
Expand Down Expand Up @@ -390,6 +392,8 @@ double jaro_similarity(const BlockPatternMatchVector& PM, Range<InputIt1> P, Ran
int64_t P_len = P.size();
int64_t T_len = T.size();

if (score_cutoff > 1.0) return 0.0;

if (!P_len && !T_len) return 1.0;

/* filter out based on the length difference between the two strings */
Expand Down
4 changes: 3 additions & 1 deletion rapidfuzz_reference/Jaro.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@ double jaro_similarity(InputIt1 P_first, InputIt1 P_last, InputIt2 T_first, Inpu
size_t P_len = static_cast<size_t>(std::distance(P_first, P_last));
size_t T_len = static_cast<size_t>(std::distance(T_first, T_last));

if (!P_len || !T_len) return 1.0;
if (score_cutoff > 1.0) return 0.0;

if (!P_len || !T_len) return double(!P_len && !T_len);

std::vector<int> P_flag(P_len + 1);
std::vector<int> T_flag(T_len + 1);
Expand Down
12 changes: 7 additions & 5 deletions test/distance/tests-Jaro.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ double jaro_similarity(const Sentence1& s1, const Sentence2& s2, double score_cu
rapidfuzz::CachedJaro scorer(s1);
double res5 = scorer.similarity(s2, score_cutoff);
double res6 = scorer.similarity(s2.begin(), s2.end(), score_cutoff);
double res7 = scorer.similarity(s2, score_cutoff);
double res8 = scorer.similarity(s2.begin(), s2.end(), score_cutoff);
double res7 = scorer.normalized_similarity(s2, score_cutoff);
double res8 = scorer.normalized_similarity(s2.begin(), s2.end(), score_cutoff);
REQUIRE(res1 == Approx(res2));
REQUIRE(res1 == Approx(res3));
REQUIRE(res1 == Approx(res4));
Expand All @@ -39,8 +39,8 @@ double jaro_distance(const Sentence1& s1, const Sentence2& s2, double score_cuto
rapidfuzz::CachedJaro scorer(s1);
double res5 = scorer.distance(s2, score_cutoff);
double res6 = scorer.distance(s2.begin(), s2.end(), score_cutoff);
double res7 = scorer.distance(s2, score_cutoff);
double res8 = scorer.distance(s2.begin(), s2.end(), score_cutoff);
double res7 = scorer.normalized_distance(s2, score_cutoff);
double res8 = scorer.normalized_distance(s2.begin(), s2.end(), score_cutoff);
REQUIRE(res1 == Approx(res2));
REQUIRE(res1 == Approx(res3));
REQUIRE(res1 == Approx(res4));
Expand All @@ -63,7 +63,9 @@ TEST_CASE("JaroWinklerTest")

SECTION("testFullResultWithScoreCutoff")
{
for (double score_cutoff = 0.0; score_cutoff < 1.1; score_cutoff += 0.1)
auto score_cutoffs = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1};

for (double score_cutoff : score_cutoffs)
for (const auto& name1 : names)
for (const auto& name2 : names) {
INFO("name1: " << name1 << ", name2: " << name2 << ", score_cutoff: " << score_cutoff);
Expand Down
12 changes: 7 additions & 5 deletions test/distance/tests-JaroWinkler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ double jaro_winkler_similarity(const Sentence1& s1, const Sentence2& s2, double
rapidfuzz::CachedJaroWinkler scorer(s1, prefix_weight);
double res5 = scorer.similarity(s2, score_cutoff);
double res6 = scorer.similarity(s2.begin(), s2.end(), score_cutoff);
double res7 = scorer.similarity(s2, score_cutoff);
double res8 = scorer.similarity(s2.begin(), s2.end(), score_cutoff);
double res7 = scorer.normalized_similarity(s2, score_cutoff);
double res8 = scorer.normalized_similarity(s2.begin(), s2.end(), score_cutoff);
REQUIRE(res1 == Approx(res2));
REQUIRE(res1 == Approx(res3));
REQUIRE(res1 == Approx(res4));
Expand All @@ -43,8 +43,8 @@ double jaro_winkler_distance(const Sentence1& s1, const Sentence2& s2, double pr
rapidfuzz::CachedJaroWinkler scorer(s1, prefix_weight);
double res5 = scorer.distance(s2, score_cutoff);
double res6 = scorer.distance(s2.begin(), s2.end(), score_cutoff);
double res7 = scorer.distance(s2, score_cutoff);
double res8 = scorer.distance(s2.begin(), s2.end(), score_cutoff);
double res7 = scorer.normalized_distance(s2, score_cutoff);
double res8 = scorer.normalized_distance(s2.begin(), s2.end(), score_cutoff);
REQUIRE(res1 == Approx(res2));
REQUIRE(res1 == Approx(res3));
REQUIRE(res1 == Approx(res4));
Expand All @@ -67,7 +67,9 @@ TEST_CASE("JaroWinklerTest")

SECTION("testFullResultWithScoreCutoff")
{
for (double score_cutoff = 0.0; score_cutoff < 1.1; score_cutoff += 0.1)
auto score_cutoffs = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1};

for (double score_cutoff : score_cutoffs)
for (const auto& name1 : names)
for (const auto& name2 : names) {
INFO("name1: " << name1 << ", name2: " << name2 << ", score_cutoff: " << score_cutoff);
Expand Down

0 comments on commit 5684705

Please sign in to comment.