Skip to content

Commit

Permalink
fix out of bound reads
Browse files Browse the repository at this point in the history
  • Loading branch information
maxbachmann committed Oct 9, 2023
1 parent 7a30d6b commit 1678beb
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 27 deletions.
20 changes: 11 additions & 9 deletions extras/rapidfuzz_amalgamated.hpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// Licensed under the MIT License <http://opensource.org/licenses/MIT>.
// SPDX-License-Identifier: MIT
// RapidFuzz v1.0.2
// Generated: 2023-10-08 21:27:21.591281
// Generated: 2023-10-09 03:12:47.555069
// ----------------------------------------------------------
// This file is an amalgamation of multiple different files.
// You probably shouldn't edit it directly.
Expand Down Expand Up @@ -5229,17 +5229,18 @@ FlaggedCharsWord flag_similar_characters_word(const PM_Vec& PM, [[maybe_unused]]
uint64_t BoundMask = bit_mask_lsb<uint64_t>(Bound + 1);

int64_t j = 0;
for (; j < std::min(static_cast<int64_t>(Bound), static_cast<int64_t>(T.size())); ++j) {
uint64_t PM_j = PM.get(0, T[j]) & BoundMask & (~flagged.P_flag);
auto T_iter = T.begin();
for (; j < std::min(static_cast<int64_t>(Bound), static_cast<int64_t>(T.size())); ++j, ++T_iter) {
uint64_t PM_j = PM.get(0, *T_iter) & BoundMask & (~flagged.P_flag);

flagged.P_flag |= blsi(PM_j);
flagged.T_flag |= static_cast<uint64_t>(PM_j != 0) << j;

BoundMask = (BoundMask << 1) | 1;
}

for (; j < T.size(); ++j) {
uint64_t PM_j = PM.get(0, T[j]) & BoundMask & (~flagged.P_flag);
for (; j < T.size(); ++j, ++T_iter) {
uint64_t PM_j = PM.get(0, *T_iter) & BoundMask & (~flagged.P_flag);

flagged.P_flag |= blsi(PM_j);
flagged.T_flag |= static_cast<uint64_t>(PM_j != 0) << j;
Expand Down Expand Up @@ -5348,8 +5349,9 @@ static inline FlaggedCharsMultiword flag_similar_characters_block(const BlockPat
BoundMask.last_mask = (1ull << (start_range % 64)) - 1;
BoundMask.first_mask = ~UINT64_C(0);

for (int64_t j = 0; j < T.size(); ++j) {
flag_similar_characters_step(PM, T[j], flagged, static_cast<size_t>(j), BoundMask);
auto T_iter = T.begin();
for (int64_t j = 0; j < T.size(); ++j, ++T_iter) {
flag_similar_characters_step(PM, *T_iter, flagged, static_cast<size_t>(j), BoundMask);

if (j + Bound + 1 < P.size()) {
BoundMask.last_mask = (BoundMask.last_mask << 1) | 1;
Expand Down Expand Up @@ -5486,7 +5488,7 @@ double jaro_similarity(Range<InputIt1> P, Range<InputIt2> T, double score_cutoff
/* filter out based on the length difference between the two strings */
if (!jaro_length_filter(P_len, T_len, score_cutoff)) return 0.0;

if (P_len == 1 && T_len == 1) return static_cast<double>(P[0] == T[0]);
if (P_len == 1 && T_len == 1) return static_cast<double>(P.front() == T.front());

int64_t Bound = jaro_bounds(P, T);

Expand Down Expand Up @@ -5638,7 +5640,7 @@ void jaro_similarity_simd(Range<double*> scores, const detail::BlockPatternMatch
// this is solved by splitting the loop into two parts where after this boundary is reached
// the first bit inside boundMask is no longer set
int64_t j = 0;
for (; j < maxBound; ++j) {
for (; j < std::min(maxBound, s2_cur.size()); ++j) {
alignas(32) std::array<uint64_t, vecs> stored;
unroll<int, vecs>([&](auto i) { stored[i] = block.get(cur_vec + i, s2_cur[j]); });
native_simd<VecType> X(stored.data());
Expand Down
18 changes: 10 additions & 8 deletions rapidfuzz/distance/Jaro_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,17 +112,18 @@ FlaggedCharsWord flag_similar_characters_word(const PM_Vec& PM, [[maybe_unused]]
uint64_t BoundMask = bit_mask_lsb<uint64_t>(Bound + 1);

int64_t j = 0;
for (; j < std::min(static_cast<int64_t>(Bound), static_cast<int64_t>(T.size())); ++j) {
uint64_t PM_j = PM.get(0, T[j]) & BoundMask & (~flagged.P_flag);
auto T_iter = T.begin();
for (; j < std::min(static_cast<int64_t>(Bound), static_cast<int64_t>(T.size())); ++j,++T_iter) {
uint64_t PM_j = PM.get(0, *T_iter) & BoundMask & (~flagged.P_flag);

flagged.P_flag |= blsi(PM_j);
flagged.T_flag |= static_cast<uint64_t>(PM_j != 0) << j;

BoundMask = (BoundMask << 1) | 1;
}

for (; j < T.size(); ++j) {
uint64_t PM_j = PM.get(0, T[j]) & BoundMask & (~flagged.P_flag);
for (; j < T.size(); ++j,++T_iter) {
uint64_t PM_j = PM.get(0, *T_iter) & BoundMask & (~flagged.P_flag);

flagged.P_flag |= blsi(PM_j);
flagged.T_flag |= static_cast<uint64_t>(PM_j != 0) << j;
Expand Down Expand Up @@ -231,8 +232,9 @@ static inline FlaggedCharsMultiword flag_similar_characters_block(const BlockPat
BoundMask.last_mask = (1ull << (start_range % 64)) - 1;
BoundMask.first_mask = ~UINT64_C(0);

for (int64_t j = 0; j < T.size(); ++j) {
flag_similar_characters_step(PM, T[j], flagged, static_cast<size_t>(j), BoundMask);
auto T_iter = T.begin();
for (int64_t j = 0; j < T.size(); ++j,++T_iter) {
flag_similar_characters_step(PM, *T_iter, flagged, static_cast<size_t>(j), BoundMask);

if (j + Bound + 1 < P.size()) {
BoundMask.last_mask = (BoundMask.last_mask << 1) | 1;
Expand Down Expand Up @@ -370,7 +372,7 @@ double jaro_similarity(Range<InputIt1> P, Range<InputIt2> T, double score_cutoff
/* filter out based on the length difference between the two strings */
if (!jaro_length_filter(P_len, T_len, score_cutoff)) return 0.0;

if (P_len == 1 && T_len == 1) return static_cast<double>(P[0] == T[0]);
if (P_len == 1 && T_len == 1) return static_cast<double>(P.front() == T.front());

int64_t Bound = jaro_bounds(P, T);

Expand Down Expand Up @@ -522,7 +524,7 @@ void jaro_similarity_simd(Range<double*> scores, const detail::BlockPatternMatch
// this is solved by splitting the loop into two parts where after this boundary is reached
// the first bit inside boundMask is no longer set
int64_t j = 0;
for(; j < maxBound; ++j)
for(; j < std::min(maxBound, s2_cur.size()); ++j)
{
alignas(32) std::array<uint64_t, vecs> stored;
unroll<int, vecs>([&](auto i) { stored[i] = block.get(cur_vec + i, s2_cur[j]); });
Expand Down
33 changes: 23 additions & 10 deletions test/distance/tests-Jaro.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,17 @@ double jaro_similarity(const Sentence1& s1, const Sentence2& s2, double score_cu
double res3 = rapidfuzz::jaro_normalized_similarity(s1, s2, score_cutoff);
double res4 =
rapidfuzz::jaro_normalized_similarity(s1.begin(), s1.end(), s2.begin(), s2.end(), score_cutoff);
#if 0 // todo
double res5 = rapidfuzz::jaro_similarity(
BidirectionalIterWrapper(s1.begin()), BidirectionalIterWrapper(s1.end()),
BidirectionalIterWrapper(s2.begin()), BidirectionalIterWrapper(s2.end()), score_cutoff);
#endif

rapidfuzz::CachedJaro scorer(s1);
double res5 = scorer.similarity(s2, score_cutoff);
double res6 = scorer.similarity(s2.begin(), s2.end(), score_cutoff);
double res7 = scorer.normalized_similarity(s2, score_cutoff);
double res8 = scorer.normalized_similarity(s2.begin(), s2.end(), score_cutoff);
double res6 = scorer.similarity(s2, score_cutoff);
double res7 = scorer.similarity(s2.begin(), s2.end(), score_cutoff);
double res8 = scorer.normalized_similarity(s2, score_cutoff);
double res9 = scorer.normalized_similarity(s2.begin(), s2.end(), score_cutoff);

#ifdef RAPIDFUZZ_SIMD
std::vector<double> results(256 / 8);
Expand Down Expand Up @@ -52,10 +58,11 @@ double jaro_similarity(const Sentence1& s1, const Sentence2& s2, double score_cu
REQUIRE(res1 == Approx(res2));
REQUIRE(res1 == Approx(res3));
REQUIRE(res1 == Approx(res4));
REQUIRE(res1 == Approx(res5));
//REQUIRE(res1 == Approx(res5));
REQUIRE(res1 == Approx(res6));
REQUIRE(res1 == Approx(res7));
REQUIRE(res1 == Approx(res8));
REQUIRE(res1 == Approx(res9));
return res1;
}

Expand All @@ -67,18 +74,24 @@ double jaro_distance(const Sentence1& s1, const Sentence2& s2, double score_cuto
double res3 = rapidfuzz::jaro_normalized_distance(s1, s2, score_cutoff);
double res4 =
rapidfuzz::jaro_normalized_distance(s1.begin(), s1.end(), s2.begin(), s2.end(), score_cutoff);
#if 0 // todo
double res5 = rapidfuzz::jaro_distance(
BidirectionalIterWrapper(s1.begin()), BidirectionalIterWrapper(s1.end()),
BidirectionalIterWrapper(s2.begin()), BidirectionalIterWrapper(s2.end()), score_cutoff);
#endif
rapidfuzz::CachedJaro scorer(s1);
double res5 = scorer.distance(s2, score_cutoff);
double res6 = scorer.distance(s2.begin(), s2.end(), score_cutoff);
double res7 = scorer.normalized_distance(s2, score_cutoff);
double res8 = scorer.normalized_distance(s2.begin(), s2.end(), score_cutoff);
double res6 = scorer.distance(s2, score_cutoff);
double res7 = scorer.distance(s2.begin(), s2.end(), score_cutoff);
double res8 = scorer.normalized_distance(s2, score_cutoff);
double res9 = scorer.normalized_distance(s2.begin(), s2.end(), score_cutoff);
REQUIRE(res1 == Approx(res2));
REQUIRE(res1 == Approx(res3));
REQUIRE(res1 == Approx(res4));
REQUIRE(res1 == Approx(res5));
//REQUIRE(res1 == Approx(res5));
REQUIRE(res1 == Approx(res6));
REQUIRE(res1 == Approx(res7));
REQUIRE(res1 == Approx(res8));
REQUIRE(res1 == Approx(res9));
return res1;
}

Expand Down

0 comments on commit 1678beb

Please sign in to comment.