If break at period, it has to be followed by space (#61)

k2-fsa · Jan 14, 2024 · 589f933 · 589f933
1 parent 9963507
commit 589f933
Show file tree

Hide file tree

Showing 2 changed files with 45 additions and 29 deletions.
diff --git a/examples/libriheavy/matching.py b/examples/libriheavy/matching.py
@@ -95,7 +95,7 @@ def get_params() -> AttributeDict:
             "preceding_context_length": 1000,
             "timestamp_position": "current",
             "silence_length_to_break": 0.45,
-            "overlap_ratio": 0.45,
+            "overlap_ratio": 0.5,
             "min_duration": 2,
             "max_duration": 30,
             "expected_duration": (5, 20),

diff --git a/textsearch/python/textsearch/match.py b/textsearch/python/textsearch/match.py
@@ -704,55 +704,71 @@ def _get_segment_candidates(
         # punctuation
         prev_punctuation = 0
         j = align["ref_pos"] - 1
+        num_space_behind_period = 0
         while j >= 0:
             current_token = chr(target_source.binary_text[j])
             if is_punctuation(current_token, eos_only=True):
-                tmp = "".join(
-                    [
-                        chr(x)
-                        for x in target_source.binary_text[
-                            j - period_pattern_length : j + 1
+                if current_token == ".":
+                    tmp = "".join(
+                        [
+                            chr(x)
+                            for x in target_source.binary_text[
+                                j - period_pattern_length : j + 1
+                            ]
                         ]
-                    ]
-                )
-                if current_token != "." or (
-                    current_token == "."
-                    and period_patterns.search(tmp) is not None
-                ):
+                    )
+                    if (
+                        period_patterns.search(tmp) is not None
+                        and num_space_behind_period >= 1
+                    ):
+                        prev_punctuation = punctuation_score
+                        break
+                    else:
+                        break
+                else:
                     prev_punctuation = punctuation_score
                     break
-                else:
-                    j -= 1
             elif current_token == " " or is_punctuation(current_token):
+                if current_token == " ":
+                    num_space_behind_period += 1
                 j -= 1
             else:
                 break
 
         succ_punctuation = 0
         j = align["ref_pos"] + 1
+        followed_by_period = False
+        followed_by_other_eos = False
+        num_space_behind_period = 0
         while j < target_source.binary_text.size:
             current_token = chr(target_source.binary_text[j])
             if is_punctuation(current_token, eos_only=True):
-                tmp = "".join(
-                    [
-                        chr(x)
-                        for x in target_source.binary_text[
-                            j - period_pattern_length : j + 1
+                if current_token == ".":
+                    tmp = "".join(
+                        [
+                            chr(x)
+                            for x in target_source.binary_text[
+                                j - period_pattern_length : j + 1
+                            ]
                         ]
-                    ]
-                )
-                if current_token != "." or (
-                    current_token == "."
-                    and period_patterns.search(tmp) is not None
-                ):
-                    succ_punctuation = punctuation_score
-                    break
+                    )
+                    if period_patterns.search(tmp) is not None:
+                        followed_by_period = True
                 else:
-                    j += 1
+                    followed_by_other_eos = True
+                j += 1
             elif current_token == " " or is_punctuation(current_token):
+                if current_token == " ":
+                    num_space_behind_period += 1
                 j += 1
             else:
-                break
+                if (
+                    followed_by_period and num_space_behind_period >= 1
+                ) or followed_by_other_eos:
+                    succ_punctuation = punctuation_score
+                    break
+                else:
+                    break
 
         begin_score = (
             prev_silence