Skip to content

Commit

Permalink
If break at period, it has to be followed by space (#61)
Browse files Browse the repository at this point in the history
  • Loading branch information
pkufool authored Jan 14, 2024
1 parent 9963507 commit 589f933
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 29 deletions.
2 changes: 1 addition & 1 deletion examples/libriheavy/matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def get_params() -> AttributeDict:
"preceding_context_length": 1000,
"timestamp_position": "current",
"silence_length_to_break": 0.45,
"overlap_ratio": 0.45,
"overlap_ratio": 0.5,
"min_duration": 2,
"max_duration": 30,
"expected_duration": (5, 20),
Expand Down
72 changes: 44 additions & 28 deletions textsearch/python/textsearch/match.py
Original file line number Diff line number Diff line change
Expand Up @@ -704,55 +704,71 @@ def _get_segment_candidates(
# punctuation
prev_punctuation = 0
j = align["ref_pos"] - 1
num_space_behind_period = 0
while j >= 0:
current_token = chr(target_source.binary_text[j])
if is_punctuation(current_token, eos_only=True):
tmp = "".join(
[
chr(x)
for x in target_source.binary_text[
j - period_pattern_length : j + 1
if current_token == ".":
tmp = "".join(
[
chr(x)
for x in target_source.binary_text[
j - period_pattern_length : j + 1
]
]
]
)
if current_token != "." or (
current_token == "."
and period_patterns.search(tmp) is not None
):
)
if (
period_patterns.search(tmp) is not None
and num_space_behind_period >= 1
):
prev_punctuation = punctuation_score
break
else:
break
else:
prev_punctuation = punctuation_score
break
else:
j -= 1
elif current_token == " " or is_punctuation(current_token):
if current_token == " ":
num_space_behind_period += 1
j -= 1
else:
break

succ_punctuation = 0
j = align["ref_pos"] + 1
followed_by_period = False
followed_by_other_eos = False
num_space_behind_period = 0
while j < target_source.binary_text.size:
current_token = chr(target_source.binary_text[j])
if is_punctuation(current_token, eos_only=True):
tmp = "".join(
[
chr(x)
for x in target_source.binary_text[
j - period_pattern_length : j + 1
if current_token == ".":
tmp = "".join(
[
chr(x)
for x in target_source.binary_text[
j - period_pattern_length : j + 1
]
]
]
)
if current_token != "." or (
current_token == "."
and period_patterns.search(tmp) is not None
):
succ_punctuation = punctuation_score
break
)
if period_patterns.search(tmp) is not None:
followed_by_period = True
else:
j += 1
followed_by_other_eos = True
j += 1
elif current_token == " " or is_punctuation(current_token):
if current_token == " ":
num_space_behind_period += 1
j += 1
else:
break
if (
followed_by_period and num_space_behind_period >= 1
) or followed_by_other_eos:
succ_punctuation = punctuation_score
break
else:
break

begin_score = (
prev_silence
Expand Down

0 comments on commit 589f933

Please sign in to comment.