diff --git a/CHANGELOG.md b/CHANGELOG.md index b2ccaef..29b1af1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## sic 1.3 +### [1.3.3] - 2021-09-17 + +#### Changed + +- Fixed bug causing incorrect normalization when replacement tokens can be tokenized themselves + ### [1.3.2] - 2021-01-21 #### Changed diff --git a/dist/sic-1.3.2-cp36-cp36m-win_amd64.whl b/dist/sic-1.3.2-cp36-cp36m-win_amd64.whl deleted file mode 100644 index a4da1eb..0000000 Binary files a/dist/sic-1.3.2-cp36-cp36m-win_amd64.whl and /dev/null differ diff --git a/dist/sic-1.3.2-cp37-cp37m-win_amd64.whl b/dist/sic-1.3.2-cp37-cp37m-win_amd64.whl deleted file mode 100644 index b6e136b..0000000 Binary files a/dist/sic-1.3.2-cp37-cp37m-win_amd64.whl and /dev/null differ diff --git a/dist/sic-1.3.2-cp38-cp38-win_amd64.whl b/dist/sic-1.3.2-cp38-cp38-win_amd64.whl deleted file mode 100644 index 4eab6c8..0000000 Binary files a/dist/sic-1.3.2-cp38-cp38-win_amd64.whl and /dev/null differ diff --git a/dist/sic-1.3.2-cp39-cp39-win_amd64.whl b/dist/sic-1.3.2-cp39-cp39-win_amd64.whl deleted file mode 100644 index 8baeece..0000000 Binary files a/dist/sic-1.3.2-cp39-cp39-win_amd64.whl and /dev/null differ diff --git a/dist/sic-1.3.2.tar.gz b/dist/sic-1.3.2.tar.gz deleted file mode 100644 index 3603f01..0000000 Binary files a/dist/sic-1.3.2.tar.gz and /dev/null differ diff --git a/dist/sic-1.3.3-cp36-cp36m-win_amd64.whl b/dist/sic-1.3.3-cp36-cp36m-win_amd64.whl new file mode 100644 index 0000000..d7bb473 Binary files /dev/null and b/dist/sic-1.3.3-cp36-cp36m-win_amd64.whl differ diff --git a/dist/sic-1.3.3-cp37-cp37m-win_amd64.whl b/dist/sic-1.3.3-cp37-cp37m-win_amd64.whl new file mode 100644 index 0000000..70db466 Binary files /dev/null and b/dist/sic-1.3.3-cp37-cp37m-win_amd64.whl differ diff --git a/dist/sic-1.3.3-cp38-cp38-win_amd64.whl b/dist/sic-1.3.3-cp38-cp38-win_amd64.whl new file mode 100644 index 0000000..1126fdd Binary files /dev/null and b/dist/sic-1.3.3-cp38-cp38-win_amd64.whl differ diff --git a/dist/sic-1.3.3-cp39-cp39-win_amd64.whl b/dist/sic-1.3.3-cp39-cp39-win_amd64.whl new file mode 100644 index 0000000..fa99a00 Binary files /dev/null and b/dist/sic-1.3.3-cp39-cp39-win_amd64.whl differ diff --git a/dist/sic-1.3.3.tar.gz b/dist/sic-1.3.3.tar.gz new file mode 100644 index 0000000..40102b6 Binary files /dev/null and b/dist/sic-1.3.3.tar.gz differ diff --git a/shipping/setup.py b/shipping/setup.py index f906c8b..8545b49 100644 --- a/shipping/setup.py +++ b/shipping/setup.py @@ -9,7 +9,7 @@ setup( name='sic', - version='1.3.2', + version='1.3.3', description='Utility for string normalization', long_description=long_description, long_description_content_type='text/markdown', diff --git a/sic/core.py b/sic/core.py index fad39f5..960a3d9 100644 --- a/sic/core.py +++ b/sic/core.py @@ -382,7 +382,7 @@ def normalize(self, source_string, word_separator=' ', normalizer_option=0, cont on_the_right = False added_separator = False if character not in (word_separator, control_character) and last_character not in (word_separator, control_character): - if this_group == 0 or this_group != last_group: + if (this_group == 0 or this_group != last_group) and (subtrie is self.content or character not in subtrie): if not buffer.endswith(word_separator) and not buffer.endswith(control_character): buffer += control_character if len(b_map) == len(buffer): diff --git a/test/ut_sic.py b/test/ut_sic.py index 0190b27..3f3f102 100644 --- a/test/ut_sic.py +++ b/test/ut_sic.py @@ -946,6 +946,47 @@ def test_spelling_correction_implicit(self): assert expected1 == normalized1, 'Expected "%s", got "%s".' % (expected1, normalized1) assert expected2 == normalized2, 'Expected "%s", got "%s".' % (expected2, normalized2) + def test_split_inside_replace(self): + model = sic.Model() + model.add_rule(sic.ReplaceToken('(p)', '[p]')) + model.add_rule(sic.SplitToken('(a)', 'l')) + model.add_rule(sic.SplitToken('p', 'lmr')) + sic.build_normalizer(model) + test_string1 = 'ab(c)'; expected1 = 'ab ( c )'; normalized1 = sic.normalize(test_string1) + test_string2 = 'ab(p)'; expected2 = 'ab [p]'; normalized2 = sic.normalize(test_string2) + test_string3 = '(a)ab ( p )'; expected3 = '(a) ab ( p )'; normalized3 = sic.normalize(test_string3) + test_string4 = 'ab(p)cd'; expected4 = 'ab [p] cd'; normalized4 = sic.normalize(test_string4) + test_string5 = 'ab(p)(p)'; expected5 = 'ab [p] [p]'; normalized5 = sic.normalize(test_string5) + test_string6 = '(p)ab(p)(p)'; expected6 = '[p] ab [p] [p]'; normalized6 = sic.normalize(test_string6) + test_string7 = '(p)(p)ab(p)'; expected7 = '[p] [p] ab [p]'; normalized7 = sic.normalize(test_string7) + test_string8 = 'ab(p)(p)cd'; expected8 = 'ab [p] [p] cd'; normalized8 = sic.normalize(test_string8) + test_string9 = 'AB(C)'; expected9 = 'ab ( c )'; normalized9 = sic.normalize(test_string9) + test_string10 = 'AB(P)'; expected10 = 'ab [p]'; normalized10 = sic.normalize(test_string10) + test_string11 = '(A)AB ( P )'; expected11 = '(a) ab ( p )'; normalized11 = sic.normalize(test_string11) + test_string12 = 'AB(P)CD'; expected12 = 'ab [p] cd'; normalized12 = sic.normalize(test_string12) + test_string13 = 'AB(P)(P)'; expected13 = 'ab [p] [p]'; normalized13 = sic.normalize(test_string13) + test_string14 = '(P)AB(P)(P)'; expected14 = '[p] ab [p] [p]'; normalized14 = sic.normalize(test_string14) + test_string15 = '(P)(P)AB(P)'; expected15 = '[p] [p] ab [p]'; normalized15 = sic.normalize(test_string15) + test_string16 = 'AB(P)(P)CD'; expected16 = 'ab [p] [p] cd'; normalized16 = sic.normalize(test_string16) + test_string17 = 'AB(P)(P)CDPP'; expected17 = 'ab [p] [p] cd p p'; normalized17 = sic.normalize(test_string17) + assert expected1 == normalized1, 'Expected "%s", got "%s".' % (expected1, normalized1) + assert expected2 == normalized2, 'Expected "%s", got "%s".' % (expected2, normalized2) + assert expected3 == normalized3, 'Expected "%s", got "%s".' % (expected3, normalized3) + assert expected4 == normalized4, 'Expected "%s", got "%s".' % (expected4, normalized4) + assert expected5 == normalized5, 'Expected "%s", got "%s".' % (expected5, normalized5) + assert expected6 == normalized6, 'Expected "%s", got "%s".' % (expected6, normalized6) + assert expected7 == normalized7, 'Expected "%s", got "%s".' % (expected7, normalized7) + assert expected8 == normalized8, 'Expected "%s", got "%s".' % (expected8, normalized8) + assert expected9 == normalized9, 'Expected "%s", got "%s".' % (expected9, normalized9) + assert expected10 == normalized10, 'Expected "%s", got "%s".' % (expected10, normalized10) + assert expected11 == normalized11, 'Expected "%s", got "%s".' % (expected11, normalized11) + assert expected12 == normalized12, 'Expected "%s", got "%s".' % (expected12, normalized12) + assert expected13 == normalized13, 'Expected "%s", got "%s".' % (expected13, normalized13) + assert expected14 == normalized14, 'Expected "%s", got "%s".' % (expected14, normalized14) + assert expected15 == normalized15, 'Expected "%s", got "%s".' % (expected15, normalized15) + assert expected16 == normalized16, 'Expected "%s", got "%s".' % (expected16, normalized16) + assert expected17 == normalized17, 'Expected "%s", got "%s".' % (expected17, normalized17) + if __name__ == '__main__': sys.path.insert(0, '') import sic # pylint: disable=E0611,F0401