Merge pull request #40 from pgolo/dev

Released 1.0.6
pgolo · Sep 10, 2020 · 30aa135 · 30aa135
2 parents 53d1c0a + 0f8d215
commit 30aa135
Show file tree

Hide file tree

Showing 13 changed files with 28 additions and 4 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,9 +5,15 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [1.0.6] - 2020-09-10
+
+### Changed
+
+- Fixed bug with replacing substring that is not a token
+
 ## [1.0.5] - 2020-09-08
 
-## Changed
+### Changed
 
 - Normalizer.data is now exposed as a property
 - Updated documentation, added performance benchmarks

diff --git a/dist/sic-1.0.5-cp36-cp36m-win_amd64.whl b/dist/sic-1.0.5-cp36-cp36m-win_amd64.whl
diff --git a/dist/sic-1.0.5-cp37-cp37m-win_amd64.whl b/dist/sic-1.0.5-cp37-cp37m-win_amd64.whl
diff --git a/dist/sic-1.0.5-cp38-cp38-win_amd64.whl b/dist/sic-1.0.5-cp38-cp38-win_amd64.whl
diff --git a/dist/sic-1.0.5.tar.gz b/dist/sic-1.0.5.tar.gz
diff --git a/dist/sic-1.0.6-cp36-cp36m-win_amd64.whl b/dist/sic-1.0.6-cp36-cp36m-win_amd64.whl
diff --git a/dist/sic-1.0.6-cp37-cp37m-win_amd64.whl b/dist/sic-1.0.6-cp37-cp37m-win_amd64.whl
diff --git a/dist/sic-1.0.6-cp38-cp38-win_amd64.whl b/dist/sic-1.0.6-cp38-cp38-win_amd64.whl
diff --git a/dist/sic-1.0.6.tar.gz b/dist/sic-1.0.6.tar.gz
diff --git a/shipping/setup.py b/shipping/setup.py
@@ -9,7 +9,7 @@
 
 setup(
     name='sic',
-    version='1.0.5',
+    version='1.0.6',
     description='Utility for string normalization',
     long_description=long_description,
     long_description_content_type='text/markdown',

diff --git a/sic/core.py b/sic/core.py
@@ -258,7 +258,7 @@ def normalize(self, source_string, word_separator=' ', normalizer_option=0):
                     last_buffer = buffer
                     last_replacement = subtrie['~_']
                     l_map = [b_map[0] for i in range(len(last_replacement))]
-                if '~_' in subtrie and ((on_the_left or on_the_right) or '~m' in subtrie):
+                if '~_' in subtrie and ((on_the_left and on_the_right) or '~m' in subtrie or ('~l' in subtrie and on_the_left) or ('~r' in subtrie and on_the_right)):
                     # now buffer has token to be replaced
                     buffer = subtrie['~_'] + word_separator #if not buffer.endswith(word_separator) else ''
                     b_map = [b_map[0] for i in range(len(buffer))]
@@ -323,7 +323,7 @@ def normalize(self, source_string, word_separator=' ', normalizer_option=0):
         # DRY!
         on_the_right = True
         on_the_left = this_fragment == '' or this_fragment[-1:] == word_separator
-        if '~_' in subtrie and ((on_the_left or on_the_right) or '~m' in subtrie):
+        if '~_' in subtrie and ((on_the_left and on_the_right) or '~m' in subtrie or ('~l' in subtrie and on_the_left) or ('~r' in subtrie and on_the_right)):
             # now buffer has token to be replaced
             buffer = subtrie['~_'] + word_separator #if not buffer.endswith(word_separator) else ''
             b_map = [b_map[0] for i in range(len(buffer))]

diff --git a/test/assets/tokenizer_no_plurals_right.xml b/test/assets/tokenizer_no_plurals_right.xml
@@ -0,0 +1,5 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<tokenizer name="test_plurals">
+  <setting name="cs" value="0" />
+  <token to="" from="s" />
+</tokenizer>
diff --git a/test/ut_sic.py b/test/ut_sic.py
@@ -658,6 +658,19 @@ def test_tokenizer_plurals_all(self):
         ]
         assert self.assert_normalization('tokenizer_plurals_all.xml', 'test_plurals', testcases) == True, 'Something is wrong.'
 
+    def test_tokenizer_no_plurals_right(self):
+        testcases = [
+            {
+                'original': 'plurals plurals',
+                'expected': {
+                    'normal': 'plurals plurals',
+                    'list': 'plurals plurals',
+                    'set': 'plurals'
+                }
+            }
+        ]
+        assert self.assert_normalization('tokenizer_no_plurals_right.xml', 'test_plurals', testcases) == True, 'Something is wrong.'
+
 
 if __name__ == '__main__':
     unittest.main()