Skip to content

Commit

Permalink
Merge pull request #40 from pgolo/dev
Browse files Browse the repository at this point in the history
Released 1.0.6
  • Loading branch information
pgolo authored Sep 10, 2020
2 parents 53d1c0a + 0f8d215 commit 30aa135
Show file tree
Hide file tree
Showing 13 changed files with 28 additions and 4 deletions.
8 changes: 7 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,15 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [1.0.6] - 2020-09-10

### Changed

- Fixed bug with replacing substring that is not a token

## [1.0.5] - 2020-09-08

## Changed
### Changed

- Normalizer.data is now exposed as a property
- Updated documentation, added performance benchmarks
Expand Down
Binary file removed dist/sic-1.0.5-cp36-cp36m-win_amd64.whl
Binary file not shown.
Binary file removed dist/sic-1.0.5-cp37-cp37m-win_amd64.whl
Binary file not shown.
Binary file removed dist/sic-1.0.5-cp38-cp38-win_amd64.whl
Binary file not shown.
Binary file removed dist/sic-1.0.5.tar.gz
Binary file not shown.
Binary file added dist/sic-1.0.6-cp36-cp36m-win_amd64.whl
Binary file not shown.
Binary file added dist/sic-1.0.6-cp37-cp37m-win_amd64.whl
Binary file not shown.
Binary file added dist/sic-1.0.6-cp38-cp38-win_amd64.whl
Binary file not shown.
Binary file added dist/sic-1.0.6.tar.gz
Binary file not shown.
2 changes: 1 addition & 1 deletion shipping/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

setup(
name='sic',
version='1.0.5',
version='1.0.6',
description='Utility for string normalization',
long_description=long_description,
long_description_content_type='text/markdown',
Expand Down
4 changes: 2 additions & 2 deletions sic/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ def normalize(self, source_string, word_separator=' ', normalizer_option=0):
last_buffer = buffer
last_replacement = subtrie['~_']
l_map = [b_map[0] for i in range(len(last_replacement))]
if '~_' in subtrie and ((on_the_left or on_the_right) or '~m' in subtrie):
if '~_' in subtrie and ((on_the_left and on_the_right) or '~m' in subtrie or ('~l' in subtrie and on_the_left) or ('~r' in subtrie and on_the_right)):
# now buffer has token to be replaced
buffer = subtrie['~_'] + word_separator #if not buffer.endswith(word_separator) else ''
b_map = [b_map[0] for i in range(len(buffer))]
Expand Down Expand Up @@ -323,7 +323,7 @@ def normalize(self, source_string, word_separator=' ', normalizer_option=0):
# DRY!
on_the_right = True
on_the_left = this_fragment == '' or this_fragment[-1:] == word_separator
if '~_' in subtrie and ((on_the_left or on_the_right) or '~m' in subtrie):
if '~_' in subtrie and ((on_the_left and on_the_right) or '~m' in subtrie or ('~l' in subtrie and on_the_left) or ('~r' in subtrie and on_the_right)):
# now buffer has token to be replaced
buffer = subtrie['~_'] + word_separator #if not buffer.endswith(word_separator) else ''
b_map = [b_map[0] for i in range(len(buffer))]
Expand Down
5 changes: 5 additions & 0 deletions test/assets/tokenizer_no_plurals_right.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
<?xml version="1.0" encoding="UTF-8"?>
<tokenizer name="test_plurals">
<setting name="cs" value="0" />
<token to="" from="s" />
</tokenizer>
13 changes: 13 additions & 0 deletions test/ut_sic.py
Original file line number Diff line number Diff line change
Expand Up @@ -658,6 +658,19 @@ def test_tokenizer_plurals_all(self):
]
assert self.assert_normalization('tokenizer_plurals_all.xml', 'test_plurals', testcases) == True, 'Something is wrong.'

def test_tokenizer_no_plurals_right(self):
testcases = [
{
'original': 'plurals plurals',
'expected': {
'normal': 'plurals plurals',
'list': 'plurals plurals',
'set': 'plurals'
}
}
]
assert self.assert_normalization('tokenizer_no_plurals_right.xml', 'test_plurals', testcases) == True, 'Something is wrong.'


if __name__ == '__main__':
unittest.main()

0 comments on commit 30aa135

Please sign in to comment.