Merge pull request #27 from pgolo/dev

sic-1.3.1
pgolo · Dec 8, 2020 · f543894 · f543894
2 parents 9b16033 + 026da75
commit f543894
Show file tree

Hide file tree

Showing 17 changed files with 93 additions and 97 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,11 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [1.3.1] - 2020-12-08
+
+### Changed
+- Updated logic with respect to spelling correction
+
 ## [1.3.0] - 2020-11-30
 
 ### Added

diff --git a/README.md b/README.md
@@ -315,12 +315,13 @@ data structure in `sic.Normalizer` instance.
 according to the rules ingested at the time of class initialization, and
 returns normalized string.
 
-|     ARGUMENT      | TYPE | DEFAULT |            DESCRIPTION             |
-|:-----------------:|:----:|:-------:|:----------------------------------:|
-| source_string     | str  |   n/a   | String to normalize.               |
-| word_separator    | str  |   ' '   | Word delimiter (single character). |
-| normalizer_option | int  |    0    | Mode of post-processing.           |
-|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
+|     ARGUMENT      | TYPE | DEFAULT |            DESCRIPTION                              |
+|:-----------------:|:----:|:-------:|:---------------------------------------------------:|
+| source_string     | str  |   n/a   | String to normalize.                                |
+| word_separator    | str  |   ' '   | Word delimiter (single character).                  |
+| normalizer_option | int  |    0    | Mode of post-processing.                            |
+| control_character | str  | '\x00'  | Character masking word delimiter (single character) |
+||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
 
 `word_separator`: Specified character will be considered a boundary between
 tokens. The default value is `' '` (space) which seems reasonable choice for
@@ -338,6 +339,11 @@ controls the way tokenized string is post-processed:
 |   3   | Remove all added word separators.                             |
 |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
 
+`control_character`: Implementation detail - character that used as word
+delimiter inserted in a parsed string at the run time. If parsed string
+initially included this character somewhere, normalization will return error.
+The value is set to `\x00` by default.
+
 **Property** `sic.Normalizer.result` retains the result of last call for
 `sic.Normalizer.normalize` function as dict object with the following keys:
 
@@ -387,13 +393,14 @@ for `sic.Normalizer.load()` function.
 instantly creates new local `sic.Normalizer` class, and uses it to perform
 requested string normalization.
 
-|     ARGUMENT      | TYPE | DEFAULT |            DESCRIPTION                |
-|:-----------------:|:----:|:-------:|:-------------------------------------:|
-| source_string     | str  |   n/a   | String to normalize.                  |
-| word_separator    | str  |   ' '   | Word delimiter (single character).    |
-| normalizer_option | int  |    0    | Mode of post-processing.              |
-| tokenizer_config  | str  |  None   | Path to tokenizer configuration file. |
-||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
+|     ARGUMENT      | TYPE | DEFAULT |            DESCRIPTION                              |
+|:-----------------:|:----:|:-------:|:---------------------------------------------------:|
+| source_string     | str  |   n/a   | String to normalize.                                |
+| word_separator    | str  |   ' '   | Word delimiter (single character).                  |
+| normalizer_option | int  |    0    | Mode of post-processing.                            |
+| control_character | str  | '\x00'  | Character masking word delimiter (single character) |
+| tokenizer_config  | str  |  None   | Path to tokenizer configuration file.               |
+||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
 
 If `tokenizer_config` argument is not provided, the function will use global
 instance of `sic.Normalizer` class (will create it if it is not initialized).

diff --git a/dist/sic-1.3.0-cp36-cp36m-win_amd64.whl b/dist/sic-1.3.0-cp36-cp36m-win_amd64.whl
diff --git a/dist/sic-1.3.0-cp37-cp37m-win_amd64.whl b/dist/sic-1.3.0-cp37-cp37m-win_amd64.whl
diff --git a/dist/sic-1.3.0-cp38-cp38-win_amd64.whl b/dist/sic-1.3.0-cp38-cp38-win_amd64.whl
diff --git a/dist/sic-1.3.0-cp39-cp39-win_amd64.whl b/dist/sic-1.3.0-cp39-cp39-win_amd64.whl
diff --git a/dist/sic-1.3.0.tar.gz b/dist/sic-1.3.0.tar.gz
diff --git a/dist/sic-1.3.1-cp36-cp36m-win_amd64.whl b/dist/sic-1.3.1-cp36-cp36m-win_amd64.whl
diff --git a/dist/sic-1.3.1-cp37-cp37m-win_amd64.whl b/dist/sic-1.3.1-cp37-cp37m-win_amd64.whl
diff --git a/dist/sic-1.3.1-cp38-cp38-win_amd64.whl b/dist/sic-1.3.1-cp38-cp38-win_amd64.whl
diff --git a/dist/sic-1.3.1-cp39-cp39-win_amd64.whl b/dist/sic-1.3.1-cp39-cp39-win_amd64.whl
diff --git a/dist/sic-1.3.1.tar.gz b/dist/sic-1.3.1.tar.gz
diff --git a/shipping/setup.py b/shipping/setup.py
@@ -9,7 +9,7 @@
 
 setup(
     name='sic',
-    version='1.3.0',
+    version='1.3.1',
     description='Utility for string normalization',
     long_description=long_description,
     long_description_content_type='text/markdown',

diff --git a/sic/core.pxd b/sic/core.pxd
@@ -128,9 +128,6 @@ cdef class Normalizer():
         on_the_left=cython.bint,
         on_the_right=cython.bint,
         added_separator=cython.bint,
-        separators=cython.list,
-        last_separators=cython.list,
-        separator_index=cython.set,
         normalized=cython.str,
         i=cython.int,
         x=cython.str
@@ -139,7 +136,8 @@ cdef class Normalizer():
         self,
         str source_string,
         str word_separator=*,
-        int normalizer_option=*
+        int normalizer_option=*,
+        str control_character=*
     )
 
     cpdef save(