Merge pull request #23 from pgolo/dev

sic version 1.3.0
pgolo · Dec 1, 2020 · 9b16033 · 9b16033
2 parents c83fccd + 3798f99
commit 9b16033
Show file tree

Hide file tree

Showing 21 changed files with 266 additions and 31 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,13 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [1.3.0] - 2020-11-30
+
+### Added
+
+- Support for transitivity in tokenization rules
+- Option to identify tokens but don't add word separators to resulting string
+
 ## [1.2.0] - 2020-11-09
 
 ### Added

diff --git a/README.md b/README.md
@@ -58,10 +58,10 @@ over another depending on usage scenario. The benchmark is below.
 
 | STRING LENGTH | REPEATS | VERSION | MEAN TIME (s) |
 |:-------------:|:-------:|:-------:|:-------------:|
-| 71            | 10000   | .tar.gz | 1.4           |
-| 71            | 10000   | wheel   | 0.4           |
-| 710000        | 1       | .tar.gz | 2.2           |
-| 710000        | 1       | wheel   | 14.0          |
+| 71            | 10000   | .tar.gz | 1.8           |
+| 71            | 10000   | wheel   | 0.5           |
+| 710000        | 1       | .tar.gz | 2.7           |
+| 710000        | 1       | wheel   | 15.9          |
 |||||||||||||||||||||||||||||||||||||||||||||||||||||
 
 ## Tokenization configs
@@ -327,14 +327,15 @@ tokens. The default value is `' '` (space) which seems reasonable choice for
 natural language. However any character can be specified, which might be more
 useful in certain context.
 
-`normalizer_option`: The value can be either one of `0`, `1`, or `2` and
+`normalizer_option`: The value can be either one of `0`, `1`, `2`, or `3` and
 controls the way tokenized string is post-processed:
 
 | VALUE |                             MODE                              |
 |:-----:|:-------------------------------------------------------------:|
 |   0   | No post-processing.                                           |
 |   1   | Rearrange tokens in alphabetical order.                       |
 |   2   | Rearrange tokens in alphabetical order and remove duplicates. |
+|   3   | Remove all added word separators.                             |
 |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
 
 **Property** `sic.Normalizer.result` retains the result of last call for
@@ -420,6 +421,8 @@ sic.result   # will work in Python >= 3.7
 
 ## Examples
 
+### Basic usage
+
 ```python
 import sic
 
@@ -444,36 +447,67 @@ print(machine.result)
   ]
 }
 """
+```
+
+### Custom word separator
 
-# using custom word separator
+```python
 x = machine.normalize('alpha-2-macroglobulin-p', word_separator='|')
 print(x) # 'alpha|-|2|-|macroglobulin|-|p'
+```
+
+### Post-processing options
 
+```python
 # using normalizer_option=1
 x = machine.normalize('alpha-2-macroglobulin-p', normalizer_option=1)
 print(x) # '- - - 2 alpha macroglobulin p'
+```
 
+```python
 # using normalizer_option=2
 x = machine.normalize('alpha-2-macroglobulin-p', normalizer_option=2)
 print(x) # '- 2 alpha macroglobulin p'
+```
 
-# ad hoc normalization
+```python
+# using normalizer_option=3
+# assuming normalization config includes the following:
+# <setting name="cs" value="0" />
+# <split value="mis" where="l" />
+# <token to="spelling" from="speling" />
+x = machine.normalize('Misspeling')
+print(x) # 'Misspelling'
+```
+
+### Using implicitly instantiated classes
+
+```python
+# normalize() with default instance
 x = sic.normalize('alpha-2-macroglobulin-p', word_separator='|')
 print(x) # 'alpha|-|2|-|macroglobulin|-|p'
 
+# custom configuration for implicitly instantiated normalizer
 sic.build_normalizer('/path/to/config.xml')
 x = sic.normalize('some string')
 print(x) # will be normalized according to config at /path/to/config.xml
 
+# custom config and normalization in one line
 x = sic.normalize('some string', tokenizer_config='/path/to/another/config.xml')
 print(x) # will be normalized according to config at /path/to/another/config.xml
+```
 
-# save/load compiled normalizer to/from disk
+### Saving and loading compiled normalizer to/from disk
+
+```python
 machine.save('/path/to/file') # will write /path/to/file
 another_machine = sic.Normalizer()
 another_machine.load('/path/to/file') # will read /path/to/file
+```
 
-# add more rules to already compiled model
+### Adding normalization rules to already compiled model
+
+```python
 # (assuming `machine` is sic.Normalizer instance armed with tokenization ruleset)
 new_ruleset = [sic.ReplaceToken('from', 'to'), sic.SplitToken('token', 'r')]
 new_ruleset_string = ''.join([rule.decode() for rule in new_ruleset])

diff --git a/dist/sic-1.2.0-cp36-cp36m-win_amd64.whl b/dist/sic-1.2.0-cp36-cp36m-win_amd64.whl
diff --git a/dist/sic-1.2.0-cp37-cp37m-win_amd64.whl b/dist/sic-1.2.0-cp37-cp37m-win_amd64.whl
diff --git a/dist/sic-1.2.0-cp38-cp38-win_amd64.whl b/dist/sic-1.2.0-cp38-cp38-win_amd64.whl
diff --git a/dist/sic-1.2.0-cp39-cp39-win_amd64.whl b/dist/sic-1.2.0-cp39-cp39-win_amd64.whl
diff --git a/dist/sic-1.2.0.tar.gz b/dist/sic-1.2.0.tar.gz
diff --git a/dist/sic-1.3.0-cp36-cp36m-win_amd64.whl b/dist/sic-1.3.0-cp36-cp36m-win_amd64.whl
diff --git a/dist/sic-1.3.0-cp37-cp37m-win_amd64.whl b/dist/sic-1.3.0-cp37-cp37m-win_amd64.whl
diff --git a/dist/sic-1.3.0-cp38-cp38-win_amd64.whl b/dist/sic-1.3.0-cp38-cp38-win_amd64.whl
diff --git a/dist/sic-1.3.0-cp39-cp39-win_amd64.whl b/dist/sic-1.3.0-cp39-cp39-win_amd64.whl
diff --git a/dist/sic-1.3.0.tar.gz b/dist/sic-1.3.0.tar.gz
diff --git a/shipping/setup.py b/shipping/setup.py
@@ -9,7 +9,7 @@
 
 setup(
     name='sic',
-    version='1.2.0',
+    version='1.3.0',
     description='Utility for string normalization',
     long_description=long_description,
     long_description_content_type='text/markdown',

diff --git a/sic/core.pxd b/sic/core.pxd
@@ -33,6 +33,26 @@ cdef class Normalizer():
     cdef public dict content
     cdef public dict normalizer_result
 
+    cpdef str expand_instruction(
+        self,
+        dict g,
+        str node,
+        set visited
+    )
+
+    @cython.locals(
+        ret=cython.str,
+        replacements=cython.dict,
+        line=cython.str,
+        action=cython.str,
+        parameter=cython.str,
+        subject=cython.str
+    )
+    cpdef str merge_replacements(
+        self,
+        str sdata
+    )
+
     @cython.locals(
         updated = cython.str,
         x = cython.str
@@ -66,6 +86,13 @@ cdef class Normalizer():
         str s
     )
 
+    cpdef str align_case(
+        self,
+        str replacement,
+        str original,
+        int normalizer_option
+    )
+
     @cython.locals(
         ret=cython.list,
         i=cython.int,
@@ -79,6 +106,7 @@ cdef class Normalizer():
 
     @cython.locals(
         original_string=cython.str,
+        parsed_string=cython.str,
         subtrie=cython.dict,
         this_fragment=cython.str,
         buffer=cython.str,
@@ -100,6 +128,9 @@ cdef class Normalizer():
         on_the_left=cython.bint,
         on_the_right=cython.bint,
         added_separator=cython.bint,
+        separators=cython.list,
+        last_separators=cython.list,
+        separator_index=cython.set,
         normalized=cython.str,
         i=cython.int,
         x=cython.str