Merge pull request #38 from pgolo/dev

Release 1.3.2
pgolo · Jan 22, 2021 · 591a57f · 591a57f
2 parents 69392bc + 5dba14d
commit 591a57f
Show file tree

Hide file tree

Showing 18 changed files with 133 additions and 53 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,74 +5,89 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## [1.3.1] - 2020-12-08
+## sic 1.3
+
+### [1.3.2] - 2021-01-21
+
+#### Changed
+
+- Improved logical reasoning to resolve conflicting instructions
+
+### [1.3.1] - 2020-12-08
+
+#### Changed
 
-### Changed
 - Updated logic with respect to spelling correction
 
-## [1.3.0] - 2020-11-30
+### [1.3.0] - 2020-11-30
 
-### Added
+#### Added
 
 - Support for transitivity in tokenization rules
 - Option to identify tokens but don't add word separators to resulting string
 
-## [1.2.0] - 2020-11-09
+## sic 1.2
+
+### [1.2.0] - 2020-11-09
 
-### Added
+#### Added
 
 - Tokenization rules can be added to a compiled model
 
-## [1.1.0] - 2020-10-31
+## sic 1.1
 
-### Added
+### [1.1.0] - 2020-10-31
+
+#### Added
 
 - Implicit instantiation of core classes
 - Classes and function for ad hoc creation of tokenization config
 - Methods to save (pickle) and load (unpickle) compiled Normalizer instance
 - Wheel for Python 3.9
 
-## [1.0.6] - 2020-09-10
+## sic 1.0
+
+### [1.0.6] - 2020-09-10
 
-### Changed
+#### Changed
 
 - Fixed bug with replacing substring that is not a token
 
-## [1.0.5] - 2020-09-08
+### [1.0.5] - 2020-09-08
 
-### Changed
+#### Changed
 
 - Normalizer.data is now exposed as a property
 - Updated documentation, added performance benchmarks
 - Installable package is either pure Python or wheel with precompiled Cython
 
-## [1.0.4] - 2020-09-03
+### [1.0.4] - 2020-09-03
 
-### Added
+#### Added
 
 - Normalizer.result['r_map'] attribute
 - Scripts to build wheels
 
-## [1.0.3] - 2020-07-30
+### [1.0.3] - 2020-07-30
 
-### Added
+#### Added
 
 - Normalizer.data attribute is now exposed and can be accessed directly
 
-## [1.0.2] - 2020-06-12
+### [1.0.2] - 2020-06-12
 
-### Added
+#### Added
 
 - Added README.md in released package
 
-## [1.0.1] - 2020-06-12
+### [1.0.1] - 2020-06-12
 
-### Added
+#### Added
 
 - Module is cythonized at the time of installation
 
-## [1.0.0] - 2020-06-12
+### [1.0.0] - 2020-06-12
 
-### Added
+#### Added
 
 - Configurable string normalization module
diff --git a/README.md b/README.md
@@ -203,7 +203,10 @@ Transformation is applied in the following order:
 2. Splitting tokens
 3. Replacing tokens
 
-When splitting tokens, longer ones shadow shorter ones.
+When splitting tokens, longer ones shadow shorter ones. Token replacement
+instructions may contradict each other locally, but in entire set they must
+converge so that each token has only one replacement option (otherwise
+ValueError exception will be thrown).
 
 ## Usage
 
@@ -258,7 +261,7 @@ model.add_rule(sic.ReplaceCharacter('a', 'z'))
 > "bad" --> "good" will not be used; "bad" --> "better" will be used instead
 
 **Method** `sic.Model.remove_rule` removes single tokenization instruction from
-Model instance if is there:
+Model instance if it is there:
 
 ```python
 model.remove_rule(sic.ReplaceToken('bad', 'good'))

diff --git a/dist/sic-1.3.1-cp36-cp36m-win_amd64.whl b/dist/sic-1.3.1-cp36-cp36m-win_amd64.whl
diff --git a/dist/sic-1.3.1-cp37-cp37m-win_amd64.whl b/dist/sic-1.3.1-cp37-cp37m-win_amd64.whl
diff --git a/dist/sic-1.3.1-cp38-cp38-win_amd64.whl b/dist/sic-1.3.1-cp38-cp38-win_amd64.whl
diff --git a/dist/sic-1.3.1-cp39-cp39-win_amd64.whl b/dist/sic-1.3.1-cp39-cp39-win_amd64.whl
diff --git a/dist/sic-1.3.1.tar.gz b/dist/sic-1.3.1.tar.gz
diff --git a/dist/sic-1.3.2-cp36-cp36m-win_amd64.whl b/dist/sic-1.3.2-cp36-cp36m-win_amd64.whl
diff --git a/dist/sic-1.3.2-cp37-cp37m-win_amd64.whl b/dist/sic-1.3.2-cp37-cp37m-win_amd64.whl
diff --git a/dist/sic-1.3.2-cp38-cp38-win_amd64.whl b/dist/sic-1.3.2-cp38-cp38-win_amd64.whl
diff --git a/dist/sic-1.3.2-cp39-cp39-win_amd64.whl b/dist/sic-1.3.2-cp39-cp39-win_amd64.whl
diff --git a/dist/sic-1.3.2.tar.gz b/dist/sic-1.3.2.tar.gz
diff --git a/shipping/setup.py b/shipping/setup.py
@@ -9,7 +9,7 @@
 
 setup(
     name='sic',
-    version='1.3.1',
+    version='1.3.2',
     description='Utility for string normalization',
     long_description=long_description,
     long_description_content_type='text/markdown',

diff --git a/sic/core.pxd b/sic/core.pxd
@@ -33,11 +33,15 @@ cdef class Normalizer():
     cdef public dict content
     cdef public dict normalizer_result
 
-    cpdef str expand_instruction(
+    @cython.locals(
+        next_nodes=cython.set
+    )
+    cpdef set expand_instruction(
         self,
         dict g,
-        str node,
-        set visited
+        str seed,
+        set nodes=*,
+        int hops=*
     )
 
     @cython.locals(
@@ -46,7 +50,8 @@ cdef class Normalizer():
         line=cython.str,
         action=cython.str,
         parameter=cython.str,
-        subject=cython.str
+        subject=cython.str,
+        node=cython.str
     )
     cpdef str merge_replacements(
         self,

diff --git a/sic/core.py b/sic/core.py
@@ -127,21 +127,27 @@ def data(self):
     def data(self, obj):
         self.content = obj
 
-    def expand_instruction(self, g, node, visited):
-        """Helper function that traverses a path and returns terminal node.
+    def expand_instruction(self, g, seed, nodes=set(), hops=0):
+        """Helper function that traverses a path and returns set of terminal nodes.
         For a directed graph *g*, it is assumed that each node has at most 1 descendant.
 
         Args:
-            *g* is a dict(str, str) representing a graph
-            *node* is starting node
-            *visited* is a set to keep visited nodes for cycle detection
+            *g* is a dict(str, set) representing a graph
+            *seed* is a node (str) to expand
+            *nodes* is a set of nodes on the way of expansion
+            *hops* is depth of expansion so far
         """
-        if node in visited:
-            raise RecursionError('Circular reference in replacement instruction regarding "%s"' % (node))
-        visited.add(node)
-        if node in g:
-            node = self.expand_instruction(g, g[node], visited)
-        return node
+        next_nodes = set()
+        if hops == 0:
+            nodes = {seed}
+        elif seed in nodes:
+            raise RecursionError('Circular reference in replacement instruction regarding "%s"' % (seed))
+        for node in nodes:
+            if node in g:
+                next_nodes = next_nodes.union(self.expand_instruction(g, seed, g[node], hops + 1))
+            else:
+                next_nodes.add(node)
+        return next_nodes
 
     def merge_replacements(self, sdata):
         """This function takes *sdata* config string, merges classes of "c" and "r" tokenization rules,
@@ -159,15 +165,16 @@ def merge_replacements(self, sdata):
                     if action not in replacements:
                         replacements[action] = dict()
                     if subject not in replacements[action]:
-                        replacements[action][subject] = parameter
-                    elif replacements[action][subject] != parameter:
-                        raise ValueError('Conflicting instruction: (replace "%s" --> "%s") vs (replace "%s" --> "%s")' % (subject, replacements[action][subject], subject, parameter))
+                        replacements[action][subject] = set()
+                    replacements[action][subject].add(parameter)
                     continue
             ret += '%s\n' % (line)
         for action in replacements:
             for node in replacements[action]:
-                replacements[action][node] = self.expand_instruction(replacements[action], node, set())
-                ret += '%s\t%s\t%s\n' % (action, replacements[action][node], node)
+                replacements[action][node] = self.expand_instruction(replacements[action], node)
+                if len(replacements[action][node]) > 1:
+                    raise ValueError('Conflicting instruction: (replace "%s" --> "%s") vs (replace "%s" --> "%s")' % (subject, replacements[action][subject], subject, parameter))
+                ret += '%s\t%s\t%s\n' % (action, next(iter(replacements[action][node])), node)
         return ret
 
     def update_str_with_chmap(self, value, chmap):
@@ -257,7 +264,7 @@ def make_tokenizer(self, sdata, update=False):
                     if character not in subtrie:
                         subtrie[character] = dict()
                     subtrie = subtrie[character]
-                if parameter_key:
+                if parameter_key != '':
                     for parameter_keylet in parameter_key:
                         subtrie[actions[action][parameter_keylet]] = parameter_value
                 else:
@@ -390,7 +397,7 @@ def normalize(self, source_string, word_separator=' ', normalizer_option=0, cont
                 temp_index, temp_buffer, t_map = current_index, buffer, list(b_map)
             if character in subtrie:
                 if not began_reading:
-                    if on_the_left and this_fragment and this_fragment[-1:] not in (word_separator, control_character):
+                    if on_the_left and this_fragment != '' and this_fragment[-1:] not in (word_separator, control_character):
                         this_fragment += control_character
                         if len(f_map) == len(this_fragment):
                             f_map[-1] = current_index
@@ -410,7 +417,7 @@ def normalize(self, source_string, word_separator=' ', normalizer_option=0, cont
                 b_map += [current_index for x in character]
             else:
                 on_the_right = on_the_right or character in (word_separator, control_character)
-                on_the_left = not this_fragment or this_fragment[-1:] in (word_separator, control_character)
+                on_the_left = this_fragment == '' or this_fragment[-1:] in (word_separator, control_character)
                 began_reading = False
                 # check what's in the buffer, and do the right thing
                 if '~_' in subtrie:
@@ -444,7 +451,7 @@ def normalize(self, source_string, word_separator=' ', normalizer_option=0, cont
                             b_map[-1] = current_index
                         else:
                             b_map.append(current_index)
-                    if last_buffer:
+                    if last_buffer != '':
                         f_map = f_map[:-len(last_buffer)] + l_map
                         this_fragment = this_fragment[:-len(last_buffer)] + last_replacement
                     temp_index = -1
@@ -458,7 +465,7 @@ def normalize(self, source_string, word_separator=' ', normalizer_option=0, cont
                     current_index, buffer, b_map = temp_index, temp_buffer, list(t_map) # plain jumping back which causes performance hit, think about better solution
                     temp_index, temp_buffer, t_map = -1, '', []
                     continue
-                if on_the_left and this_fragment and this_fragment[-1:] not in (word_separator, control_character) and character not in (word_separator, control_character) and not added_separator:
+                if on_the_left and this_fragment != '' and this_fragment[-1:] not in (word_separator, control_character) and character not in (word_separator, control_character) and not added_separator:
                     this_fragment += control_character
                     if len(f_map) == len(this_fragment):
                         f_map[-1] = current_index
@@ -492,7 +499,7 @@ def normalize(self, source_string, word_separator=' ', normalizer_option=0, cont
             if not buffer.startswith(word_separator) and not buffer.startswith(control_character):
                 buffer = word_separator + buffer
                 b_map.insert(0, total_length - 1)
-            if last_buffer:
+            if last_buffer != '':
                 f_map += l_map
                 this_fragment = this_fragment[:-len(last_buffer)] + last_replacement
         if on_the_left and this_fragment[-1:] not in (word_separator, control_character):
@@ -585,10 +592,10 @@ def convert_xml(self, filename, res, batch_name):
             dict *res* is initial dict with tokenization rules
             str *batch_name* is name of tokenizer
         """
-        result = res if res else {'name': filename if not batch_name else batch_name}
+        result = res if res else {'name': filename if batch_name == '' else batch_name}
         tree = et.parse(filename)
         root = tree.getroot()
-        if 'name' in root.attrib and not batch_name:
+        if 'name' in root.attrib and batch_name == '':
             result['name'] = root.attrib['name']
         import_elements = root.findall('./import')
         if import_elements:

diff --git a/test/assets/tokenizer_conflict_converging.xml b/test/assets/tokenizer_conflict_converging.xml
@@ -0,0 +1,16 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<tokenizer name="expanded">
+  <setting name="cs" value="0" />
+  <token from="abc" to="def" />
+  <token from="def" to="ghi" />
+  <token from="ghi" to="jkl" />
+  <token from="mmm" to="nnn" />
+  <token from="qwe" to="rty" />
+  <token from="qwe" to="uiop" />
+  <token from="rty" to="uiop" />
+  <token from="abc" to="jkl" />
+  <character from="z" to="y" />
+  <character from="y" to="x" />
+  <character from="x" to="w" />
+  <split value="abc" where="lmr" />
+</tokenizer>
diff --git a/test/assets/tokenizer_expanded.xml b/test/assets/tokenizer_expanded.xml
@@ -5,6 +5,8 @@
   <token from="def" to="ghi" />
   <token from="ghi" to="jkl" />
   <token from="mmm" to="nnn" />
+  <token from="qwe" to="rty" />
+  <token from="rty" to="uiop" />
   <character from="z" to="y" />
   <character from="y" to="x" />
   <character from="x" to="w" />

diff --git a/test/ut_sic.py b/test/ut_sic.py
@@ -862,14 +862,46 @@ def test_expanded_tokenizer(self):
         expected1 = '123 jkl 456 nnn'
         test_string4 = '123xyz456 nnn'
         expected2 = '123 www 456 nnn'
+        test_string5 = '123qwe456'
+        test_string6 = '123rty456'
+        expected3 = '123 uiop 456'
         normalized1 = worker.normalize(test_string1)
         normalized2 = worker.normalize(test_string2)
         normalized3 = worker.normalize(test_string3)
         normalized4 = worker.normalize(test_string4)
+        normalized5 = worker.normalize(test_string5)
+        normalized6 = worker.normalize(test_string6)
         assert expected1 == normalized1, 'Expected "%s", got "%s".' % (expected1, normalized1)
         assert expected1 == normalized2, 'Expected "%s", got "%s".' % (expected1, normalized2)
         assert expected1 == normalized3, 'Expected "%s", got "%s".' % (expected1, normalized3)
         assert expected2 == normalized4, 'Expected "%s", got "%s".' % (expected2, normalized4)
+        assert expected3 == normalized5, 'Expected "%s", got "%s".' % (expected3, normalized5)
+        assert expected3 == normalized6, 'Expected "%s", got "%s".' % (expected3, normalized6)
+
+    def test_conflict_converging_tokenizer(self):
+        builder = sic.Builder()
+        worker = builder.build_normalizer('%s/tokenizer_conflict_converging.xml' % (self.assets_dir))
+        test_string1 = '123abc456mmm'
+        test_string2 = '123def456mmm'
+        test_string3 = '123ghi456mmm'
+        expected1 = '123 jkl 456 nnn'
+        test_string4 = '123xyz456 nnn'
+        expected2 = '123 www 456 nnn'
+        test_string5 = '123qwe456'
+        test_string6 = '123rty456'
+        expected3 = '123 uiop 456'
+        normalized1 = worker.normalize(test_string1)
+        normalized2 = worker.normalize(test_string2)
+        normalized3 = worker.normalize(test_string3)
+        normalized4 = worker.normalize(test_string4)
+        normalized5 = worker.normalize(test_string5)
+        normalized6 = worker.normalize(test_string6)
+        assert expected1 == normalized1, 'Expected "%s", got "%s".' % (expected1, normalized1)
+        assert expected1 == normalized2, 'Expected "%s", got "%s".' % (expected1, normalized2)
+        assert expected1 == normalized3, 'Expected "%s", got "%s".' % (expected1, normalized3)
+        assert expected2 == normalized4, 'Expected "%s", got "%s".' % (expected2, normalized4)
+        assert expected3 == normalized5, 'Expected "%s", got "%s".' % (expected3, normalized5)
+        assert expected3 == normalized6, 'Expected "%s", got "%s".' % (expected3, normalized6)
 
     def test_spelling_correction(self):
         builder = sic.Builder()