Merge pull request #28 from pgolo/dev

Releasing 0.1.0
pgolo · Nov 11, 2020 · a19d094 · a19d094
2 parents cc36faf + d7cb9fc
commit a19d094
Show file tree

Hide file tree

Showing 22 changed files with 383 additions and 80 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,14 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.1.0] - 2020-11-11
+
+### Added
+
+- Context manager support for Model and Utility classes
+- pilsner.Utility.ignore_node() method to arbitrary ignore labels present in the model
+- pilsner.Model instance created with `simple=True` parameter does not store attributes or otherwise interact with a database
+
 ## [0.0.1] - 2020-10-08
 
 ### Added

diff --git a/README.md b/README.md
@@ -34,7 +34,7 @@ class provides storage for the dictionary and string normalization rules, as
 well as low-level methods for populating this storage. `Utility` class provides
 high-level methods for storing and retrieving data to/from `Model` instance.
 
-![Diagram](misc/pilsner-diagram.svg)
+![Diagram](https://github.com/pgolo/pilsner/blob/master/misc/pilsner-diagram.svg)
 
 ## 4. Usage
 
@@ -63,6 +63,12 @@ disk:
 m = pilsner.Model(storage_location=':memory:')
 ```
 
+- To create empty model that does not store any attributes in a database at all:
+
+```python
+m = pilsner.Model(simple=True)
+```
+
 > If database is created in memory, the model cannot be later saved on disk
 (can only be used instantly).
 
@@ -205,7 +211,8 @@ m.save('path/to/model_name')
 
 - The snippet above will write the following files:
   - `path/to/model_name.attributes`: database with attributes (fields from the
-  dictionary that are not synonyms);
+  dictionary that are not synonyms) - will only be written if `Model` instance
+  is not created with `simple=True` parameter;
   - `path/to/model_name.keywords`: keywords used for disambiguation;
   - `path/to/model_name.normalizers`: string normalization units;
   - `path/to/model_name.0.dictionary`: trie with synonyms;
@@ -230,7 +237,10 @@ m.load('path/to/model_name')
 ```
 
 - In both cases, the program will look for the following files:
-  - `path/to/model_name.attributes`: database with attributes (fields from the dictionary that are not synonyms);
+  - `path/to/model_name.attributes`: database with attributes (fields from the
+  dictionary that are not synonyms) - if not found, `Model` instance will work
+  as if it is initialized with `simple=True` parameter, meaning no attributes
+  other than primary IDs could be processed;
   - `path/to/model_name.keywords`: keywords used for disambiguation;
   - `path/to/model_name.normalizers`: string normalization units;
   - `path/to/model_name.<N>.dictionary`: tries with synonyms (`<N>` being
@@ -253,8 +263,21 @@ parsed = r.parse(
 - The output will be dict object where keys are tuples for location of spotted
 entity in a string (begin, end) and values are dicts for attributes that are
 associated with identified entity (`{'attribute_name': {attribute_values}}`).
+- To ignore entity by its label rather than some of its attributes, compiled
+model can be adjusted using `pilsnet.Utility.ignore_node()` method:
+
+```python
+# Assuming m is pilsner.Model instance, r is pilsner.Utility instance
+r.ignore_node(
+  model=m,
+  label='irrelevant substring'
+)
+# substring 'irrelevant substring' will not be found by pilsner.Utility.parse()
+# even if it is present in the model
+```
+
 - For details about optional parameters, see comments in the code -
-`pilsner.Utility.parse` function).
+`pilsner.Utility.parse()` function.
 
 ## 5. Example
 

diff --git a/dist/pilsner-0.0.1-cp36-cp36m-win_amd64.whl b/dist/pilsner-0.0.1-cp36-cp36m-win_amd64.whl
diff --git a/dist/pilsner-0.0.1-cp37-cp37m-win_amd64.whl b/dist/pilsner-0.0.1-cp37-cp37m-win_amd64.whl
diff --git a/dist/pilsner-0.0.1-cp38-cp38-win_amd64.whl b/dist/pilsner-0.0.1-cp38-cp38-win_amd64.whl
diff --git a/dist/pilsner-0.0.1.tar.gz b/dist/pilsner-0.0.1.tar.gz
diff --git a/dist/pilsner-0.1.0-cp36-cp36m-win_amd64.whl b/dist/pilsner-0.1.0-cp36-cp36m-win_amd64.whl
diff --git a/dist/pilsner-0.1.0-cp37-cp37m-win_amd64.whl b/dist/pilsner-0.1.0-cp37-cp37m-win_amd64.whl
diff --git a/dist/pilsner-0.1.0-cp38-cp38-win_amd64.whl b/dist/pilsner-0.1.0-cp38-cp38-win_amd64.whl
diff --git a/dist/pilsner-0.1.0-cp39-cp39-win_amd64.whl b/dist/pilsner-0.1.0-cp39-cp39-win_amd64.whl
diff --git a/dist/pilsner-0.1.0.tar.gz b/dist/pilsner-0.1.0.tar.gz
diff --git a/misc/example/example2.py b/misc/example/example2.py
@@ -0,0 +1,95 @@
+# Either install pilsner package to the environment first,
+# or run this from project's root
+
+import sys; sys.path.insert(0, '')
+
+# Import pilsner
+import pilsner
+
+# Initialize Model class
+#m = pilsner.Model()
+with pilsner.Model() as m:
+
+    # Add normalization units
+    m.add_normalizer('default', 'misc/example/default_normalizer.xml')
+    m.add_normalizer('custom', 'misc/example/custom_normalizer.xml')
+
+    # Map names of normalization units to some string values
+    m.normalizer_map = {
+        'animal': 'default',
+        'plant': 'custom'
+    }
+
+    # Initialize Utility class
+    r = pilsner.Utility()
+
+    # Provide table definition for misc/example/living_fileds.txt file
+    fields = [
+        {
+            'name': 'type',             # attribute name is 'type'
+            'include': True,            # include this column
+            'delimiter': None,          # no delimiter (single value per row)
+            'id_flag': False,           # entity IDs are not in this column
+            'normalizer_flag': True,    # tags for normalization units are in this column
+            'value_flag': False         # string labels (synonyms) are not in this column
+        },
+        {
+            'name': 'id',               # attribute name is 'id'
+            'include': True,
+            'delimiter': None,
+            'id_flag': True,            # entity IDs are in this column
+            'normalizer_flag': False,
+            'value_flag': False
+        },
+        {
+            'name': 'label',            # attribute name is 'label'
+            'include': True,
+            'delimiter': None,
+            'id_flag': False,
+            'normalizer_flag': False,
+            'value_flag': True          # string labels (synonyms) are in this column
+        },
+        {
+            'name': 'habitat',          # attribute name is 'habitat'
+            'include': True,
+            'delimiter': ',',           # multiple values delimited with ',' can be stored in a single row
+            'id_flag': False,
+            'normalizer_flag': False,
+            'value_flag': False
+        }
+    ]
+
+    # Populate Model instance with data from misc/example/living_things.txt file
+    r.compile_model(
+        model=m,
+        filename='misc/example/living_things.txt',
+        fields=fields,
+        word_separator=' ',
+        column_separator='\t',
+        column_enclosure='\n',
+        include_keywords=True
+    )
+
+    # Save Model instance to disk
+    m.save('misc/example/living_things')
+
+# Load Model instance from disk
+#m = pilsner.Model('misc/example/living_things')
+with pilsner.Model('misc/example/living_things') as m:
+
+    # Parse string
+    text_to_parse = '''
+    Little mouse is not recognized and is not frightened by big scary eagle.
+    Daniorerio also does not care much about water lilies, though both are recognized.
+    '''
+    parsed = r.parse(
+        model=m,
+        source_string=text_to_parse,
+        attrs_where={
+            '+': {'habitat': {'air', 'ocean'}} # only consider items with these values in 'habitat' column
+        },
+        attrs_out=['type'] # for each spotted entity, only output 'type' attribute
+    )
+
+# Print out the result: recognized are 'big eagle', 'danio rerio', 'water lily'.
+print(parsed)
diff --git a/pilsner/model.pxd b/pilsner/model.pxd
@@ -7,8 +7,10 @@ cdef class Model(dict):
     cdef public str COMPRESSED_KEY
     cdef public str TOKENIZER_OPTION_KEY
     cdef public str WORD_SEPARATOR_KEY
+    cdef public str IGNORE_KEY
     cdef public str ENTITY_KEY
     cdef public str ATTRS_KEY
+    cdef public set RESERVED_CHARACTERS
     cdef public str INTERNAL_ID_KEY
     cdef public str DICTIONARY_KEY
     cdef public str KEYWORDS_KEY

diff --git a/pilsner/model.py b/pilsner/model.py
@@ -10,21 +10,26 @@
 class Model(dict):
     """This class is a dict that stores tries and metadata, and provides functions and methods associated with the storage."""
 
-    def __init__(self, filename='', storage_location='', debug_mode=False, verbose_mode=False):
+    def __init__(self, filename='', storage_location='', simple=False, debug_mode=False, verbose_mode=False):
         """Creates Model instance.
 
         Args:
             str *filename*: if provided, loads model from disk, see load() method
-            str *storage_location*:
+            str *storage_location*: location for SQLite database that stores attributes (when blank, the database will be stored on disk in a file with randomized name)
+            bool *simple*: when True, attributes will not be stored or processed, only labels and primary IDs (defaulr False)
+            bool *debug_mode*: increase verbosity (default False)
+            bool *verbose_mode*: increase verbosity even more (default False)
         """
         self.CONTENT_KEY = '~content'
         self.SPECS_KEY = '~specs'
         self.COMPRESSED_KEY = '~compressed'
         self.TOKENIZER_OPTION_KEY = '~tokenizer_option'
         self.WORD_SEPARATOR_KEY = '~word_separator'
-        self.ENTITY_KEY = '~i'
-        self.ATTRS_KEY = '~p'
-        self.INTERNAL_ID_KEY = '~iid'
+        self.IGNORE_KEY = '\x07' # BEL, formerly '~x'
+        self.ENTITY_KEY = '\x03' # ETX, formerly '~i'
+        self.ATTRS_KEY = '\x05' # ENQ, formerly '~p'
+        self.RESERVED_CHARACTERS = set([self.IGNORE_KEY, self.ENTITY_KEY, self.ATTRS_KEY])
+        self.INTERNAL_ID_KEY = '~internal_id_map'
         self.DICTIONARY_KEY = '~dictionary'
         self.KEYWORDS_KEY = '~keywords'
         self.NORMALIZER_KEY = '~normalization'
@@ -49,19 +54,25 @@ def __init__(self, filename='', storage_location='', debug_mode=False, verbose_m
         self[self.DEFAULT_NORMALIZER_KEY] = ''
         self[self.DICTIONARY_KEY] = []
         self[self.KEYWORDS_KEY] = {}
+        self[self.INTERNAL_ID_KEY] = {}
         self[self.DATASOURCE_KEY] = self.DEFAULT_DATASOURCE
         self[self.WORD_SEPARATOR_KEY] = self.DEFAULT_WORD_SEPARATOR
         self[self.TOKENIZER_OPTION_KEY] = self.DEFAULT_TOKENIZER_OPTION
-        self.connection = sqlite3.connect(self[self.DATASOURCE_KEY])
-        self.cursor = self.connection.cursor()
+        if not simple:
+            self.connection = sqlite3.connect(self[self.DATASOURCE_KEY])
+            self.cursor = self.connection.cursor()
+        else:
+            self.connection = None
+            self.cursor = None
         self.normalizer_map = {}
         self.sic_builder = sic.Builder(debug_mode=debug_mode, verbose_mode=verbose_mode)
         if filename != '':
             self.load(filename)
 
     def destroy(self):
         """Closes connection, removes temporary database."""
-        self.connection.close()
+        if self.connection is not None:
+            self.connection.close()
         if os.path.exists(self.DEFAULT_DATASOURCE):
             os.remove(self.DEFAULT_DATASOURCE)
 
@@ -72,6 +83,14 @@ def __del__(self):
         except:
             pass
 
+    def __enter__(self):
+        """Enter `with`."""
+        return self
+
+    def __exit__(self, ex_type, ex_value, ex_traceback):
+        """Exit `with`."""
+        self.destroy()
+
     def save(self, filename):
         """Saves model to disk.
         Note: this will throw exception if temporary database is stored in memory.
@@ -86,14 +105,15 @@ def save(self, filename):
             filename.attributes
         """
         try:
-            assert os.path.exists(self[self.DATASOURCE_KEY]), 'Cannot find temporary database on disk'
+            assert self.connection is None or os.path.exists(self[self.DATASOURCE_KEY]), 'Cannot find temporary database on disk'
             assert len(self[self.DICTIONARY_KEY]) > 0, 'Model is empty, nothing to save'
         except Exception as e:
             self.destroy()
             raise e
         logging.debug('Saving model "%s"' % (filename))
-        self.cursor.close()
-        self.connection.close()
+        if self.connection is not None:
+            self.cursor.close()
+            self.connection.close()
         normalizers = {
             self.DEFAULT_NORMALIZER_KEY: self[self.DEFAULT_NORMALIZER_KEY],
             self.WORD_SEPARATOR_KEY: self[self.WORD_SEPARATOR_KEY],
@@ -110,10 +130,13 @@ def save(self, filename):
         with open('%s.keywords' % (filename), mode='wb') as f:
             pickle.dump(self[self.KEYWORDS_KEY], f)
             logging.debug('Saved "%s"' % ('%s.keywords' % (filename)))
-        shutil.copyfile(self[self.DATASOURCE_KEY], '%s.attributes' % (filename))
-        logging.debug('Saved "%s"' % ('%s.attributes' % (filename)))
-        self.connection = sqlite3.connect(self[self.DATASOURCE_KEY])
-        self.cursor = self.connection.cursor()
+        if self.connection is not None:
+            shutil.copyfile(self[self.DATASOURCE_KEY], '%s.attributes' % (filename))
+            logging.debug('Saved "%s"' % ('%s.attributes' % (filename)))
+            self.connection = sqlite3.connect(self[self.DATASOURCE_KEY])
+            self.cursor = self.connection.cursor()
+        else:
+            logging.warning('Attributes database not found, model has been saved as "simple"')
         logging.debug('Saved "%s"' % (filename))
         return True
 
@@ -131,8 +154,9 @@ def load(self, filename):
         """
         logging.debug('Loading model "%s"' % (filename))
         self[self.DATASOURCE_KEY] = '%s.attributes' % (filename)
-        self.cursor.close()
-        self.connection.close()
+        if self.connection is not None:
+            self.cursor.close()
+            self.connection.close()
         with open('%s.normalizers' % (filename), mode='rb') as f:
             normalizers = pickle.load(f)
         for normalizer_name in normalizers[self.NORMALIZER_KEY]:
@@ -153,8 +177,13 @@ def load(self, filename):
             self[self.KEYWORDS_KEY] = keywords
         logging.debug('Loaded "%s"' % ('%s.keywords' % (filename)))
         self[self.DATASOURCE_KEY] = '%s.attributes' % (filename)
-        self.connection = sqlite3.connect(self[self.DATASOURCE_KEY])
-        self.cursor = self.connection.cursor()
+        if os.path.exists(self[self.DATASOURCE_KEY]):
+            self.connection = sqlite3.connect(self[self.DATASOURCE_KEY])
+            self.cursor = self.connection.cursor()
+        else:
+            self.connection = None
+            self.cursor = None
+            logging.warning('Could not load attributes, model is in "simple" mode')
         return True
 
     def add_normalizer(self, normalizer_name, filename, default=False):
@@ -167,6 +196,14 @@ def add_normalizer(self, normalizer_name, filename, default=False):
         """
         logging.debug('Adding normalizer "%s" from "%s"' % (normalizer_name, filename))
         normalizer = self.sic_builder.build_normalizer(filename)
+        normalizer.make_tokenizer(
+            ''.join([rule.decode() for rule in [
+                sic.ReplaceCharacter(self.IGNORE_KEY, ''),
+                sic.ReplaceCharacter(self.ENTITY_KEY, ''),
+                sic.ReplaceCharacter(self.ATTRS_KEY, '')]
+            ]),
+            update=True
+        )
         self[self.NORMALIZER_KEY][normalizer_name] = normalizer
         self.normalizer_map[normalizer_name] = normalizer_name
         if len(self[self.NORMALIZER_KEY]) == 1 or default:
@@ -180,9 +217,12 @@ def create_recognizer_schema(self, cursor):
         Args:
             sqlite3.connect.cursor *cursor*: cursor to use for throwing queries
         """
-        logging.debug('Creating schema for permanent storage')
-        cursor.execute('create table attrs (n integer, iid integer, attr_name text, attr_value text);')
-        logging.debug('Created schema for permanent storage')
+        if cursor is not None:
+            logging.debug('Creating schema for permanent storage')
+            cursor.execute('create table attrs (n integer, iid integer, attr_name text, attr_value text);')
+            logging.debug('Created schema for permanent storage')
+        else:
+            logging.debug('No cursor is provided, schema is not created')
         return True
 
     def pack_subtrie(self, trie, compressed, prefix):
@@ -255,13 +295,16 @@ def store_attributes(self, line_number, internal_id, subtrie, specs, columns):
         if self.ENTITY_KEY not in subtrie:
             subtrie[self.ENTITY_KEY] = []
         subtrie[self.ENTITY_KEY].append(line_number)
-        for k in specs['fields']:
-            if specs['fields'][k][3]:
-                continue
-            if not specs['fields'][k][1]:
-                self.cursor.execute('insert into attrs (n, iid, attr_name, attr_value) select ?, ?, ?, ?;', (line_number, internal_id, k, columns[specs['fields'][k][0]]))
-            else:
-                _ = [self.cursor.execute('insert into attrs (n, iid, attr_name, attr_value) select ?, ?, ?, ?;', (line_number, internal_id, k, s)) for s in set(columns[specs['fields'][k][0]].split(specs['fields'][k][1]))]
+        if self.cursor is not None:
+            for k in specs['fields']:
+                if specs['fields'][k][3]:
+                    continue
+                if not specs['fields'][k][1]:
+                    self.cursor.execute('insert into attrs (n, iid, attr_name, attr_value) select ?, ?, ?, ?;', (line_number, internal_id, k, columns[specs['fields'][k][0]]))
+                else:
+                    _ = [self.cursor.execute('insert into attrs (n, iid, attr_name, attr_value) select ?, ?, ?, ?;', (line_number, internal_id, k, s)) for s in set(columns[specs['fields'][k][0]].split(specs['fields'][k][1]))]
+        else:
+            self[self.INTERNAL_ID_KEY][line_number] = columns[specs['id'][0]]
 
     def get_dictionary_line(self, specs, entity_ids, line_numbers, line_number, line, column_separator, column_enclosure):
         """Extracts values of columns in a file and associates them with internal entity ID.