From 30af57340a0ecfe79b41fd423ac4e30fb9eab6b7 Mon Sep 17 00:00:00 2001 From: Gordon Mohr Date: Tue, 21 Jul 2020 16:12:47 -0700 Subject: [PATCH 01/19] Delete .gitattributes --- .gitattributes | 1 - 1 file changed, 1 deletion(-) delete mode 100644 .gitattributes diff --git a/.gitattributes b/.gitattributes deleted file mode 100644 index fcadb2cf97..0000000000 --- a/.gitattributes +++ /dev/null @@ -1 +0,0 @@ -* text eol=lf From 3f7047f2d11c9caa99665e8e14c90de1fd7813aa Mon Sep 17 00:00:00 2001 From: Gordon Mohr Date: Wed, 22 Jul 2020 09:44:40 -0700 Subject: [PATCH 02/19] test showing FT failure as W2V --- gensim/test/test_fasttext.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index c8c9b0582c..1ed2fa45a8 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -19,6 +19,7 @@ from gensim.models.fasttext import FastText as FT_gensim, FastTextKeyedVectors, _unpack from gensim.models.keyedvectors import KeyedVectors from gensim.test.utils import datapath, get_tmpfile, temporary_file, common_texts as sentences +from gensim.test.test_word2vec import TestWord2VecModel import gensim.models._fasttext_bin from gensim.models.fasttext_inner import compute_ngrams, compute_ngrams_bytes, ft_hash_bytes @@ -1371,6 +1372,7 @@ def _read_fb(fin): class ZeroBucketTest(unittest.TestCase): + """Tests FastText with no buckets/no-ngrams (essentially FastText-as-Word2Vec""" def test_in_vocab(self): model = train_gensim(bucket=0) self.assertIsNotNone(model.wv['anarchist']) @@ -1379,6 +1381,15 @@ def test_out_of_vocab(self): model = train_gensim(bucket=0) self.assertRaises(KeyError, model.wv.word_vec, 'streamtrain') + def test_cbow_neg(self): + """See gensim.test.test_word2vec.TestWord2VecModel.test_cbow_neg""" + model = FT_gensim( + sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=15, + min_count=5, epochs=10, workers=2, sample=0, + max_n=0 # force no char-ngram buckets + ) + TestWord2VecModel.model_sanity(self, model) + class UnicodeVocabTest(unittest.TestCase): def test_ascii(self): From ac9126de4d5e848f80fdff03926a705ec340aad4 Mon Sep 17 00:00:00 2001 From: Gordon Mohr Date: Tue, 21 Jul 2020 21:16:50 -0700 Subject: [PATCH 03/19] set .vectors even when ngrams off --- gensim/models/fasttext.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 5c07a0b540..921878e128 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -1401,6 +1401,7 @@ def adjust_vectors(self): """ if self.bucket == 0: + self.vectors = self.vectors_vocab # no ngrams influence return self.vectors = self.vectors_vocab[:].copy() From 0316084be31975301f192142076708d0cf754026 Mon Sep 17 00:00:00 2001 From: Gordon Mohr Date: Wed, 22 Jul 2020 13:15:03 -0700 Subject: [PATCH 04/19] use _save_specials/_load_specials per type --- gensim/models/doc2vec.py | 2 +- gensim/models/fasttext.py | 88 +++++++++++++++++++----------- gensim/models/word2vec.py | 110 +++++++++++++++++++++----------------- 3 files changed, 118 insertions(+), 82 deletions(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index 1a55ad9b5f..ff93dbfcb5 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -790,7 +790,7 @@ def load(cls, *args, **kwargs): except AttributeError as ae: logger.error( "Model load error. Was model saved using code from an older Gensim Version? " - "Try loading older model using gensim-3.8.1, then re-saving, to restore " + "Try loading older model using gensim-3.8.3, then re-saving, to restore " "compatibility with current code.") raise ae diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 921878e128..928142580f 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -279,6 +279,7 @@ import os import numpy as np +import itertools as it from numpy import ones, vstack, float32 as REAL import six from collections.abc import Iterable @@ -822,7 +823,6 @@ def save(self, *args, **kwargs): Load :class:`~gensim.models.fasttext.FastText` model. """ - kwargs['ignore'] = kwargs.get('ignore', []) + ['buckets_word', ] super(FastText, self).save(*args, **kwargs) @classmethod @@ -845,25 +845,15 @@ def load(cls, *args, **kwargs): Save :class:`~gensim.models.fasttext.FastText` model. """ - model = super(FastText, cls).load(*args, rethrow=True, **kwargs) - - if not hasattr(model.wv, 'vectors_vocab_lockf') and hasattr(model.wv, 'vectors_vocab'): - # TODO: try trainables-location - model.wv.vectors_vocab_lockf = ones(1, dtype=REAL) - if not hasattr(model, 'vectors_ngrams_lockf') and hasattr(model.wv, 'vectors_ngrams'): - # TODO: try trainables-location - model.wv.vectors_ngrams_lockf = ones(1, dtype=REAL) - # fixup mistakenly overdimensioned gensim-3.x lockf arrays - if len(model.wv.vectors_vocab_lockf.shape) > 1: - model.wv.vectors_vocab_lockf = ones(1, dtype=REAL) - if len(model.wv.vectors_ngrams_lockf.shape) > 1: - model.wv.vectors_ngrams_lockf = ones(1, dtype=REAL) - if hasattr(model, 'bucket'): - del model.bucket # should only exist in one place: the wv subcomponent - if not hasattr(model.wv, 'buckets_word') or not model.wv.buckets_word: - model.wv.recalc_char_ngram_buckets() + return super(FastText, cls).load(*args, rethrow=True, **kwargs) - return model + def _load_specials(self, *args, **kwargs): + """Handle special requirements of `.load()` protocol, usually up-converting older versions.""" + super(FastText, self)._load_specials(*args, **kwargs) + if hasattr(self, 'bucket'): + # should only exist in one place: the wv subcomponent + self.wv.bucket = self.bucket + del self.bucket class FastTextVocab(utils.SaveLoad): @@ -1197,12 +1187,47 @@ def __init__(self, vector_size, min_n, max_n, bucket): @classmethod def load(cls, fname_or_handle, **kwargs): - model = super(FastTextKeyedVectors, cls).load(fname_or_handle, **kwargs) - if isinstance(model, FastTextKeyedVectors): - if not hasattr(model, 'compatible_hash') or model.compatible_hash is False: - raise TypeError("Pre-gensim-3.8.x Fasttext models with nonstandard hashing are no longer compatible." - "Loading into gensim-3.8.3 & re-saving may create a compatible model.") - return model + """Load a previously saved `FastTextKeyedVectors` model. + + Parameters + ---------- + fname : str + Path to the saved file. + + Returns + ------- + :class:`~gensim.models.fasttext.FastTextKeyedVectors` + Loaded model. + + See Also + -------- + :meth:`~gensim.models.fasttext.FastTextKeyedVectors.save` + Save :class:`~gensim.models.fasttext.FastTextKeyedVectors` model. + + """ + return super(FastTextKeyedVectors, cls).load(fname_or_handle, **kwargs) + + def _load_specials(self, *args, **kwargs): + """Handle special requirements of `.load()` protocol, usually up-converting older versions.""" + super(FastTextKeyedVectors, self)._load_specials(*args, **kwargs) + if not isinstance(self, FastTextKeyedVectors): + raise TypeError("Loaded object of type %s, not expected FastTextKeyedVectors" % type(self)) + if not hasattr(self, 'compatible_hash') or self.compatible_hash is False: + raise TypeError("Pre-gensim-3.8.x Fasttext models with nonstandard hashing are no longer compatible." + "Loading into gensim-3.8.3 & re-saving may create a compatible model.") + if not hasattr(self, 'vectors_vocab_lockf') and hasattr(self, 'vectors_vocab'): + self.vectors_vocab_lockf = ones(1, dtype=REAL) + if not hasattr(self, 'vectors_ngrams_lockf') and hasattr(self, 'vectors_ngrams'): + self.vectors_ngrams_lockf = ones(1, dtype=REAL) + # fixup mistakenly overdimensioned gensim-3.x lockf arrays + if len(self.vectors_vocab_lockf.shape) > 1: + self.vectors_vocab_lockf = ones(1, dtype=REAL) + if len(self.vectors_ngrams_lockf.shape) > 1: + self.vectors_ngrams_lockf = ones(1, dtype=REAL) + if not hasattr(self, 'buckets_word') or not self.buckets_word: + self.recalc_char_ngram_buckets() + if not hasattr(self, 'vectors') or self.vectors is None: + self.adjust_vectors() # recompose full-word vectors def __contains__(self, word): """Check if `word` or any character ngrams in `word` are present in the vocabulary. @@ -1250,14 +1275,15 @@ def save(self, *args, **kwargs): Load object. """ - # don't bother storing the cached normalized vectors - ignore_attrs = [ - 'buckets_word', - 'hash2index', - ] - kwargs['ignore'] = kwargs.get('ignore', ignore_attrs) super(FastTextKeyedVectors, self).save(*args, **kwargs) + def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, compress, subname): + """Arrange any special handling for the gensim.utils.SaveLoad protocol""" + # don't save properties that are merely calculated from others + ignore = set(it.chain(ignore, ('buckets_word', 'vectors'))) + return super(FastTextKeyedVectors, self)._save_specials( + fname, separately, sep_limit, ignore, pickle_protocol, compress, subname) + def get_vector(self, word, use_norm=False): """Get `word` representations in vector space, as a 1D numpy array. diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index a6523babdf..307a97537f 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -128,7 +128,7 @@ from collections import defaultdict, namedtuple from types import GeneratorType import threading -import itertools +import itertools as it import copy from gensim.utils import keep_vocab_item, call_on_class_only, deprecated @@ -1788,20 +1788,14 @@ def save(self, *args, **kwargs): Path to the file. """ - # don't bother storing recalculable table - kwargs['ignore'] = kwargs.get('ignore', []) + ['cum_table', ] super(Word2Vec, self).save(*args, **kwargs) - def get_latest_training_loss(self): - """Get current value of the training loss. - - Returns - ------- - float - Current training loss. - - """ - return self.running_training_loss + def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, compress, subname): + """Arrange any special handling for the gensim.utils.SaveLoad protocol""" + # don't save properties that are merely calculated from others + ignore = set(it.chain(ignore, ('cum_table',))) + return super(Word2Vec, self)._save_specials( + fname, separately, sep_limit, ignore, pickle_protocol, compress, subname) @classmethod def load(cls, *args, rethrow=False, **kwargs): @@ -1828,49 +1822,65 @@ def load(cls, *args, rethrow=False, **kwargs): if not isinstance(model, Word2Vec): rethrow = True raise AttributeError("Model of type %s can't be loaded by %s" % (type(model), str(cls))) - # for backward compatibility - if not hasattr(model, 'ns_exponent'): - model.ns_exponent = 0.75 - if model.negative and hasattr(model.wv, 'index2word'): - model.make_cum_table() # rebuild cum_table from vocabulary ## TODO: ??? - if not hasattr(model, 'corpus_count'): - model.corpus_count = None - if not hasattr(model, 'corpus_total_words'): - model.corpus_total_words = None - if not hasattr(model.wv, 'vectors_lockf') and hasattr(model.wv, 'vectors'): - model.wv.vectors_lockf = getattr(model, 'vectors_lockf', np.ones(1, dtype=REAL)) - if not hasattr(model, 'random'): - model.random = np.random.RandomState(model.seed) - if not hasattr(model, 'train_count'): - model.train_count = 0 - model.total_train_time = 0 - if not hasattr(model, 'epochs'): - model.epochs = model.iter - del model.iter - if not hasattr(model, 'max_final_vocab'): - model.max_final_vocab = None - if hasattr(model, 'vocabulary'): # re-integrate state that had been moved - for a in ('max_vocab_size', 'min_count', 'sample', 'sorted_vocab', 'null_word', 'raw_vocab'): - setattr(model, a, getattr(model.vocabulary, a)) - del model.vocabulary - if hasattr(model, 'trainables'): # re-integrate state that had been moved - for a in ('hashfxn', 'layer1_size', 'seed', 'syn1neg', 'syn1'): - if hasattr(model.trainables, a): - setattr(model, a, getattr(model.trainables, a)) - if hasattr(model, 'syn1'): - model.syn1 = model.syn1 - del model.syn1 - del model.trainables return model except AttributeError as ae: if rethrow: raise ae logger.error( "Model load error. Was model saved using code from an older Gensim Version? " - "Try loading older model using gensim-3.8.1, then re-saving, to restore " + "Try loading older model using gensim-3.8.3, then re-saving, to restore " "compatibility with current code.") raise ae + def _load_specials(self, *args, **kwargs): + """Handle special requirements of `.load()` protocol, usually up-converting older versions.""" + super(Word2Vec, self)._load_specials(*args, **kwargs) + # for backward compatibility, add/rearrange properties from prior versions + if not hasattr(self, 'ns_exponent'): + self.ns_exponent = 0.75 + if self.negative and hasattr(self.wv, 'index_to_key'): + self.make_cum_table() # rebuild cum_table from vocabulary + if not hasattr(self, 'corpus_count'): + self.corpus_count = None + if not hasattr(self, 'corpus_total_words'): + self.corpus_total_words = None + if not hasattr(self.wv, 'vectors_lockf') and hasattr(self.wv, 'vectors'): + self.wv.vectors_lockf = getattr(self, 'vectors_lockf', np.ones(1, dtype=REAL)) + if not hasattr(self, 'random'): + # use new instance of numpy's recommended generator/algorithm + self.random = np.random.default_rng(seed=self.seed) + if not hasattr(self, 'train_count'): + self.train_count = 0 + self.total_train_time = 0 + if not hasattr(self, 'epochs'): + self.epochs = self.iter + del self.iter + if not hasattr(self, 'max_final_vocab'): + self.max_final_vocab = None + if hasattr(self, 'vocabulary'): # re-integrate state that had been moved + for a in ('max_vocab_size', 'min_count', 'sample', 'sorted_vocab', 'null_word', 'raw_vocab'): + setattr(self, a, getattr(self.vocabulary, a)) + del self.vocabulary + if hasattr(self, 'trainables'): # re-integrate state that had been moved + for a in ('hashfxn', 'layer1_size', 'seed', 'syn1neg', 'syn1'): + if hasattr(self.trainables, a): + setattr(self, a, getattr(self.trainables, a)) + if hasattr(self, 'syn1'): + self.syn1 = self.syn1 + del self.syn1 + del self.trainables + + def get_latest_training_loss(self): + """Get current value of the training loss. + + Returns + ------- + float + Current training loss. + + """ + return self.running_training_loss + class BrownCorpus(object): def __init__(self, dirname): @@ -1958,7 +1968,7 @@ def __iter__(self): # Assume it is a file-like object and try treating it as such # Things that don't have seek will trigger an exception self.source.seek(0) - for line in itertools.islice(self.source, self.limit): + for line in it.islice(self.source, self.limit): line = utils.to_unicode(line).split() i = 0 while i < len(line): @@ -1967,7 +1977,7 @@ def __iter__(self): except AttributeError: # If it didn't work like a file, use it as a string filename with utils.open(self.source, 'rb') as fin: - for line in itertools.islice(fin, self.limit): + for line in it.islice(fin, self.limit): line = utils.to_unicode(line).split() i = 0 while i < len(line): @@ -2021,7 +2031,7 @@ def __iter__(self): for file_name in self.input_files: logger.info('reading file %s', file_name) with utils.open(file_name, 'rb') as fin: - for line in itertools.islice(fin, self.limit): + for line in it.islice(fin, self.limit): line = utils.to_unicode(line).split() i = 0 while i < len(line): From 49b35b718bff53d45ffab6884a38c7103eff6a13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Mon, 7 Sep 2020 14:10:07 +0200 Subject: [PATCH 05/19] docstirng fixes --- gensim/models/fasttext.py | 6 ++++-- gensim/models/word2vec.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index b2f5bdaad0..fa221a011d 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -1218,8 +1218,10 @@ def _load_specials(self, *args, **kwargs): if not isinstance(self, FastTextKeyedVectors): raise TypeError("Loaded object of type %s, not expected FastTextKeyedVectors" % type(self)) if not hasattr(self, 'compatible_hash') or self.compatible_hash is False: - raise TypeError("Pre-gensim-3.8.x Fasttext models with nonstandard hashing are no longer compatible." - "Loading into gensim-3.8.3 & re-saving may create a compatible model.") + raise TypeError( + "Pre-gensim-3.8.x fastText models with nonstandard hashing are no longer compatible. " + "Loading your old model into gensim-3.8.3 & re-saving may create a model compatible with gensim 4.x." + ) if not hasattr(self, 'vectors_vocab_lockf') and hasattr(self, 'vectors_vocab'): self.vectors_vocab_lockf = ones(1, dtype=REAL) if not hasattr(self, 'vectors_ngrams_lockf') and hasattr(self, 'vectors_ngrams'): diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index ca957e14b7..7db3d1fcc0 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -1819,7 +1819,7 @@ def save(self, *args, **kwargs): super(Word2Vec, self).save(*args, **kwargs) def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, compress, subname): - """Arrange any special handling for the gensim.utils.SaveLoad protocol""" + """Arrange any special handling for the `gensim.utils.SaveLoad` protocol.""" # don't save properties that are merely calculated from others ignore = set(it.chain(ignore, ('cum_table',))) return super(Word2Vec, self)._save_specials( From 3f972a6d4c51af725f71a95e3eea861a3fa7a343 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Tue, 8 Sep 2020 13:19:58 +0200 Subject: [PATCH 06/19] get rid of python2 constructs --- gensim/models/word2vec.py | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 7db3d1fcc0..77fc56b032 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -133,21 +133,15 @@ import threading import itertools as it import copy - -from gensim.utils import keep_vocab_item, call_on_class_only, deprecated -from gensim.models.keyedvectors import KeyedVectors, pseudorandom_weak_vector - -try: - from queue import Queue, Empty -except ImportError: - from Queue import Queue, Empty +from queue import Queue, Empty from numpy import float32 as REAL import numpy as np -from gensim import utils, matutils # utility fnc for pickling, common scipy operations etc -from six import iteritems, itervalues, string_types -from six.moves import range +from gensim.utils import keep_vocab_item, call_on_class_only, deprecated +from gensim.models.keyedvectors import KeyedVectors, pseudorandom_weak_vector +from gensim import utils, matutils + logger = logging.getLogger(__name__) @@ -371,7 +365,7 @@ def __init__(self, sentences=None, corpus_file=None, vector_size=100, alpha=0.02 def build_vocab_and_train(self, corpus_iterable=None, corpus_file=None, trim_rule=None, callbacks=None): if not (corpus_iterable is None) ^ (corpus_file is None): raise ValueError("You must provide only one of corpus_iterable or corpus_file arguments.") - if corpus_file is not None and not isinstance(corpus_file, string_types): + if corpus_file is not None and not isinstance(corpus_file, str): raise TypeError("You must pass string as the corpus_file argument.") elif isinstance(corpus_iterable, GeneratorType): raise TypeError("You can't pass a generator as the sentences argument. Try a sequence.") @@ -464,7 +458,7 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No raw_vocab = word_freq logger.info( "collected %i different raw word, with total frequency of %i", - len(raw_vocab), sum(itervalues(raw_vocab)) + len(raw_vocab), sum(raw_vocab.values()), ) # Since no sentences are provided, this is to control the corpus_count. @@ -484,11 +478,11 @@ def _scan_vocab(self, sentences, progress_per, trim_rule): checked_string_types = 0 for sentence_no, sentence in enumerate(sentences): if not checked_string_types: - if isinstance(sentence, string_types): + if isinstance(sentence, str): logger.warning( "Each 'sentences' item should be a list of words (usually unicode strings). " "First item here is instead plain %s.", - type(sentence) + type(sentence), ) checked_string_types += 1 if sentence_no % progress_per == 0: @@ -570,7 +564,7 @@ def prepare_vocab( self.sample = sample self.wv.key_to_index = {} - for word, v in iteritems(self.raw_vocab): + for word, v in self.raw_vocab.items(): if keep_vocab_item(word, v, self.effective_min_count, trim_rule=trim_rule): retain_words.append(word) retain_total += v @@ -600,7 +594,7 @@ def prepare_vocab( logger.info("Updating model with new vocabulary") new_total = pre_exist_total = 0 new_words = pre_exist_words = [] - for word, v in iteritems(self.raw_vocab): + for word, v in self.raw_vocab.items(): if keep_vocab_item(word, v, self.effective_min_count, trim_rule=trim_rule): if self.wv.has_index_for(word): pre_exist_words.append(word) From 4331ccf0f0c8288003763806142d2f3711da7ff3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Sun, 13 Sep 2020 15:55:45 +0200 Subject: [PATCH 07/19] code style fixes while debugging pickle model sizes --- gensim/downloader.py | 1 + gensim/models/coherencemodel.py | 9 ++-- gensim/models/keyedvectors.py | 3 +- gensim/utils.py | 81 ++++++++++++--------------------- 4 files changed, 38 insertions(+), 56 deletions(-) diff --git a/gensim/downloader.py b/gensim/downloader.py index 6fb362ccad..8a4395440e 100644 --- a/gensim/downloader.py +++ b/gensim/downloader.py @@ -50,6 +50,7 @@ By default, this subdirectory is ~/gensim-data. """ + from __future__ import absolute_import import argparse import os diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py index 9633a2e62f..70fea79804 100644 --- a/gensim/models/coherencemodel.py +++ b/gensim/models/coherencemodel.py @@ -25,6 +25,7 @@ Internal functions for pipelines. """ + import logging import multiprocessing as mp from collections import namedtuple @@ -33,9 +34,11 @@ from gensim import interfaces, matutils from gensim import utils -from gensim.topic_coherence import (segmentation, probability_estimation, - direct_confirmation_measure, indirect_confirmation_measure, - aggregation) +from gensim.topic_coherence import ( + segmentation, probability_estimation, + direct_confirmation_measure, indirect_confirmation_measure, + aggregation, +) from gensim.topic_coherence.probability_estimation import unique_ids_from_segments logger = logging.getLogger(__name__) diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index e42c46cc7c..5d5433c61b 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -395,9 +395,8 @@ def add_one(self, key, vector): Warning: using this repeatedly is inefficient, requiring a full reallocation & copy, if this instance hasn't been preallocated to be ready fro such incremental additions. - returns: actual index used TODO: other param docs + returns: actual index used FIXME: other param docs """ - target_index = self.next_index if target_index >= len(self) or self.index_to_key[target_index] is not None: # must append at end by expanding existing structures diff --git a/gensim/utils.py b/gensim/utils.py index bb9ee2fa02..49cab6c595 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -11,16 +11,9 @@ import collections import logging import warnings - -try: - from html.entities import name2codepoint as n2cp -except ImportError: - from htmlentitydefs import name2codepoint as n2cp -try: - import cPickle as _pickle -except ImportError: - import pickle as _pickle - +import numbers +from html.entities import name2codepoint as n2cp +import pickle as _pickle import re import unicodedata import os @@ -36,18 +29,9 @@ import heapq import numpy as np -import numbers import scipy.sparse - -from six import iterkeys, iteritems, itervalues, u, string_types, unichr -from six.moves import range - from smart_open import open -from multiprocessing import cpu_count - -if sys.version_info[0] >= 3: - unicode = str logger = logging.getLogger(__name__) @@ -138,7 +122,7 @@ def file_or_filename(input): An open file, positioned at the beginning. """ - if isinstance(input, string_types): + if isinstance(input, str): # input was a filename: open as file return open(input, 'rb') else: @@ -169,11 +153,11 @@ def open_file(input): except Exception: # Handling any unhandled exceptions from the code nested in 'with' statement. exc = True - if not isinstance(input, string_types) or not mgr.__exit__(*sys.exc_info()): + if not isinstance(input, str) or not mgr.__exit__(*sys.exc_info()): raise # Try to introspect and silence errors. finally: - if not exc and isinstance(input, string_types): + if not exc and isinstance(input, str): mgr.__exit__(None, None, None) @@ -199,11 +183,11 @@ def deaccent(text): u'Sef chomutovskych komunistu dostal postou bily prasek' """ - if not isinstance(text, unicode): + if not isinstance(text, str): # assume utf8 for byte strings, use default (strict) error handling text = text.decode('utf8') norm = unicodedata.normalize("NFD", text) - result = u('').join(ch for ch in norm if unicodedata.category(ch) != 'Mn') + result = ''.join(ch for ch in norm if unicodedata.category(ch) != 'Mn') return unicodedata.normalize("NFC", result) @@ -339,10 +323,10 @@ def any2utf8(text, errors='strict', encoding='utf8'): """ - if isinstance(text, unicode): + if isinstance(text, str): return text.encode('utf8') # do bytestring -> unicode -> utf8 full circle, to ensure valid utf8 - return unicode(text, encoding, errors=errors).encode('utf8') + return str(text, encoding, errors=errors).encode('utf8') to_utf8 = any2utf8 @@ -366,9 +350,9 @@ def any2unicode(text, encoding='utf8', errors='strict'): Unicode version of `text`. """ - if isinstance(text, unicode): + if isinstance(text, str): return text - return unicode(text, encoding, errors=errors) + return str(text, encoding, errors=errors) to_unicode = any2unicode @@ -393,7 +377,7 @@ def call_on_class_only(*args, **kwargs): raise AttributeError('This method should be called on a class object.') -class SaveLoad(object): +class SaveLoad: """Serialize/deserialize object from disk, by equipping objects with the save()/load() methods. Warnings @@ -562,7 +546,7 @@ def _smart_save(self, fname, separately=None, sep_limit=10 * 1024**2, ignore=fro finally: # restore attribs handled specially for obj, asides in restores: - for attrib, val in iteritems(asides): + for attrib, val in asides.items(): with ignore_deprecation_warning(): setattr(obj, attrib, val) logger.info("saved %s", fname) @@ -599,7 +583,7 @@ def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, sparse_matrices = (scipy.sparse.csr_matrix, scipy.sparse.csc_matrix) if separately is None: separately = [] - for attrib, val in iteritems(self.__dict__): + for attrib, val in self.__dict__.items(): if isinstance(val, np.ndarray) and val.size >= sep_limit: separately.append(attrib) elif isinstance(val, sparse_matrices) and val.nnz >= sep_limit: @@ -614,7 +598,7 @@ def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, recursive_saveloads = [] restores = [] - for attrib, val in iteritems(self.__dict__): + for attrib, val in self.__dict__.items(): if hasattr(val, '_save_specials'): # better than 'isinstance(val, SaveLoad)' if IPython reloading recursive_saveloads.append(attrib) cfname = '.'.join((fname, attrib)) @@ -622,7 +606,7 @@ def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, try: numpys, scipys, ignoreds = [], [], [] - for attrib, val in iteritems(asides): + for attrib, val in asides.items(): if isinstance(val, np.ndarray) and attrib not in ignore: numpys.append(attrib) logger.info("storing np array '%s' to %s", attrib, subname(fname, attrib)) @@ -666,7 +650,7 @@ def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, self.__dict__['__recursive_saveloads'] = recursive_saveloads except Exception: # restore the attributes if exception-interrupted - for attrib, val in iteritems(asides): + for attrib, val in asides.items(): setattr(self, attrib, val) raise return restores + [(self, asides)] @@ -749,7 +733,7 @@ def get_max_id(corpus): return maxid -class FakeDict(object): +class FakeDict: """Objects of this class act as dictionaries that map integer->str(integer), for a specified range of integers <0, num_terms). @@ -778,7 +762,6 @@ def __getitem__(self, val): def iteritems(self): """Iterate over all keys and values. - Yields ------ (int, str) @@ -1087,9 +1070,9 @@ def safe_unichr(intval): """ try: - return unichr(intval) + return chr(intval) except ValueError: - # ValueError: unichr() arg not in range(0x10000) (narrow Python build) + # ValueError: chr() arg not in range(0x10000) (narrow Python build) s = "\\U%08x" % intval # return UTF16 surrogate pair return s.decode('unicode-escape') @@ -1396,11 +1379,7 @@ def unpickle(fname): """ with open(fname, 'rb') as f: - # Because of loading from S3 load can't be used (missing readline in smart_open) - if sys.version_info > (3, 0): - return _pickle.load(f, encoding='latin1') - else: - return _pickle.loads(f.read()) + return _pickle.load(f, encoding='latin1') # needed because loading from S3 doesn't support readline() def revdict(d): @@ -1430,7 +1409,7 @@ def revdict(d): {2: 1, 4: 3} """ - return {v: k for (k, v) in iteritems(dict(d))} + return {v: k for (k, v) in dict(d).items()} def deprecated(reason): @@ -1450,7 +1429,7 @@ def deprecated(reason): Decorated function """ - if isinstance(reason, string_types): + if isinstance(reason, str): def decorator(func): fmt = "Call to deprecated `{name}` ({reason})." @@ -1704,7 +1683,7 @@ def lemmatize(content, allowed_tags=re.compile(r'(NN|VB|JJ|RB)'), light=False, # producing '==relate/VBN' or '**/NN'... try to preprocess the text a little # FIXME this throws away all fancy parsing cues, including sentence structure, # abbreviations etc. - content = u(' ').join(tokenize(content, lower=True, errors='ignore')) + content = ' '.join(tokenize(content, lower=True, errors='ignore')) parsed = parse(content, lemmata=True, collapse=False) result = [] @@ -1814,7 +1793,7 @@ def trim_vocab_by_freq(vocab, topk, trim_rule=None): if topk >= len(vocab): return - min_count = heapq.nlargest(topk, itervalues(vocab))[-1] + min_count = heapq.nlargest(topk, vocab.values())[-1] prune_vocab(vocab, min_count, trim_rule=trim_rule) @@ -1831,7 +1810,7 @@ def merge_counts(dict1, dict2): result : dict Merged dictionary with sum of frequencies as values. """ - for word, freq in iteritems(dict2): + for word, freq in dict2.items(): if word in dict1: dict1[word] += freq else: @@ -1957,7 +1936,7 @@ def sample_dict(d, n=10, use_random=True): Selected items from dictionary, as a list. """ - selected_keys = random.sample(list(d), min(len(d), n)) if use_random else itertools.islice(iterkeys(d), n) + selected_keys = random.sample(list(d), min(len(d), n)) if use_random else itertools.islice(d.keys(), n) return [(key, d[key]) for key in selected_keys] @@ -2080,7 +2059,7 @@ def lazy_flatten(nested_list): """ for el in nested_list: - if isinstance(el, collections.Iterable) and not isinstance(el, string_types): + if isinstance(el, collections.Iterable) and not isinstance(el, str): for sub in flatten(el): yield sub else: @@ -2124,5 +2103,5 @@ def effective_n_jobs(n_jobs): elif n_jobs is None: return 1 elif n_jobs < 0: - n_jobs = max(cpu_count() + 1 + n_jobs, 1) + n_jobs = max(multiprocessing.cpu_count() + 1 + n_jobs, 1) return n_jobs From 012d59813341997e71930dfd1a2e314a02a45775 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Sun, 13 Sep 2020 16:18:20 +0200 Subject: [PATCH 08/19] py2 to 3: get rid of forgotten range --- gensim/test/test_sharded_corpus.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/gensim/test/test_sharded_corpus.py b/gensim/test/test_sharded_corpus.py index 3a56f240e2..14eea34f88 100644 --- a/gensim/test/test_sharded_corpus.py +++ b/gensim/test/test_sharded_corpus.py @@ -1,19 +1,17 @@ """ -Testing the test sharded corpus. +Tests for ShardedCorpus. """ -import os +import os import unittest - import random -import numpy as np import shutil +import numpy as np from scipy import sparse -from gensim.utils import is_corpus +from gensim.utils import is_corpus, mock_data from gensim.corpora.sharded_corpus import ShardedCorpus -from gensim.utils import mock_data, range ############################################################################# From eefe9ab87ae1b0206f0c7ff7adc296961dada6bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Sun, 13 Sep 2020 23:36:05 +0200 Subject: [PATCH 09/19] fix docs --- gensim/models/word2vec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 4384b7604a..ecaeeb8c81 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -44,7 +44,7 @@ >>> >>> path = get_tmpfile("word2vec.model") >>> - >>> model = Word2Vec(common_texts, size=100, window=5, min_count=1, workers=4) + >>> model = Word2Vec(common_texts, vector_size=100, window=5, min_count=1, workers=4) >>> model.save("word2vec.model") The training is streamed, meaning `sentences` can be a generator, reading input data From 1a9b6466d69d8cd532209e3be3af0fad3f9ec172 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Mon, 14 Sep 2020 10:37:17 +0200 Subject: [PATCH 10/19] get rid of numpy.str_ --- gensim/models/keyedvectors.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index 5d5433c61b..05ec846386 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -423,7 +423,7 @@ def add(self, keys, weights, extras=None, replace=False): Parameters ---------- keys : list of (str or int) - keys specified by string or int ids. + Keys specified by string or int ids. weights: list of numpy.ndarray or numpy.ndarray List of 1D np.array vectors or a 2D np.array of vectors. replace: bool, optional @@ -582,7 +582,7 @@ def sort_by_descending_frequency(self): if not len(self): return # noop if empty count_sorted_indexes = np.argsort(self.expandos['count'])[::-1] - self.index_to_key = list(np.array(self.index_to_key)[count_sorted_indexes]) + self.index_to_key = [self.index_to_key[idx] for idx in count_sorted_indexes] self.allocate_vecattrs() for k in self.expandos: # Use numpy's "fancy indexing" to permutate the entire array in one step. @@ -697,8 +697,10 @@ def most_similar(self, positive=None, negative=None, topn=10, clip_start=0, clip return dists best = matutils.argsort(dists, topn=topn + len(all_keys), reverse=True) # ignore (don't return) keys from the input - result = [(self.index_to_key[sim + clip_start], float(dists[sim])) - for sim in best if (sim + clip_start) not in all_keys] + result = [ + (self.index_to_key[sim + clip_start], float(dists[sim])) + for sim in best if (sim + clip_start) not in all_keys + ] return result[:topn] def similar_by_word(self, word, topn=10, restrict_vocab=None): From 06aef7510f8208b32f183c1c95597ef55865826d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Sat, 19 Sep 2020 12:42:41 +0200 Subject: [PATCH 11/19] fix index2entity, fix docs, hard-fail deprecated properties --- gensim/models/keyedvectors.py | 175 +++++++++++++++++++++++++--------- 1 file changed, 128 insertions(+), 47 deletions(-) diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index 34929dfce3..bbc9079d40 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -101,12 +101,16 @@ >>> >>> word_vectors = api.load("glove-wiki-gigaword-100") # load pre-trained word-vectors from gensim-data >>> + >>> # Check the "most similar words", using the default "cosine similarity" measure. >>> result = word_vectors.most_similar(positive=['woman', 'king'], negative=['man']) - >>> print("{}: {:.4f}".format(*result[0])) + >>> most_similar_key, similarity = result[0] # look at the first value + >>> print(f"{most_similar_key}: {similarity:.4f}" queen: 0.7699 >>> + >>> # Use a different similarity measure: "cosmul". >>> result = word_vectors.most_similar_cosmul(positive=['woman', 'king'], negative=['man']) - >>> print("{}: {:.4f}".format(*result[0])) + >>> most_similar_key, similarity = result[0] # look at the first value + >>> print(f"{most_similar_key}: {similarity:.4f}" queen: 0.8965 >>> >>> print(word_vectors.doesnt_match("breakfast cereal dinner lunch".split())) @@ -117,22 +121,23 @@ True >>> >>> result = word_vectors.similar_by_word("cat") - >>> print("{}: {:.4f}".format(*result[0])) + >>> most_similar_key, similarity = result[0] # look at the first value + >>> print(f"{most_similar_key}: {similarity:.4f}" dog: 0.8798 >>> >>> sentence_obama = 'Obama speaks to the media in Illinois'.lower().split() >>> sentence_president = 'The president greets the press in Chicago'.lower().split() >>> >>> similarity = word_vectors.wmdistance(sentence_obama, sentence_president) - >>> print("{:.4f}".format(similarity)) + >>> print(f"{similarity:.4f}") 3.4893 >>> >>> distance = word_vectors.distance("media", "media") - >>> print("{:.1f}".format(distance)) + >>> print(f"{distance:.1f}") 0.0 >>> - >>> sim = word_vectors.n_similarity(['sushi', 'shop'], ['japanese', 'restaurant']) - >>> print("{:.4f}".format(sim)) + >>> similarity = word_vectors.n_similarity(['sushi', 'shop'], ['japanese', 'restaurant']) + >>> print(f"{similarity:.4f}") 0.7067 >>> >>> vector = word_vectors['computer'] # numpy vector of a word @@ -219,7 +224,7 @@ def _load_specials(self, *args, **kwargs): self._upconvert_old_d2vkv() # fixup rename/consolidation into index_to_key of older index2word, index2entity if not hasattr(self, 'index_to_key'): - self.index_to_key = self.__dict__.pop('index2word', self.__dict__.pop('index2word', None)) + self.index_to_key = self.__dict__.pop('index2word', self.__dict__.pop('index2entity', None)) # fixup rename into vectors of older syn0 if not hasattr(self, 'vectors'): self.vectors = self.__dict__.pop('syn0', None) @@ -267,22 +272,54 @@ def allocate_vecattrs(self, attrs=None, types=None): continue prev_expando = self.expandos[attr] if not np.issubdtype(t, prev_expando.dtype): - raise TypeError("can't allocate {0} for existing {1}".format(t, prev_expando.dtype)) + raise TypeError(f"can't allocate {t} for existing {prev_expando.dtype}") if len(prev_expando) == target_size: continue # no resizing necessary prev_count = len(prev_expando) self.expandos[attr] = np.zeros(target_size, dtype=prev_expando.dtype) - self.expandos[attr][0:min(prev_count, target_size), ] = \ - prev_expando[0:min(prev_count, target_size), ] + self.expandos[attr][: min(prev_count, target_size), ] = prev_expando[: min(prev_count, target_size), ] def set_vecattr(self, key, attr, val): - """Set attribute associated with given key to value. TODO: param docs""" + """Set attribute associated with the given key to value. + + Parameters + ---------- + + key : str + Store the attribute for this vector key. + attr : str + Name of the additional attribute to store for the given key. + val : object + Value of the additional attribute to store for the given key. + + Returns + ------- + + None + + """ self.allocate_vecattrs(attrs=[attr], types=[type(val)]) index = self.get_index(key) self.expandos[attr][index] = val def get_vecattr(self, key, attr): - """Get attribute value associate with given key. TODO: param docs""" + """Get attribute value associated with given key. + + Parameters + ---------- + + key : str + Vector key for which to fetch the attribute value. + attr : str + Name of the additional attribute to fetch for the given key. + + Returns + ------- + + object + Value of the additional attribute fetched for the given key. + + """ index = self.get_index(key) return self.expandos[attr][index] @@ -351,13 +388,14 @@ def get_index(self, key, default=None): elif default is not None: return default else: - raise KeyError("Key '%s' not present" % key) + raise KeyError(f"Key '{key}' not present") def get_vector(self, key, norm=False): """Get the key's vector, as a 1D numpy array. Parameters ---------- + key : str Key for vector to return. norm : bool, optional @@ -365,11 +403,13 @@ def get_vector(self, key, norm=False): Returns ------- + numpy.ndarray Vector for the specified key. Raises ------ + KeyError If the given key doesn't exist. @@ -386,16 +426,29 @@ def get_vector(self, key, norm=False): @deprecated("Use get_vector instead") def word_vec(self, *args, **kwargs): - """Compatibility alias for get_vector(); must exist so subclass calls reach subclass get_vector()""" + """Compatibility alias for get_vector(); must exist so subclass calls reach subclass get_vector().""" return self.get_vector(*args, **kwargs) - def add_one(self, key, vector): + def add_new_vector(self, key, vector): """Add one new vector at the given key, into existing slot if available. Warning: using this repeatedly is inefficient, requiring a full reallocation & copy, - if this instance hasn't been preallocated to be ready fro such incremental additions. + if this instance hasn't been preallocated to be ready for such incremental additions. + + Parameters + ---------- + + key: str + Key identifier of the added vector. + vector: numpy.ndarray + 1D numpy array with the vector values. + + Returns + ------- + int + Index of the newly added vector, so that ``self.vectors[result] == vector`` and + ``self.index_to_key[result] == key``. - returns: actual index used FIXME: other param docs """ target_index = self.next_index if target_index >= len(self) or self.index_to_key[target_index] is not None: @@ -521,19 +574,32 @@ def rank(self, key1, key2): @property def vectors_norm(self): - raise ValueError( - "The vectors_norm attribute became a get_normed_vectors() method in Gensim 4.0.0. " + raise AttributeError( + "The `.vectors_norm` attribute is computed dynamically since Gensim 4.0.0. " + "Use `.get_normed_vectors()` instead.\n" "See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4#init_sims" ) @vectors_norm.setter def vectors_norm(self, _): - pass # no-op; shouldn't be set + raise AttributeError( + "Vector norms are computed dynamically since Gensim 4.0.0 to save memory, you cannot set them. " + "See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4#init_sims" + ) def get_normed_vectors(self): - # TODO: what's the way for users to get from a matrix index (integer) to the - # corresponding key (string)? - # Shouldn't we return this as a mapping (dict), or even a new KeyedVectors instance? + """Get all embedding vectors normalized to unit L2 length (euclidean), as a 2D numpy array. + + To see which key corresponds to which vector = which array row, refer + to the :attr:`~gensim.models.keyedvectors.KeyedVectors.index_to_key` attribute. + + Returns + ------- + numpy.ndarray: + 2D numpy array of shape ``(number_of_keys, embedding dimensionality)``, L2-normalized + along the rows (key vectors). + + """ self.fill_norms() return self.vectors / self.norms[..., np.newaxis] @@ -542,7 +608,7 @@ def fill_norms(self, force=False): Ensure per-vector norms are available. Any code which modifies vectors should ensure the accompanying norms are - either recalculated or 'None', to trigger a full recalculation later. + either recalculated or 'None', to trigger a full recalculation later on-request. """ if self.norms is None or force: @@ -550,27 +616,39 @@ def fill_norms(self, force=False): @property def index2entity(self): - return self.index_to_key + raise AttributeError( + "The index2entity attribute has been replaced by index_to_key since Gensim 4.0.0.\n" + "See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4#init_sims" + ) @index2entity.setter def index2entity(self, value): - self.index_to_key = value + raise AttributeError( + "The index2entity attribute has been replaced by index_to_key since Gensim 4.0.0.\n" + "See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4#init_sims" + ) @property def index2word(self): - return self.index_to_key + raise AttributeError( + "The index2word attribute has been replaced by index_to_key since Gensim 4.0.0.\n" + "See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4#init_sims" + ) @index2word.setter def index2word(self, value): - self.index_to_key = value + raise AttributeError( + "The index2word attribute has been replaced by index_to_key since Gensim 4.0.0.\n" + "See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4#init_sims" + ) @property def vocab(self): - raise NotImplementedError( - "The .vocab dict of 'Vocab' propery objects, one per key, has been removed.\n" - "See the KeyedVectors .key_to_index dict, .index_to_key list, and methods\n" - ".get_vecattr(key, attr)/.set_vecattr(key, attr, new_val) for replacement\n" - "functionality." + raise AttributeError( + "The vocab attribute was removed from KeyedVector in Gensim 4.0.0.\n" + "Use KeyedVector's .key_to_index dict, .index_to_key list, and methods " + ".get_vecattr(key, attr) and .set_vecattr(key, attr, new_val) instead.\n" + "See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4#init_sims" ) @vocab.setter @@ -590,11 +668,10 @@ def sort_by_descending_frequency(self): if len(self.vectors): logger.warning("sorting after vectors have been allocated is expensive & error-prone") self.vectors = self.vectors[count_sorted_indexes] - for i, word in enumerate(self.index_to_key): - self.key_to_index[word] = i + self.key_to_index = {word : i for i, word in enumerate(self.index_to_key)} def save(self, *args, **kwargs): - """Save KeyedVectors. + """Save KeyedVectors to a file. Parameters ---------- @@ -604,13 +681,15 @@ def save(self, *args, **kwargs): See Also -------- :meth:`~gensim.models.keyedvectors.KeyedVectors.load` - Load saved model. + Load a previously saved model. """ super(KeyedVectors, self).save(*args, **kwargs) - def most_similar(self, positive=None, negative=None, topn=10, clip_start=0, clip_end=None, - restrict_vocab=None, indexer=None): + def most_similar( + self, positive=None, negative=None, topn=10, clip_start=0, clip_end=None, + restrict_vocab=None, indexer=None, + ): """Find the top-N most similar keys. Positive keys contribute positively towards the similarity, negative keys negatively. @@ -1648,11 +1727,11 @@ def _add_word_to_kv(kv, counts, word, weights, vocab_size): if kv.has_index_for(word): logger.warning("duplicate word '%s' in word2vec file, ignoring all but first", word) return - word_id = kv.add_one(word, weights) + word_id = kv.add_new_vector(word, weights) if counts is None: - # most common scenario: no vocab file given. just make up some bogus counts, in descending order - # FIXME(someday): make this faking optional, include more realistic (Zipf-based) fake numbers + # Most common scenario: no vocab file given. Just make up some bogus counts, in descending order. + # TODO (someday): make this faking optional, include more realistic (Zipf-based) fake numbers. word_count = vocab_size - word_id elif word in counts: # use count from the vocab file @@ -1798,14 +1877,16 @@ def _load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8' kv = cls(vector_size, vocab_size, dtype=datatype) if binary: - _word2vec_read_binary(fin, kv, counts, - vocab_size, vector_size, datatype, unicode_errors, binary_chunk_size) + _word2vec_read_binary( + fin, kv, counts, + vocab_size, vector_size, datatype, unicode_errors, binary_chunk_size, + ) else: _word2vec_read_text(fin, kv, counts, vocab_size, vector_size, datatype, unicode_errors, encoding) if kv.vectors.shape[0] != len(kv): logger.info( "duplicate words detected, shrinking matrix size from %i to %i", - kv.vectors.shape[0], len(kv) + kv.vectors.shape[0], len(kv), ) kv.vectors = ascontiguousarray(kv.vectors[: len(kv)]) assert (len(kv), vector_size) == kv.vectors.shape @@ -1815,7 +1896,7 @@ def _load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8' def load_word2vec_format(*args, **kwargs): - """Alias for `KeyedVectors.load_word2vec_format(...)`""" + """Alias for `KeyedVectors.load_word2vec_format(...)`.""" return KeyedVectors.load_word2vec_format(*args, **kwargs) From 5e215607636e868e27c867a140be3ba4f71a3890 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Sat, 19 Sep 2020 13:17:55 +0200 Subject: [PATCH 12/19] fix typos + more doc fixes + fix failing tests --- gensim/models/keyedvectors.py | 43 +++++++++++++------------------- gensim/test/test_keyedvectors.py | 26 ++++++++----------- 2 files changed, 28 insertions(+), 41 deletions(-) diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index bbc9079d40..5b8e010c26 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -103,14 +103,14 @@ >>> >>> # Check the "most similar words", using the default "cosine similarity" measure. >>> result = word_vectors.most_similar(positive=['woman', 'king'], negative=['man']) - >>> most_similar_key, similarity = result[0] # look at the first value - >>> print(f"{most_similar_key}: {similarity:.4f}" + >>> most_similar_key, similarity = result[0] # look at the first match + >>> print(f"{most_similar_key}: {similarity:.4f}") queen: 0.7699 >>> >>> # Use a different similarity measure: "cosmul". >>> result = word_vectors.most_similar_cosmul(positive=['woman', 'king'], negative=['man']) - >>> most_similar_key, similarity = result[0] # look at the first value - >>> print(f"{most_similar_key}: {similarity:.4f}" + >>> most_similar_key, similarity = result[0] # look at the first match + >>> print(f"{most_similar_key}: {similarity:.4f}") queen: 0.8965 >>> >>> print(word_vectors.doesnt_match("breakfast cereal dinner lunch".split())) @@ -121,8 +121,8 @@ True >>> >>> result = word_vectors.similar_by_word("cat") - >>> most_similar_key, similarity = result[0] # look at the first value - >>> print(f"{most_similar_key}: {similarity:.4f}" + >>> most_similar_key, similarity = result[0] # look at the first match + >>> print(f"{most_similar_key}: {similarity:.4f}") dog: 0.8798 >>> >>> sentence_obama = 'Obama speaks to the media in Illinois'.lower().split() @@ -458,7 +458,7 @@ def add_new_vector(self, key, vector): "Adding single vectors to a KeyedVectors which grows by one each time can be costly. " "Consider adding in batches or preallocating to the required size.", UserWarning) - self.add([key], [vector]) + self.add_vectors([key], [vector]) self.allocate_vecattrs() # grow any adjunct arrays self.next_index = target_index + 1 else: @@ -469,7 +469,7 @@ def add_new_vector(self, key, vector): self.next_index += 1 return target_index - def add(self, keys, weights, extras=None, replace=False): + def add_vectors(self, keys, weights, extras=None, replace=False): """Append keys and their vectors in a manual way. If some key is already in the vocabulary, the old vector is kept unless `replace` flag is True. @@ -536,7 +536,7 @@ def __setitem__(self, keys, weights): keys = [keys] weights = weights.reshape(1, -1) - self.add(keys, weights, replace=True) + self.add_vectors(keys, weights, replace=True) def has_index_for(self, key): """Can this model return a single index for this key? @@ -582,10 +582,7 @@ def vectors_norm(self): @vectors_norm.setter def vectors_norm(self, _): - raise AttributeError( - "Vector norms are computed dynamically since Gensim 4.0.0 to save memory, you cannot set them. " - "See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4#init_sims" - ) + pass # ignored but must remain for backward serialization compatibility def get_normed_vectors(self): """Get all embedding vectors normalized to unit L2 length (euclidean), as a 2D numpy array. @@ -623,10 +620,7 @@ def index2entity(self): @index2entity.setter def index2entity(self, value): - raise AttributeError( - "The index2entity attribute has been replaced by index_to_key since Gensim 4.0.0.\n" - "See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4#init_sims" - ) + self.index_to_key = value # must remain for backward serialization compatibility @property def index2word(self): @@ -637,10 +631,7 @@ def index2word(self): @index2word.setter def index2word(self, value): - raise AttributeError( - "The index2word attribute has been replaced by index_to_key since Gensim 4.0.0.\n" - "See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4#init_sims" - ) + self.index_to_key = value # must remain for backward serialization compatibility @property def vocab(self): @@ -668,7 +659,7 @@ def sort_by_descending_frequency(self): if len(self.vectors): logger.warning("sorting after vectors have been allocated is expensive & error-prone") self.vectors = self.vectors[count_sorted_indexes] - self.key_to_index = {word : i for i, word in enumerate(self.index_to_key)} + self.key_to_index = {word: i for i, word in enumerate(self.index_to_key)} def save(self, *args, **kwargs): """Save KeyedVectors to a file. @@ -1896,15 +1887,15 @@ def _load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8' def load_word2vec_format(*args, **kwargs): - """Alias for `KeyedVectors.load_word2vec_format(...)`.""" + """Alias for :meth:`~gensim.models.keyedvectors.KeyedVectors.load_word2vec_format`.""" return KeyedVectors.load_word2vec_format(*args, **kwargs) def pseudorandom_weak_vector(size, seed_string=None, hashfxn=hash): - """Get a 'random' vector (but deterministically derived from seed_string if supplied). + """Get a random vector, derived deterministically from `seed_string` if supplied. + + Useful for initializing KeyedVectors that will be the starting projection/input layers of _2Vec models. - Useful for initializing KeyedVectors that will be the starting - projection/input layers of _2Vec models. """ if seed_string: once = np.random.Generator(np.random.SFC64(hashfxn(seed_string) & 0xffffffff)) diff --git a/gensim/test/test_keyedvectors.py b/gensim/test/test_keyedvectors.py index b998ffe308..3ab8f33afa 100644 --- a/gensim/test/test_keyedvectors.py +++ b/gensim/test/test_keyedvectors.py @@ -23,8 +23,7 @@ class TestKeyedVectors(unittest.TestCase): def setUp(self): - self.vectors = KeyedVectors.load_word2vec_format( - datapath('euclidean_vectors.bin'), binary=True) + self.vectors = KeyedVectors.load_word2vec_format(datapath('euclidean_vectors.bin'), binary=True) self.model_path = datapath("w2v_keyedvectors_load_test.modeldata") self.vocab_path = datapath("w2v_keyedvectors_load_test.vocab") @@ -61,12 +60,9 @@ def test_relative_cosine_similarity(self): 'respectable', 'beneficial', 'just', 'upright', 'adept', 'expert', 'practiced', 'proficient', 'skillful', 'skilful', 'dear', 'near', 'dependable', 'safe', 'secure', 'right', 'ripe', 'well', 'effective', 'in_effect', 'in_force', 'serious', 'sound', 'salutary', 'honest', 'undecomposed', - 'unspoiled', 'unspoilt', 'thoroughly', 'soundly' + 'unspoiled', 'unspoilt', 'thoroughly', 'soundly', ] # synonyms for "good" as per wordnet - cos_sim = [] - for i in range(len(wordnet_syn)): - if wordnet_syn[i] in self.vectors: - cos_sim.append(self.vectors.similarity("good", wordnet_syn[i])) + cos_sim = [self.vectors.similarity("good", syn) for syn in wordnet_syn if syn in self.vectors] cos_sim = sorted(cos_sim, reverse=True) # cosine_similarity of "good" with wordnet_syn in decreasing order # computing relative_cosine_similarity of two similar words rcs_wordnet = self.vectors.similarity("good", "nice") / sum(cos_sim[i] for i in range(10)) @@ -113,7 +109,7 @@ def test_similar_by_word(self): 'administration', 'terrorism', 'call', - 'israel' + 'israel', ] predicted = [result[0] for result in self.vectors.similar_by_word('war', topn=5)] self.assertEqual(expected, predicted) @@ -154,12 +150,12 @@ def test_rank(self): def test_add_single(self): """Test that adding entity in a manual way works correctly.""" - entities = ['___some_entity{}_not_present_in_keyed_vectors___'.format(i) for i in range(5)] + entities = [f'___some_entity{i}_not_present_in_keyed_vectors___' for i in range(5)] vectors = [np.random.randn(self.vectors.vector_size) for _ in range(5)] # Test `add` on already filled kv. for ent, vector in zip(entities, vectors): - self.vectors.add(ent, vector) + self.vectors.add_vectors(ent, vector) for ent, vector in zip(entities, vectors): self.assertTrue(np.allclose(self.vectors[ent], vector)) @@ -167,7 +163,7 @@ def test_add_single(self): # Test `add` on empty kv. kv = KeyedVectors(self.vectors.vector_size) for ent, vector in zip(entities, vectors): - kv.add(ent, vector) + kv.add_vectors(ent, vector) for ent, vector in zip(entities, vectors): self.assertTrue(np.allclose(kv[ent], vector)) @@ -179,7 +175,7 @@ def test_add_multiple(self): # Test `add` on already filled kv. vocab_size = len(self.vectors) - self.vectors.add(entities, vectors, replace=False) + self.vectors.add_vectors(entities, vectors, replace=False) self.assertEqual(vocab_size + len(entities), len(self.vectors)) for ent, vector in zip(entities, vectors): @@ -198,7 +194,7 @@ def test_add_type(self): assert kv.vectors.dtype == REAL words, vectors = ["a"], np.array([1., 1.], dtype=np.float64).reshape(1, -1) - kv.add(words, vectors) + kv.add_vectors(words, vectors) assert kv.vectors.dtype == REAL @@ -270,7 +266,7 @@ def test_save_reload(self): count = 20 keys = [str(i) for i in range(count)] weights = [pseudorandom_weak_vector(randkv.vector_size) for _ in range(count)] - randkv.add(keys, weights) + randkv.add_vectors(keys, weights) tmpfiletxt = gensim.test.utils.get_tmpfile("tmp_kv.txt") randkv.save_word2vec_format(tmpfiletxt, binary=False) reloadtxtkv = KeyedVectors.load_word2vec_format(tmpfiletxt, binary=False) @@ -287,7 +283,7 @@ def test_no_header(self): count = 20 keys = [str(i) for i in range(count)] weights = [pseudorandom_weak_vector(randkv.vector_size) for _ in range(count)] - randkv.add(keys, weights) + randkv.add_vectors(keys, weights) tmpfiletxt = gensim.test.utils.get_tmpfile("tmp_kv.txt") randkv.save_word2vec_format(tmpfiletxt, binary=False, write_header=False) reloadtxtkv = KeyedVectors.load_word2vec_format(tmpfiletxt, binary=False, no_header=True) From 51cae68d188fabf3ba19d3dbcde32569104d8a96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Sat, 19 Sep 2020 14:11:47 +0200 Subject: [PATCH 13/19] more index2word => index_to_key fixes --- gensim/models/_fasttext_bin.py | 2 +- gensim/models/translation_matrix.py | 23 +++++++++++------------ gensim/scripts/word2vec2tensor.py | 2 +- gensim/similarities/annoy.py | 4 ++-- gensim/similarities/nmslib.py | 4 ++-- gensim/test/test_doc2vec.py | 4 ++-- gensim/test/test_keyedvectors.py | 2 +- gensim/test/test_poincare.py | 2 +- gensim/test/test_similarities.py | 4 ++-- gensim/test/test_translation_matrix.py | 11 ++++++----- gensim/test/test_word2vec.py | 18 +++++++++--------- gensim/viz/poincare.py | 2 +- 12 files changed, 39 insertions(+), 39 deletions(-) diff --git a/gensim/models/_fasttext_bin.py b/gensim/models/_fasttext_bin.py index 5eeb4ca71a..77549b1351 100644 --- a/gensim/models/_fasttext_bin.py +++ b/gensim/models/_fasttext_bin.py @@ -549,7 +549,7 @@ def _dict_save(fout, model, encoding): # prunedidx_size_=-1, -1 value denotes no prunning index (prunning is only supported in supervised mode) fout.write(np.int64(-1)) - for word in model.wv.index2word: + for word in model.wv.index_to_key: word_count = model.wv.get_vecattr(word, 'count') fout.write(word.encode(encoding)) fout.write(_END_OF_WORD_MARKER) diff --git a/gensim/models/translation_matrix.py b/gensim/models/translation_matrix.py index 54b21416e3..528e3d6fa2 100644 --- a/gensim/models/translation_matrix.py +++ b/gensim/models/translation_matrix.py @@ -1,8 +1,8 @@ #!/usr/bin/env python # encoding: utf-8 -"""Produce translation matrix to translate the word from one language to another language, using either -standard nearest neighbour method or globally corrected neighbour retrieval method [1]_. +"""Produce a translation matrix to translate words from one language to another, using either +a standard nearest neighbour method or a globally corrected neighbour retrieval method [1]_. This method can be used to augment the existing phrase tables with more candidate translations, or filter out errors from the translation tables and known dictionaries [2]_. What's more, It also work @@ -10,6 +10,7 @@ Examples -------- + How to make translation between two set of word-vectors ======================================================= @@ -97,19 +98,18 @@ """ import warnings +from collections import OrderedDict + import numpy as np -from collections import OrderedDict from gensim import utils -from six import string_types -class Space(object): +class Space: """An auxiliary class for storing the the words space.""" def __init__(self, matrix, index2word): """ - Parameters ---------- matrix : iterable of numpy.ndarray @@ -256,7 +256,7 @@ def train(self, word_pairs): self.translation_matrix = np.linalg.lstsq(m1, m2, -1)[0] def save(self, *args, **kwargs): - """Save the model to file but ignoring the `source_space` and `target_space`""" + """Save the model to a file. Ignores (doesn't store) the `source_space` and `target_space` attributes.""" kwargs['ignore'] = kwargs.get('ignore', ['source_space', 'target_space']) super(TranslationMatrix, self).save(*args, **kwargs) @@ -266,12 +266,12 @@ def apply_transmat(self, words_space): Parameters ---------- words_space : :class:`~gensim.models.translation_matrix.Space` - Object that constructed for those words to be translate. + `Space` object constructed for the words to be translated. Returns ------- :class:`~gensim.models.translation_matrix.Space` - Object that constructed for those mapped words. + `Space` object constructed for the mapped words. """ return Space(np.dot(words_space.mat, self.translation_matrix), words_space.index2word) @@ -301,8 +301,7 @@ def translate(self, source_words, topn=5, gc=0, sample_num=None, source_lang_vec Ordered dict where each item is `word`: [`translated_word_1`, `translated_word_2`, ...] """ - - if isinstance(source_words, string_types): + if isinstance(source_words, str): # pass only one word to translate source_words = [source_words] @@ -329,7 +328,7 @@ def translate(self, source_words, topn=5, gc=0, sample_num=None, source_lang_vec "When using the globally corrected neighbour retrieval method, " "the `sample_num` parameter(i.e. the number of words sampled from source space) must be provided." ) - lexicon = set(source_lang_vec.index2word) + lexicon = set(source_lang_vec.index_to_key) addition = min(sample_num, len(lexicon) - len(source_words)) lexicon = self.random_state.choice(list(lexicon.difference(source_words)), addition) source_space = Space.build(source_lang_vec, set(source_words).union(set(lexicon))) diff --git a/gensim/scripts/word2vec2tensor.py b/gensim/scripts/word2vec2tensor.py index 8495cb9862..3e79688490 100644 --- a/gensim/scripts/word2vec2tensor.py +++ b/gensim/scripts/word2vec2tensor.py @@ -70,7 +70,7 @@ def word2vec2tensor(word2vec_model_path, tensor_filename, binary=False): outfiletsvmeta = tensor_filename + '_metadata.tsv' with utils.open(outfiletsv, 'wb') as file_vector, utils.open(outfiletsvmeta, 'wb') as file_metadata: - for word in model.index2word: + for word in model.index_to_key: file_metadata.write(gensim.utils.to_utf8(word) + gensim.utils.to_utf8('\n')) vector_row = '\t'.join(str(x) for x in model[word]) file_vector.write(gensim.utils.to_utf8(vector_row) + gensim.utils.to_utf8('\n')) diff --git a/gensim/similarities/annoy.py b/gensim/similarities/annoy.py index 9f8b8fdbc0..57808f1b3b 100644 --- a/gensim/similarities/annoy.py +++ b/gensim/similarities/annoy.py @@ -151,7 +151,7 @@ def load(self, fname): def build_from_word2vec(self): """Build an Annoy index using word vectors from a Word2Vec model.""" return self._build_from_model( - self.model.wv.get_normed_vectors(), self.model.wv.index2word, self.model.vector_size, + self.model.wv.get_normed_vectors(), self.model.wv.index_to_key, self.model.vector_size, ) def build_from_doc2vec(self): @@ -163,7 +163,7 @@ def build_from_doc2vec(self): def build_from_keyedvectors(self): """Build an Annoy index using word vectors from a KeyedVectors model.""" return self._build_from_model( - self.model.get_normed_vectors(), self.model.index2word, self.model.vector_size, + self.model.get_normed_vectors(), self.model.index_to_key, self.model.vector_size, ) def _build_from_model(self, vectors, labels, num_features): diff --git a/gensim/similarities/nmslib.py b/gensim/similarities/nmslib.py index b70a9f4e43..7ff78539c1 100644 --- a/gensim/similarities/nmslib.py +++ b/gensim/similarities/nmslib.py @@ -187,7 +187,7 @@ def load(cls, fname): def _build_from_word2vec(self): """Build an NMSLIB index using word vectors from a Word2Vec model.""" - self._build_from_model(self.model.wv.get_normed_vectors(), self.model.wv.index2word) + self._build_from_model(self.model.wv.get_normed_vectors(), self.model.wv.index_to_key) def _build_from_doc2vec(self): """Build an NMSLIB index using document vectors from a Doc2Vec model.""" @@ -197,7 +197,7 @@ def _build_from_doc2vec(self): def _build_from_keyedvectors(self): """Build an NMSLIB index using word vectors from a KeyedVectors model.""" - self._build_from_model(self.model.get_normed_vectors(), self.model.index2word) + self._build_from_model(self.model.get_normed_vectors(), self.model.index_to_key) def _build_from_model(self, vectors, labels): index = nmslib.init(method='hnsw', space='cosinesimil') diff --git a/gensim/test/test_doc2vec.py b/gensim/test/test_doc2vec.py index aa958b744d..e402b1355a 100644 --- a/gensim/test/test_doc2vec.py +++ b/gensim/test/test_doc2vec.py @@ -109,7 +109,7 @@ def obsolete_testLoadOldModel(self): model = doc2vec.Doc2Vec.load(datapath(model_file)) self.assertTrue(model.wv.vectors.shape == (3955, 100)) self.assertTrue(len(model.wv) == 3955) - self.assertTrue(len(model.wv.index2word) == 3955) + self.assertTrue(len(model.wv.index_to_key) == 3955) self.assertIsNone(model.corpus_total_words) self.assertTrue(model.syn1neg.shape == (len(model.wv), model.vector_size)) self.assertTrue(model.wv.vectors_lockf.shape == (3955, )) @@ -129,7 +129,7 @@ def obsolete_testLoadOldModelSeparates(self): model = doc2vec.Doc2Vec.load(datapath(model_file)) self.assertTrue(model.wv.vectors.shape == (3955, 100)) self.assertTrue(len(model.wv) == 3955) - self.assertTrue(len(model.wv.index2word) == 3955) + self.assertTrue(len(model.wv.index_to_key) == 3955) self.assertIsNone(model.corpus_total_words) self.assertTrue(model.syn1neg.shape == (len(model.wv), model.vector_size)) self.assertTrue(model.wv.vectors_lockf.shape == (3955, )) diff --git a/gensim/test/test_keyedvectors.py b/gensim/test/test_keyedvectors.py index 3ab8f33afa..fd96f9f26f 100644 --- a/gensim/test/test_keyedvectors.py +++ b/gensim/test/test_keyedvectors.py @@ -80,7 +80,7 @@ def test_most_similar_raises_keyerror(self): def test_most_similar_restrict_vocab(self): """Test most_similar returns handles restrict_vocab correctly.""" - expected = set(self.vectors.index2word[:5]) + expected = set(self.vectors.index_to_key[:5]) predicted = set(result[0] for result in self.vectors.most_similar('war', topn=5, restrict_vocab=5)) self.assertEqual(expected, predicted) diff --git a/gensim/test/test_poincare.py b/gensim/test/test_poincare.py index 67b2668e02..98970525a2 100644 --- a/gensim/test/test_poincare.py +++ b/gensim/test/test_poincare.py @@ -278,7 +278,7 @@ def test_most_similar_raises_keyerror(self): def test_most_similar_restrict_vocab(self): """Test most_similar returns handles restrict_vocab correctly.""" - expected = set(self.vectors.index2word[:5]) + expected = set(self.vectors.index_to_key[:5]) predicted = set(result[0] for result in self.vectors.most_similar('dog.n.01', topn=5, restrict_vocab=5)) self.assertEqual(expected, predicted) diff --git a/gensim/test/test_similarities.py b/gensim/test/test_similarities.py index 9c91e8926d..6a0321fdbe 100644 --- a/gensim/test/test_similarities.py +++ b/gensim/test/test_similarities.py @@ -591,7 +591,7 @@ def testLoadMissingRaisesError(self): def assertVectorIsSimilarToItself(self, wv, index): vector = wv.get_normed_vectors()[0] - label = wv.index2word[0] + label = wv.index_to_key[0] approx_neighbors = index.most_similar(vector, 1) word, similarity = approx_neighbors[0] @@ -748,7 +748,7 @@ def test_load_missing_raises_error(self): def assertVectorIsSimilarToItself(self, wv, index): vector = wv.get_normed_vectors()[0] - label = wv.index2word[0] + label = wv.index_to_key[0] approx_neighbors = index.most_similar(vector, 1) word, similarity = approx_neighbors[0] diff --git a/gensim/test/test_translation_matrix.py b/gensim/test/test_translation_matrix.py index 8846dc617d..578be26941 100644 --- a/gensim/test/test_translation_matrix.py +++ b/gensim/test/test_translation_matrix.py @@ -19,10 +19,11 @@ def setUp(self): self.source_word_vec_file = datapath("EN.1-10.cbow1_wind5_hs0_neg10_size300_smpl1e-05.txt") self.target_word_vec_file = datapath("IT.1-10.cbow1_wind5_hs0_neg10_size300_smpl1e-05.txt") - self.word_pairs = [("one", "uno"), ("two", "due"), ("three", "tre"), + self.word_pairs = [ + ("one", "uno"), ("two", "due"), ("three", "tre"), ("four", "quattro"), ("five", "cinque"), ("seven", "sette"), ("eight", "otto"), ("dog", "cane"), ("pig", "maiale"), ("fish", "cavallo"), ("birds", "uccelli"), - ("apple", "mela"), ("orange", "arancione"), ("grape", "acino"), ("banana", "banana") + ("apple", "mela"), ("orange", "arancione"), ("grape", "acino"), ("banana", "banana"), ] self.test_word_pairs = [("ten", "dieci"), ("cat", "gatto")] @@ -53,7 +54,7 @@ def test_translate_nn(self): test_source_word, test_target_word = zip(*self.test_word_pairs) translated_words = model.translate( - test_source_word, topn=5, source_lang_vec=self.source_word_vec, target_lang_vec=self.target_word_vec + test_source_word, topn=5, source_lang_vec=self.source_word_vec, target_lang_vec=self.target_word_vec, ) for idx, item in enumerate(self.test_word_pairs): @@ -96,7 +97,7 @@ def setUp(self): def test_translation_matrix(self): model = translation_matrix.BackMappingTranslationMatrix( - self.source_doc_vec, self.target_doc_vec, self.train_docs[:5] + self.source_doc_vec, self.target_doc_vec, self.train_docs[:5], ) transmat = model.train(self.train_docs[:5]) self.assertEqual(transmat.shape, (8, 8)) @@ -108,7 +109,7 @@ def test_infer_vector(self): replaces a nonsensical test. """ model = translation_matrix.BackMappingTranslationMatrix( - self.source_doc_vec, self.target_doc_vec, self.train_docs[:5] + self.source_doc_vec, self.target_doc_vec, self.train_docs[:5], ) model.train(self.train_docs[:5]) backmapped_vec = model.infer_vector(self.target_doc_vec.dv[self.train_docs[5].tags]) diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index 001ad4c365..a1d766bdb8 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -571,9 +571,9 @@ def testEvaluateWordPairs(self): pearson = correlation[0][0] spearman = correlation[1][0] oov = correlation[2] - self.assertTrue(0.1 < pearson < 1.0, "pearson %f not between 0.1 & 1.0" % pearson) - self.assertTrue(0.1 < spearman < 1.0, "spearman %f not between 0.1 and 1.0" % spearman) - self.assertTrue(0.0 <= oov < 90.0, "oov %f not between 0.0 and 90.0" % oov) + self.assertTrue(0.1 < pearson < 1.0, "pearson {pearson} not between 0.1 & 1.0") + self.assertTrue(0.1 < spearman < 1.0, "spearman {spearman} not between 0.1 and 1.0") + self.assertTrue(0.0 <= oov < 90.0, "OOV {oov} not between 0.0 and 90.0") @unittest.skipIf(os.name == 'nt' and six.PY2, "CythonLineSentence is not supported on Windows + Py27") def testEvaluateWordPairsFromFile(self): @@ -586,9 +586,9 @@ def testEvaluateWordPairsFromFile(self): pearson = correlation[0][0] spearman = correlation[1][0] oov = correlation[2] - self.assertTrue(0.1 < pearson < 1.0, "pearson %f not between 0.1 & 1.0" % pearson) - self.assertTrue(0.1 < spearman < 1.0, "spearman %f not between 0.1 and 1.0" % spearman) - self.assertTrue(0.0 <= oov < 90.0, "oov %f not between 0.0 and 90.0" % oov) + self.assertTrue(0.1 < pearson < 1.0, f"pearson {pearson} not between 0.1 & 1.0") + self.assertTrue(0.1 < spearman < 1.0, f"spearman {spearman} not between 0.1 and 1.0") + self.assertTrue(0.0 <= oov < 90.0, f"OOV {oov} not between 0.0 and 90.0") def model_sanity(self, model, train=True, with_corpus_file=False, ranks=None): """Even tiny models trained on LeeCorpus should pass these sanity checks""" @@ -606,7 +606,7 @@ def model_sanity(self, model, train=True, with_corpus_file=False, ranks=None): self.assertFalse((orig0 == model.wv.vectors[1]).all()) # vector should vary after training query_word = 'attacks' expected_word = 'bombings' - sims = model.wv.most_similar(query_word, topn=len(model.wv.index2word)) + sims = model.wv.most_similar(query_word, topn=len(model.wv.index_to_key)) t_rank = [word for word, score in sims].index(expected_word) # in >200 calibration runs w/ calling parameters, 'terrorism' in 50-most_sim for 'war' if ranks is not None: @@ -855,7 +855,7 @@ def testLoadOldModel(self): model = word2vec.Word2Vec.load(datapath(model_file)) self.assertTrue(model.wv.vectors.shape == (12, 100)) self.assertTrue(len(model.wv) == 12) - self.assertTrue(len(model.wv.index2word) == 12) + self.assertTrue(len(model.wv.index_to_key) == 12) self.assertTrue(model.syn1neg.shape == (len(model.wv), model.wv.vector_size)) self.assertTrue(len(model.wv.vectors_lockf.shape) > 0) self.assertTrue(model.cum_table.shape == (12,)) @@ -870,7 +870,7 @@ def testLoadOldModelSeparates(self): model = word2vec.Word2Vec.load(datapath(model_file)) self.assertTrue(model.wv.vectors.shape == (12, 100)) self.assertTrue(len(model.wv) == 12) - self.assertTrue(len(model.wv.index2word) == 12) + self.assertTrue(len(model.wv.index_to_key) == 12) self.assertTrue(model.syn1neg.shape == (len(model.wv), model.wv.vector_size)) self.assertTrue(len(model.wv.vectors_lockf.shape) > 0) self.assertTrue(model.cum_table.shape == (12,)) diff --git a/gensim/viz/poincare.py b/gensim/viz/poincare.py index f20fd8ab2d..ba91f103dd 100644 --- a/gensim/viz/poincare.py +++ b/gensim/viz/poincare.py @@ -51,7 +51,7 @@ def poincare_2d_visualization(model, tree, figure_title, num_nodes=50, show_node if vectors.shape[1] != 2: raise ValueError('Can only plot 2-D vectors') - node_labels = model.kv.index2word + node_labels = model.kv.index_to_key nodes_x = list(vectors[:, 0]) nodes_y = list(vectors[:, 1]) nodes = go.Scatter( From 17da21e4e3a5d73275ac32e501bf152500a49ee5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Sat, 19 Sep 2020 15:55:06 +0200 Subject: [PATCH 14/19] finish method renaming - add() => add_vectors() - add_one() => add_vector() --- gensim/models/keyedvectors.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index 5b8e010c26..5d5f01e84c 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -429,7 +429,7 @@ def word_vec(self, *args, **kwargs): """Compatibility alias for get_vector(); must exist so subclass calls reach subclass get_vector().""" return self.get_vector(*args, **kwargs) - def add_new_vector(self, key, vector): + def add_vector(self, key, vector): """Add one new vector at the given key, into existing slot if available. Warning: using this repeatedly is inefficient, requiring a full reallocation & copy, @@ -1718,7 +1718,7 @@ def _add_word_to_kv(kv, counts, word, weights, vocab_size): if kv.has_index_for(word): logger.warning("duplicate word '%s' in word2vec file, ignoring all but first", word) return - word_id = kv.add_new_vector(word, weights) + word_id = kv.add_vector(word, weights) if counts is None: # Most common scenario: no vocab file given. Just make up some bogus counts, in descending order. From f0cade1af9dc9177cd2b6d47cac9b19cf295965d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Sat, 19 Sep 2020 20:05:10 +0200 Subject: [PATCH 15/19] Update gensim/models/word2vec.py Co-authored-by: Michael Penkov --- gensim/models/word2vec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index b904fa038e..f13ca69e0c 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -1852,7 +1852,7 @@ def _load_specials(self, *args, **kwargs): if not hasattr(self, 'corpus_total_words'): self.corpus_total_words = None if not hasattr(self.wv, 'vectors_lockf') and hasattr(self.wv, 'vectors'): - self.wv.vectors_lockf = getattr(self, 'vectors_lockf', np.ones(1, dtype=REAL)) + self.wv.vectors_lockf = np.ones(1, dtype=REAL) if not hasattr(self, 'random'): # use new instance of numpy's recommended generator/algorithm self.random = np.random.default_rng(seed=self.seed) From 6fa5a1b464b22bbebea8389fffa3581a04a28ae7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Sat, 19 Sep 2020 20:15:33 +0200 Subject: [PATCH 16/19] a few more style fixes --- gensim/models/word2vec.py | 92 +++++++++++++++++++++++++-------------- 1 file changed, 59 insertions(+), 33 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index f13ca69e0c..8fbea05dfe 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -167,21 +167,27 @@ # file-based word2vec is not supported CORPUSFILE_VERSION = -1 - def train_epoch_sg(model, corpus_file, offset, _cython_vocab, _cur_epoch, _expected_examples, _expected_words, - _work, _neu1, compute_loss): + def train_epoch_sg( + model, corpus_file, offset, _cython_vocab, _cur_epoch, _expected_examples, _expected_words, + _work, _neu1, compute_loss, + ): raise RuntimeError("Training with corpus_file argument is not supported") - def train_epoch_cbow(model, corpus_file, offset, _cython_vocab, _cur_epoch, _expected_examples, _expected_words, - _work, _neu1, compute_loss): + def train_epoch_cbow( + model, corpus_file, offset, _cython_vocab, _cur_epoch, _expected_examples, _expected_words, + _work, _neu1, compute_loss, + ): raise RuntimeError("Training with corpus_file argument is not supported") class Word2Vec(utils.SaveLoad): - def __init__(self, sentences=None, corpus_file=None, vector_size=100, alpha=0.025, window=5, min_count=5, - max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, - sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, epochs=5, null_word=0, - trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False, callbacks=(), - comment=None, max_final_vocab=None): + def __init__( + self, sentences=None, corpus_file=None, vector_size=100, alpha=0.025, window=5, min_count=5, + max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, + sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, epochs=5, null_word=0, + trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False, callbacks=(), + comment=None, max_final_vocab=None, + ): """Train, use and evaluate neural networks described in https://code.google.com/p/word2vec/. Once you're finished training a model (=no more updates, only querying) @@ -380,8 +386,10 @@ def build_vocab_and_train(self, corpus_iterable=None, corpus_file=None, trim_rul total_words=self.corpus_total_words, epochs=self.epochs, start_alpha=self.alpha, end_alpha=self.min_alpha, compute_loss=self.compute_loss, callbacks=callbacks) - def build_vocab(self, corpus_iterable=None, corpus_file=None, update=False, progress_per=10000, - keep_raw_vocab=False, trim_rule=None, **kwargs): + def build_vocab( + self, corpus_iterable=None, corpus_file=None, update=False, progress_per=10000, + keep_raw_vocab=False, trim_rule=None, **kwargs, + ): """Build vocabulary from a sequence of sentences (can be a once-only generator stream). Parameters @@ -427,7 +435,9 @@ def build_vocab(self, corpus_iterable=None, corpus_file=None, update=False, prog report_values['memory'] = self.estimate_memory(vocab_size=report_values['num_retained_words']) self.prepare_weights(update=update) - def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=None, trim_rule=None, update=False): + def build_vocab_from_freq( + self, word_freq, keep_raw_vocab=False, corpus_count=None, trim_rule=None, update=False, + ): """Build vocabulary from a dictionary of word frequencies. Parameters @@ -522,7 +532,8 @@ def scan_vocab(self, corpus_iterable=None, corpus_file=None, progress_per=10000, def prepare_vocab( self, update=False, keep_raw_vocab=False, trim_rule=None, - min_count=None, sample=None, dry_run=False): + min_count=None, sample=None, dry_run=False, + ): """Apply vocabulary settings for `min_count` (discarding less-frequent words) and `sample` (controlling the downsampling of more-frequent words). @@ -830,8 +841,10 @@ def init_sims(self, replace=False): """ self.wv.init_sims(replace=replace) - def _do_train_epoch(self, corpus_file, thread_id, offset, cython_vocab, thread_private_mem, cur_epoch, - total_examples=None, total_words=None, **kwargs): + def _do_train_epoch( + self, corpus_file, thread_id, offset, cython_vocab, thread_private_mem, cur_epoch, + total_examples=None, total_words=None, **kwargs, + ): work, neu1 = thread_private_mem if self.sg: @@ -873,10 +886,12 @@ def _clear_post_train(self): """Clear any cached vector lengths from the model.""" self.wv.norms = None - def train(self, corpus_iterable=None, corpus_file=None, total_examples=None, total_words=None, - epochs=None, start_alpha=None, end_alpha=None, word_count=0, - queue_factor=2, report_delay=1.0, compute_loss=False, callbacks=(), - **kwargs): + def train( + self, corpus_iterable=None, corpus_file=None, total_examples=None, total_words=None, + epochs=None, start_alpha=None, end_alpha=None, word_count=0, + queue_factor=2, report_delay=1.0, compute_loss=False, callbacks=(), + **kwargs, + ): """Update the model's neural weights from a sequence of sentences. Notes @@ -891,7 +906,7 @@ def train(self, corpus_iterable=None, corpus_file=None, total_examples=None, tot -------- To avoid common mistakes around the model's ability to do multiple training passes itself, an explicit `epochs` argument **MUST** be provided. In the common and recommended case - where :meth:`~gensim.models.word2vec.Word2Vec.train` is only called once, you can set `epochs=self.iter`. + where :meth:`~gensim.models.word2vec.Word2Vec.train` is only called once, you can set `epochs=self.epochs`. Parameters ---------- @@ -944,7 +959,7 @@ def train(self, corpus_iterable=None, corpus_file=None, total_examples=None, tot >>> >>> model = Word2Vec(min_count=1) >>> model.build_vocab(sentences) # prepare the model vocabulary - >>> model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) # train word vectors + >>> model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs) # train word vectors (1, 30) """ @@ -1000,8 +1015,10 @@ def train(self, corpus_iterable=None, corpus_file=None, total_examples=None, tot callback.on_train_end(self) return trained_word_count, raw_word_count - def _worker_loop_corpusfile(self, corpus_file, thread_id, offset, cython_vocab, progress_queue, cur_epoch=0, - total_examples=None, total_words=None, **kwargs): + def _worker_loop_corpusfile( + self, corpus_file, thread_id, offset, cython_vocab, progress_queue, cur_epoch=0, + total_examples=None, total_words=None, **kwargs, + ): """Train the model on a `corpus_file` in LineSentence format. This function will be called in parallel by multiple workers (threads or processes) to make @@ -1147,8 +1164,10 @@ def _job_producer(self, data_iterator, job_queue, cur_epoch=0, total_examples=No job_queue.put(None) logger.debug("job loop exiting, total %i jobs", job_no) - def _log_epoch_progress(self, progress_queue=None, job_queue=None, cur_epoch=0, total_examples=None, - total_words=None, report_delay=1.0, is_corpus_file_mode=None): + def _log_epoch_progress( + self, progress_queue=None, job_queue=None, cur_epoch=0, total_examples=None, + total_words=None, report_delay=1.0, is_corpus_file_mode=None, + ): """Get the progress report for a single training epoch. Parameters @@ -1220,7 +1239,8 @@ def _log_epoch_progress(self, progress_queue=None, job_queue=None, cur_epoch=0, return trained_word_count, raw_word_count, job_tally def _train_epoch_corpusfile( - self, corpus_file, cur_epoch=0, total_examples=None, total_words=None, callbacks=(), **kwargs): + self, corpus_file, cur_epoch=0, total_examples=None, total_words=None, callbacks=(), **kwargs, + ): """Train the model for a single epoch. Parameters @@ -1283,8 +1303,10 @@ def _train_epoch_corpusfile( return trained_word_count, raw_word_count, job_tally - def _train_epoch(self, data_iterable, cur_epoch=0, total_examples=None, total_words=None, - queue_factor=2, report_delay=1.0, callbacks=()): + def _train_epoch( + self, data_iterable, cur_epoch=0, total_examples=None, total_words=None, + queue_factor=2, report_delay=1.0, callbacks=(), + ): """Train the model for a single epoch. Parameters @@ -1449,8 +1471,10 @@ def _check_training_sanity(self, epochs=None, total_examples=None, total_words=N self.hs, self.sample, self.negative, self.window ) - def _log_progress(self, job_queue, progress_queue, cur_epoch, example_count, total_examples, - raw_word_count, total_words, trained_word_count, elapsed): + def _log_progress( + self, job_queue, progress_queue, cur_epoch, example_count, total_examples, + raw_word_count, total_words, trained_word_count, elapsed + ): """Callback used to log progress for long running jobs. Parameters @@ -1500,8 +1524,10 @@ def _log_progress(self, job_queue, progress_queue, cur_epoch, example_count, tot -1 if job_queue is None else utils.qsize(job_queue), utils.qsize(progress_queue) ) - def _log_epoch_end(self, cur_epoch, example_count, total_examples, raw_word_count, total_words, - trained_word_count, elapsed, is_corpus_file_mode): + def _log_epoch_end( + self, cur_epoch, example_count, total_examples, raw_word_count, total_words, + trained_word_count, elapsed, is_corpus_file_mode + ): """Callback used to log the end of a training epoch. Parameters @@ -1800,7 +1826,7 @@ def save(self, *args, **kwargs): def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, compress, subname): """Arrange any special handling for the `gensim.utils.SaveLoad` protocol.""" # don't save properties that are merely calculated from others - ignore = set(it.chain(ignore, ('cum_table',))) + ignore = set(ignore).union(['cum_table', ]) return super(Word2Vec, self)._save_specials( fname, separately, sep_limit, ignore, pickle_protocol, compress, subname) From e95ac0ac15dc27c256b8cba5cc1ddb0c1b655f59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Sun, 20 Sep 2020 11:29:49 +0200 Subject: [PATCH 17/19] fix nonsensical word2vec path examples --- gensim/models/word2vec.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 8fbea05dfe..741a26e1ad 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -66,22 +66,25 @@ .. sourcecode:: pycon - >>> vector = model.wv['computer'] # numpy vector of a word + >>> vector = model.wv['computer'] # get numpy vector of a word The reason for separating the trained vectors into `KeyedVectors` is that if you don't -need the full model state any more (don't need to continue training), the state can discarded, -resulting in a much smaller and faster object that can be mmapped for lightning +need the full model state any more (don't need to continue training), the state can discarded. +This results in a much smaller and faster object that can be mmapped for lightning fast loading and sharing the vectors in RAM between processes: .. sourcecode:: pycon >>> from gensim.models import KeyedVectors >>> - >>> path = get_tmpfile("wordvectors.kv") + >>> # Store just the words + their trained embeddings. + >>> word_vectors = model.wv + >>> word_vectors.save("word2vec.wordvectors") + >>> + >>> # Load back with memory-mapping = read-only, shared across processes. + >>> wv = KeyedVectors.load("word2vec.wordvectors", mmap='r') >>> - >>> model.wv.save(path) - >>> wv = KeyedVectors.load("model.wv", mmap='r') - >>> vector = wv['computer'] # numpy vector of a word + >>> vector = wv['computer'] # Get numpy vector of a word Gensim can also load word vectors in the "word2vec C format", as a :class:`~gensim.models.keyedvectors.KeyedVectors` instance: @@ -90,8 +93,10 @@ >>> from gensim.test.utils import datapath >>> - >>> wv_from_text = KeyedVectors.load_word2vec_format(datapath('word2vec_pre_kv_c'), binary=False) # C text format - >>> wv_from_bin = KeyedVectors.load_word2vec_format(datapath("euclidean_vectors.bin"), binary=True) # C bin format + >>> # Load a word2vec model stored in the C *text* format. + >>> wv_from_text = KeyedVectors.load_word2vec_format(datapath('word2vec_pre_kv_c'), binary=False) + >>> # Load a word2vec model stored in the C *binary* format. + >>> wv_from_bin = KeyedVectors.load_word2vec_format(datapath("euclidean_vectors.bin"), binary=True) It is impossible to continue training the vectors loaded from the C format because the hidden weights, vocabulary frequencies and the binary tree are missing. To continue training, you'll need the @@ -150,7 +155,7 @@ logger = logging.getLogger(__name__) try: - from gensim.models.word2vec_inner import ( # noqa: F401 + from gensim.models.word2vec_inner import ( train_batch_sg, train_batch_cbow, score_sentence_sg, From dc9c3fc79797f8186f933ecce0dea6b9a9345cbf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Sun, 20 Sep 2020 12:02:32 +0200 Subject: [PATCH 18/19] more doc fixes --- gensim/models/word2vec.py | 92 +++++++++++++++++++++++++++++---------- 1 file changed, 70 insertions(+), 22 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 741a26e1ad..6c934d7746 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -8,6 +8,7 @@ """ Introduction ============ + This module implements the word2vec family of algorithms, using highly optimized C routines, data streaming and Pythonic interfaces. @@ -21,17 +22,15 @@ There are more ways to train word vectors in Gensim than just Word2Vec. See also :class:`~gensim.models.doc2vec.Doc2Vec`, :class:`~gensim.models.fasttext.FastText` and -wrappers for :class:`~gensim.models.wrappers.VarEmbed` and :class:`~gensim.models.wrappers.WordRank`. +wrappers for :class:`~gensim.models.wrappers.varembed.VarEmbed` and :class:`~gensim.models.wrappers.wordrank.WordRank`. The training algorithms were originally ported from the C package https://code.google.com/p/word2vec/ -and extended with additional functionality and optimizations over the years. +and extended with additional functionality and +`optimizations `_ over the years. For a tutorial on Gensim word2vec, with an interactive web app trained on GoogleNews, visit https://rare-technologies.com/word2vec-tutorial/. -**Make sure you have a C compiler before installing Gensim, to use the optimized word2vec routines** -(70x speedup compared to plain NumPy implementation, https://rare-technologies.com/parallelizing-word2vec-in-python/). - Usage examples ============== @@ -42,17 +41,17 @@ >>> from gensim.test.utils import common_texts >>> from gensim.models import Word2Vec >>> - >>> model = Word2Vec(common_texts, vector_size=100, window=5, min_count=1, workers=4) + >>> model = Word2Vec(sentences=common_texts, vector_size=100, window=5, min_count=1, workers=4) >>> model.save("word2vec.model") -The training is streamed, so ``sentences`` can be an iterable, reading input data -from disk on-the-fly. This lets you avoid loading the entire corpus into RAM. -However, note that because the iterable must be re-startable, `sentences` must -not be a generator. For an example of an appropriate iterator see -:class:`~gensim.models.word2vec.BrownCorpus`, -:class:`~gensim.models.word2vec.Text8Corpus` or -:class:`~gensim.models.word2vec.LineSentence`. +**The training is streamed, so ``sentences`` can be an iterable**, reading input data +from the disk or network on-the-fly, without loading your entire corpus into RAM. + +Note the ``sentences`` iterable must be *restartable* (not just a generator), to allow the algorithm +to stream over your dataset multiple times. For some examples of streamed iterables, +see :class:`~gensim.models.word2vec.BrownCorpus`, +:class:`~gensim.models.word2vec.Text8Corpus` or :class:`~gensim.models.word2vec.LineSentence`. If you save the model you can continue training it later: @@ -62,14 +61,16 @@ >>> model.train([["hello", "world"]], total_examples=1, epochs=1) (0, 2) -The trained word vectors are stored in a :class:`~gensim.models.keyedvectors.KeyedVectors` instance in `model.wv`: +The trained word vectors are stored in a :class:`~gensim.models.keyedvectors.KeyedVectors` instance, as `model.wv`: .. sourcecode:: pycon >>> vector = model.wv['computer'] # get numpy vector of a word The reason for separating the trained vectors into `KeyedVectors` is that if you don't -need the full model state any more (don't need to continue training), the state can discarded. +need the full model state any more (don't need to continue training), its state can discarded, +keeping just the vectors and their keys proper. + This results in a much smaller and faster object that can be mmapped for lightning fast loading and sharing the vectors in RAM between processes: @@ -103,8 +104,8 @@ full :class:`~gensim.models.word2vec.Word2Vec` object state, as stored by :meth:`~gensim.models.word2vec.Word2Vec.save`, not just the :class:`~gensim.models.keyedvectors.KeyedVectors`. -You can perform various NLP word tasks with a trained model. Some of them -are already built-in - you can see it in :mod:`gensim.models.keyedvectors`. +You can perform various NLP tasks with a trained model. Some of the operations +are already built-in - see :mod:`gensim.models.keyedvectors`. If you're finished training a model (i.e. no more updates, only querying), you can switch to the :class:`~gensim.models.keyedvectors.KeyedVectors` instance: @@ -116,18 +117,65 @@ to trim unneeded model state = use much less RAM and allow fast loading and memory sharing (mmap). -Note that there is a :mod:`gensim.models.phrases` module which lets you automatically -detect phrases longer than one word. Using phrases, you can learn a word2vec model -where "words" are actually multiword expressions, such as `new_york_times` or `financial_crisis`: +Embeddings with multiword ngrams +================================ + +There is a :mod:`gensim.models.phrases` module which lets you automatically +detect phrases longer than one word, using collocation statistics. +Using phrases, you can learn a word2vec model where "words" are actually multiword expressions, +such as `new_york_times` or `financial_crisis`: .. sourcecode:: pycon - >>> from gensim.test.utils import common_texts >>> from gensim.models import Phrases >>> + >>> # Train a bigram detector. >>> bigram_transformer = Phrases(common_texts) + >>> + >>> # Apply the trained MWE detector to a corpus, using the result to train a Word2vec model. >>> model = Word2Vec(bigram_transformer[common_texts], min_count=1) +Pretrained models +================= + +Gensim comes with several already pre-trained models, in the +`Gensim-data repository `_: + +.. sourcecode:: pycon + + >>> import gensim.downloader + >>> # Show all available models in gensim-data + >>> print(list(gensim.downloader.info()['models'].keys())) + ['fasttext-wiki-news-subwords-300', + 'conceptnet-numberbatch-17-06-300', + 'word2vec-ruscorpora-300', + 'word2vec-google-news-300', + 'glove-wiki-gigaword-50', + 'glove-wiki-gigaword-100', + 'glove-wiki-gigaword-200', + 'glove-wiki-gigaword-300', + 'glove-twitter-25', + 'glove-twitter-50', + 'glove-twitter-100', + 'glove-twitter-200', + '__testing_word2vec-matrix-synopsis'] + >>> + >>> # Download the "glove-twitter-25" embeddings + >>> glove_vectors = gensim.downloader.load('glove-twitter-25') + >>> + >>> # Use the downloaded vectors as usual: + >>> glove_vectors.most_similar('twitter') + [('facebook', 0.948005199432373), + ('tweet', 0.9403423070907593), + ('fb', 0.9342358708381653), + ('instagram', 0.9104824066162109), + ('chat', 0.8964964747428894), + ('hashtag', 0.8885937333106995), + ('tweets', 0.8878158330917358), + ('tl', 0.8778461217880249), + ('link', 0.8778210878372192), + ('internet', 0.8753897547721863)] + """ from __future__ import division # py3 "true division" @@ -155,7 +203,7 @@ logger = logging.getLogger(__name__) try: - from gensim.models.word2vec_inner import ( + from gensim.models.word2vec_inner import ( # noqa: F401 train_batch_sg, train_batch_cbow, score_sentence_sg, From da8847a04f9ee56702cb81a0218cd5a57e1f24e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Thu, 24 Sep 2020 11:49:08 +0200 Subject: [PATCH 19/19] `it` => `itertools`, + code style fixes --- gensim/models/fasttext.py | 3 +- gensim/models/phrases.py | 58 +++++++++++++++++---------------------- gensim/models/word2vec.py | 12 ++++---- 3 files changed, 32 insertions(+), 41 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index fa221a011d..3476f7c5dc 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -280,7 +280,6 @@ from collections.abc import Iterable import numpy as np -import itertools as it from numpy import ones, vstack, float32 as REAL import gensim.models._fasttext_bin @@ -1287,7 +1286,7 @@ def save(self, *args, **kwargs): def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, compress, subname): """Arrange any special handling for the gensim.utils.SaveLoad protocol""" # don't save properties that are merely calculated from others - ignore = set(it.chain(ignore, ('buckets_word', 'vectors'))) + ignore = set(ignore).union(['buckets_word', 'vectors', ]) return super(FastTextKeyedVectors, self)._save_specials( fname, separately, sep_limit, ignore, pickle_protocol, compress, subname) diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index d7001830ee..9460619db8 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -62,20 +62,14 @@ import os import logging from collections import defaultdict -import functools as ft -import itertools as it +import functools +import itertools from math import log import pickle -import six - -from six import iteritems, string_types, PY2, next +from inspect import getfullargspec as getargspec from gensim import utils, interfaces -if PY2: - from inspect import getargspec -else: - from inspect import getfullargspec as getargspec logger = logging.getLogger(__name__) @@ -101,11 +95,11 @@ def _is_single(obj): temp_iter = obj_iter try: peek = next(obj_iter) - obj_iter = it.chain([peek], obj_iter) + obj_iter = itertools.chain([peek], obj_iter) except StopIteration: # An empty object is a single document return True, obj - if isinstance(peek, string_types): + if isinstance(peek, str): # It's a document, return the iterator return True, obj_iter if temp_iter is obj: @@ -116,7 +110,7 @@ def _is_single(obj): return False, obj -class SentenceAnalyzer(object): +class SentenceAnalyzer: """Base util class for :class:`~gensim.models.phrases.Phrases` and :class:`~gensim.models.phrases.Phraser`.""" def score_item(self, worda, wordb, components, scorer): """Get bi-gram score statistics. @@ -194,7 +188,7 @@ def analyze_sentence(self, sentence, threshold, common_terms, scorer): in_between = [] else: # release words individually - for w in it.chain([last_uncommon], in_between): + for w in itertools.chain([last_uncommon], in_between): yield (w, None) in_between = [] last_uncommon = word @@ -242,7 +236,7 @@ def load(cls, *args, **kwargs): model.scoring = original_scorer # if there is a scoring parameter, and it's a text value, load the proper scoring function if hasattr(model, 'scoring'): - if isinstance(model.scoring, six.string_types): + if isinstance(model.scoring, str): if model.scoring == 'default': logger.info('older version of %s loaded with "default" scoring parameter', cls.__name__) logger.info('setting scoring method to original_scorer pluggable scoring method for compatibility') @@ -290,7 +284,7 @@ def _sentence2token(phrase_class, sentence): delimiter = phrase_class.delimiter if hasattr(phrase_class, 'vocab'): - scorer = ft.partial( + scorer = functools.partial( phrase_class.scoring, len_vocab=float(len(phrase_class.vocab)), min_count=float(phrase_class.min_count), @@ -311,9 +305,11 @@ def _sentence2token(phrase_class, sentence): class Phrases(SentenceAnalyzer, PhrasesTransformation): """Detect phrases based on collocation counts.""" - def __init__(self, sentences=None, min_count=5, threshold=10.0, - max_vocab_size=40000000, delimiter=b'_', progress_per=10000, - scoring='default', common_terms=frozenset()): + def __init__( + self, sentences=None, min_count=5, threshold=10.0, + max_vocab_size=40000000, delimiter=b'_', progress_per=10000, + scoring='default', common_terms=frozenset(), + ): """ Parameters @@ -378,16 +374,16 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0, # intentially override the value of the scoring parameter rather than set self.scoring here, # to still run the check of scoring function parameters in the next code block - if isinstance(scoring, six.string_types): + if isinstance(scoring, str): if scoring == 'default': scoring = original_scorer elif scoring == 'npmi': scoring = npmi_scorer else: - raise ValueError('unknown scoring method string %s specified' % (scoring)) + raise ValueError(f'unknown scoring method string {scoring} specified') scoring_parameters = [ - 'worda_count', 'wordb_count', 'bigram_count', 'len_vocab', 'min_count', 'corpus_word_count' + 'worda_count', 'wordb_count', 'bigram_count', 'len_vocab', 'min_count', 'corpus_word_count', ] if callable(scoring): if all(parameter in getargspec(scoring)[0] for parameter in scoring_parameters): @@ -407,13 +403,9 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0, # ensure picklability of custom scorer try: - test_pickle = pickle.dumps(self.scoring) - load_pickle = pickle.loads(test_pickle) + pickle.loads(pickle.dumps(self.scoring)) except pickle.PickleError: - raise pickle.PickleError('unable to pickle custom Phrases scoring function') - finally: - del(test_pickle) - del(load_pickle) + raise pickle.PickleError('Custom Phrases scoring function must be pickle-able') if sentences is not None: self.add_vocab(sentences) @@ -442,7 +434,7 @@ def __str__(self): """Get short string representation of this phrase detector.""" return "%s<%i vocab, min_count=%s, threshold=%s, max_vocab_size=%s>" % ( self.__class__.__name__, len(self.vocab), self.min_count, - self.threshold, self.max_vocab_size + self.threshold, self.max_vocab_size, ) @staticmethod @@ -510,7 +502,7 @@ def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000, if word not in common_terms: vocab[word] += 1 if last_uncommon is not None: - components = it.chain([last_uncommon], in_between, [word]) + components = itertools.chain([last_uncommon], in_between, [word]) vocab[delimiter.join(components)] += 1 last_uncommon = word in_between = [] @@ -569,7 +561,7 @@ def add_vocab(self, sentences): if len(self.vocab) > 0: logger.info("merging %i counts into %s", len(vocab), self) self.min_reduce = max(self.min_reduce, min_reduce) - for word, count in iteritems(vocab): + for word, count in vocab.items(): self.vocab[word] += count if len(self.vocab) > self.max_vocab_size: utils.prune_vocab(self.vocab, self.min_reduce) @@ -612,11 +604,11 @@ def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False): ... pass """ - analyze_sentence = ft.partial( + analyze_sentence = functools.partial( self.analyze_sentence, threshold=self.threshold, common_terms=self.common_terms, - scorer=ft.partial( + scorer=functools.partial( self.scoring, len_vocab=float(len(self.vocab)), min_count=float(self.min_count), @@ -780,7 +772,7 @@ def pseudocorpus(source_vocab, sep, common_terms=frozenset()): for i in range(1, len(unigrams)): if unigrams[i - 1] not in common_terms: # do not join common terms - cterms = list(it.takewhile(lambda w: w in common_terms, unigrams[i:])) + cterms = list(itertools.takewhile(lambda w: w in common_terms, unigrams[i:])) tail = unigrams[i + len(cterms):] components = [sep.join(unigrams[:i])] + cterms if tail: diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 6c934d7746..806e087c56 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -188,7 +188,7 @@ from collections import defaultdict, namedtuple from types import GeneratorType import threading -import itertools as it +import itertools import copy from queue import Queue, Empty @@ -2054,7 +2054,7 @@ def __iter__(self): # Assume it is a file-like object and try treating it as such # Things that don't have seek will trigger an exception self.source.seek(0) - for line in it.islice(self.source, self.limit): + for line in itertools.islice(self.source, self.limit): line = utils.to_unicode(line).split() i = 0 while i < len(line): @@ -2063,7 +2063,7 @@ def __iter__(self): except AttributeError: # If it didn't work like a file, use it as a string filename with utils.open(self.source, 'rb') as fin: - for line in it.islice(fin, self.limit): + for line in itertools.islice(fin, self.limit): line = utils.to_unicode(line).split() i = 0 while i < len(line): @@ -2117,7 +2117,7 @@ def __iter__(self): for file_name in self.input_files: logger.info('reading file %s', file_name) with utils.open(file_name, 'rb') as fin: - for line in it.islice(fin, self.limit): + for line in itertools.islice(fin, self.limit): line = utils.to_unicode(line).split() i = 0 while i < len(line): @@ -2126,12 +2126,12 @@ def __iter__(self): class Word2VecVocab(utils.SaveLoad): - """Obsolete class retained for now as load-compatibility state capture""" + """Obsolete class retained for now as load-compatibility state capture.""" pass class Word2VecTrainables(utils.SaveLoad): - """Obsolete class retained for now as load-compatibility state capture""" + """Obsolete class retained for now as load-compatibility state capture.""" pass