Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update changelog for 4.0.0 release #2981

Merged
merged 23 commits into from
Oct 28, 2020
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
785 changes: 436 additions & 349 deletions CHANGELOG.md

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion gensim/models/callbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@
...
>>>
>>> epoch_logger = EpochLogger()
>>> w2v_model = Word2Vec(common_texts, iter=5, size=10, min_count=0, seed=42, callbacks=[epoch_logger])
>>> w2v_model = Word2Vec(common_texts, epochs=5, vector_size=10, min_count=0, seed=42, callbacks=[epoch_logger])
Epoch #0 start
Epoch #0 end
Epoch #1 start
Expand Down
2 changes: 1 addition & 1 deletion gensim/models/doc2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -751,7 +751,7 @@ def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='*
@deprecated(
"Gensim 4.0.0 implemented internal optimizations that make calls to init_sims() unnecessary. "
"init_sims() is now obsoleted and will be completely removed in future versions. "
"See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4#init_sims"
"See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4"
)
def init_sims(self, replace=False):
"""
Expand Down
51 changes: 21 additions & 30 deletions gensim/models/fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@

.. sourcecode:: pycon

>>> # from gensim.models import FastText # FIXME: why does Sphinx dislike this import?
>>> from gensim.models import FastText
>>> from gensim.test.utils import common_texts # some example sentences
>>>
>>> print(common_texts[0])
Expand All @@ -50,16 +50,7 @@

.. sourcecode:: pycon

>>> model2 = FastText(vector_size=4, window=3, min_count=1, sentences=common_texts, iter=10)

.. Important::
This style of initialize-and-train in a single line is **deprecated**. We include it here
for backward compatibility only.

Please use the initialize-`build_vocab`-`train` pattern above instead, including using `epochs`
instead of `iter`.
The motivation is to simplify the API and resolve naming inconsistencies,
e.g. the iter parameter to the constructor is called epochs in the train function.
>>> model2 = FastText(vector_size=4, window=3, min_count=1, sentences=common_texts, epochs=10)

The two models above are instantiated differently, but behave identically.
For example, we can compare the embeddings they've calculated for the word "computer":
Expand Down Expand Up @@ -139,7 +130,7 @@

>>> import numpy as np
>>>
>>> 'computation' in model.wv.vocab # New word, currently out of vocab
>>> 'computation' in model.wv.key_to_index # New word, currently out of vocab
False
>>> old_vector = np.copy(model.wv['computation']) # Grab the existing vector
>>> new_sentences = [
Expand All @@ -157,7 +148,7 @@
>>> new_vector = model.wv['computation']
>>> np.allclose(old_vector, new_vector, atol=1e-4) # Vector has changed, model has learnt something
False
>>> 'computation' in model.wv.vocab # Word is still out of vocab
>>> 'computation' in model.wv.key_to_index # Word is still out of vocab
False

.. Important::
Expand All @@ -178,15 +169,15 @@

.. sourcecode:: pycon

>>> 'computer' in fb_model.wv.vocab # New word, currently out of vocab
>>> 'computer' in fb_model.wv.key_to_index # New word, currently out of vocab
False
>>> old_computer = np.copy(fb_model.wv['computer']) # Calculate current vectors
>>> fb_model.build_vocab(new_sentences, update=True)
>>> fb_model.train(new_sentences, total_examples=len(new_sentences), epochs=model.epochs)
>>> new_computer = fb_model.wv['computer']
>>> np.allclose(old_computer, new_computer, atol=1e-4) # Vector has changed, model has learnt something
False
>>> 'computer' in fb_model.wv.vocab # New word is now in the vocabulary
>>> 'computer' in fb_model.wv.key_to_index # New word is now in the vocabulary
True

If you do not intend to continue training the model, consider using the
Expand All @@ -200,25 +191,25 @@
>>> cap_path = datapath("crime-and-punishment.bin")
>>> wv = load_facebook_vectors(cap_path)
>>>
>>> 'landlord' in wv.vocab # Word is out of vocabulary
>>> 'landlord' in wv.key_to_index # Word is out of vocabulary
False
>>> oov_vector = wv['landlord']
>>> oov_vector = wv['landlord'] # Even OOV words have vectors in FastText
>>>
>>> 'landlady' in wv.vocab # Word is in the vocabulary
>>> 'landlady' in wv.key_to_index # Word is in the vocabulary
True
>>> iv_vector = wv['landlady']

Retrieve word-vector for vocab and out-of-vocab word:
Retrieve the word-vector for vocab and out-of-vocab word:

.. sourcecode:: pycon

>>> existent_word = "computer"
>>> existent_word in model.wv.vocab
>>> existent_word in model.wv.key_to_index
True
>>> computer_vec = model.wv[existent_word] # numpy vector of a word
>>>
>>> oov_word = "graph-out-of-vocab"
>>> oov_word in model.wv.vocab
>>> oov_word in model.wv.key_to_index
False
>>> oov_vec = model.wv[oov_word] # numpy vector for OOV word

Expand Down Expand Up @@ -488,9 +479,9 @@ def estimate_memory(self, vocab_size=None, report=None):
hashes = ft_ngram_hashes(word, self.wv.min_n, self.wv.max_n, self.wv.bucket)
num_ngrams += len(hashes)
# A list (64 bytes) with one np.array (100 bytes) per key, with a total of
# num_ngrams uint32s (4 bytes) amongst them
# Only used during training, not stored with the model
report['buckets_word'] = 64 + (100 * len(self.wv)) + (4 * num_ngrams) # FIXME: caching & calc sensible?
# num_ngrams uint32s (4 bytes) amongst them.
# Only used during training, not stored with the model.
report['buckets_word'] = 64 + (100 * len(self.wv)) + (4 * num_ngrams) # TODO: caching & calc sensible?
report['total'] = sum(report.values())
logger.info(
"estimated required memory for %i words, %i buckets and %i dimensions: %i bytes",
Expand Down Expand Up @@ -541,7 +532,7 @@ def _do_train_job(self, sentences, alpha, inits):
@deprecated(
"Gensim 4.0.0 implemented internal optimizations that make calls to init_sims() unnecessary. "
"init_sims() is now obsoleted and will be completely removed in future versions. "
"See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4#init_sims"
"See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4"
)
def init_sims(self, replace=False):
"""
Expand Down Expand Up @@ -699,11 +690,11 @@ def load_facebook_model(path, encoding='utf-8'):
>>> cap_path = datapath("crime-and-punishment.bin")
>>> fb_model = load_facebook_model(cap_path)
>>>
>>> 'landlord' in fb_model.wv.vocab # Word is out of vocabulary
>>> 'landlord' in fb_model.wv.key_to_index # Word is out of vocabulary
False
>>> oov_term = fb_model.wv['landlord']
>>>
>>> 'landlady' in fb_model.wv.vocab # Word is in the vocabulary
>>> 'landlady' in fb_model.wv.key_to_index # Word is in the vocabulary
True
>>> iv_term = fb_model.wv['landlady']
>>>
Expand Down Expand Up @@ -764,11 +755,11 @@ def load_facebook_vectors(path, encoding='utf-8'):
>>> cap_path = datapath("crime-and-punishment.bin")
>>> fbkv = load_facebook_vectors(cap_path)
>>>
>>> 'landlord' in fbkv.vocab # Word is out of vocabulary
>>> 'landlord' in fbkv.key_to_index # Word is out of vocabulary
False
>>> oov_vector = fbkv['landlord']
>>>
>>> 'landlady' in fbkv.vocab # Word is in the vocabulary
>>> 'landlady' in fbkv.key_to_index # Word is in the vocabulary
True
>>> iv_vector = fbkv['landlady']

Expand Down Expand Up @@ -1193,7 +1184,7 @@ def recalc_char_ngram_buckets(self):
Scan the vocabulary, calculate ngrams and their hashes, and cache the list of ngrams for each known word.

"""
# FIXME: evaluate if precaching even necessary, compared to recalculating as needed
# TODO: evaluate if precaching even necessary, compared to recalculating as needed.
if self.bucket == 0:
self.buckets_word = [np.array([], dtype=np.uint32)] * len(self.index_to_key)
return
Expand Down
Loading