From a48ec39346059d5f95db8d2a9103a9c7bc3b4cab Mon Sep 17 00:00:00 2001 From: Kumar Neelabh Date: Thu, 26 Nov 2020 17:25:23 +0530 Subject: [PATCH 1/9] Update wikicorpus.py Let the users have metadata (e.g. title) if they need it. Added an argument in WikiCorpus __init__() to specify if metadata is needed. Previously, it was set to False and could not be toggled. --- gensim/corpora/wikicorpus.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py index 8c3c94b5ff..dea4ad20d0 100644 --- a/gensim/corpora/wikicorpus.py +++ b/gensim/corpora/wikicorpus.py @@ -576,7 +576,7 @@ class WikiCorpus(TextCorpus): """ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None, filter_namespaces=('0',), tokenizer_func=tokenize, article_min_tokens=ARTICLE_MIN_WORDS, - token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True, filter_articles=None): + token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True, filter_articles=None, metadata=False): """Initialize the corpus. Unless a dictionary is provided, this scans the corpus once, @@ -621,7 +621,7 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction self.fname = fname self.filter_namespaces = filter_namespaces self.filter_articles = filter_articles - self.metadata = False + self.metadata = metadata if processes is None: processes = max(1, multiprocessing.cpu_count() - 1) self.processes = processes From a133ea765c6b84248aeb29bd46f04bf0e572cb1e Mon Sep 17 00:00:00 2001 From: Kumar Neelabh Date: Thu, 26 Nov 2020 17:43:23 +0530 Subject: [PATCH 2/9] Update wikicorpus.py Make Wikipedia corpus metadata accessible. --- gensim/corpora/wikicorpus.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py index dea4ad20d0..86a030d1d9 100644 --- a/gensim/corpora/wikicorpus.py +++ b/gensim/corpora/wikicorpus.py @@ -576,7 +576,8 @@ class WikiCorpus(TextCorpus): """ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None, filter_namespaces=('0',), tokenizer_func=tokenize, article_min_tokens=ARTICLE_MIN_WORDS, - token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True, filter_articles=None, metadata=False): + token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True, filter_articles=None, + metadata=False): """Initialize the corpus. Unless a dictionary is provided, this scans the corpus once, @@ -612,6 +613,8 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction If set, each XML article element will be passed to this callable before being processed. Only articles where the callable returns an XML element are processed, returning None allows filtering out some articles based on customised rules. + metadata: bool, optional + if True - write article titles to corpus Warnings -------- From 52b8ffca5d0264aef28c39a80fa0ac9ccd89360a Mon Sep 17 00:00:00 2001 From: Kumar Neelabh Date: Thu, 26 Nov 2020 19:11:11 +0530 Subject: [PATCH 3/9] Update wikicorpus.py Allow users to access metadata by allowing self.metadata in WikiCorpus to be set by a parameter. However, Dictionary() raises "TypeError: decoding to str: need a bytes-like object, list found" if metadata is returned. So, introduced a dictionary_mode parameter in get_texts() so that metadata bypasses the dictionary, and goes directly to the user. --- gensim/corpora/wikicorpus.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py index 86a030d1d9..4fbef614f0 100644 --- a/gensim/corpora/wikicorpus.py +++ b/gensim/corpora/wikicorpus.py @@ -636,7 +636,7 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction self.lower = lower if dictionary is None: - self.dictionary = Dictionary(self.get_texts()) + self.dictionary = Dictionary(self.get_texts(dictionary_mode=True)) else: self.dictionary = dictionary @@ -644,7 +644,7 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction def input(self): return self.fname - def get_texts(self): + def get_texts(self, dictionary_mode=False): """Iterate over the dump, yielding a list of tokens for each article that passed the length and namespace filtering. @@ -654,6 +654,12 @@ def get_texts(self): ----- This iterates over the **texts**. If you want vectors, just use the standard corpus interface instead of this method: + + Parameters + ---------- + dictionary_mode : bool + If True, yields list of str. + If False, yield depends on self.metadata (see 'Yields' below). Examples -------- @@ -699,7 +705,7 @@ def get_texts(self): continue articles += 1 positions += len(tokens) - if self.metadata: + if self.metadata and not dictionary_mode: yield (tokens, (pageid, title)) else: yield tokens From c2a35c361da709bb88b52625c2f4df22d6e5efb6 Mon Sep 17 00:00:00 2001 From: Kumar Neelabh Date: Thu, 26 Nov 2020 19:19:56 +0530 Subject: [PATCH 4/9] Update wikicorpus.py --- gensim/corpora/wikicorpus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py index 4fbef614f0..ad77e35c00 100644 --- a/gensim/corpora/wikicorpus.py +++ b/gensim/corpora/wikicorpus.py @@ -657,7 +657,7 @@ def get_texts(self, dictionary_mode=False): Parameters ---------- - dictionary_mode : bool + dictionary_mode : bool, optional If True, yields list of str. If False, yield depends on self.metadata (see 'Yields' below). From 1aaefb6508f8668bd6d6db94c9da1d37add3fec4 Mon Sep 17 00:00:00 2001 From: Kumar Neelabh Date: Thu, 26 Nov 2020 19:27:03 +0530 Subject: [PATCH 5/9] Update wikicorpus.py --- gensim/corpora/wikicorpus.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py index ad77e35c00..5e19b82d44 100644 --- a/gensim/corpora/wikicorpus.py +++ b/gensim/corpora/wikicorpus.py @@ -706,6 +706,7 @@ def get_texts(self, dictionary_mode=False): articles += 1 positions += len(tokens) if self.metadata and not dictionary_mode: + print('yielding metadata') yield (tokens, (pageid, title)) else: yield tokens From 79ee20e3086cbe52b871b45ba4e5db71a2fc0cfc Mon Sep 17 00:00:00 2001 From: Kumar Neelabh Date: Thu, 26 Nov 2020 19:30:57 +0530 Subject: [PATCH 6/9] Update wikicorpus.py --- gensim/corpora/wikicorpus.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py index 5e19b82d44..8c5df5fb62 100644 --- a/gensim/corpora/wikicorpus.py +++ b/gensim/corpora/wikicorpus.py @@ -705,10 +705,12 @@ def get_texts(self, dictionary_mode=False): continue articles += 1 positions += len(tokens) + print('dictionary_mode: ', dictionary_mode) if self.metadata and not dictionary_mode: print('yielding metadata') yield (tokens, (pageid, title)) else: + print('not yielding metadata') yield tokens except KeyboardInterrupt: From 151ce19525561356ca61d2336cccbde35d6de706 Mon Sep 17 00:00:00 2001 From: Kumar Neelabh Date: Thu, 26 Nov 2020 19:37:13 +0530 Subject: [PATCH 7/9] Update wikicorpus.py --- gensim/corpora/wikicorpus.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py index 8c5df5fb62..05f6167c26 100644 --- a/gensim/corpora/wikicorpus.py +++ b/gensim/corpora/wikicorpus.py @@ -634,7 +634,8 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction self.token_min_len = token_min_len self.token_max_len = token_max_len self.lower = lower - + + print('version updated') if dictionary is None: self.dictionary = Dictionary(self.get_texts(dictionary_mode=True)) else: From ce6ebfa8409ace8fdd35043bdccee84019daa7dd Mon Sep 17 00:00:00 2001 From: Kumar Neelabh Date: Thu, 26 Nov 2020 19:58:13 +0530 Subject: [PATCH 8/9] Update wikicorpus.py --- gensim/corpora/wikicorpus.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py index 05f6167c26..86afb39271 100644 --- a/gensim/corpora/wikicorpus.py +++ b/gensim/corpora/wikicorpus.py @@ -614,7 +614,7 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction where the callable returns an XML element are processed, returning None allows filtering out some articles based on customised rules. metadata: bool, optional - if True - write article titles to corpus + Whether to write articles titles to serialized corpus. Warnings -------- @@ -635,7 +635,6 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction self.token_max_len = token_max_len self.lower = lower - print('version updated') if dictionary is None: self.dictionary = Dictionary(self.get_texts(dictionary_mode=True)) else: @@ -706,12 +705,9 @@ def get_texts(self, dictionary_mode=False): continue articles += 1 positions += len(tokens) - print('dictionary_mode: ', dictionary_mode) if self.metadata and not dictionary_mode: - print('yielding metadata') yield (tokens, (pageid, title)) else: - print('not yielding metadata') yield tokens except KeyboardInterrupt: From 92d64e820ab0148de9dce631a6d40dc641ee7a9e Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Tue, 29 Jun 2021 16:05:45 +0900 Subject: [PATCH 9/9] Apply suggestions from code review --- gensim/corpora/wikicorpus.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py index 86afb39271..54c721eed2 100644 --- a/gensim/corpora/wikicorpus.py +++ b/gensim/corpora/wikicorpus.py @@ -634,7 +634,6 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction self.token_min_len = token_min_len self.token_max_len = token_max_len self.lower = lower - if dictionary is None: self.dictionary = Dictionary(self.get_texts(dictionary_mode=True)) else: