From a48ec39346059d5f95db8d2a9103a9c7bc3b4cab Mon Sep 17 00:00:00 2001
From: Kumar Neelabh <neelabh.plus@gmail.com>
Date: Thu, 26 Nov 2020 17:25:23 +0530
Subject: [PATCH 1/9] Update wikicorpus.py

Let the users have metadata (e.g. title) if they need it. Added an argument in WikiCorpus __init__() to specify if metadata is needed. Previously, it was set to False and could not be toggled.
---
 gensim/corpora/wikicorpus.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py
index 8c3c94b5ff..dea4ad20d0 100644
--- a/gensim/corpora/wikicorpus.py
+++ b/gensim/corpora/wikicorpus.py
@@ -576,7 +576,7 @@ class WikiCorpus(TextCorpus):
     """
     def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None,
                  filter_namespaces=('0',), tokenizer_func=tokenize, article_min_tokens=ARTICLE_MIN_WORDS,
-                 token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True, filter_articles=None):
+                 token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True, filter_articles=None, metadata=False):
         """Initialize the corpus.
 
         Unless a dictionary is provided, this scans the corpus once,
@@ -621,7 +621,7 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction
         self.fname = fname
         self.filter_namespaces = filter_namespaces
         self.filter_articles = filter_articles
-        self.metadata = False
+        self.metadata = metadata
         if processes is None:
             processes = max(1, multiprocessing.cpu_count() - 1)
         self.processes = processes

From a133ea765c6b84248aeb29bd46f04bf0e572cb1e Mon Sep 17 00:00:00 2001
From: Kumar Neelabh <neelabh.plus@gmail.com>
Date: Thu, 26 Nov 2020 17:43:23 +0530
Subject: [PATCH 2/9] Update wikicorpus.py

Make Wikipedia corpus metadata accessible.
---
 gensim/corpora/wikicorpus.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py
index dea4ad20d0..86a030d1d9 100644
--- a/gensim/corpora/wikicorpus.py
+++ b/gensim/corpora/wikicorpus.py
@@ -576,7 +576,8 @@ class WikiCorpus(TextCorpus):
     """
     def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None,
                  filter_namespaces=('0',), tokenizer_func=tokenize, article_min_tokens=ARTICLE_MIN_WORDS,
-                 token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True, filter_articles=None, metadata=False):
+                 token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True, filter_articles=None, 
+                 metadata=False):
         """Initialize the corpus.
 
         Unless a dictionary is provided, this scans the corpus once,
@@ -612,6 +613,8 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction
             If set, each XML article element will be passed to this callable before being processed. Only articles
             where the callable returns an XML element are processed, returning None allows filtering out
             some articles based on customised rules.
+        metadata: bool, optional
+                if True - write article titles to corpus
 
         Warnings
         --------

From 52b8ffca5d0264aef28c39a80fa0ac9ccd89360a Mon Sep 17 00:00:00 2001
From: Kumar Neelabh <neelabh.plus@gmail.com>
Date: Thu, 26 Nov 2020 19:11:11 +0530
Subject: [PATCH 3/9] Update wikicorpus.py

Allow users to access metadata by allowing self.metadata in WikiCorpus to be set by a parameter. However, Dictionary() raises "TypeError: decoding to str: need a bytes-like object, list found" if metadata is returned. So, introduced a dictionary_mode parameter in get_texts() so that metadata bypasses the dictionary, and goes directly to the user.
---
 gensim/corpora/wikicorpus.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py
index 86a030d1d9..4fbef614f0 100644
--- a/gensim/corpora/wikicorpus.py
+++ b/gensim/corpora/wikicorpus.py
@@ -636,7 +636,7 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction
         self.lower = lower
 
         if dictionary is None:
-            self.dictionary = Dictionary(self.get_texts())
+            self.dictionary = Dictionary(self.get_texts(dictionary_mode=True))
         else:
             self.dictionary = dictionary
 
@@ -644,7 +644,7 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction
     def input(self):
         return self.fname
 
-    def get_texts(self):
+    def get_texts(self, dictionary_mode=False):
         """Iterate over the dump, yielding a list of tokens for each article that passed
         the length and namespace filtering.
 
@@ -654,6 +654,12 @@ def get_texts(self):
         -----
         This iterates over the **texts**. If you want vectors, just use the standard corpus interface
         instead of this method:
+        
+        Parameters
+        ----------
+        dictionary_mode : bool
+            If True, yields list of str.
+            If False, yield depends on self.metadata (see 'Yields' below).
 
         Examples
         --------
@@ -699,7 +705,7 @@ def get_texts(self):
                         continue
                     articles += 1
                     positions += len(tokens)
-                    if self.metadata:
+                    if self.metadata and not dictionary_mode:
                         yield (tokens, (pageid, title))
                     else:
                         yield tokens

From c2a35c361da709bb88b52625c2f4df22d6e5efb6 Mon Sep 17 00:00:00 2001
From: Kumar Neelabh <neelabh.plus@gmail.com>
Date: Thu, 26 Nov 2020 19:19:56 +0530
Subject: [PATCH 4/9] Update wikicorpus.py

---
 gensim/corpora/wikicorpus.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py
index 4fbef614f0..ad77e35c00 100644
--- a/gensim/corpora/wikicorpus.py
+++ b/gensim/corpora/wikicorpus.py
@@ -657,7 +657,7 @@ def get_texts(self, dictionary_mode=False):
         
         Parameters
         ----------
-        dictionary_mode : bool
+        dictionary_mode : bool, optional
             If True, yields list of str.
             If False, yield depends on self.metadata (see 'Yields' below).
 

From 1aaefb6508f8668bd6d6db94c9da1d37add3fec4 Mon Sep 17 00:00:00 2001
From: Kumar Neelabh <neelabh.plus@gmail.com>
Date: Thu, 26 Nov 2020 19:27:03 +0530
Subject: [PATCH 5/9] Update wikicorpus.py

---
 gensim/corpora/wikicorpus.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py
index ad77e35c00..5e19b82d44 100644
--- a/gensim/corpora/wikicorpus.py
+++ b/gensim/corpora/wikicorpus.py
@@ -706,6 +706,7 @@ def get_texts(self, dictionary_mode=False):
                     articles += 1
                     positions += len(tokens)
                     if self.metadata and not dictionary_mode:
+                        print('yielding metadata')
                         yield (tokens, (pageid, title))
                     else:
                         yield tokens

From 79ee20e3086cbe52b871b45ba4e5db71a2fc0cfc Mon Sep 17 00:00:00 2001
From: Kumar Neelabh <neelabh.plus@gmail.com>
Date: Thu, 26 Nov 2020 19:30:57 +0530
Subject: [PATCH 6/9] Update wikicorpus.py

---
 gensim/corpora/wikicorpus.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py
index 5e19b82d44..8c5df5fb62 100644
--- a/gensim/corpora/wikicorpus.py
+++ b/gensim/corpora/wikicorpus.py
@@ -705,10 +705,12 @@ def get_texts(self, dictionary_mode=False):
                         continue
                     articles += 1
                     positions += len(tokens)
+                    print('dictionary_mode: ', dictionary_mode)
                     if self.metadata and not dictionary_mode:
                         print('yielding metadata')
                         yield (tokens, (pageid, title))
                     else:
+                        print('not yielding metadata')
                         yield tokens
 
         except KeyboardInterrupt:

From 151ce19525561356ca61d2336cccbde35d6de706 Mon Sep 17 00:00:00 2001
From: Kumar Neelabh <neelabh.plus@gmail.com>
Date: Thu, 26 Nov 2020 19:37:13 +0530
Subject: [PATCH 7/9] Update wikicorpus.py

---
 gensim/corpora/wikicorpus.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py
index 8c5df5fb62..05f6167c26 100644
--- a/gensim/corpora/wikicorpus.py
+++ b/gensim/corpora/wikicorpus.py
@@ -634,7 +634,8 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction
         self.token_min_len = token_min_len
         self.token_max_len = token_max_len
         self.lower = lower
-
+        
+        print('version updated')
         if dictionary is None:
             self.dictionary = Dictionary(self.get_texts(dictionary_mode=True))
         else:

From ce6ebfa8409ace8fdd35043bdccee84019daa7dd Mon Sep 17 00:00:00 2001
From: Kumar Neelabh <neelabh.plus@gmail.com>
Date: Thu, 26 Nov 2020 19:58:13 +0530
Subject: [PATCH 8/9] Update wikicorpus.py

---
 gensim/corpora/wikicorpus.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py
index 05f6167c26..86afb39271 100644
--- a/gensim/corpora/wikicorpus.py
+++ b/gensim/corpora/wikicorpus.py
@@ -614,7 +614,7 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction
             where the callable returns an XML element are processed, returning None allows filtering out
             some articles based on customised rules.
         metadata: bool, optional
-                if True - write article titles to corpus
+                Whether to write articles titles to serialized corpus.
 
         Warnings
         --------
@@ -635,7 +635,6 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction
         self.token_max_len = token_max_len
         self.lower = lower
         
-        print('version updated')
         if dictionary is None:
             self.dictionary = Dictionary(self.get_texts(dictionary_mode=True))
         else:
@@ -706,12 +705,9 @@ def get_texts(self, dictionary_mode=False):
                         continue
                     articles += 1
                     positions += len(tokens)
-                    print('dictionary_mode: ', dictionary_mode)
                     if self.metadata and not dictionary_mode:
-                        print('yielding metadata')
                         yield (tokens, (pageid, title))
                     else:
-                        print('not yielding metadata')
                         yield tokens
 
         except KeyboardInterrupt:

From 92d64e820ab0148de9dce631a6d40dc641ee7a9e Mon Sep 17 00:00:00 2001
From: Michael Penkov <m@penkov.dev>
Date: Tue, 29 Jun 2021 16:05:45 +0900
Subject: [PATCH 9/9] Apply suggestions from code review

---
 gensim/corpora/wikicorpus.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py
index 86afb39271..54c721eed2 100644
--- a/gensim/corpora/wikicorpus.py
+++ b/gensim/corpora/wikicorpus.py
@@ -634,7 +634,6 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction
         self.token_min_len = token_min_len
         self.token_max_len = token_max_len
         self.lower = lower
-        
         if dictionary is None:
             self.dictionary = Dictionary(self.get_texts(dictionary_mode=True))
         else: