piskvorky · kumarneelabh13 · Nov 26, 2020 · Nov 26, 2020 · Nov 26, 2020 · Nov 26, 2020
diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py
@@ -576,7 +576,8 @@ class WikiCorpus(TextCorpus):
     """
     def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None,
                  filter_namespaces=('0',), tokenizer_func=tokenize, article_min_tokens=ARTICLE_MIN_WORDS,
-                 token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True, filter_articles=None):
+                 token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True, filter_articles=None, 
+                 metadata=False):
         """Initialize the corpus.
 
         Unless a dictionary is provided, this scans the corpus once,
@@ -612,6 +613,8 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction
             If set, each XML article element will be passed to this callable before being processed. Only articles
             where the callable returns an XML element are processed, returning None allows filtering out
             some articles based on customised rules.
+        metadata: bool, optional
+                if True - write article titles to corpus
 
         Warnings
         --------
@@ -621,7 +624,7 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction
         self.fname = fname
         self.filter_namespaces = filter_namespaces
         self.filter_articles = filter_articles
-        self.metadata = False
+        self.metadata = metadata
         if processes is None:
             processes = max(1, multiprocessing.cpu_count() - 1)
         self.processes = processes
@@ -633,15 +636,15 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction
         self.lower = lower
 
         if dictionary is None:
-            self.dictionary = Dictionary(self.get_texts())
+            self.dictionary = Dictionary(self.get_texts(dictionary_mode=True))
         else:
             self.dictionary = dictionary
 
     @property
     def input(self):
         return self.fname
 
-    def get_texts(self):
+    def get_texts(self, dictionary_mode=False):
         """Iterate over the dump, yielding a list of tokens for each article that passed
         the length and namespace filtering.
 
@@ -651,6 +654,12 @@ def get_texts(self):
         -----
         This iterates over the **texts**. If you want vectors, just use the standard corpus interface
         instead of this method:
+
+        Parameters
+        ----------
+        dictionary_mode : bool, optional
+            If True, yields list of str.
+            If False, yield depends on self.metadata (see 'Yields' below).
 
         Examples
         --------
@@ -696,7 +705,8 @@ def get_texts(self):
                         continue
                     articles += 1
                     positions += len(tokens)
-                    if self.metadata:
+                    if self.metadata and not dictionary_mode:
+                        print('yielding metadata')
                         yield (tokens, (pageid, title))
                     else:
                         yield tokens