beir-cellar · Nick-S-2018 · Apr 16, 2022
diff --git a/beir/retrieval/evaluation.py b/beir/retrieval/evaluation.py
@@ -4,14 +4,15 @@
 from .search.dense import DenseRetrievalExactSearch as DRES
 from .search.dense import DenseRetrievalFaissSearch as DRFS
 from .search.lexical import BM25Search as BM25
+from .search.lexical.manticore_search import ManticoreLexicalSearch
 from .search.sparse import SparseSearch as SS
 from .custom_metrics import mrr, recall_cap, hole, top_k_accuracy
 
 logger = logging.getLogger(__name__)
 
 class EvaluateRetrieval:
 
-    def __init__(self, retriever: Union[Type[DRES], Type[DRFS], Type[BM25], Type[SS]] = None, k_values: List[int] = [1,3,5,10,100,1000], score_function: str = "cos_sim"):
+    def __init__(self, retriever: Union[Type[DRES], Type[DRFS], Type[BM25], Type[SS], ManticoreLexicalSearch] = None, k_values: List[int] = [1,3,5,10,100,1000], score_function: str = "cos_sim"):
         self.k_values = k_values
         self.top_k = max(k_values)
         self.retriever = retriever

diff --git a/beir/retrieval/search/lexical/manticore_search.py b/beir/retrieval/search/lexical/manticore_search.py
@@ -0,0 +1,107 @@
+import time
+import manticoresearch
+from json import dumps 
+from tqdm import tqdm
+from typing import Dict
+from wasabi import msg
+from urllib.parse import quote
+
+
+class ManticoreLexicalSearch:
+
+    ESC_CHARS = ['\\', "'", '!', '"', '$', '(', ')', '-', '/', '<', '@', '^', '|', '~', ]
+
+    def __init__(
+        self,
+        index_name: str,
+        host: str,
+        store_indexes: bool = False
+    ):
+        self.store_indexes = store_indexes
+        # Escape special characters in index name
+        for ch in self.ESC_CHARS:
+            index_name = index_name.replace(ch, '_') 
+        self.index_name = "beir_benchmark_" + index_name
+        # Initialize Manticore instance and create benchmark index
+        with manticoresearch.ApiClient( manticoresearch.Configuration(host=host) ) as api_client:
+            self.__index_api = manticoresearch.IndexApi(api_client)
+            self.__utils_api = manticoresearch.UtilsApi(api_client)
+        body = quote("CREATE TABLE IF NOT EXISTS " + self.index_name +
+            "(_id string, title text, body text) stopwords='en' stopwords_unstemmed='1'" + 
+            " html_strip='1' morphology='lemmatize_en_all' index_exact_words='1' index_field_lengths='1' ")
+        self.__utils_api.sql(body)
+
+
+    def clear(self):
+        # Clear existing benchmark index
+        self.__utils_api.sql( quote("DROP TABLE IF EXISTS " + self.index_name) )
+
+
+    def __index_exists(self) -> bool:
+        req = "SELECT 1 FROM " + self.index_name + " LIMIT 1"
+        resp = self.__utils_api.sql(req, raw_response=False)
+        return True if resp['hits']['hits'] else False
+
+
+    def __prepare_query(self, query:str) -> str:
+        # Escape necessary characters and convert query to 'or' search mode
+        for ch in self.ESC_CHARS:
+            if ch == "'":
+                repl = '\\'
+            elif ch == '\\':
+                repl = '\\\\\\'
+            else:
+                repl = '\\\\'
+            query = query.replace(ch, repl + ch )
+        if query[-1] == '=':
+            query = query[0:-1] + '\\\\='
+        return '"{}"/1'.format(query)
+
+
+    def index(self, corpus: Dict[str, Dict[str, str]], batch_size: int = 10000):
+        msg.info("Indexing:")
+        docs = list( corpus.items() )
+        for i in range(0, len(corpus), batch_size):
+            index_docs = [ {
+                "insert": {
+                    "index": self.index_name,
+                    "doc": {
+                        "_id": str(id),
+                        "title": doc["title"],
+                        "body": doc["text"],
+                    }
+                }
+            } for id,doc in docs[i:i + batch_size] ]
+            msg.info( "Batch {} with {} docs".format(i//batch_size+1, len(index_docs) ) )
+            self.__index_api.bulk( '\n'.join( map(dumps, index_docs) ) )
+        self.__utils_api.sql("FLUSH RAMCHUNK " + self.index_name)
+        time.sleep(5)
+        self.__utils_api.sql( quote("OPTIMIZE INDEX " + self.index_name + " OPTION cutoff=1, sync=1; ") )
+
+
+    def search(
+        self, 
+        corpus: Dict[str, Dict[str, str]], 
+        queries: Dict[str, str], 
+        top_k: int,
+        *args,
+        **kwargs
+    ) -> Dict[str, Dict[str, float]]:
+        results = {}
+        req_tmpl = "SELECT _id, WEIGHT() as w FROM " + self.index_name + " WHERE MATCH('@(title,body){}') " \
+        "OPTION ranker=expr('10000 * bm25f(1.2,0.75)'), idf='plain,tfidf_unnormalized', max_matches=" \
+         + str(10)
+        if not self.__index_exists():
+            self.index(corpus)
+        msg.info("Evaluating:")
+        for qid, query in tqdm( queries.items() ):
+            req = req_tmpl.format( self.__prepare_query(query) )
+            resp = self.__utils_api.sql(req, raw_response=False)
+            query_docs = { doc['_source']['_id']:doc['_source']['w']
+                          for doc in resp['hits']['hits'] if doc['_source']['w'] }
+            if query_docs:
+                results.update( { qid: query_docs  } )
+        if not self.store_indexes:
+            self.clear()
+        return results
+
diff --git a/examples/retrieval/evaluation/lexical/evaluate_manticore_bm25.py b/examples/retrieval/evaluation/lexical/evaluate_manticore_bm25.py
@@ -0,0 +1,156 @@
+"""
+This example shows how to evaluate Manticore BM25f model in BEIR.
+To install and run Manticore server on your local machine, follow the instruction from this manual - 
+https://manual.manticoresearch.com/Installation
+
+The code doesn't require GPU to run.
+
+Usage: python evaluate_manticore_bm25.py
+:option --data_dir -d: A folder path for downloaded dataset files. 
+:option --dataset-name -n: A dataset(s) to be used in the benchmark.
+:option --host -h: Hostname and port your Manticore server is running on, e.g. localhost:9308
+:option --outfile -o: Filepath to save benchmarking results
+:option --store-datasets: Store downloaded dataset files after benchmarking is completed.
+:option --store-indexes: Store created indexes after benchmarking is completed.  
+"""
+
+import os
+import shutil
+import typer
+from beir import util
+from beir.datasets.data_loader import GenericDataLoader
+from beir.retrieval.search.lexical.manticore_search import ManticoreLexicalSearch
+from beir.retrieval.evaluation import EvaluateRetrieval
+from pandas import DataFrame
+from typing import List, Optional
+from wasabi import msg
+
+
+def load_datasets( data_dir: str, dataset_names: List[str] ) -> List[str]:
+    """
+    Download necessary datasets
+
+    :param data_dir: A folder path for downloaded files.
+    :param dataset_names: A list of dataset names to be used in the benchmark.
+    :return: A list of filepathes to downloaded datasets.
+    """
+    print("Loading datasets:")
+    url_tmpl = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip"
+    data_pathes = [
+         util.download_and_unzip( url_tmpl.format(name), data_dir) for name in dataset_names
+    ]
+    print("Done") 
+    return data_pathes
+
+
+def remove_datasets( data_dir: str, dataset_names: List[str] ):
+    """
+    Remove downloaded datasets
+
+    :param data_dir: A folder path for downloaded dataset files.
+    :param dataset_names: A list of dataset names to be removed.
+    """
+    for name in dataset_names:
+        shutil.rmtree( os.path.join(data_dir, name) )
+        os.remove( os.path.join(data_dir, name + ".zip") )
+
+
+def save_results(metrics: List, outfile: str):
+    """
+    Save calculated metrics data
+
+    :param metrics: A list of calculated metrics.
+    :param outfile: Path to outfile.
+    """
+    if os.path.isfile(outfile):
+        DataFrame.from_records(metrics).to_csv(
+            outfile, mode="a", header=False, index=False
+        )
+    else:
+        DataFrame.from_records(metrics).to_csv(
+            outfile, mode="w", header=True, index=False
+        )
+    savepath = os.getcwd() + "/" + outfile
+    msg.good("Benchmarking results are saved to " + savepath)
+
+
+def benchmark(
+        data_dir: str = typer.Option( os.getcwd(), '--data-dir', '-d'),
+        dataset_names: List[str] = typer.Option( [
+            "msmarco",
+            "scifact",
+            "trec-covid",
+            "nfcorpus",
+            "nq",
+            "fiqa",
+            "arguana",
+            "webis-touche2020",
+            "dbpedia-entity",
+            "scidocs",
+            "fever",
+            "climate-fever",
+            "hotpotqa",
+        ], '--dataset-name', '-n' ),
+        host: str = typer.Option( "http://localhost:9308", '--host', '-h' ),
+        outfile: Optional[str] = typer.Option(None, '--outfile', '-o'),
+        store_datasets: bool = False,
+        store_indexes: bool = False,
+        ):
+    """
+    Benchmark Manticore BM25 search relevance across a collection of BEIR datasets.
+
+    :param data_dir: A folder path for downloaded files. By default, set to the current script's folder.    
+
+    :param dataset_names: A list of dataset names to be used in the benchmark. By default,
+     all the datasets available for download from the BEIR's leaderboard are used.
+
+    :param host: Hostname and port your Manticore server is running on. By default, 
+    set to  http://localhost:9308
+
+    :param store_datasets: Store downloaded dataset files after benchmarking is completed. By default, 
+    set to False.
+
+    :param store_indexes: Store created indexes after benchmarking is completed. By default, 
+    set to False.
+
+    :param outfile: File to save benchmark results. By default, set to None
+    """
+    print("Benchmarking is started\n")
+    metrics = []
+    data_pathes = load_datasets(data_dir, dataset_names)
+    for i,name in enumerate(dataset_names):
+        print("\nDataset " + name + ":")
+        # Create an evaluation model for Manticore search
+        model = ManticoreLexicalSearch(
+            index_name=name,
+            host=host,
+            store_indexes=store_indexes,
+            )
+        # Msmarco is the only dataset using "dev" set for its evaluation 
+        split_type = 'dev' if name == 'msmarco' else 'test'
+        # Extract corpus, queries and qrels from dataset.
+        corpus, queries, qrels = GenericDataLoader( data_pathes[i] ).load(split=split_type)
+        # Performing evaluations with the set of metrics given( NDCG and so on )
+        retriever = EvaluateRetrieval(model)
+        results = retriever.retrieve(corpus, queries)
+        ndcg, _map, recall, precision = retriever.evaluate(
+            qrels, results, retriever.k_values
+        )
+        metric = {"Dataset": name}
+        metric.update(ndcg)
+        metric.update(_map)
+        metric.update(recall)
+        metric.update(precision)
+        metrics.append(metric)
+    if not store_datasets:
+        remove_datasets(data_dir, dataset_names)
+    # Output benchmark results
+    if outfile is not None:
+        save_results(metrics, outfile)
+    print( "\n" + DataFrame(data=metrics).to_markdown(tablefmt='grid')  + "\n" )
+    msg.good("Benchmarking is successfully finished")
+
+
+if __name__ == "__main__":
+    typer.run(benchmark)
+
diff --git a/setup.py b/setup.py
@@ -4,7 +4,8 @@
     readme = readme_file.read()
 
 optional_packages = {
-    "tf" : ['tensorflow>=2.2.0', 'tensorflow-text', 'tensorflow-hub']
+    "tf" : ['tensorflow>=2.2.0', 'tensorflow-text', 'tensorflow-hub'],
+    "manticore": ['manticoresearch==1.0.6', 'pandas', 'typer', 'wasabi']
 }
 
 setup(

diff --git a/tests/test_retrieval_lexical_manticore_bm25.py b/tests/test_retrieval_lexical_manticore_bm25.py
@@ -0,0 +1,35 @@
+import unittest
+from beir.retrieval.search.lexical.manticore_search import ManticoreLexicalSearch
+from beir.retrieval.evaluation import EvaluateRetrieval
+
+
+class TestManticoreLexicalSearch(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.application_name = "Manticore_test"
+        self.corpus = {
+            "1": {"title": "this is a title for query 1", "text": "this is a text for query 1"},
+            "2": {"title": "this is a title for query 2", "text": "this is a text for query 2"},
+            "3": {"title": "this is a title for query 3", "text": "this is a text for query 3"},
+        }
+        self.queries = {"1": "this is query 1", "2": "this is query 2"}
+
+
+    def test_or_bm25(self):
+        self.model = ManticoreLexicalSearch("test", 'http://localhost:9308')
+        retriever = EvaluateRetrieval(self.model)
+        results = retriever.retrieve(corpus=self.corpus, queries=self.queries)
+        self.assertEqual(
+            {"1", "2"}, 
+            set( results.keys() )
+            )
+        for query_id in results.keys():
+            self.assertEqual(
+                {"1", "2", "3"}, 
+                set( results[query_id].keys() ) 
+            )
+
+
+    def tearDown(self) -> None:
+        self.model.clear()
+