harsha-simhadri · sourcesync · Jun 11, 2023 · Jun 13, 2023 · Jun 13, 2023 · Jun 15, 2023
diff --git a/benchmark/datasets.py b/benchmark/datasets.py
@@ -366,8 +366,8 @@ def __init__(self, nb_M=1000):
         self.basedir = os.path.join(BASEDIR, "text2image1B")
 
         self.private_nq = 30000
-        self.private_qs_url = "https://comp21storage.blob.core.windows.net/publiccontainer/comp21/text2image1b/query.heldout.30K.fbin"
-        self.private_gt_url = "https://comp21storage.blob.core.windows.net/publiccontainer/comp21/text2image1b/gt100-heldout.30K.fbin"
+        self.private_qs_url = "https://storage.yandexcloud.net/yandex-research/ann-datasets/T2I/query.heldout.30K.fbin"
+        self.private_gt_url = "https://storage.yandexcloud.net/yandex-research/ann-datasets/T2I/gt100-heldout.30K.fbin"
 
         self.private_nq_large = 1000000
         self.private_qs_large_url = "https://storage.yandexcloud.net/yr-secret-share/ann-datasets-5ac0659e27/T2I/query.private.1M.fbin"
@@ -871,7 +871,9 @@ def prepare(self, skip_data=False):
 
         n_neighbors = 100
 
-        nbrs = NearestNeighbors(n_neighbors=n_neighbors, metric="euclidean", algorithm='brute').fit(data[:self.nb // 2])
+        # TODO: This code might memfault.  The workaround right now is to decrease the size, but the real
+        # TODO: solution would be to use a batch version of computing ground truth.
+        nbrs = NearestNeighbors(n_neighbors=n_neighbors, metric="euclidean", algorithm='brute').fit(data[:self.nb // 6])
         DD, II = nbrs.kneighbors(queries[self.nq // 2:])
 
         nbrs = NearestNeighbors(n_neighbors=n_neighbors, metric="euclidean", algorithm='brute').fit(data[self.nb // 2: ])

diff --git a/benchmark/main.py b/benchmark/main.py
@@ -47,6 +47,7 @@ def run_worker(args, queue):
         cpu_limit = "0-%d" % (multiprocessing.cpu_count() - 1)
 
         if args.nodocker:
+            print("Launching without docker...")
             run_no_docker(definition, args.dataset, args.count,
                           args.runs, args.timeout, args.rebuild, cpu_limit, mem_limit,
                           args.t3, args.power_capture,
@@ -55,6 +56,7 @@ def run_worker(args, queue):
                           args.private_query, args.neurips23track)
 
         else:
+            print("Launching with docker...")
             run_docker(definition, args.dataset, args.count,
                        args.runs, args.timeout, args.rebuild, cpu_limit, mem_limit,
                        args.t3, args.power_capture,

diff --git a/m1/Dockerfile.all b/m1/Dockerfile.all
@@ -0,0 +1,43 @@
+FROM ubuntu:jammy
+
+RUN apt-get update && apt-get install -y python3-numpy python3-scipy python3-pip build-essential git axel wget
+
+RUN wget https://aka.ms/downloadazcopy-v10-linux && mv downloadazcopy-v10-linux azcopy.tgz && tar xzf azcopy.tgz --transform 's!^[^/]\+\($\|/\)!azcopy_folder\1!'
+RUN cp azcopy_folder/azcopy /usr/bin
+
+RUN pip3 install -U pip
+
+WORKDIR /home/app
+COPY requirements_py3.10.txt run_algorithm.py ./
+RUN pip3 install -r requirements_py3.10.txt
+
+ENTRYPOINT ["python3", "-u", "run_algorithm.py"]
+
+##
+
+RUN apt update && apt install -y wget swig
+RUN wget https://repo.anaconda.com/archive/Anaconda3-2023.03-0-Linux-x86_64.sh
+RUN bash Anaconda3-2023.03-0-Linux-x86_64.sh -b
+
+ENV PATH /root/anaconda3/bin:$PATH
+ENV CONDA_PREFIX /root/anaconda3/
+
+RUN conda install -c pytorch faiss-cpu
+COPY requirements_conda.txt ./
+
+# conda doesn't like some of our packages, use pip 
+RUN python3 -m pip install -r requirements_conda.txt 
+
+COPY bow_id_selector.swig ./
+
+RUN swig -c++ -python -I$CONDA_PREFIX/include -Ifaiss bow_id_selector.swig 
+RUN g++ -m64 -shared -O3 -g -fPIC bow_id_selector_wrap.cxx -o _bow_id_selector.so  \
+      -I $( python -c "import distutils.sysconfig ; print(distutils.sysconfig.get_python_inc())" )   \
+      -I $CONDA_PREFIX/include $CONDA_PREFIX/lib/libfaiss.so -Ifaiss
+
+## AVX version - -I $CONDA_PREFIX/include $CONDA_PREFIX/lib/libfaiss_avx2.so -Ifaiss
+
+RUN python3 -c 'import faiss; print(faiss.IndexFlatL2); print(faiss.__version__)'
+
+
+
diff --git a/m1/README.md b/m1/README.md
@@ -0,0 +1,13 @@
+
+My notes for running on M1:
+
+General troubleshooting:
+* upgrade to a recent docker desktop
+* install docker AFTER installing developer support 
+* make sure docker python package is at right version 
+* make sure docker-py is NOT installed ( if so, uninstall and reinstall docker package )
+
+faiss troubleshooting:
+* QEMU may not be emulating AVX, so switch to building libfaiss.so not libfaiss_avx.so
+
+
diff --git a/m1/build.sh b/m1/build.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+set -x
+set -e
+
+#export DOCKER_BUILDKIT=0
+
+# all
+# no platform flag: docker build --progress=plain -f ./Dockerfile.all -t neurips23-filter-faiss .
+
+docker build --progress=plain --platform linux/amd64 -f ./Dockerfile.all -t neurips23-filter-faiss . 
+
+# try arm64 docker build --progress=plain --platform linux/arm64 -f ./Dockerfile.all -t neurips23-filter-faiss . 
+
+
diff --git a/m1/test.sh b/m1/test.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+set -e
+set -x
+
+## faiss sparse
+
+# Launch with just bash
+#cd .. && docker run --entrypoint /bin/bash --platform linux/amd64 -it -v /Users/cuongwilliams/Projects/BigANN/big-ann-benchmarks/benchmark:/home/app/benchmark -v /Users/cuongwilliams/Projects/BigANN/big-ann-benchmarks/data:/home/app/data -v /Users/cuongwilliams/Projects/BigANN/big-ann-benchmarks/results:/home/app/results -v /Users/cuongwilliams/Projects/BigANN/big-ann-benchmarks/neurips23:/home/app/neurips23 neurips23-filter-faiss
+
+## after bash:  python3 -m run_algorithm.py --dataset random-filter-s --algorithm faiss --module neurips23.filter.faiss.faiss --constructor FAISS --runs 1 --count 10 --neurips23track filter '["euclidean", {"indexkey": "IVF1024,SQ8"}]' '[{"nprobe": 1}]' '[{"nprobe": 2}]' '[{"nprobe": 4}]'
+
+# WORKS! - 
+cd .. && docker run --platform linux/amd64 -it -v /Users/cuongwilliams/Projects/BigANN/big-ann-benchmarks/benchmark:/home/app/benchmark -v /Users/cuongwilliams/Projects/BigANN/big-ann-benchmarks/data:/home/app/data -v /Users/cuongwilliams/Projects/BigANN/big-ann-benchmarks/results:/home/app/results -v /Users/cuongwilliams/Projects/BigANN/big-ann-benchmarks/neurips23:/home/app/neurips23 neurips23-filter-faiss --dataset random-filter-s --algorithm faiss --module neurips23.filter.faiss.faiss --constructor FAISS --runs 1 --count 10 --neurips23track filter '["euclidean", {"indexkey": "IVF1024,SQ8"}]' '[{"nprobe": 1}]' '[{"nprobe": 2}]' '[{"nprobe": 4}]'
+
+# WITH TRACE! - cd .. && docker run -e QEMU_STRACE=1 --platform linux/amd64 -it -v /Users/cuongwilliams/Projects/BigANN/big-ann-benchmarks/benchmark:/home/app/benchmark -v /Users/cuongwilliams/Projects/BigANN/big-ann-benchmarks/data:/home/app/data -v /Users/cuongwilliams/Projects/BigANN/big-ann-benchmarks/results:/home/app/results -v /Users/cuongwilliams/Projects/BigANN/big-ann-benchmarks/neurips23:/home/app/neurips23 neurips23-filter-faiss --dataset random-filter-s --algorithm faiss --module neurips23.filter.faiss.faiss --constructor FAISS --runs 1 --count 10 --neurips23track filter '["euclidean", {"indexkey": "IVF1024,SQ8"}]' '[{"nprobe": 1}]' '[{"nprobe": 2}]' '[{"nprobe": 4}]'
+
+## linscan
+
+#args ['--dataset', 'sparse-small', '--algorithm', 'linscan', '--module', 'neurips23.sparse.linscan.linscan', '--constructor', 'Linscan', '--runs', '5', '--count', '10', '--neurips23track', 'sparse', '["ip", {}]', '[{"budget": 1}]', '[{"budget": 0.5}]', '[{"budget": 0.4}]', '[{"budget": 0.3}]', '[{"budget": 0.25}]', '[{"budget": 0.2}]', '[{"budget": 0.15}]', '[{"budget": 0.1}]', '[{"budget": 0.075}]', '[{"budget": 0.05}]']
+
+# platform flag - cd .. && docker run --platform linux/amd64 -it -v /Users/cuongwilliams/Projects/BigANN/big-ann-benchmarks/benchmark:/home/app/benchmark -v /Users/cuongwilliams/Projects/BigANN/big-ann-benchmarks/data:/home/app/data -v /Users/cuongwilliams/Projects/BigANN/big-ann-benchmarks/results:/home/app/results -v /Users/cuongwilliams/Projects/BigANN/big-ann-benchmarks/neurips23:/home/app/neurips23  neurips23-sparse-linscan '--dataset' 'sparse-small' '--algorithm' 'linscan' '--module' 'neurips23.sparse.linscan.linscan' '--constructor' 'Linscan' '--runs' '5' '--count' '10' '--neurips23track' 'sparse' '["ip", {}]' '[{"budget": 1}]', '[{"budget": 0.5}]' '[{"budget": 0.4}]' '[{"budget": 0.3}]' '[{"budget": 0.25}]', '[{"budget": 0.2}]' '[{"budget": 0.15}]' '[{"budget": 0.1}]' '[{"budget": 0.075}]' '[{"budget": 0.05}]'
+
+# WORKS! - cd .. && docker run -it -v /Users/cuongwilliams/Projects/BigANN/big-ann-benchmarks/benchmark:/home/app/benchmark -v /Users/cuongwilliams/Projects/BigANN/big-ann-benchmarks/data:/home/app/data -v /Users/cuongwilliams/Projects/BigANN/big-ann-benchmarks/results:/home/app/results -v /Users/cuongwilliams/Projects/BigANN/big-ann-benchmarks/neurips23:/home/app/neurips23  neurips23-sparse-linscan '--dataset' 'sparse-small' '--algorithm' 'linscan' '--module' 'neurips23.sparse.linscan.linscan' '--constructor' 'Linscan' '--runs' '5' '--count' '10' '--neurips23track' 'sparse' '["ip", {}]' '[{"budget": 1}]' '[{"budget": 0.5}]' '[{"budget": 0.4}]' '[{"budget": 0.3}]' '[{"budget": 0.25}]' '[{"budget": 0.2}]' '[{"budget": 0.15}]' '[{"budget": 0.1}]' '[{"budget": 0.075}]' '[{"budget": 0.05}]'
+
+# sparse linscan baseline
+docker run -it -v /Users/cuongwilliams/Projects/BigANN/big-ann-benchmarks/benchmark:/home/app/benchmark -v /Users/cuongwilliams/Projects/BigANN/big-ann-benchmarks/data:/home/app/data -v /Users/cuongwilliams/Projects/BigANN/big-ann-benchmarks/results:/home/app/results -v /Users/cuongwilliams/Projects/BigANN/big-ann-benchmarks/neurips23:/home/app/neurips23  neurips23-sparse-linscan '--dataset' 'sparse-full' '--algorithm' 'linscan' '--module' 'neurips23.sparse.linscan.linscan' '--constructor' 'Linscan' '--runs' '5' '--count' '10' '--neurips23track' 'sparse' '["ip", {}]' '[{"budget": 5}]' '[{"budget": 15}]' '[{"budget": 35}]' '[{"budget": 50}]' '[{"budget": 52.5}]' '[{"budget": 55}]' '[{"budget": 57.5}]' '[{"budget": 60}]' '[{"budget": 90}]' '[{"budget": 500}]'
diff --git a/requirements_py3.10.txt b/requirements_py3.10.txt
@@ -9,3 +9,4 @@ scipy==1.10.1
 scikit-learn
 jinja2==3.1.2
 pandas==2.0.0
+urllib3<=2