diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index b6a6412d..583a14e9 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -36,6 +36,12 @@ jobs: - algorithm: httpann_example dataset: random-range-xs library: httpann_example + - algorithm: ngt-t1 + dataset: random-range-xs + library: ngt + - algorithm: ngt-t2 + dataset: random-range-xs + library: ngt - algorithm : kst_ann_t1 dataset: random-xs library: kst_ann_t1 diff --git a/algos.yaml b/algos.yaml index ff406b7a..9c387639 100644 --- a/algos.yaml +++ b/algos.yaml @@ -228,6 +228,30 @@ random-xs: args: - [ 0.2, 0.8, 1.0 ] query-args: [ ] + ngt-t1: + # ngt-t1: random-xs + docker-tag: billion-scale-benchmark-ngt + module: benchmark.algorithms.ngt_t1 + constructor: NGT + base-args: ["@metric"] + run-groups: + base: + args: | + [{"q": 10, "s": 600, "b": 10000, "rs": 1000, "ri": 100, "rx": 10}] + query-args: | + [{"epsilon":0.10, "edge":0, "blob":500}] + ngt-t2: + # ngt-t2: random-xs + docker-tag: billion-scale-benchmark-ngt + module: benchmark.algorithms.ngt_t1 + constructor: NGT + base-args: ["@metric"] + run-groups: + base: + args: | + [{"q": 10, "s": 600, "b": 10000, "rs": 1000, "ri": 100, "rx": 10}] + query-args: | + [{"epsilon":0.10, "edge":0, "blob":500, "expansion":2.0}] bbann: docker-tag: billion-scale-benchmark-bbann module: benchmark.algorithms.bbann @@ -429,6 +453,49 @@ deep-1B: {"Ls":70, "BW":4, "T":16}, {"Ls":80, "BW":4, "T":16}, {"Ls":100, "BW":4, "T":16}] + ngt-t1: + # ngt-t1: deep-1B + docker-tag: billion-scale-benchmark-ngt + module: benchmark.algorithms.ngt_t1 + constructor: NGT + base-args: ["@metric"] + run-groups: + base: + args: | + [{"q": 1000000, "s": 100, "b": 1000000, "rs": 1000000, "ri": 600, "rx": 10, + "index": "https://ngtf-rlab.east.edge.storage-yahoo.jp/neurips21/indices/1128/Deep1BDataset-1000000000.tar"}] + query-args: | + [{"epsilon":0.10, "edge":600, "blob":200}, + {"epsilon":0.15, "edge":600, "blob":200}, + {"epsilon":0.20, "edge":600, "blob":200}, + {"epsilon":0.24, "edge":600, "blob":200}, + {"epsilon":0.26, "edge":600, "blob":200}, + {"epsilon":0.28, "edge":600, "blob":200}, + {"epsilon":0.30, "edge":600, "blob":200}, + {"epsilon":0.32, "edge":600, "blob":200}, + {"epsilon":0.35, "edge":600, "blob":200}, + {"epsilon":0.40, "edge":600, "blob":200}] + ngt-t2: + # ngt-t2: deep-1B + docker-tag: billion-scale-benchmark-ngt + module: benchmark.algorithms.ngt_t2 + constructor: NGT + base-args: ["@metric"] + run-groups: + base: + args: | + [{"q": 1000000, "s": 100, "b": 1000000, "rs": 1000000, "ri": 600, "rx": 10, + "index": "https://ngtf-rlab.east.edge.storage-yahoo.jp/neurips21/indices/1128/Deep1BDataset-1000000000.tar"}] + query-args: | + [{"epsilon":0.16, "edge":100, "blob":500, "expansion":4.0}, + {"epsilon":0.16, "edge":100, "blob":500, "expansion":5.0}, + {"epsilon":0.16, "edge":100, "blob":500, "expansion":6.0}, + {"epsilon":0.16, "edge":100, "blob":500, "expansion":7.0}, + {"epsilon":0.16, "edge":100, "blob":500, "expansion":8.0}, + {"epsilon":0.16, "edge":100, "blob":500, "expansion":9.0}, + {"epsilon":0.16, "edge":100, "blob":500, "expansion":10.0}, + {"epsilon":0.16, "edge":100, "blob":500, "expansion":11.0}, + {"epsilon":0.16, "edge":100, "blob":500, "expansion":12.0}] msspacev-1B: kst_ann_t1: docker-tag: billion-scale-benchmark-kst_ann_t1 @@ -537,6 +604,51 @@ msspacev-1B: {"Ls":110, "BW":4, "T":16}, {"Ls":120, "BW":4, "T":16}, {"Ls":130, "BW":4, "T":16}] + ngt-t1: + # ngt-t1: msspacev-1B + docker-tag: billion-scale-benchmark-ngt + module: benchmark.algorithms.ngt_t1 + constructor: NGT + base-args: ["@metric"] + run-groups: + base: + args: | + [{"q": 400000, "s": 500, "b": 1000000, "rs": 1000000, "ri": 600, "rx": 10, + "index": "https://ngtf-rlab.east.edge.storage-yahoo.jp/neurips21/indices/1115/MSSPACEV1B-1000000000.tar"}] + query-args: | + [{"epsilon":0.10, "edge":0, "blob":500}, + {"epsilon":0.15, "edge":0, "blob":500}, + {"epsilon":0.18, "edge":0, "blob":500}, + {"epsilon":0.20, "edge":0, "blob":500}, + {"epsilon":0.22, "edge":0, "blob":500}, + {"epsilon":0.24, "edge":0, "blob":500}, + {"epsilon":0.26, "edge":0, "blob":500}, + {"epsilon":0.30, "edge":0, "blob":500}, + {"epsilon":0.32, "edge":0, "blob":500}, + {"epsilon":0.34, "edge":0, "blob":500}, + {"epsilon":0.36, "edge":0, "blob":500}] + ngt-t2: + # ngt-t2: msspacev-1B + docker-tag: billion-scale-benchmark-ngt + module: benchmark.algorithms.ngt_t2 + constructor: NGT + base-args: ["@metric"] + run-groups: + base: + args: | + [{"q": 400000, "s": 500, "b": 1000000, "rs": 1000000, "ri": 600, "rx": 10, + "index": "https://ngtf-rlab.east.edge.storage-yahoo.jp/neurips21/indices/1115/MSSPACEV1B-1000000000.tar"}] + query-args: | + [{"epsilon":0.18, "edge":0, "blob":500, "expansion":2}, + {"epsilon":0.18, "edge":0, "blob":500, "expansion":3}, + {"epsilon":0.18, "edge":0, "blob":500, "expansion":4}, + {"epsilon":0.18, "edge":0, "blob":500, "expansion":5}, + {"epsilon":0.18, "edge":0, "blob":500, "expansion":6}, + {"epsilon":0.18, "edge":0, "blob":500, "expansion":7}, + {"epsilon":0.18, "edge":0, "blob":500, "expansion":8}, + {"epsilon":0.18, "edge":0, "blob":500, "expansion":9}, + {"epsilon":0.18, "edge":0, "blob":500, "expansion":10}] + bbann: docker-tag: billion-scale-benchmark-bbann module: benchmark.algorithms.bbann @@ -691,6 +803,49 @@ msturing-1B: {"Ls":70, "BW":4, "T":16}, {"Ls":80, "BW":4, "T":16}, {"Ls":100, "BW":4, "T":16}] + ngt-t1: + # ngt-t1: msturing-1B: + docker-tag: billion-scale-benchmark-ngt + module: benchmark.algorithms.ngt_t1 + constructor: NGT + base-args: ["@metric"] + run-groups: + base: + args: | + [{"q": 400000, "s": 200, "b": 1000000, "rs": 1000000, "ri": 600, "rx": 10, + "index": "https://ngtf-rlab.east.edge.storage-yahoo.jp/neurips21/indices/1112/MSTuringANNS-1000000000.tar"}] + query-args: | + [{"epsilon":0.10, "edge":0, "blob":500}, + {"epsilon":0.12, "edge":0, "blob":500}, + {"epsilon":0.14, "edge":0, "blob":500}, + {"epsilon":0.16, "edge":0, "blob":500}, + {"epsilon":0.18, "edge":0, "blob":500}, + {"epsilon":0.20, "edge":0, "blob":500}, + {"epsilon":0.22, "edge":0, "blob":500}, + {"epsilon":0.24, "edge":0, "blob":500}, + {"epsilon":0.26, "edge":0, "blob":500}, + {"epsilon":0.30, "edge":0, "blob":500}] + ngt-t2: + # ngt-t2: msturing-1B: + docker-tag: billion-scale-benchmark-ngt + module: benchmark.algorithms.ngt_t2 + constructor: NGT + base-args: ["@metric"] + run-groups: + base: + args: | + [{"q": 400000, "s": 200, "b": 1000000, "rs": 1000000, "ri": 600, "rx": 10, + "index": "https://ngtf-rlab.east.edge.storage-yahoo.jp/neurips21/indices/1112/MSTuringANNS-1000000000.tar"}] + query-args: | + [{"epsilon":0.20, "edge":0, "blob":500, "expansion":1.0}, + {"epsilon":0.20, "edge":0, "blob":500, "expansion":1.2}, + {"epsilon":0.20, "edge":0, "blob":500, "expansion":1.4}, + {"epsilon":0.20, "edge":0, "blob":500, "expansion":1.6}, + {"epsilon":0.20, "edge":0, "blob":500, "expansion":1.8}, + {"epsilon":0.20, "edge":0, "blob":500, "expansion":2.0}, + {"epsilon":0.20, "edge":0, "blob":500, "expansion":2.2}, + {"epsilon":0.20, "edge":0, "blob":500, "expansion":2.4}, + {"epsilon":0.20, "edge":0, "blob":500, "expansion":2.6}] bigann-1B: kst_ann_t1: docker-tag: billion-scale-benchmark-kst_ann_t1 @@ -847,6 +1002,56 @@ bigann-1B: {"Ls":70, "BW":4, "T":16}, {"Ls":80, "BW":4, "T":16}, {"Ls":100, "BW":4, "T":16}] + ngt-t1: + # ngt-t1: bigann-1B + docker-tag: billion-scale-benchmark-ngt + module: benchmark.algorithms.ngt_t1 + constructor: NGT + base-args: ["@metric"] + run-groups: + base: + args: | + [{"q": 500000, "s": 500, "b": 1000000, "rs": 1000000, "ri": 600, "rx": 10, + "index": "https://ngtf-rlab.east.edge.storage-yahoo.jp/neurips21/indices/1119/BigANNDataset-1000000000.tar"}] + query-args: | + [{"epsilon":0.10, "edge":600, "blob":200}, + {"epsilon":0.20, "edge":600, "blob":200}, + {"epsilon":0.25, "edge":600, "blob":200}, + {"epsilon":0.27, "edge":600, "blob":200}, + {"epsilon":0.29, "edge":600, "blob":200}, + {"epsilon":0.30, "edge":600, "blob":200}, + {"epsilon":0.31, "edge":600, "blob":200}, + {"epsilon":0.32, "edge":600, "blob":200}, + {"epsilon":0.33, "edge":600, "blob":200}, + {"epsilon":0.34, "edge":600, "blob":200}, + {"epsilon":0.35, "edge":600, "blob":200}, + {"epsilon":0.40, "edge":600, "blob":200}, + {"epsilon":0.45, "edge":600, "blob":200}, + {"epsilon":0.50, "edge":600, "blob":200}] + ngt-t2: + # ngt-t2: bigann-1B + docker-tag: billion-scale-benchmark-ngt + module: benchmark.algorithms.ngt_t2 + constructor: NGT + base-args: ["@metric"] + run-groups: + base: + args: | + [{"q": 500000, "s": 500, "b": 1000000, "rs": 1000000, "ri": 600, "rx": 10, + "index": "https://ngtf-rlab.east.edge.storage-yahoo.jp/neurips21/indices/1119/BigANNDataset-1000000000.tar"}] + query-args: | + [{"epsilon":0.18, "edge":600, "blob":1000, "expansion":2}, + {"epsilon":0.18, "edge":600, "blob":1000, "expansion":3}, + {"epsilon":0.18, "edge":600, "blob":1000, "expansion":4}, + {"epsilon":0.18, "edge":600, "blob":1000, "expansion":5}, + {"epsilon":0.18, "edge":600, "blob":1000, "expansion":6}, + {"epsilon":0.18, "edge":600, "blob":1000, "expansion":7}, + {"epsilon":0.18, "edge":600, "blob":1000, "expansion":8}, + {"epsilon":0.18, "edge":600, "blob":1000, "expansion":9}, + {"epsilon":0.18, "edge":600, "blob":1000, "expansion":10}, + {"epsilon":0.18, "edge":600, "blob":1000, "expansion":11}, + {"epsilon":0.18, "edge":600, "blob":1000, "expansion":12}, + {"epsilon":0.18, "edge":600, "blob":1000, "expansion":13}] ssnpp-1B: kota-t2: docker-tag: billion-scale-benchmark-kota diff --git a/benchmark/algorithms/ngt_t1.py b/benchmark/algorithms/ngt_t1.py new file mode 100644 index 00000000..16a55c5d --- /dev/null +++ b/benchmark/algorithms/ngt_t1.py @@ -0,0 +1,122 @@ +from __future__ import absolute_import +import numpy as np +import os +import time +import subprocess +from benchmark.algorithms.base import BaseANN +from benchmark.datasets import DATASETS, download_accelerated + +import ngtpy + +class NGT(BaseANN): + def __init__(self, metric, params): + self._params = params + self._metric = metric + self._quantization = params.get("q", 1024) + self._quantization_sample = params.get("s", 100) + self._blob = params.get("b", 10000) + self._num_of_r_samples = params.get("rs", 1000) + self._num_of_r_iterations = params.get("ri", 10) + self._r_step = params.get("rx", 2) + #self._ngt_root = "data/ngt" # debug + self._ngt_root = "ngt" + self._ngt_index_root = self._ngt_root + "/indexes/" + self._is_open = False + + def setIndexPath(self, dataset): + self._path = f"data/indices/trackT1/algo.NGT:q{self._quantization}-s{self._quantization_sample}-b{self._blob}-rs{self._num_of_r_samples}-ri{self._num_of_r_iterations}" + os.makedirs(self._path, exist_ok=True) + self._index_path = os.path.join(self._path, DATASETS[dataset]().short_name()) + + def fit(self, dataset): + index_params = self._params + ds = DATASETS[dataset]() + if ds.d <= 128: + pseudo_dimension = 128 + subvector_dimension = 2 + elif ds.d <= 256: + pseudo_dimension = 256 + subvector_dimension = 4 + self.setIndexPath(dataset) + print("NGT dataset:", dataset) + print("NGT dataset str:", ds.__str__()) + print("NGT distance:", ds.distance()) + print("NGT dimension:", ds.d) + print("NGT type:", ds.dtype) + print("NGT nb:", ds.nb) + print("NGT dataset file name:", ds.get_dataset_fn()) + print("NGT quantization (q):", self._quantization) + print("NGT quantization sample (s):", self._quantization_sample) + print("NGT blob (b):", self._blob) + print("NGT # of r samples (rs):", self._num_of_r_samples) + print("NGT # of r iterations (rs):", self._num_of_r_iterations) + print("NGT ngt root:", self._ngt_root) + print("NGT index path:", self._index_path) + print("NGT build.sh:", self._ngt_root + '/build.sh') + args = ['/bin/bash', self._ngt_root + '/build.sh', + #'--search', + '--root=' + self._ngt_root, + '--object=' + ds.get_dataset_fn(), + '--benchmark=' + self._index_path, '-d=' + str(pseudo_dimension), + '-D=' + str(subvector_dimension), + '-f', '-m=KMEANS_QUANTIZATION', '-n=' + str(ds.nb), + '-q=' + str(self._quantization), '-E=' + str(self._quantization_sample), + '-b=' + str(self._blob), + '-r=' + str(self._num_of_r_samples), '-R=' + str(self._num_of_r_iterations), + '-X=' + str(self._r_step)] + print(args) + subprocess.call(args) + + if not self._is_open: + print("NGT: opening the index...") + self._index = ngtpy.QuantizedBlobIndex(self._index_path) + self._is_open = True + + def load_index(self, dataset): + self.setIndexPath(dataset) + + if not os.path.exists(self._index_path + "/grp"): + if "index" not in self._params: + return False + if not os.path.exists(self._index_path + ".tar"): + print(f"NGT: dowinloading the index... index={self._params['index']}->{self._index_path}") + download_accelerated(self._params["index"], self._index_path + ".tar", quiet=True) + args = ['tar', 'xf', self._index_path + ".tar", "-C", self._path] + print(args) + subprocess.call(args) + args = ['rm', '-r', self._index_path + ".tar"] + print(args) + subprocess.call(args) + + if not self._is_open: + print("NGT: opening the index...") + self._index = ngtpy.QuantizedBlobIndex(self._index_path) + self._is_open = True + + def set_query_arguments(self, query_args): + self._epsilon = query_args.get("epsilon", 0.1) + self._edge_size = query_args.get("edge", 0) + self._exploration_size = query_args.get("blob", 120) + # only this part is different between t1 and t2 + #self._exact_result_expansion = query_args.get("expansion", 2.0) + self._exact_result_expansion = 0.0 + self._index.set(epsilon=self._epsilon, blob_epsilon=0.0, edge_size=self._edge_size, + exploration_size=self._exploration_size, + exact_result_expansion=self._exact_result_expansion) + print(f"NGT: epsilon={self._epsilon} edge={self._edge_size} blob={self._exploration_size}") + + def query(self, X, n): + self._results = ngtpy.BatchResults() + self._index.batchSearch(X, self._results, n) + + def range_query(self, X, radius): + print("NGT: range_query") + + def get_results(self): + return self._results.getIDs() + + def get_range_results(self): + return self._results.getIndex(), self._results.getIndexedIDs(), self._results.getIndexedDistances() + + def __str__(self): + return f"NGT-T1:q{self._quantization}-b{self._blob}-rs{self._num_of_r_samples}-ri{self._num_of_r_iterations}-e{self._epsilon:.3f}-b{self._exploration_size}" diff --git a/benchmark/algorithms/ngt_t2.py b/benchmark/algorithms/ngt_t2.py new file mode 100644 index 00000000..7fac62c1 --- /dev/null +++ b/benchmark/algorithms/ngt_t2.py @@ -0,0 +1,122 @@ +from __future__ import absolute_import +import numpy as np +import os +import time +import subprocess +from benchmark.algorithms.base import BaseANN +from benchmark.datasets import DATASETS, download_accelerated + +import ngtpy + +class NGT(BaseANN): + def __init__(self, metric, params): + self._params = params + self._metric = metric + self._quantization = params.get("q", 1024) + self._quantization_sample = params.get("s", 100) + self._blob = params.get("b", 10000) + self._num_of_r_samples = params.get("rs", 1000) + self._num_of_r_iterations = params.get("ri", 10) + self._r_step = params.get("rx", 2) + #self._ngt_root = "data/ngt" # debug + self._ngt_root = "ngt" + self._ngt_index_root = self._ngt_root + "/indexes/" + self._is_open = False + + def setIndexPath(self, dataset): + self._path = f"data/indices/trackT2/algo.NGT:q{self._quantization}-s{self._quantization_sample}-b{self._blob}-rs{self._num_of_r_samples}-ri{self._num_of_r_iterations}" + os.makedirs(self._path, exist_ok=True) + self._index_path = os.path.join(self._path, DATASETS[dataset]().short_name()) + + def fit(self, dataset): + index_params = self._params + ds = DATASETS[dataset]() + if ds.d <= 128: + pseudo_dimension = 128 + subvector_dimension = 2 + elif ds.d <= 256: + pseudo_dimension = 256 + subvector_dimension = 4 + self.setIndexPath(dataset) + print("NGT dataset:", dataset) + print("NGT dataset str:", ds.__str__()) + print("NGT distance:", ds.distance()) + print("NGT dimension:", ds.d) + print("NGT type:", ds.dtype) + print("NGT nb:", ds.nb) + print("NGT dataset file name:", ds.get_dataset_fn()) + print("NGT quantization (q):", self._quantization) + print("NGT quantization sample (s):", self._quantization_sample) + print("NGT blob (b):", self._blob) + print("NGT # of r samples (rs):", self._num_of_r_samples) + print("NGT # of r iterations (rs):", self._num_of_r_iterations) + print("NGT ngt root:", self._ngt_root) + print("NGT index path:", self._index_path) + print("NGT build.sh:", self._ngt_root + '/build.sh') + args = ['/bin/bash', self._ngt_root + '/build.sh', + #'--search', + '--root=' + self._ngt_root, + '--object=' + ds.get_dataset_fn(), + '--benchmark=' + self._index_path, '-d=' + str(pseudo_dimension), + '-D=' + str(subvector_dimension), + '-f', '-m=KMEANS_QUANTIZATION', '-n=' + str(ds.nb), + '-q=' + str(self._quantization), '-E=' + str(self._quantization_sample), + '-b=' + str(self._blob), + '-r=' + str(self._num_of_r_samples), '-R=' + str(self._num_of_r_iterations), + '-X=' + str(self._r_step)] + print(args) + subprocess.call(args) + + if not self._is_open: + print("NGT: opening the index...") + self._index = ngtpy.QuantizedBlobIndex(self._index_path) + self._is_open = True + + def load_index(self, dataset): + self.setIndexPath(dataset) + + if not os.path.exists(self._index_path + "/grp"): + if "index" not in self._params: + return False + if not os.path.exists(self._index_path + ".tar"): + print(f"NGT: dowinloading the index... index={self._params['index']}->{self._index_path}") + download_accelerated(self._params["index"], self._index_path + ".tar", quiet=True) + args = ['tar', 'xf', self._index_path + ".tar", "-C", self._path] + print(args) + subprocess.call(args) + args = ['rm', '-r', self._index_path + ".tar"] + print(args) + subprocess.call(args) + + if not self._is_open: + print("NGT: opening the index...") + self._index = ngtpy.QuantizedBlobIndex(self._index_path) + self._is_open = True + + def set_query_arguments(self, query_args): + self._epsilon = query_args.get("epsilon", 0.1) + self._edge_size = query_args.get("edge", 0) + self._exploration_size = query_args.get("blob", 120) + # only this part is different between t1 and t2 + self._exact_result_expansion = query_args.get("expansion", 2.0) + #self._exact_result_expansion = 0.0 + self._index.set(epsilon=self._epsilon, blob_epsilon=0.0, edge_size=self._edge_size, + exploration_size=self._exploration_size, + exact_result_expansion=self._exact_result_expansion) + print(f"NGT: epsilon={self._epsilon} edge={self._edge_size} blob={self._exploration_size}") + + def query(self, X, n): + self._results = ngtpy.BatchResults() + self._index.batchSearch(X, self._results, n) + + def range_query(self, X, radius): + print("NGT: range_query") + + def get_results(self): + return self._results.getIDs() + + def get_range_results(self): + return self._results.getIndex(), self._results.getIndexedIDs(), self._results.getIndexedDistances() + + def __str__(self): + return f"NGT-T2:q{self._quantization}-b{self._blob}-rs{self._num_of_r_samples}-ri{self._num_of_r_iterations}-e{self._epsilon:.3f}-b{self._exploration_size}-x{self._exact_result_expansion}" diff --git a/install/Dockerfile.ngt b/install/Dockerfile.ngt new file mode 100644 index 00000000..25679fe8 --- /dev/null +++ b/install/Dockerfile.ngt @@ -0,0 +1,13 @@ +FROM billion-scale-benchmark + +RUN apt-get update +RUN apt-get install -y git cmake liblapack-dev bc +RUN pip3 install wheel pybind11 +RUN git clone -b neurips21 https://github.com/yahoojapan/NGT.git +RUN cd NGT && mkdir -p build && cd build && cmake .. +RUN cd NGT/build && make -j 8 && make install +RUN mkdir ngt; cp NGT/bin/build.sh ngt/ +RUN ldconfig +RUN cd NGT/python && python3 setup.py bdist_wheel +RUN pip3 install NGT/python/dist/ngt-*-linux_x86_64.whl +