Skip to content

Commit

Permalink
Harshasi/final runbook (#177)
Browse files Browse the repository at this point in the history
* new classes, methods and config

* add runbook

* edit runbook_gen

* add final runbook

* remove data file in GT after calculation is done

* h5py libver latest

* set h5py dataset directly with 2D np array

* set k=10 and run_count to 1

* add final runbook to data_export

* reduce streaming runtime to 1hr

* try different configs

* add 8GB memory limit for streaming

* add README instructions

* make download_gt work

* update streaming runtime
  • Loading branch information
harsha-simhadri authored Oct 18, 2023
1 parent f5bb90b commit 8674ce9
Show file tree
Hide file tree
Showing 11 changed files with 4,099 additions and 22 deletions.
30 changes: 30 additions & 0 deletions benchmark/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,13 @@ def get_dataset_iterator(self, bs=512, split=(1,0)):
j1 = min(j0 + bs, i1)
yield sanitize(x[j0:j1])

def get_data_in_range(self, start, end):
assert start >= 0
assert end <= self.nb
filename = self.get_dataset_fn()
x = xbin_mmap(filename, dtype=self.dtype, maxn=self.nb)
return x[start:end]

def search_type(self):
return "knn"

Expand Down Expand Up @@ -434,6 +441,28 @@ def distance(self):

def prepare(self, skip_data=False, original_size=10 ** 9):
return super().prepare(skip_data, original_size = self.nb)

class MSTuringClustered30M(DatasetCompetitionFormat):
def __init__(self):
self.nb = 29998994
self.d = 100
self.nq = 10000
self.dtype = "float32"
self.ds_fn = "30M-clustered64.fbin"
self.qs_fn = "testQuery10K.fbin"
self.gt_fn = "clu_msturing30M_gt100"

self.base_url = "https://comp21storage.blob.core.windows.net/publiccontainer/comp23/clustered_data/msturing-30M-clustered/"
self.basedir = os.path.join(BASEDIR, "MSTuring-30M-clustered")

self.private_gt_url = None
self.private_qs_url = None

def distance(self):
return "euclidean"

def prepare(self, skip_data=False, original_size=10 ** 9):
return super().prepare(skip_data, original_size = self.nb)

class MSSPACEV1B(DatasetCompetitionFormat):
def __init__(self, nb_M=1000):
Expand Down Expand Up @@ -984,6 +1013,7 @@ def __str__(self):
'msturing-1M': lambda : MSTuringANNS(1),

'msturing-10M-clustered': lambda: MSTuringClustered10M(),
'msturing-30M-clustered': lambda: MSTuringClustered30M(),

'msspacev-1B': lambda : MSSPACEV1B(1000),
'msspacev-100M': lambda : MSSPACEV1B(100),
Expand Down
9 changes: 3 additions & 6 deletions benchmark/results.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import re
import traceback


def get_result_filename(dataset=None, count=None, definition=None,
query_arguments=None, neurips23track=None, runbook_path=None):
d = ['results']
Expand Down Expand Up @@ -41,9 +40,7 @@ def get_result_filename(dataset=None, count=None, definition=None,

def add_results_to_h5py(f, search_type, results, count, suffix = ''):
if search_type == "knn" or search_type == "knn_filtered":
neighbors = f.create_dataset('neighbors' + suffix, (len(results), count), 'i')
for i, idxs in enumerate(results):
neighbors[i] = idxs
neighbors = f.create_dataset('neighbors' + suffix, (len(results), count), 'i', data = results)
elif search_type == "range":
lims, D, I= results
f.create_dataset('neighbors' + suffix, data=I)
Expand All @@ -59,7 +56,7 @@ def store_results(dataset, count, definition, query_arguments,
head, tail = os.path.split(fn)
if not os.path.isdir(head):
os.makedirs(head)
f = h5py.File(fn, 'w')
f = h5py.File(name=fn, mode='w', libver='latest')
for k, v in attrs.items():
f.attrs[k] = v

Expand All @@ -83,7 +80,7 @@ def load_all_results(dataset=None, count=None, neurips23track=None, runbook_path
if os.path.splitext(fn)[-1] != '.hdf5':
continue
try:
f = h5py.File(os.path.join(root, fn), 'r+')
f = h5py.File(name=os.path.join(root, fn), mode='r+', libver='latest')
properties = dict(f.attrs)
yield properties, f
f.close()
Expand Down
6 changes: 4 additions & 2 deletions benchmark/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def run(definition, dataset, count, run_count, rebuild,
algo.set_query_arguments(*query_arguments)
if neurips23track == 'streaming':
descriptor, results = custom_runner.run_task(
algo, ds, distance, 1, run_count, search_type, private_query, runbook)
algo, ds, distance, count, 1, search_type, private_query, runbook)
else:
descriptor, results = custom_runner.run_task(
algo, ds, distance, count, run_count, search_type, private_query)
Expand All @@ -116,9 +116,11 @@ def run(definition, dataset, count, run_count, rebuild,
X = ds.get_private_queries()
power_stats = power_capture.run(algo, X, distance, count,
run_count, search_type, descriptor)
print('start store results')
store_results(dataset, count, definition,
query_arguments, descriptor,
results, search_type, neurips23track, runbook_path)
print('end store results')
finally:
algo.done()

Expand Down Expand Up @@ -263,7 +265,7 @@ def run_docker(definition, dataset, count, runs, timeout, rebuild,

client = docker.from_env()
if mem_limit is None:
mem_limit = psutil.virtual_memory().available
mem_limit = psutil.virtual_memory().available if neurips23track != 'streaming' else (8*1024*1024*1024)

# ready the container object invoked later in this function
container = None
Expand Down
5 changes: 4 additions & 1 deletion benchmark/streaming/compute_gt.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def gt_dir(ds, runbook_path):
return os.path.join(ds.basedir, str(ds.nb), runbook_filename)

def output_gt(ds, ids, step, gt_cmdline, runbook_path):
data = ds.get_dataset()
data = ds.get_data_in_range(0, ds.nb)
data_slice = data[ids]

dir = gt_dir(ds, runbook_path)
Expand All @@ -52,6 +52,9 @@ def output_gt(ds, ids, step, gt_cmdline, runbook_path):
gt_cmdline += ' --tags_file ' + tags_file
print("Executing cmdline: ", gt_cmdline)
os.system(gt_cmdline)
print("Removing data file")
rm_cmdline = "rm " + data_file
os.system(rm_cmdline)


def main():
Expand Down
5 changes: 4 additions & 1 deletion data_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,10 @@ def cleaned_run_metric(run_metrics):
print(f"Looking at track:{track}, dataset:{dataset_name}")
dataset = DATASETS[dataset_name]()
if track == 'streaming':
for runbook_path in ['neurips23/streaming/simple_runbook.yaml', 'neurips23/streaming/clustered_runbook.yaml', 'neurips23/streaming/delete_runbook.yaml']:
for runbook_path in ['neurips23/streaming/simple_runbook.yaml',
'neurips23/streaming/clustered_runbook.yaml',
'neurips23/streaming/delete_runbook.yaml',
'neurips23/streaming/final_runbook.yaml']:
results = load_all_results(dataset_name, neurips23track=track, runbook_path=runbook_path)
run_metrics = compute_metrics_all_runs(dataset, dataset_name, results, args.recompute, \
args.sensors, args.search_times, args.private_query, \
Expand Down
11 changes: 8 additions & 3 deletions neurips23/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ The Practical Vector Search challenge at NeurIPS 2023 has four different tasks:
The tags are from a vocabulary of 200386 possible tags.
The 100,000 queries consist of one image embedding and one or two tags that must appear in the database elements to be considered.

**Task Streaming:** This task uses 10M slice of the MS Turing data set released in the previous challenge. The index starts with zero points and must implement the "runbook" provided - a sequence of insertion operations, deletion operations, and search commands (roughly 4:4:1 ratio) - within a time bound of 1 hour. In the final run, we will use a different runbook, and possibly a different data set, to avoid participants over-fitting to this dataset. Entries will be ranked by average recall over queries at all check points. The intention is for the algorithm to process the operations and maintain a compact index over the active points rather than index the entire anticipated set of points and use tombstones or flags to mark active elements.
**Task Streaming:** This task uses 10M slice of the MS Turing data set released in the previous challenge. The index starts with zero points and must implement the "runbook" provided - a sequence of insertion operations, deletion operations, and search commands (roughly 4:4:1 ratio) - within a time bound of 1 hour and a DRAM limit of 8GB. Entries will be ranked by average recall over queries at all check points. The intention is for the algorithm to process the operations and maintain a compact index over the active points rather than index the entire anticipated set of points and use tombstones or flags to mark active elements. ~~In the final run, we will use a different runbook, and possibly a different data set, to avoid participants over-fitting to this dataset.~~ The final run will use `msturing-30M-clustered`, a 30M slice of the MSTuring dataset, and the `final_runbook.yaml` runbook.

**Task Out-Of-Distribution:** Yandex Text-to-Image 10M represents a cross-modal dataset where the database and query vectors have different distributions in the shared vector space.
The base set is a 10M subset of the Yandex visual search database of 200-dimensional image embeddings which are produced with the Se-ResNext-101 model.
Expand All @@ -46,6 +46,7 @@ The baselines were run on an Azure Standard D8lds v5 (8 vcpus, 16 GiB memory) ma
|Sparse | Linear Scan | 101 | `python3 run.py --dataset sparse-full --algorithm linscan --neurips23track sparse` |
|Filter | faiss | 3200 | `python3 run.py --dataset yfcc-10M --algorithm faiss --neurips23track filter` |
|Streaming| DiskANN | 0.924 (recall@10), 23 mins | `python3 run.py --dataset msturing-10M-clustered --algorithm diskann --neurips23track streaming --runbook_path neurips23/streaming/delete_runbook.yaml` |
|Streaming| DiskANN | 0.883 (recall@10), 45 mins | `python3 run.py --dataset msturing-30M-clustered --algorithm diskann --neurips23track streaming --runbook_path neurips23/streaming/final_runbook.yaml` |
|OOD | DiskANN | 4882 | `python3 run.py --dataset text2image-10M --algorithm diskann --neurips23track ood` |


Expand Down Expand Up @@ -110,13 +111,17 @@ For the competition dataset, run commands mentioned in the table above, for exam
python run.py --neurips23track filter --algorithm faiss --dataset yfcc-10M
python run.py --neurips23track sparse --algorithm linscan --dataset sparse-full
python run.py --neurips23track ood --algorithm diskann --dataset text2image-10M
# preliminary runbook for testing
python run.py --neurips23track streaming --algorithm diskann --dataset msturing-10M-clustered --runbook_path neurips23/streaming/delete_runbook.yaml
#Final runbook for evaluation
python run.py --neurips23track streaming --algorithm diskann --dataset msturing-30M-clustered --runbook_path neurips23/streaming/final_runbook.yaml
```

For streaming track, runbook specifies the order of operations to be executed by the algorithms. To download the ground truth for every search operation: (needs azcopy tool in your binary path):
```
python benchmark/streaming/download_gt.py --runbook_file neurips23/streaming/simple_runbook.yaml --dataset msspacev-10M
python benchmark/streaming/download_gt.py --runbook_file neurips23/streaming/delete_runbook.yaml --dataset msturing-10M-clustered
python -m benchmark.streaming.download_gt --runbook_file neurips23/streaming/simple_runbook.yaml --dataset msspacev-10M
python -m benchmark.streaming.download_gt --runbook_file neurips23/streaming/delete_runbook.yaml --dataset msturing-10M-clustered
python -m benchmark.streaming.download_gt --runbook_file neurips23/streaming/final_runbook.yaml --dataset msturing-30M-clustered
```
Alternately, to compute ground truth for an arbitrary runbook, [clone and build DiskANN repo](https://github.com/Microsoft/DiskANN) and use the command line tool to compute ground truth at various search checkpoints. The `--gt_cmdline_tool` points to the directory with DiskANN commandline tools.
```
Expand Down
16 changes: 15 additions & 1 deletion neurips23/streaming/diskann/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -83,4 +83,18 @@ msturing-10M-clustered:
args: |
[{"R":64, "L":50, "insert_threads":16, "consolidate_threads":16}]
query-args: |
[{"Ls":100, "T":16}]
[{"Ls":100, "T":16}]
msturing-30M-clustered:
diskann:
docker-tag: neurips23-streaming-diskann
module: neurips23.streaming.diskann.diskann-str
constructor: diskann
base-args: ["@metric"]
run-groups:
base:
args: |
[{"R":32, "L":50, "insert_threads":16, "consolidate_threads":16},
{"R":32, "L":70, "insert_threads":16, "consolidate_threads":16},
{"R":50, "L":50, "insert_threads":16, "consolidate_threads":16}]
query-args: |
[{"Ls":70, "T":16}]
Loading

0 comments on commit 8674ce9

Please sign in to comment.