Re-organize runbooks and add wiki replace runbook (#312)

* re-organized runbooks, added generic function for generation, added wiki replace runbook * amended README * updated data_export.py and neurips23 README with new paths * fixed bug in readme instructions, added new 1M wiki runbook * commit to switch branches * fixed bug in replace tag boundary calculation * added gt url option to generator * changed driver to allow replace-only, made replace-only and replace-delete versions of wiki-35M and wiki-1M runbooks * commit to switch * moved runbooks folder, updated README correspondingly * moved runbooks folder
harsha-simhadri · Oct 18, 2024 · 3c35f54 · 3c35f54
1 parent 0ce0eaa
commit 3c35f54
Show file tree

Hide file tree

Showing 26 changed files with 8,625 additions and 159 deletions.
diff --git a/benchmark/streaming/load_runbook.py b/benchmark/streaming/load_runbook.py
@@ -30,6 +30,7 @@ def load_runbook(dataset_name, max_pts, runbook_file):
                 if entry['tags_start'] < 0 or entry ['tags_start'] >= max_pts:
                     raise Exception('Start of indices to be replaced out of range in runbook') 
                 if entry['tags_end'] < 0 or entry ['tags_end'] > max_pts:
+                    print(entry['tags_end'])
                     raise Exception('End of indices to be replaced out of range in runbook') 
                 if entry['ids_start'] < 0 or entry ['ids_start'] >= max_pts:
                     raise Exception('Start of indices to replace out of range in runbook') 

diff --git a/data_export.py b/data_export.py
@@ -96,17 +96,18 @@ def cleaned_run_metric(run_metrics):
             dataset = DATASETS[dataset_name]()
             runbook_paths = [None]
             if track == 'streaming':
-                runbook_paths = ['neurips23/streaming/simple_runbook.yaml',
-                                    'neurips23/streaming/simple_replace_runbook.yaml',
-                                    'neurips23/streaming/random_replace_runbook.yaml',
-                                    'neurips23/streaming/clustered_replace_runbook.yaml',
-                                    'neurips23/streaming/clustered_runbook.yaml',
-                                    'neurips23/streaming/clustered_runbook.yaml',
-                                    'neurips23/streaming/delete_runbook.yaml',
-                                    'neurips23/streaming/final_runbook.yaml',
-                                    'neurips23/streaming/msturing-10M_slidingwindow_runbook.yaml',
-                                    'neurips23/streaming/wikipedia-35M_expirationtime_runbook.yaml',
-                                    'neurips23/streaming/msmarco-100M_expirationtime_runbook.yaml']
+                runbook_paths = ['neurips23/runbooks/streaming/simple_runbook.yaml',
+                                    'neurips23/runbooks/streaming/simple_replace_runbook.yaml',
+                                    'neurips23/runbooks/streaming/random_replace_runbook.yaml',
+                                    'neurips23/runbooks/streaming/clustered_replace_runbook.yaml',
+                                    'neurips23/runbooks/streaming/clustered_runbook.yaml',
+                                    'neurips23/runbooks/streaming/clustered_runbook.yaml',
+                                    'neurips23/runbooks/streaming/delete_runbook.yaml',
+                                    'neurips23/runbooks/streaming/final_runbook.yaml',
+                                    'neurips23/runbooks/streaming/msturing-10M_slidingwindow_runbook.yaml',
+                                    'neurips23/runbooks/streaming/wikipedia-35M_expirationtime_runbook.yaml',
+                                    'neurips23/runbooks/streaming/wikipedia-35M_expiration_time_replace_runbook.yaml',
+                                    'neurips23/runbooks/streaming/msmarco-100M_expirationtime_runbook.yaml']
             for runbook_path in runbook_paths:
                 print("Looking for runbook ", runbook_path)
                 results = load_all_results(dataset_name, neurips23track=track, runbook_path=runbook_path)

diff --git a/neurips23/README.md b/neurips23/README.md
@@ -46,8 +46,8 @@ The baselines were run on an Azure Standard D8lds v5 (8 vcpus, 16 GiB memory) ma
 |---------|-------------|-----------------------------|---------|
 |Sparse   | Linear Scan | 101                         |  `python3 run.py --dataset sparse-full --algorithm linscan --neurips23track sparse` |
 |Filter   | faiss       | 3200                        | `python3 run.py --dataset yfcc-10M --algorithm faiss --neurips23track filter` |
-|Streaming| DiskANN     | 0.924 (recall@10), 23 mins  |  `python3 run.py --dataset msturing-10M-clustered --algorithm diskann --neurips23track streaming --runbook_path neurips23/streaming/delete_runbook.yaml` |
-|Streaming| DiskANN     | 0.883 (recall@10), 45 mins  |  `python3 run.py --dataset msturing-30M-clustered --algorithm diskann --neurips23track streaming --runbook_path neurips23/streaming/final_runbook.yaml` |
+|Streaming| DiskANN     | 0.924 (recall@10), 23 mins  |  `python3 run.py --dataset msturing-10M-clustered --algorithm diskann --neurips23track streaming --runbook_path neurips23/runbooks/delete_runbook.yaml` |
+|Streaming| DiskANN     | 0.883 (recall@10), 45 mins  |  `python3 run.py --dataset msturing-30M-clustered --algorithm diskann --neurips23track streaming --runbook_path neurips23/runbooks/final_runbook.yaml` |
 |OOD      | DiskANN     | 4882                        | `python3 run.py --dataset text2image-10M --algorithm diskann --neurips23track ood` | 
 
 ## For_Participants
@@ -99,7 +99,7 @@ Test the benchmark and baseline using the algorithm's definition file on small t
 python run.py --neurips23track filter    --algorithm faiss   --dataset random-filter-s
 python run.py --neurips23track sparse    --algorithm linscan --dataset sparse-small
 python run.py --neurips23track ood       --algorithm diskann --dataset random-xs
-python run.py --neurips23track streaming --algorithm diskann --dataset random-xs --runbook_path neurips23/streaming/simple_runbook.yaml
+python run.py --neurips23track streaming --algorithm diskann --dataset random-xs --runbook_path neurips23/runbooks/simple_runbook.yaml
 ```
 
 For the competition dataset, run commands mentioned in the table above, for example:
@@ -108,22 +108,22 @@ python run.py --neurips23track filter    --algorithm faiss   --dataset yfcc-10M
 python run.py --neurips23track sparse    --algorithm linscan --dataset sparse-full
 python run.py --neurips23track ood       --algorithm diskann --dataset text2image-10M
 # preliminary runbook for testing 
-python run.py --neurips23track streaming --algorithm diskann --dataset msturing-10M-clustered --runbook_path neurips23/streaming/delete_runbook.yaml
+python run.py --neurips23track streaming --algorithm diskann --dataset msturing-10M-clustered --runbook_path neurips23/runbooks/delete_runbook.yaml
 #Final runbook for evaluation
-python run.py --neurips23track streaming --algorithm diskann --dataset msturing-30M-clustered --runbook_path neurips23/streaming/final_runbook.yaml
+python run.py --neurips23track streaming --algorithm diskann --dataset msturing-30M-clustered --runbook_path neurips23/runbooks/final_runbook.yaml
 ```
 
 For streaming track, runbook specifies the order of operations to be executed by the algorithms. To download the ground truth for every search operation: (needs azcopy tool in your binary path):
 ```
-python -m benchmark.streaming.download_gt --runbook_file neurips23/streaming/simple_runbook.yaml --dataset msspacev-10M
-python -m benchmark.streaming.download_gt --runbook_file neurips23/streaming/delete_runbook.yaml --dataset msturing-10M-clustered
-python -m benchmark.streaming.download_gt --runbook_file neurips23/streaming/final_runbook.yaml  --dataset msturing-30M-clustered
+python -m benchmark.streaming.download_gt --runbook_file neurips23/runbooks/simple_runbook.yaml --dataset msspacev-10M
+python -m benchmark.streaming.download_gt --runbook_file neurips23/runbooks/delete_runbook.yaml --dataset msturing-10M-clustered
+python -m benchmark.streaming.download_gt --runbook_file neurips23/runbooks/final_runbook.yaml  --dataset msturing-30M-clustered
 ```
 Alternately, to compute ground truth for an arbitrary runbook, [clone and build DiskANN repo](https://github.com/Microsoft/DiskANN) and use the command line tool to compute ground truth at various search checkpoints. The `--gt_cmdline_tool` points to the directory with DiskANN commandline tools.
 ```
-python benchmark/streaming/compute_gt.py --dataset msspacev-10M --runbook neurips23/streaming/simple_runbook.yaml --gt_cmdline_tool ~/DiskANN/build/apps/utils/compute_groundtruth
+python benchmark/streaming/compute_gt.py --dataset msspacev-10M --runbook neurips23/runbooks/simple_runbook.yaml --gt_cmdline_tool ~/DiskANN/build/apps/utils/compute_groundtruth
 ```
-Consider also the examples in runbooks [here]]neurips23/streaming/clustered_runbook.yaml) and [here](neurips23/streaming/delete_runbook.yaml). The datasets here are [generated](neurips23/streaming/clustered_data_gen.py) by clustering the original dataset with k-means and packing points in the same cluster into contiguous indices. Then insertions are then performed one cluster at a time. This runbook tests if an indexing algorithm can adapt to data draft. The `max_pts` entry for the dataset in the runbook indicates an upper bound on the number of active points that the index must support during the runbook execution.
+Consider also the examples in runbooks [here]]neurips23/runbooks/clustered_runbook.yaml) and [here](neurips23/runbooks/delete_runbook.yaml). The datasets here are [generated](neurips23/runbooks/clustered_data_gen.py) by clustering the original dataset with k-means and packing points in the same cluster into contiguous indices. Then insertions are then performed one cluster at a time. This runbook tests if an indexing algorithm can adapt to data draft. The `max_pts` entry for the dataset in the runbook indicates an upper bound on the number of active points that the index must support during the runbook execution.
 
 
 To make the results available for post-processing, change permissions of the results folder

diff --git a/neurips23/streaming/clustered_data_gen.py → neurips23/runbooks/clustered_data_gen.py b/neurips23/streaming/clustered_data_gen.py → neurips23/runbooks/clustered_data_gen.py
diff --git a/.../streaming/clustered_replace_runbook.yaml → ...3/runbooks/clustered_replace_runbook.yaml b/.../streaming/clustered_replace_runbook.yaml → ...3/runbooks/clustered_replace_runbook.yaml
diff --git a/neurips23/streaming/clustered_runbook.yaml → neurips23/runbooks/clustered_runbook.yaml b/neurips23/streaming/clustered_runbook.yaml → neurips23/runbooks/clustered_runbook.yaml
diff --git a/neurips23/streaming/delete_runbook.yaml → neurips23/runbooks/delete_runbook.yaml b/neurips23/streaming/delete_runbook.yaml → neurips23/runbooks/delete_runbook.yaml
diff --git a/neurips23/streaming/final_runbook.yaml → neurips23/runbooks/final_runbook.yaml b/neurips23/streaming/final_runbook.yaml → neurips23/runbooks/final_runbook.yaml
diff --git a/neurips23/streaming/final_runbook_gen.py → neurips23/runbooks/final_runbook_gen.py b/neurips23/streaming/final_runbook_gen.py → neurips23/runbooks/final_runbook_gen.py
diff --git a/neurips23/runbooks/gen_expiration_time_runbook.py b/neurips23/runbooks/gen_expiration_time_runbook.py
@@ -0,0 +1,176 @@
+import yaml
+import os
+import random
+
+'''
+dataset_name: dataset key as specified in benchmark/datasets.py
+dataset_size: size of datasets
+max_t: number of timesteps
+runbook_filename: name to save the runbook to
+ratios: tuple of three numbers indicating proportion of deletes/replaces assigned to each timestep
+timesteps: how long to wait before deleting for each ratio
+seed: seed given to random generator
+do_replace: whether to include replace in runbook or not
+'''
+def gen_exp_time_runbook(dataset_name, dataset_size, max_t, runbook_filename, ratios, timesteps, seed = 0, do_replace = False, gt_url = None, do_delete = True):
+    random.seed(seed)
+    data = {dataset_name: {}}
+
+    max_num_points=0
+    num_points=0
+
+    batch_size = dataset_size//max_t
+    to_delete=[[] for _ in range(max_t)]
+    to_replace=[[] for _ in range(max_t)]
+
+    t=1
+
+    for i in range(max_t): 
+        if do_replace:
+            fraction = random.uniform(.5, .9)
+        else:
+            fraction = 1.0
+        start = i*batch_size
+        end = start + int(fraction*batch_size)
+        ids_start = end
+        ids_end = (i+1)*batch_size
+        tags_start = i*batch_size
+        tags_end = tags_start + (ids_end - ids_start)
+        replace_info = (tags_start, tags_end, ids_start, ids_end)
+        delete_info = (tags_start, end)
+        data[dataset_name][t]={
+            'operation': 'insert',
+            'start': i*(batch_size),
+            'end': end
+        }
+        t+=1
+
+        num_points+=int(fraction*batch_size)
+
+        max_num_points=max(max_num_points,num_points)
+
+
+        data_type = random.randint(0, ratios[2])
+        if do_delete:
+            if data_type <= ratios[0]:
+                pass
+            elif data_type > ratios[0] and data_type < ratios[1]:
+                if (i+timesteps[1] < max_t):
+                    to_delete[i+timesteps[1]].append(delete_info)
+            else:
+                if (i+timesteps[2] < max_t):
+                    to_delete[i+timesteps[2]].append(delete_info)
+
+        if do_replace:
+            if data_type <= ratios[0]:
+                remaining_steps = (max_t - t)//2
+                to_replace[i+remaining_steps].append(replace_info)
+                # with probability 1/19, the points get replaced at t_max-t/2 steps
+            elif data_type > ratios[0] and data_type < ratios[1]:
+                if (i + timesteps[1]//2 < max_t):
+                    to_replace[i+timesteps[1]//2].append(replace_info)
+                # with probability 3/19, the points get replaced after 50 steps
+            else:
+                if (i + timesteps[2]//2 < max_t):
+                    to_replace[i+timesteps[2]//2].append(replace_info)
+                # with probability 15/19, the points get replaced after 10 steps
+
+        for (start, end) in to_delete[i]:
+            data[dataset_name][t]={
+                'operation': 'delete',
+                'start': start,
+                'end': end
+            }
+            t+=1
+            num_points-=batch_size
+
+        for (tags_start, tags_end, ids_start, ids_end) in to_replace[i]:
+            data[dataset_name][t] ={
+                'operation' : 'replace',
+                'tags_start': tags_start,
+                'tags_end': tags_end,
+                'ids_start': ids_start,
+                'ids_end': ids_end
+            }
+            t += 1
+
+        data[dataset_name][t]={
+            'operation': 'search',
+        }
+        t+=1
+
+    data[dataset_name]["max_pts"]=max_num_points
+
+    if gt_url is not None:
+        data[dataset_name]["gt_url"] = gt_url
+
+    with open(runbook_filename, 'w') as outfile:
+        yaml.dump(data, outfile, default_flow_style=False)
+
+ratios = (0, 4, 18)
+timesteps = (0, 100, 20)
+seed = 809
+dataset_file = 'wikipedia-35M_expirationtime_runbook.yaml'
+dataset_name = 'wikipedia-35M'
+dataset_size = 35000000
+max_t = 350
+gt_url = "https://comp21storage.z5.web.core.windows.net/wiki-cohere-35M/wikipedia-35M_expirationtime_runbook.yaml"
+gen_exp_time_runbook(dataset_name, dataset_size, max_t, dataset_file, ratios, timesteps, seed, False, gt_url)
+
+ratios = (0, 4, 18)
+timesteps = (0, 100, 20)
+seed = 1232
+dataset_file = 'wikipedia-1M_expiration_time_runbook.yaml'
+dataset_name = 'wikipedia-1M'
+dataset_size = 1000000
+max_t = 100
+gt_url = "https://comp21storage.z5.web.core.windows.net/wiki-cohere-35M/wikipedia-1M_expiration_time_runbook.yaml/"
+gen_exp_time_runbook(dataset_name, dataset_size, max_t, dataset_file, ratios, timesteps, seed, False, gt_url)
+
+ratios = (0, 4, 18)
+timesteps = (0, 100, 20)
+seed = 10001
+dataset_file = 'wikipedia-35M_expiration_time_replace_only_runbook.yaml'
+dataset_name = 'wikipedia-35M'
+dataset_size = 8000000 #only use a prefix of the dataset
+max_t = 80
+gt_url = None
+gen_exp_time_runbook(dataset_name, dataset_size, max_t, dataset_file, ratios, timesteps, seed, True, gt_url, False)
+
+ratios = (0, 4, 18)
+timesteps = (0, 100, 20)
+seed = 754
+dataset_file = 'wikipedia-1M_expiration_time_replace_only_runbook.yaml'
+dataset_name = 'wikipedia-1M'
+dataset_size = 1000000
+max_t = 100
+gt_url = None
+gen_exp_time_runbook(dataset_name, dataset_size, max_t, dataset_file, ratios, timesteps, seed, True, gt_url, False)
+
+ratios = (3, 8, 18)
+timesteps = (0, 300, 50)
+seed = 22
+dataset_file = 'wikipedia-35M_expiration_time_replace_delete_runbook.yaml'
+dataset_name = 'wikipedia-35M'
+dataset_size = 35000000
+max_t = 350
+gen_exp_time_runbook(dataset_name, dataset_size, max_t, dataset_file, ratios, timesteps, seed, True, None)
+
+ratios = (1, 8, 18)
+timesteps = (0, 100, 20)
+seed = 56
+dataset_file = 'wikipedia-1M_expiration_time_replace_delete_runbook.yaml'
+dataset_name = 'wikipedia-1M'
+dataset_size = 1000000
+max_t = 100
+gen_exp_time_runbook(dataset_name, dataset_size, max_t, dataset_file, ratios, timesteps, seed, True, None)
+
+ratios = (0, 6, 25)
+timesteps = (0, 200, 50)
+seed = 809
+dataset_file = 'msmarco-100M_expirationtime_runbook.yaml'
+dataset_name = 'msmarco-100M'
+dataset_size = 101070374
+max_t = 1000
+gen_exp_time_runbook(dataset_name, dataset_size, max_t, dataset_file, ratios, timesteps, seed, False, None)
+
diff --git a/neurips23/streaming/gen_replace_runbooks.py → neurips23/runbooks/gen_replace_runbooks.py b/neurips23/streaming/gen_replace_runbooks.py → neurips23/runbooks/gen_replace_runbooks.py
diff --git a/...treaming/generate_msturing10m_runbooks.py → ...runbooks/generate_msturing10m_runbooks.py b/...treaming/generate_msturing10m_runbooks.py → ...runbooks/generate_msturing10m_runbooks.py
diff --git a/.../msmarco-100M_expirationtime_runbook.yaml → .../msmarco-100M_expirationtime_runbook.yaml b/.../msmarco-100M_expirationtime_runbook.yaml → .../msmarco-100M_expirationtime_runbook.yaml
diff --git a/...g/msturing-10M_slidingwindow_runbook.yaml → ...s/msturing-10M_slidingwindow_runbook.yaml b/...g/msturing-10M_slidingwindow_runbook.yaml → ...s/msturing-10M_slidingwindow_runbook.yaml
diff --git a/...s23/streaming/random_replace_runbook.yaml → ...ps23/runbooks/random_replace_runbook.yaml b/...s23/streaming/random_replace_runbook.yaml → ...ps23/runbooks/random_replace_runbook.yaml
diff --git a/...s23/streaming/simple_replace_runbook.yaml → ...ps23/runbooks/simple_replace_runbook.yaml b/...s23/streaming/simple_replace_runbook.yaml → ...ps23/runbooks/simple_replace_runbook.yaml
diff --git a/neurips23/streaming/simple_runbook.yaml → neurips23/runbooks/simple_runbook.yaml b/neurips23/streaming/simple_runbook.yaml → neurips23/runbooks/simple_runbook.yaml