Merge pull request #22 from sparks-baird/ramseys_branch

fixing packing generation submitit
sparks-baird · Feb 28, 2023 · 4e7d329 · 4e7d329
2 parents 7e3095e + 2814265
commit 4e7d329
Show file tree

Hide file tree

Showing 8 changed files with 215 additions and 55 deletions.
diff --git a/notebooks/particle_packing/1.0-sgb-collect-from-mongodb.ipynb b/notebooks/particle_packing/1.0-sgb-collect-from-mongodb.ipynb
@@ -24,7 +24,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 688693/688693 [02:41<00:00, 4271.45it/s]\n"
+      "100%|██████████| 688693/688693 [00:41<00:00, 16617.13it/s]\n"
      ]
     }
    ],
@@ -43,7 +43,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -56,7 +56,7 @@
        "      dtype='object')"
       ]
      },
-     "execution_count": 4,
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -67,6 +67,33 @@
     "df.to_csv(\"../../data/external/particle_packing_sobol.csv\")\n",
     "df.columns"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "54.97133601808166"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df[\"runtime\"].mean()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {

diff --git a/notebooks/particle_packing/1.4-sgb-model-usage.ipynb b/notebooks/particle_packing/1.4-sgb-model-usage.ipynb
@@ -23,6 +23,26 @@
     "fpath = \"../../models/particle_packing/surrogate_models.pkl\""
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pystow import ensure\n",
+    "key = \n",
+    "\n",
+    "class ParticlePackingSurrogate(object):\n",
+    "    def __init__(self, fpath):\n",
+    "        if fpath is None:\n",
+    "            \n",
+    "        self.models = models\n",
+    "\n",
+    "with open()\n",
+    "\n",
+    "def predict(mu1, mu2, mu3, std1, std2, std3, comp1, comp2, comp3, num_particles, safety_factor):"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 14,
@@ -63,7 +83,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "matsci-opt-benchmarks",
+   "display_name": "Python 3",
    "language": "python",
    "name": "python3"
   },
@@ -82,7 +102,7 @@
   "orig_nbformat": 4,
   "vscode": {
    "interpreter": {
-    "hash": "01883adffc5ff99e80740fdb2688c7d7f1b5220f2274814f600fbe3b3887f376"
+    "hash": "e19409612ae03435658261429b42da85155baf08f8f23ce7d549f0541569b2a3"
    }
   }
  },

diff --git a/reports/particle_packing/Datainbrief.docx b/reports/particle_packing/Datainbrief.docx
diff --git a/scripts/particle_packing/packing_generation_submitit.py b/scripts/particle_packing/packing_generation_submitit.py
@@ -6,11 +6,15 @@
 from time import time
 from uuid import uuid4
 
+import pandas as pd
+import pymongo
 import requests
+import torch
 from ax.modelbridge.factory import get_sobol
 from ax.service.ax_client import AxClient
-from my_secrets import MONGODB_API_KEY
+from my_secrets import MONGODB_API_KEY, MONGODB_PASSWORD, MONGODB_USERNAME
 from submitit import AutoExecutor
+from tqdm import tqdm
 
 from matsci_opt_benchmarks.particle_packing.utils.data import get_parameters
 from matsci_opt_benchmarks.particle_packing.utils.packing_generation import evaluate
@@ -44,7 +48,7 @@
     std_names_out,
     orig_mean_names,
     orig_std_names,
-) = get_parameters(remove_composition_degeneracy=True, remove_scaling_degeneracy=True)
+) = get_parameters(remove_composition_degeneracy=False, remove_scaling_degeneracy=False)
 
 parameters.append({"name": "num_particles", "type": "range", "bounds": [100, 1000]})
 parameters.append({"name": "safety_factor", "type": "range", "bounds": [1.0, 2.5]})
@@ -54,28 +58,119 @@
     parameters=parameters,
     objective_name="packing_fraction",
     minimize=False,
-    parameter_constraints=["std1 <= std2", "comp1 + comp2 <= 1.0"],
+    parameter_constraints=["std1 <= std2", "std2 <= std3"],
 )
 search_space = ax_client.experiment.search_space
-m = get_sobol(search_space, fallback_to_sample_polytope=True, seed=SEED)
-gr = m.gen(n=num_samples)
-param_df = gr.param_df.copy()
-# param_df["num_particles"] = 1000
+# https://github.com/facebook/Ax/issues/740
+# https://github.com/facebook/Ax/issues/1439
+fallback = False
+m = get_sobol(search_space, fallback_to_sample_polytope=fallback, seed=SEED)
+# https://github.com/facebook/Ax/issues/1439
+torch.manual_seed(SEED)
+# increase max_rs_draws if you get an error about too many random draws
+# as a last resort, switch fallback to True (above)
+max_rs_draws = 1000000
+gr = m.gen(n=num_samples, model_gen_options={"max_rs_draws": max_rs_draws})
+# gr.param_df not working https://github.com/facebook/Ax/issues/1437
+# param_df = gr.param_df.copy()
+param_df = pd.DataFrame([arm.parameters for arm in gr.arms])
+
+
+app_name = "data-oeodi"
+url = f"https://us-east-1.aws.data.mongodb-api.com/app/{app_name}/endpoint/data/v1/action/insertOne"  # noqa: E501
+# noqa: E501
+collection_name = "sobol"
+database_name = "particle-packing"
+dataSource = "Cluster0"
+cluster_uri = "cluster0.n03mvdg"
+
+# to find this string, click connect to your MongoDB cluster on the website
+# also needed to go to "Network Access", click "Add IP address", click "Allow access
+# from anywhere", and add
+client = pymongo.MongoClient(
+    f"mongodb+srv://{MONGODB_USERNAME}:{MONGODB_PASSWORD}@{cluster_uri}.mongodb.net/?retryWrites=true&w=majority"  # noqa: E501
+)
+db = client[database_name]
+collection = db[collection_name]
+
+posts = collection.find({})
+results = [post for post in tqdm(posts)]
+
+parameter_names = [
+    "mu1",
+    "mu2",
+    "mu3",
+    "std1",
+    "std2",
+    "std3",
+    "comp1",
+    "comp2",
+    "comp3",
+    "num_particles",
+    "safety_factor",
+]
+
+if len(results) > 0:
+    mongo_df = pd.DataFrame(results)
+    mongo_param_df: pd.DataFrame = mongo_df[parameter_names]
+
+    # remove the entries that are already in the database, including repeats
+    # the repeats are necessary for the variance calculation
+    # this is sort of a setdiff
+
+    mongo_param_df = mongo_param_df.groupby(
+        mongo_param_df.columns.tolist(), as_index=False
+    ).size()  # type: ignore
+    mongo_param_df["group_id"] = (
+        mongo_param_df[parameter_names]
+        .round(6)
+        .apply(lambda row: "_".join(row.values.astype(str)), axis=1)
+    )
+    param_df["group_id"] = (
+        param_df[parameter_names]
+        .round(6)
+        .apply(lambda row: "_".join(row.values.astype(str)), axis=1)
+    )
+
+    # pd.concat((param_df["group_id"], mongo_param_df["group_id"])).drop_duplicates()
+
+    param_df = param_df[~param_df["group_id"].isin(mongo_param_df["group_id"])]
+    param_df = param_df[parameter_names]
+
+    mongo_param_df = mongo_param_df[(num_repeats - mongo_param_df["size"]) > 0]
+
+    # repeat the rows of mongo_param_df based on the size column
+    # iterate through rows of mongo_param_df
+    sub_dfs = []
+    for index, row in mongo_param_df.iterrows():
+        sub_dfs.append(
+            pd.concat([row[parameter_names]] * row["size"], axis=1, ignore_index=True).T
+        )
+
+    if len(sub_dfs) > 0:
+        mongo_param_df = pd.concat(sub_dfs, axis=0, ignore_index=True)[parameter_names]
+    else:
+        mongo_param_df = pd.DataFrame(columns=parameter_names)
+
+# repeat the rows of param_df num_repeats times
+param_df = pd.concat([param_df] * num_repeats, ignore_index=True)
+
+if len(results) > 0:
+    param_df = pd.concat([param_df, mongo_param_df])  # type: ignore
+
 param_df["util_dir"] = path.join(
     "src", "matsci_opt_benchmarks", "particle_packing", "utils"
 )
 param_df["data_dir"] = path.join("data", "interim", "particle_packing")
 parameter_sets = param_df.to_dict(orient="records")
-parameter_sets = parameter_sets * num_repeats
+# parameter_sets = parameter_sets * num_repeats
 shuffle(parameter_sets)
 
 if dummy:
-    parameter_sets = parameter_sets[:10]
+    # parameter_sets = parameter_sets[:10]
     batch_size = 5
 else:
-    batch_size = 700
-
-url = "https://data.mongodb-api.com/app/data-plyju/endpoint/data/v1/action/insertOne"  # noqa: E501
+    batch_size = 1400
 
 
 def mongodb_evaluate(parameter_set, verbose=False):
@@ -99,9 +194,9 @@ def mongodb_evaluate(parameter_set, verbose=False):
 
     payload = json.dumps(
         {
-            "collection": "sobol",
-            "database": "particle-packing",
-            "dataSource": "matsci-opt-benchmarks",
+            "collection": collection_name,
+            "database": database_name,
+            "dataSource": dataSource,
             "document": results,
         }
     )
@@ -138,21 +233,49 @@ def chunks(lst, n):
 # use `myallocation` command to see available account/partition combos
 # account = "sparks"
 # partition = "kingspeak"
-account = "owner-guest"
-partition = "kingspeak-guest"
+# account = "owner-guest"
+# partition = "kingspeak-guest"
+account = "sparks"
+partition = "notchpeak-shared-freecycle"  # to allow for node sharing
 executor = AutoExecutor(folder=log_folder)
 executor.update_parameters(
     timeout_min=walltime_min,
-    slurm_nodes=None,
     slurm_partition=partition,
-    # slurm_cpus_per_task=1,
     slurm_additional_parameters={"ntasks": 1, "account": account},
 )
 
+# mongodb_evaluate(parameter_sets[0], verbose=True)
+
 # sbatch array
 jobs = executor.map_array(mongodb_evaluate_batch, parameter_batch_sets)
 # jobs = executor.map_array(mongodb_evaluate, parameter_sets)
 print("Submitted jobs")
+
+results = [job.result() for job in jobs]
+
+1 + 1
+
+# %% Code Graveyard
+# import pymongo
+# from urllib.parse import quote_plus
+# password needs to be URL encoded
+# client = pymongo.MongoClient(
+#     f"mongodb+srv://{USERNAME}:{quote_plus(PASSWORD)}@matsci-opt-benchmarks.ehu7qrh.mongodb.net/?retryWrites=true&w=majority"# noqa: E501
+# )
+# collection = client["particle-packing"]["sobol"]
+# collection.insert_one(result)
+
+# import cloudpickle as pickle
+
+# param_df = param_df[~param_df.isin(mongo_param_df[parameter_names]).all(1)]
+# param_df = param_df[~param_df.isin(mongo_param_df).all(1)]
+
+# setdiff between two dataframes
+# https://stackoverflow.com/questions/17071871/select-rows-from-a-dataframe-
+
+# param_df["num_particles"] = 1000
+
+
 # job_ids = [job.job_id for job in jobs]
 # # https://www.hpc2n.umu.se/documentation/batchsystem/job-dependencies
 # job_ids_str = ":".join(job_ids)  # e.g. "3937257_0:3937257_1:..."
@@ -179,19 +302,3 @@ def chunks(lst, n):
 # print( f"Waiting for submission jobs ({job_ids_str}) to complete before running
 #     collector job ({collector_job.job_id}). Pickled results file saved to
 # {slurm_savepath} after all jobs have run." )
-
-results = [job.result() for job in jobs]
-
-1 + 1
-
-# %% Code Graveyard
-# import pymongo
-# from urllib.parse import quote_plus
-# password needs to be URL encoded
-# client = pymongo.MongoClient(
-#     f"mongodb+srv://{USERNAME}:{quote_plus(PASSWORD)}@matsci-opt-benchmarks.ehu7qrh.mongodb.net/?retryWrites=true&w=majority"# noqa: E501
-# )
-# collection = client["particle-packing"]["sobol"]
-# collection.insert_one(result)
-
-# import cloudpickle as pickle
diff --git a/setup.cfg b/setup.cfg
@@ -58,6 +58,7 @@ install_requires =
     matbench
     torch
     zenodo-client
+    ray[tune]
     kaleido
 
 

diff --git a/src/matsci_opt_benchmarks/particle_packing/core.py b/src/matsci_opt_benchmarks/particle_packing/core.py
@@ -4,9 +4,10 @@
 
 import ray
 import torch
-from boppf.utils.ax import optimize_ppf
 from psutil import cpu_count
 
+from matsci_opt_benchmarks.particle_packing.utils.ax import optimize_ppf
+
 
 class BOPPF:
     def __init__(
-Original file line number
+Diff line change
@@ Expand Up / @@ -58,6 +58,7 @@ install_requires = @@
         matbench
         torch
         zenodo-client
+        ray[tune]
         kaleido
@@ Expand Down @@