diff --git a/notebooks/particle_packing/1.0-sgb-collect-from-mongodb.ipynb b/notebooks/particle_packing/1.0-sgb-collect-from-mongodb.ipynb index a88f229..a5949c7 100644 --- a/notebooks/particle_packing/1.0-sgb-collect-from-mongodb.ipynb +++ b/notebooks/particle_packing/1.0-sgb-collect-from-mongodb.ipynb @@ -24,7 +24,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 688693/688693 [02:41<00:00, 4271.45it/s]\n" + "100%|██████████| 688693/688693 [00:41<00:00, 16617.13it/s]\n" ] } ], @@ -43,7 +43,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -56,7 +56,7 @@ " dtype='object')" ] }, - "execution_count": 4, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -67,6 +67,33 @@ "df.to_csv(\"../../data/external/particle_packing_sobol.csv\")\n", "df.columns" ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "54.97133601808166" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[\"runtime\"].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/notebooks/particle_packing/1.4-sgb-model-usage.ipynb b/notebooks/particle_packing/1.4-sgb-model-usage.ipynb index 67fe0c8..1480932 100644 --- a/notebooks/particle_packing/1.4-sgb-model-usage.ipynb +++ b/notebooks/particle_packing/1.4-sgb-model-usage.ipynb @@ -23,6 +23,26 @@ "fpath = \"../../models/particle_packing/surrogate_models.pkl\"" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pystow import ensure\n", + "key = \n", + "\n", + "class ParticlePackingSurrogate(object):\n", + " def __init__(self, fpath):\n", + " if fpath is None:\n", + " \n", + " self.models = models\n", + "\n", + "with open()\n", + "\n", + "def predict(mu1, mu2, mu3, std1, std2, std3, comp1, comp2, comp3, num_particles, safety_factor):" + ] + }, { "cell_type": "code", "execution_count": 14, @@ -63,7 +83,7 @@ ], "metadata": { "kernelspec": { - "display_name": "matsci-opt-benchmarks", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -82,7 +102,7 @@ "orig_nbformat": 4, "vscode": { "interpreter": { - "hash": "01883adffc5ff99e80740fdb2688c7d7f1b5220f2274814f600fbe3b3887f376" + "hash": "e19409612ae03435658261429b42da85155baf08f8f23ce7d549f0541569b2a3" } } }, diff --git a/reports/particle_packing/Datainbrief.docx b/reports/particle_packing/Datainbrief.docx index 19909e6..3bc6b25 100644 Binary files a/reports/particle_packing/Datainbrief.docx and b/reports/particle_packing/Datainbrief.docx differ diff --git a/scripts/particle_packing/packing_generation_submitit.py b/scripts/particle_packing/packing_generation_submitit.py index 28691bd..5076d43 100644 --- a/scripts/particle_packing/packing_generation_submitit.py +++ b/scripts/particle_packing/packing_generation_submitit.py @@ -6,11 +6,15 @@ from time import time from uuid import uuid4 +import pandas as pd +import pymongo import requests +import torch from ax.modelbridge.factory import get_sobol from ax.service.ax_client import AxClient -from my_secrets import MONGODB_API_KEY +from my_secrets import MONGODB_API_KEY, MONGODB_PASSWORD, MONGODB_USERNAME from submitit import AutoExecutor +from tqdm import tqdm from matsci_opt_benchmarks.particle_packing.utils.data import get_parameters from matsci_opt_benchmarks.particle_packing.utils.packing_generation import evaluate @@ -44,7 +48,7 @@ std_names_out, orig_mean_names, orig_std_names, -) = get_parameters(remove_composition_degeneracy=True, remove_scaling_degeneracy=True) +) = get_parameters(remove_composition_degeneracy=False, remove_scaling_degeneracy=False) parameters.append({"name": "num_particles", "type": "range", "bounds": [100, 1000]}) parameters.append({"name": "safety_factor", "type": "range", "bounds": [1.0, 2.5]}) @@ -54,28 +58,119 @@ parameters=parameters, objective_name="packing_fraction", minimize=False, - parameter_constraints=["std1 <= std2", "comp1 + comp2 <= 1.0"], + parameter_constraints=["std1 <= std2", "std2 <= std3"], ) search_space = ax_client.experiment.search_space -m = get_sobol(search_space, fallback_to_sample_polytope=True, seed=SEED) -gr = m.gen(n=num_samples) -param_df = gr.param_df.copy() -# param_df["num_particles"] = 1000 +# https://github.com/facebook/Ax/issues/740 +# https://github.com/facebook/Ax/issues/1439 +fallback = False +m = get_sobol(search_space, fallback_to_sample_polytope=fallback, seed=SEED) +# https://github.com/facebook/Ax/issues/1439 +torch.manual_seed(SEED) +# increase max_rs_draws if you get an error about too many random draws +# as a last resort, switch fallback to True (above) +max_rs_draws = 1000000 +gr = m.gen(n=num_samples, model_gen_options={"max_rs_draws": max_rs_draws}) +# gr.param_df not working https://github.com/facebook/Ax/issues/1437 +# param_df = gr.param_df.copy() +param_df = pd.DataFrame([arm.parameters for arm in gr.arms]) + + +app_name = "data-oeodi" +url = f"https://us-east-1.aws.data.mongodb-api.com/app/{app_name}/endpoint/data/v1/action/insertOne" # noqa: E501 +# noqa: E501 +collection_name = "sobol" +database_name = "particle-packing" +dataSource = "Cluster0" +cluster_uri = "cluster0.n03mvdg" + +# to find this string, click connect to your MongoDB cluster on the website +# also needed to go to "Network Access", click "Add IP address", click "Allow access +# from anywhere", and add +client = pymongo.MongoClient( + f"mongodb+srv://{MONGODB_USERNAME}:{MONGODB_PASSWORD}@{cluster_uri}.mongodb.net/?retryWrites=true&w=majority" # noqa: E501 +) +db = client[database_name] +collection = db[collection_name] + +posts = collection.find({}) +results = [post for post in tqdm(posts)] + +parameter_names = [ + "mu1", + "mu2", + "mu3", + "std1", + "std2", + "std3", + "comp1", + "comp2", + "comp3", + "num_particles", + "safety_factor", +] + +if len(results) > 0: + mongo_df = pd.DataFrame(results) + mongo_param_df: pd.DataFrame = mongo_df[parameter_names] + + # remove the entries that are already in the database, including repeats + # the repeats are necessary for the variance calculation + # this is sort of a setdiff + + mongo_param_df = mongo_param_df.groupby( + mongo_param_df.columns.tolist(), as_index=False + ).size() # type: ignore + mongo_param_df["group_id"] = ( + mongo_param_df[parameter_names] + .round(6) + .apply(lambda row: "_".join(row.values.astype(str)), axis=1) + ) + param_df["group_id"] = ( + param_df[parameter_names] + .round(6) + .apply(lambda row: "_".join(row.values.astype(str)), axis=1) + ) + + # pd.concat((param_df["group_id"], mongo_param_df["group_id"])).drop_duplicates() + + param_df = param_df[~param_df["group_id"].isin(mongo_param_df["group_id"])] + param_df = param_df[parameter_names] + + mongo_param_df = mongo_param_df[(num_repeats - mongo_param_df["size"]) > 0] + + # repeat the rows of mongo_param_df based on the size column + # iterate through rows of mongo_param_df + sub_dfs = [] + for index, row in mongo_param_df.iterrows(): + sub_dfs.append( + pd.concat([row[parameter_names]] * row["size"], axis=1, ignore_index=True).T + ) + + if len(sub_dfs) > 0: + mongo_param_df = pd.concat(sub_dfs, axis=0, ignore_index=True)[parameter_names] + else: + mongo_param_df = pd.DataFrame(columns=parameter_names) + +# repeat the rows of param_df num_repeats times +param_df = pd.concat([param_df] * num_repeats, ignore_index=True) + +if len(results) > 0: + param_df = pd.concat([param_df, mongo_param_df]) # type: ignore + param_df["util_dir"] = path.join( "src", "matsci_opt_benchmarks", "particle_packing", "utils" ) param_df["data_dir"] = path.join("data", "interim", "particle_packing") parameter_sets = param_df.to_dict(orient="records") -parameter_sets = parameter_sets * num_repeats +# parameter_sets = parameter_sets * num_repeats shuffle(parameter_sets) if dummy: - parameter_sets = parameter_sets[:10] + # parameter_sets = parameter_sets[:10] batch_size = 5 else: - batch_size = 700 - -url = "https://data.mongodb-api.com/app/data-plyju/endpoint/data/v1/action/insertOne" # noqa: E501 + batch_size = 1400 def mongodb_evaluate(parameter_set, verbose=False): @@ -99,9 +194,9 @@ def mongodb_evaluate(parameter_set, verbose=False): payload = json.dumps( { - "collection": "sobol", - "database": "particle-packing", - "dataSource": "matsci-opt-benchmarks", + "collection": collection_name, + "database": database_name, + "dataSource": dataSource, "document": results, } ) @@ -138,21 +233,49 @@ def chunks(lst, n): # use `myallocation` command to see available account/partition combos # account = "sparks" # partition = "kingspeak" -account = "owner-guest" -partition = "kingspeak-guest" +# account = "owner-guest" +# partition = "kingspeak-guest" +account = "sparks" +partition = "notchpeak-shared-freecycle" # to allow for node sharing executor = AutoExecutor(folder=log_folder) executor.update_parameters( timeout_min=walltime_min, - slurm_nodes=None, slurm_partition=partition, - # slurm_cpus_per_task=1, slurm_additional_parameters={"ntasks": 1, "account": account}, ) +# mongodb_evaluate(parameter_sets[0], verbose=True) + # sbatch array jobs = executor.map_array(mongodb_evaluate_batch, parameter_batch_sets) # jobs = executor.map_array(mongodb_evaluate, parameter_sets) print("Submitted jobs") + +results = [job.result() for job in jobs] + +1 + 1 + +# %% Code Graveyard +# import pymongo +# from urllib.parse import quote_plus +# password needs to be URL encoded +# client = pymongo.MongoClient( +# f"mongodb+srv://{USERNAME}:{quote_plus(PASSWORD)}@matsci-opt-benchmarks.ehu7qrh.mongodb.net/?retryWrites=true&w=majority"# noqa: E501 +# ) +# collection = client["particle-packing"]["sobol"] +# collection.insert_one(result) + +# import cloudpickle as pickle + +# param_df = param_df[~param_df.isin(mongo_param_df[parameter_names]).all(1)] +# param_df = param_df[~param_df.isin(mongo_param_df).all(1)] + +# setdiff between two dataframes +# https://stackoverflow.com/questions/17071871/select-rows-from-a-dataframe- + +# param_df["num_particles"] = 1000 + + # job_ids = [job.job_id for job in jobs] # # https://www.hpc2n.umu.se/documentation/batchsystem/job-dependencies # job_ids_str = ":".join(job_ids) # e.g. "3937257_0:3937257_1:..." @@ -179,19 +302,3 @@ def chunks(lst, n): # print( f"Waiting for submission jobs ({job_ids_str}) to complete before running # collector job ({collector_job.job_id}). Pickled results file saved to # {slurm_savepath} after all jobs have run." ) - -results = [job.result() for job in jobs] - -1 + 1 - -# %% Code Graveyard -# import pymongo -# from urllib.parse import quote_plus -# password needs to be URL encoded -# client = pymongo.MongoClient( -# f"mongodb+srv://{USERNAME}:{quote_plus(PASSWORD)}@matsci-opt-benchmarks.ehu7qrh.mongodb.net/?retryWrites=true&w=majority"# noqa: E501 -# ) -# collection = client["particle-packing"]["sobol"] -# collection.insert_one(result) - -# import cloudpickle as pickle diff --git a/setup.cfg b/setup.cfg index cafeb86..d7913e4 100644 --- a/setup.cfg +++ b/setup.cfg @@ -58,6 +58,7 @@ install_requires = matbench torch zenodo-client + ray[tune] kaleido diff --git a/src/matsci_opt_benchmarks/particle_packing/core.py b/src/matsci_opt_benchmarks/particle_packing/core.py index 3a0f1d0..485f98b 100644 --- a/src/matsci_opt_benchmarks/particle_packing/core.py +++ b/src/matsci_opt_benchmarks/particle_packing/core.py @@ -4,9 +4,10 @@ import ray import torch -from boppf.utils.ax import optimize_ppf from psutil import cpu_count +from matsci_opt_benchmarks.particle_packing.utils.ax import optimize_ppf + class BOPPF: def __init__( diff --git a/src/matsci_opt_benchmarks/particle_packing/utils/ax.py b/src/matsci_opt_benchmarks/particle_packing/utils/ax.py index a5f7ef3..f6db1fe 100644 --- a/src/matsci_opt_benchmarks/particle_packing/utils/ax.py +++ b/src/matsci_opt_benchmarks/particle_packing/utils/ax.py @@ -12,7 +12,15 @@ from ax.modelbridge.registry import Models from ax.service.ax_client import AxClient from ax.service.utils.instantiation import ObjectiveProperties -from boppf.utils.data import ( +from botorch.acquisition import qExpectedImprovement +from psutil import cpu_count +from ray import tune +from ray.tune import report +from ray.tune.suggest.ax import AxSearch +from sklearn.preprocessing import normalize +from tqdm import tqdm + +from matsci_opt_benchmarks.particle_packing.utils.data import ( MU3, SPLIT, default_frac_bnd, @@ -22,14 +30,9 @@ get_parameters, target_name, ) -from boppf.utils.particle_packing import particle_packing_simulation -from botorch.acquisition import qExpectedImprovement -from psutil import cpu_count -from ray import tune -from ray.tune import report -from ray.tune.suggest.ax import AxSearch -from sklearn.preprocessing import normalize -from tqdm import tqdm +from matsci_opt_benchmarks.particle_packing.utils.particle_packing import ( + particle_packing_simulation, +) logger = logging.getLogger(tune.__name__) logger.setLevel( diff --git a/src/matsci_opt_benchmarks/particle_packing/utils/packing_generation.py b/src/matsci_opt_benchmarks/particle_packing/utils/packing_generation.py index c5ced82..8d25f8b 100644 --- a/src/matsci_opt_benchmarks/particle_packing/utils/packing_generation.py +++ b/src/matsci_opt_benchmarks/particle_packing/utils/packing_generation.py @@ -212,13 +212,14 @@ def read_packing_fraction(data_dir, uid, packing_xyzd_fpath, box_length, final=F def evaluate(parameters): - mu3 = 3.0 + # mu3 = 3.0 # print("current working directory: ", os.getcwd()) - means = [parameters[name] * mu3 for name in ["mu1_div_mu3", "mu2_div_mu3"]] - means.append(mu3) + # means = [parameters[name] * mu3 for name in ["mu1_div_mu3", "mu2_div_mu3"]] + # means.append(mu3) + means = [parameters[name] for name in ["mu1", "mu2", "mu3"]] stds = [parameters[name] for name in ["std1", "std2", "std3"]] - comps = [parameters[name] for name in ["comp1", "comp2"]] - comps.append(1 - sum(comps)) + comps = [parameters[name] for name in ["comp1", "comp2", "comp3"]] + # comps.append(1 - sum(comps)) num_particles = parameters["num_particles"] safety_factor = parameters.get("safety_factor", 2.0) util_dir = parameters.get("util_dir", ".")