From 3501db4ec27bf34367eeeb14cfe953b52bbef50c Mon Sep 17 00:00:00 2001 From: Xinzijian Liu Date: Tue, 10 Sep 2024 15:23:48 +0800 Subject: [PATCH] Add HDF5 support for trajs and model_devis (#259) ## Summary by CodeRabbit - **New Features** - Introduced new optional arguments for improved data handling and multitasking capabilities. - Added support for HDF5 formatted data in various modules. - Enhanced flexibility in input handling for multiple data formats. - **Bug Fixes** - Improved robustness in handling validation data structures. - **Documentation** - Updated documentation to clarify new parameters and their intended use. --------- Signed-off-by: zjgemi Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- dpgen2/entrypoint/args.py | 8 +++ dpgen2/entrypoint/submit.py | 5 +- dpgen2/exploration/render/traj_render.py | 7 ++- .../exploration/render/traj_render_lammps.py | 21 +++++-- .../convergence_check_stage_scheduler.py | 6 +- dpgen2/exploration/scheduler/scheduler.py | 8 ++- .../exploration/scheduler/stage_scheduler.py | 11 ++-- dpgen2/exploration/selector/conf_selector.py | 8 ++- .../selector/conf_selector_frame.py | 8 ++- dpgen2/flow/dpgen_loop.py | 4 +- dpgen2/op/__init__.py | 1 + dpgen2/op/run_relax.py | 60 +++++++++++++------ dpgen2/op/select_confs.py | 6 +- pyproject.toml | 2 +- 14 files changed, 116 insertions(+), 39 deletions(-) diff --git a/dpgen2/entrypoint/args.py b/dpgen2/entrypoint/args.py index 4ba69ea2..df11ff7f 100644 --- a/dpgen2/entrypoint/args.py +++ b/dpgen2/entrypoint/args.py @@ -359,6 +359,7 @@ def run_diffcsp_args(): doc_gen_tasks = "Number of DiffCSP generation tasks" doc_gen_command = "Command for DiffCSP generation" doc_relax_group_size = "Group size for relaxation" + doc_use_hdf5 = "Use HDF5 to store trajs and model_devis" return [ Argument( "gen_tasks", @@ -380,6 +381,13 @@ def run_diffcsp_args(): default=100, doc=doc_relax_group_size, ), + Argument( + "use_hdf5", + bool, + optional=True, + default=False, + doc=doc_use_hdf5, + ), ] diff --git a/dpgen2/entrypoint/submit.py b/dpgen2/entrypoint/submit.py index a6911ef0..76f944c6 100644 --- a/dpgen2/entrypoint/submit.py +++ b/dpgen2/entrypoint/submit.py @@ -111,6 +111,7 @@ RunDPTrain, RunLmp, RunRelax, + RunRelaxHDF5, SelectConfs, ) from dpgen2.op.caly_evo_step_merge import ( @@ -167,6 +168,7 @@ def make_concurrent_learning_op( upload_python_packages: Optional[List[os.PathLike]] = None, valid_data: Optional[S3Artifact] = None, train_optional_files: Optional[List[str]] = None, + explore_config: Optional[dict] = None, ): if train_style in ("dp", "dp-dist"): prep_run_train_op = PrepRunDPTrain( @@ -234,7 +236,7 @@ def make_concurrent_learning_op( "prep-run-diffcsp", DiffCSPGen, PrepRelax, - RunRelax, + RunRelaxHDF5 if explore_config["use_hdf5"] else RunRelax, # type: ignore prep_config=prep_explore_config, run_config=run_explore_config, upload_python_packages=upload_python_packages, @@ -552,6 +554,7 @@ def workflow_concurrent_learning( upload_python_packages=upload_python_packages, valid_data=valid_data, train_optional_files=train_optional_files, + explore_config=explore_config, ) scheduler = make_naive_exploration_scheduler(config) diff --git a/dpgen2/exploration/render/traj_render.py b/dpgen2/exploration/render/traj_render.py index eb7296b6..5c9f0c41 100644 --- a/dpgen2/exploration/render/traj_render.py +++ b/dpgen2/exploration/render/traj_render.py @@ -15,6 +15,9 @@ import dpdata import numpy as np +from dflow.python.opio import ( + HDF5Dataset, +) from ..deviation import ( DeviManager, @@ -30,7 +33,7 @@ class TrajRender(ABC): @abstractmethod def get_model_devi( self, - files: List[Path], + files: Union[List[Path], List[HDF5Dataset]], ) -> DeviManager: r"""Get model deviations from recording files. @@ -48,7 +51,7 @@ def get_model_devi( @abstractmethod def get_confs( self, - traj: List[Path], + traj: Union[List[Path], List[HDF5Dataset]], id_selected: List[List[int]], type_map: Optional[List[str]] = None, conf_filters: Optional["ConfFilters"] = None, diff --git a/dpgen2/exploration/render/traj_render_lammps.py b/dpgen2/exploration/render/traj_render_lammps.py index d51ec040..28eb07f6 100644 --- a/dpgen2/exploration/render/traj_render_lammps.py +++ b/dpgen2/exploration/render/traj_render_lammps.py @@ -1,4 +1,7 @@ import json +from io import ( + StringIO, +) from pathlib import ( Path, ) @@ -12,6 +15,9 @@ import dpdata import numpy as np +from dflow.python.opio import ( + HDF5Dataset, +) from dpgen2.utils import ( setup_ele_temp, @@ -42,7 +48,7 @@ def __init__( def get_model_devi( self, - files: List[Path], + files: Union[List[Path], List[HDF5Dataset]], ) -> DeviManager: ntraj = len(files) @@ -53,7 +59,10 @@ def get_model_devi( return model_devi def _load_one_model_devi(self, fname, model_devi): - dd = np.loadtxt(fname) + if isinstance(fname, HDF5Dataset): + dd = fname.get_data() + else: + dd = np.loadtxt(fname) if len(np.shape(dd)) == 1: # In case model-devi.out is 1-dimensional dd = dd.reshape((1, len(dd))) @@ -92,7 +101,7 @@ def set_ele_temp(self, system, ele_temp): def get_confs( self, - trajs: List[Path], + trajs: Union[List[Path], List[HDF5Dataset]], id_selected: List[List[int]], type_map: Optional[List[str]] = None, conf_filters: Optional["ConfFilters"] = None, @@ -108,7 +117,11 @@ def get_confs( ms = dpdata.MultiSystems(type_map=type_map) for ii in range(ntraj): if len(id_selected[ii]) > 0: - ss = dpdata.System(trajs[ii], fmt=traj_fmt, type_map=type_map) + if isinstance(trajs[ii], HDF5Dataset): + traj = StringIO(trajs[ii].get_data()) # type: ignore + else: + traj = trajs[ii] + ss = dpdata.System(traj, fmt=traj_fmt, type_map=type_map) ss.nopbc = self.nopbc if ele_temp: self.set_ele_temp(ss, ele_temp[ii]) diff --git a/dpgen2/exploration/scheduler/convergence_check_stage_scheduler.py b/dpgen2/exploration/scheduler/convergence_check_stage_scheduler.py index f19442f3..8ab8662f 100644 --- a/dpgen2/exploration/scheduler/convergence_check_stage_scheduler.py +++ b/dpgen2/exploration/scheduler/convergence_check_stage_scheduler.py @@ -5,11 +5,15 @@ List, Optional, Tuple, + Union, ) from dflow.python import ( FatalError, ) +from dflow.python.opio import ( + HDF5Dataset, +) from dpgen2.exploration.report import ( ExplorationReport, @@ -67,7 +71,7 @@ def reached_max_iteration(self): def plan_next_iteration( self, report: Optional[ExplorationReport] = None, - trajs: Optional[List[Path]] = None, + trajs: Optional[Union[List[Path], List[HDF5Dataset]]] = None, ) -> Tuple[bool, Optional[BaseExplorationTaskGroup], Optional[ConfSelector]]: if self.complete(): raise FatalError("Cannot plan because the stage has completed.") diff --git a/dpgen2/exploration/scheduler/scheduler.py b/dpgen2/exploration/scheduler/scheduler.py index b895ca40..ff55fa23 100644 --- a/dpgen2/exploration/scheduler/scheduler.py +++ b/dpgen2/exploration/scheduler/scheduler.py @@ -5,12 +5,16 @@ List, Optional, Tuple, + Union, ) import numpy as np from dflow.python import ( FatalError, ) +from dflow.python.opio import ( + HDF5Dataset, +) from dpgen2.exploration.report import ( ExplorationReport, @@ -110,7 +114,7 @@ def force_stage_complete(self): def plan_next_iteration( self, report: Optional[ExplorationReport] = None, - trajs: Optional[List[Path]] = None, + trajs: Optional[Union[List[Path], List[HDF5Dataset]]] = None, ) -> Tuple[bool, Optional[ExplorationTaskGroup], Optional[ConfSelector]]: """ Make the plan for the next DPGEN iteration. @@ -119,7 +123,7 @@ def plan_next_iteration( ---------- report : ExplorationReport The exploration report of this iteration. - trajs : List[Path] + trajs : Union[List[Path], List[HDF5Dataset]] A list of configurations generated during the exploration. May be used to generate new configurations for the next iteration. Returns diff --git a/dpgen2/exploration/scheduler/stage_scheduler.py b/dpgen2/exploration/scheduler/stage_scheduler.py index 9ad04d09..18fe5593 100644 --- a/dpgen2/exploration/scheduler/stage_scheduler.py +++ b/dpgen2/exploration/scheduler/stage_scheduler.py @@ -8,6 +8,11 @@ from typing import ( List, Tuple, + Union, +) + +from dflow.python.opio import ( + HDF5Dataset, ) from dpgen2.exploration.report import ( @@ -87,7 +92,7 @@ def get_reports(self) -> List[ExplorationReport]: def plan_next_iteration( self, report: ExplorationReport, - trajs: List[Path], + trajs: Union[List[Path], List[HDF5Dataset]], ) -> Tuple[bool, ExplorationTaskGroup, ConfSelector]: """ Make the plan for the next iteration of the stage. @@ -96,11 +101,9 @@ def plan_next_iteration( Parameters ---------- - hist_reports : List[ExplorationReport] - The historical exploration report of the stage. If this is the first iteration of the stage, this list is empty. report : ExplorationReport The exploration report of this iteration. - confs : List[Path] + trajs : Union[List[Path], List[HDF5Dataset]] A list of configurations generated during the exploration. May be used to generate new configurations for the next iteration. Returns diff --git a/dpgen2/exploration/selector/conf_selector.py b/dpgen2/exploration/selector/conf_selector.py index df00afd3..f24a7d31 100644 --- a/dpgen2/exploration/selector/conf_selector.py +++ b/dpgen2/exploration/selector/conf_selector.py @@ -10,9 +10,13 @@ Optional, Set, Tuple, + Union, ) import dpdata +from dflow.python.opio import ( + HDF5Dataset, +) from dpgen2.exploration.report import ( ExplorationReport, @@ -29,8 +33,8 @@ class ConfSelector(ABC): @abstractmethod def select( self, - trajs: List[Path], - model_devis: List[Path], + trajs: Union[List[Path], List[HDF5Dataset]], + model_devis: Union[List[Path], List[HDF5Dataset]], type_map: Optional[List[str]] = None, optional_outputs: Optional[List[Path]] = None, ) -> Tuple[List[Path], ExplorationReport]: diff --git a/dpgen2/exploration/selector/conf_selector_frame.py b/dpgen2/exploration/selector/conf_selector_frame.py index 74eee689..fc116f88 100644 --- a/dpgen2/exploration/selector/conf_selector_frame.py +++ b/dpgen2/exploration/selector/conf_selector_frame.py @@ -9,10 +9,14 @@ List, Optional, Tuple, + Union, ) import dpdata import numpy as np +from dflow.python.opio import ( + HDF5Dataset, +) from dpgen2.exploration.render import ( TrajRender, @@ -52,8 +56,8 @@ def __init__( def select( self, - trajs: List[Path], - model_devis: List[Path], + trajs: Union[List[Path], List[HDF5Dataset]], + model_devis: Union[List[Path], List[HDF5Dataset]], type_map: Optional[List[str]] = None, optional_outputs: Optional[List[Path]] = None, ) -> Tuple[List[Path], ExplorationReport]: diff --git a/dpgen2/flow/dpgen_loop.py b/dpgen2/flow/dpgen_loop.py index 0ff95abb..190a1090 100644 --- a/dpgen2/flow/dpgen_loop.py +++ b/dpgen2/flow/dpgen_loop.py @@ -9,6 +9,7 @@ from typing import ( List, Optional, + Union, ) import jsonpickle @@ -35,6 +36,7 @@ OPIO, Artifact, BigParameter, + HDF5Datasets, OPIOSign, PythonOPTemplate, Slices, @@ -91,7 +93,7 @@ def get_input_sign(cls): { "exploration_scheduler": BigParameter(ExplorationScheduler), "exploration_report": BigParameter(ExplorationReport), - "trajs": Artifact(List[Path]), + "trajs": Artifact(Union[List[Path], HDF5Datasets]), } ) diff --git a/dpgen2/op/__init__.py b/dpgen2/op/__init__.py index a0b0b1a8..c79c3946 100644 --- a/dpgen2/op/__init__.py +++ b/dpgen2/op/__init__.py @@ -39,6 +39,7 @@ ) from .run_relax import ( RunRelax, + RunRelaxHDF5, ) from .select_confs import ( SelectConfs, diff --git a/dpgen2/op/run_relax.py b/dpgen2/op/run_relax.py index 8876eb14..672275d8 100644 --- a/dpgen2/op/run_relax.py +++ b/dpgen2/op/run_relax.py @@ -15,6 +15,7 @@ OPIO, Artifact, BigParameter, + HDF5Datasets, OPIOSign, ) @@ -54,6 +55,31 @@ def get_output_sign(cls): } ) + def write_traj(self, dump_str, traj_file): + traj_file.write_text(dump_str) + return traj_file + + def write_model_devi(self, devi, model_devi_file): + import numpy as np + + header = "%10s%19s%19s%19s%19s%19s%19s" % ( + "step", + "max_devi_v", + "min_devi_v", + "avg_devi_v", + "max_devi_f", + "min_devi_f", + "avg_devi_f", + ) + np.savetxt( + model_devi_file, + devi, + fmt=["%12d"] + ["%19.6e"] * 6, + delimiter="", + header=header, + ) + return model_devi_file + @OP.exec_sign_check def execute( self, @@ -168,29 +194,14 @@ def execute( forces_list[j] = forces virial_list[j] = virial / len(atype) traj_file = ip["task_path"] / ("traj.%s.dump" % fname) - traj_file.write_text(dump_str) + traj_file = self.write_traj(dump_str, traj_file) trajs.append(traj_file) devi = [np.array(step_list)] devi += list(calc_model_devi_v(np.array(virial_list))) devi += list(calc_model_devi_f(np.array(forces_list))) devi = np.vstack(devi).T - header = "%10s%19s%19s%19s%19s%19s%19s" % ( - "step", - "max_devi_v", - "min_devi_v", - "avg_devi_v", - "max_devi_f", - "min_devi_f", - "avg_devi_f", - ) model_devi_file = ip["task_path"] / ("model_devi.%s.out" % fname) - np.savetxt( - model_devi_file, - devi, - fmt=["%12d"] + ["%19.6e"] * 6, - delimiter="", - header=header, - ) + model_devi_file = self.write_model_devi(devi, model_devi_file) model_devis.append(model_devi_file) return OPIO( { @@ -215,3 +226,18 @@ def normalize_config(data={}): data = base.normalize_value(data, trim_pattern="_*") base.check_value(data, strict=False) return data + + +class RunRelaxHDF5(RunRelax): + @classmethod + def get_output_sign(cls): + output_sign = super().get_output_sign() + output_sign["trajs"] = Artifact(HDF5Datasets) + output_sign["model_devis"] = Artifact(HDF5Datasets) + return output_sign + + def write_traj(self, dump_str, traj_file): + return dump_str + + def write_model_devi(self, devi, model_devi_file): + return devi diff --git a/dpgen2/op/select_confs.py b/dpgen2/op/select_confs.py index fea0bf59..cddcca55 100644 --- a/dpgen2/op/select_confs.py +++ b/dpgen2/op/select_confs.py @@ -7,6 +7,7 @@ List, Set, Tuple, + Union, ) from dflow.python import ( @@ -15,6 +16,7 @@ Artifact, BigParameter, FatalError, + HDF5Datasets, OPIOSign, ) @@ -35,8 +37,8 @@ def get_input_sign(cls): { "conf_selector": ConfSelector, "type_map": List[str], - "trajs": Artifact(List[Path]), - "model_devis": Artifact(List[Path]), + "trajs": Artifact(Union[List[Path], HDF5Datasets]), + "model_devis": Artifact(Union[List[Path], HDF5Datasets]), "optional_outputs": Artifact(List[Path], optional=True), } ) diff --git a/pyproject.toml b/pyproject.toml index e327dec3..bd8c92f2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,7 @@ classifiers = [ dependencies = [ 'numpy', 'dpdata>=0.2.20', - 'pydflow>=1.6.57', + 'pydflow>=1.8.88', 'dargs>=0.3.1', 'scipy', 'lbg',