From 3501db4ec27bf34367eeeb14cfe953b52bbef50c Mon Sep 17 00:00:00 2001
From: Xinzijian Liu <liuxin_zijian@163.com>
Date: Tue, 10 Sep 2024 15:23:48 +0800
Subject: [PATCH] Add HDF5 support for trajs and model_devis (#259)

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

- **New Features**
- Introduced new optional arguments for improved data handling and
multitasking capabilities.
	- Added support for HDF5 formatted data in various modules.
	- Enhanced flexibility in input handling for multiple data formats.

- **Bug Fixes**
	- Improved robustness in handling validation data structures.

- **Documentation**
- Updated documentation to clarify new parameters and their intended
use.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->

---------

Signed-off-by: zjgemi <liuxin_zijian@163.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 dpgen2/entrypoint/args.py                     |  8 +++
 dpgen2/entrypoint/submit.py                   |  5 +-
 dpgen2/exploration/render/traj_render.py      |  7 ++-
 .../exploration/render/traj_render_lammps.py  | 21 +++++--
 .../convergence_check_stage_scheduler.py      |  6 +-
 dpgen2/exploration/scheduler/scheduler.py     |  8 ++-
 .../exploration/scheduler/stage_scheduler.py  | 11 ++--
 dpgen2/exploration/selector/conf_selector.py  |  8 ++-
 .../selector/conf_selector_frame.py           |  8 ++-
 dpgen2/flow/dpgen_loop.py                     |  4 +-
 dpgen2/op/__init__.py                         |  1 +
 dpgen2/op/run_relax.py                        | 60 +++++++++++++------
 dpgen2/op/select_confs.py                     |  6 +-
 pyproject.toml                                |  2 +-
 14 files changed, 116 insertions(+), 39 deletions(-)

diff --git a/dpgen2/entrypoint/args.py b/dpgen2/entrypoint/args.py
index 4ba69ea2..df11ff7f 100644
--- a/dpgen2/entrypoint/args.py
+++ b/dpgen2/entrypoint/args.py
@@ -359,6 +359,7 @@ def run_diffcsp_args():
     doc_gen_tasks = "Number of DiffCSP generation tasks"
     doc_gen_command = "Command for DiffCSP generation"
     doc_relax_group_size = "Group size for relaxation"
+    doc_use_hdf5 = "Use HDF5 to store trajs and model_devis"
     return [
         Argument(
             "gen_tasks",
@@ -380,6 +381,13 @@ def run_diffcsp_args():
             default=100,
             doc=doc_relax_group_size,
         ),
+        Argument(
+            "use_hdf5",
+            bool,
+            optional=True,
+            default=False,
+            doc=doc_use_hdf5,
+        ),
     ]
 
 
diff --git a/dpgen2/entrypoint/submit.py b/dpgen2/entrypoint/submit.py
index a6911ef0..76f944c6 100644
--- a/dpgen2/entrypoint/submit.py
+++ b/dpgen2/entrypoint/submit.py
@@ -111,6 +111,7 @@
     RunDPTrain,
     RunLmp,
     RunRelax,
+    RunRelaxHDF5,
     SelectConfs,
 )
 from dpgen2.op.caly_evo_step_merge import (
@@ -167,6 +168,7 @@ def make_concurrent_learning_op(
     upload_python_packages: Optional[List[os.PathLike]] = None,
     valid_data: Optional[S3Artifact] = None,
     train_optional_files: Optional[List[str]] = None,
+    explore_config: Optional[dict] = None,
 ):
     if train_style in ("dp", "dp-dist"):
         prep_run_train_op = PrepRunDPTrain(
@@ -234,7 +236,7 @@ def make_concurrent_learning_op(
             "prep-run-diffcsp",
             DiffCSPGen,
             PrepRelax,
-            RunRelax,
+            RunRelaxHDF5 if explore_config["use_hdf5"] else RunRelax,  # type: ignore
             prep_config=prep_explore_config,
             run_config=run_explore_config,
             upload_python_packages=upload_python_packages,
@@ -552,6 +554,7 @@ def workflow_concurrent_learning(
         upload_python_packages=upload_python_packages,
         valid_data=valid_data,
         train_optional_files=train_optional_files,
+        explore_config=explore_config,
     )
     scheduler = make_naive_exploration_scheduler(config)
 
diff --git a/dpgen2/exploration/render/traj_render.py b/dpgen2/exploration/render/traj_render.py
index eb7296b6..5c9f0c41 100644
--- a/dpgen2/exploration/render/traj_render.py
+++ b/dpgen2/exploration/render/traj_render.py
@@ -15,6 +15,9 @@
 
 import dpdata
 import numpy as np
+from dflow.python.opio import (
+    HDF5Dataset,
+)
 
 from ..deviation import (
     DeviManager,
@@ -30,7 +33,7 @@ class TrajRender(ABC):
     @abstractmethod
     def get_model_devi(
         self,
-        files: List[Path],
+        files: Union[List[Path], List[HDF5Dataset]],
     ) -> DeviManager:
         r"""Get model deviations from recording files.
 
@@ -48,7 +51,7 @@ def get_model_devi(
     @abstractmethod
     def get_confs(
         self,
-        traj: List[Path],
+        traj: Union[List[Path], List[HDF5Dataset]],
         id_selected: List[List[int]],
         type_map: Optional[List[str]] = None,
         conf_filters: Optional["ConfFilters"] = None,
diff --git a/dpgen2/exploration/render/traj_render_lammps.py b/dpgen2/exploration/render/traj_render_lammps.py
index d51ec040..28eb07f6 100644
--- a/dpgen2/exploration/render/traj_render_lammps.py
+++ b/dpgen2/exploration/render/traj_render_lammps.py
@@ -1,4 +1,7 @@
 import json
+from io import (
+    StringIO,
+)
 from pathlib import (
     Path,
 )
@@ -12,6 +15,9 @@
 
 import dpdata
 import numpy as np
+from dflow.python.opio import (
+    HDF5Dataset,
+)
 
 from dpgen2.utils import (
     setup_ele_temp,
@@ -42,7 +48,7 @@ def __init__(
 
     def get_model_devi(
         self,
-        files: List[Path],
+        files: Union[List[Path], List[HDF5Dataset]],
     ) -> DeviManager:
         ntraj = len(files)
 
@@ -53,7 +59,10 @@ def get_model_devi(
         return model_devi
 
     def _load_one_model_devi(self, fname, model_devi):
-        dd = np.loadtxt(fname)
+        if isinstance(fname, HDF5Dataset):
+            dd = fname.get_data()
+        else:
+            dd = np.loadtxt(fname)
         if len(np.shape(dd)) == 1:  # In case model-devi.out is 1-dimensional
             dd = dd.reshape((1, len(dd)))
 
@@ -92,7 +101,7 @@ def set_ele_temp(self, system, ele_temp):
 
     def get_confs(
         self,
-        trajs: List[Path],
+        trajs: Union[List[Path], List[HDF5Dataset]],
         id_selected: List[List[int]],
         type_map: Optional[List[str]] = None,
         conf_filters: Optional["ConfFilters"] = None,
@@ -108,7 +117,11 @@ def get_confs(
         ms = dpdata.MultiSystems(type_map=type_map)
         for ii in range(ntraj):
             if len(id_selected[ii]) > 0:
-                ss = dpdata.System(trajs[ii], fmt=traj_fmt, type_map=type_map)
+                if isinstance(trajs[ii], HDF5Dataset):
+                    traj = StringIO(trajs[ii].get_data())  # type: ignore
+                else:
+                    traj = trajs[ii]
+                ss = dpdata.System(traj, fmt=traj_fmt, type_map=type_map)
                 ss.nopbc = self.nopbc
                 if ele_temp:
                     self.set_ele_temp(ss, ele_temp[ii])
diff --git a/dpgen2/exploration/scheduler/convergence_check_stage_scheduler.py b/dpgen2/exploration/scheduler/convergence_check_stage_scheduler.py
index f19442f3..8ab8662f 100644
--- a/dpgen2/exploration/scheduler/convergence_check_stage_scheduler.py
+++ b/dpgen2/exploration/scheduler/convergence_check_stage_scheduler.py
@@ -5,11 +5,15 @@
     List,
     Optional,
     Tuple,
+    Union,
 )
 
 from dflow.python import (
     FatalError,
 )
+from dflow.python.opio import (
+    HDF5Dataset,
+)
 
 from dpgen2.exploration.report import (
     ExplorationReport,
@@ -67,7 +71,7 @@ def reached_max_iteration(self):
     def plan_next_iteration(
         self,
         report: Optional[ExplorationReport] = None,
-        trajs: Optional[List[Path]] = None,
+        trajs: Optional[Union[List[Path], List[HDF5Dataset]]] = None,
     ) -> Tuple[bool, Optional[BaseExplorationTaskGroup], Optional[ConfSelector]]:
         if self.complete():
             raise FatalError("Cannot plan because the stage has completed.")
diff --git a/dpgen2/exploration/scheduler/scheduler.py b/dpgen2/exploration/scheduler/scheduler.py
index b895ca40..ff55fa23 100644
--- a/dpgen2/exploration/scheduler/scheduler.py
+++ b/dpgen2/exploration/scheduler/scheduler.py
@@ -5,12 +5,16 @@
     List,
     Optional,
     Tuple,
+    Union,
 )
 
 import numpy as np
 from dflow.python import (
     FatalError,
 )
+from dflow.python.opio import (
+    HDF5Dataset,
+)
 
 from dpgen2.exploration.report import (
     ExplorationReport,
@@ -110,7 +114,7 @@ def force_stage_complete(self):
     def plan_next_iteration(
         self,
         report: Optional[ExplorationReport] = None,
-        trajs: Optional[List[Path]] = None,
+        trajs: Optional[Union[List[Path], List[HDF5Dataset]]] = None,
     ) -> Tuple[bool, Optional[ExplorationTaskGroup], Optional[ConfSelector]]:
         """
         Make the plan for the next DPGEN iteration.
@@ -119,7 +123,7 @@ def plan_next_iteration(
         ----------
         report : ExplorationReport
             The exploration report of this iteration.
-        trajs : List[Path]
+        trajs : Union[List[Path], List[HDF5Dataset]]
             A list of configurations generated during the exploration. May be used to generate new configurations for the next iteration.
 
         Returns
diff --git a/dpgen2/exploration/scheduler/stage_scheduler.py b/dpgen2/exploration/scheduler/stage_scheduler.py
index 9ad04d09..18fe5593 100644
--- a/dpgen2/exploration/scheduler/stage_scheduler.py
+++ b/dpgen2/exploration/scheduler/stage_scheduler.py
@@ -8,6 +8,11 @@
 from typing import (
     List,
     Tuple,
+    Union,
+)
+
+from dflow.python.opio import (
+    HDF5Dataset,
 )
 
 from dpgen2.exploration.report import (
@@ -87,7 +92,7 @@ def get_reports(self) -> List[ExplorationReport]:
     def plan_next_iteration(
         self,
         report: ExplorationReport,
-        trajs: List[Path],
+        trajs: Union[List[Path], List[HDF5Dataset]],
     ) -> Tuple[bool, ExplorationTaskGroup, ConfSelector]:
         """
         Make the plan for the next iteration of the stage.
@@ -96,11 +101,9 @@ def plan_next_iteration(
 
         Parameters
         ----------
-        hist_reports : List[ExplorationReport]
-            The historical exploration report of the stage. If this is the first iteration of the stage, this list is empty.
         report : ExplorationReport
             The exploration report of this iteration.
-        confs : List[Path]
+        trajs : Union[List[Path], List[HDF5Dataset]]
             A list of configurations generated during the exploration. May be used to generate new configurations for the next iteration.
 
         Returns
diff --git a/dpgen2/exploration/selector/conf_selector.py b/dpgen2/exploration/selector/conf_selector.py
index df00afd3..f24a7d31 100644
--- a/dpgen2/exploration/selector/conf_selector.py
+++ b/dpgen2/exploration/selector/conf_selector.py
@@ -10,9 +10,13 @@
     Optional,
     Set,
     Tuple,
+    Union,
 )
 
 import dpdata
+from dflow.python.opio import (
+    HDF5Dataset,
+)
 
 from dpgen2.exploration.report import (
     ExplorationReport,
@@ -29,8 +33,8 @@ class ConfSelector(ABC):
     @abstractmethod
     def select(
         self,
-        trajs: List[Path],
-        model_devis: List[Path],
+        trajs: Union[List[Path], List[HDF5Dataset]],
+        model_devis: Union[List[Path], List[HDF5Dataset]],
         type_map: Optional[List[str]] = None,
         optional_outputs: Optional[List[Path]] = None,
     ) -> Tuple[List[Path], ExplorationReport]:
diff --git a/dpgen2/exploration/selector/conf_selector_frame.py b/dpgen2/exploration/selector/conf_selector_frame.py
index 74eee689..fc116f88 100644
--- a/dpgen2/exploration/selector/conf_selector_frame.py
+++ b/dpgen2/exploration/selector/conf_selector_frame.py
@@ -9,10 +9,14 @@
     List,
     Optional,
     Tuple,
+    Union,
 )
 
 import dpdata
 import numpy as np
+from dflow.python.opio import (
+    HDF5Dataset,
+)
 
 from dpgen2.exploration.render import (
     TrajRender,
@@ -52,8 +56,8 @@ def __init__(
 
     def select(
         self,
-        trajs: List[Path],
-        model_devis: List[Path],
+        trajs: Union[List[Path], List[HDF5Dataset]],
+        model_devis: Union[List[Path], List[HDF5Dataset]],
         type_map: Optional[List[str]] = None,
         optional_outputs: Optional[List[Path]] = None,
     ) -> Tuple[List[Path], ExplorationReport]:
diff --git a/dpgen2/flow/dpgen_loop.py b/dpgen2/flow/dpgen_loop.py
index 0ff95abb..190a1090 100644
--- a/dpgen2/flow/dpgen_loop.py
+++ b/dpgen2/flow/dpgen_loop.py
@@ -9,6 +9,7 @@
 from typing import (
     List,
     Optional,
+    Union,
 )
 
 import jsonpickle
@@ -35,6 +36,7 @@
     OPIO,
     Artifact,
     BigParameter,
+    HDF5Datasets,
     OPIOSign,
     PythonOPTemplate,
     Slices,
@@ -91,7 +93,7 @@ def get_input_sign(cls):
             {
                 "exploration_scheduler": BigParameter(ExplorationScheduler),
                 "exploration_report": BigParameter(ExplorationReport),
-                "trajs": Artifact(List[Path]),
+                "trajs": Artifact(Union[List[Path], HDF5Datasets]),
             }
         )
 
diff --git a/dpgen2/op/__init__.py b/dpgen2/op/__init__.py
index a0b0b1a8..c79c3946 100644
--- a/dpgen2/op/__init__.py
+++ b/dpgen2/op/__init__.py
@@ -39,6 +39,7 @@
 )
 from .run_relax import (
     RunRelax,
+    RunRelaxHDF5,
 )
 from .select_confs import (
     SelectConfs,
diff --git a/dpgen2/op/run_relax.py b/dpgen2/op/run_relax.py
index 8876eb14..672275d8 100644
--- a/dpgen2/op/run_relax.py
+++ b/dpgen2/op/run_relax.py
@@ -15,6 +15,7 @@
     OPIO,
     Artifact,
     BigParameter,
+    HDF5Datasets,
     OPIOSign,
 )
 
@@ -54,6 +55,31 @@ def get_output_sign(cls):
             }
         )
 
+    def write_traj(self, dump_str, traj_file):
+        traj_file.write_text(dump_str)
+        return traj_file
+
+    def write_model_devi(self, devi, model_devi_file):
+        import numpy as np
+
+        header = "%10s%19s%19s%19s%19s%19s%19s" % (
+            "step",
+            "max_devi_v",
+            "min_devi_v",
+            "avg_devi_v",
+            "max_devi_f",
+            "min_devi_f",
+            "avg_devi_f",
+        )
+        np.savetxt(
+            model_devi_file,
+            devi,
+            fmt=["%12d"] + ["%19.6e"] * 6,
+            delimiter="",
+            header=header,
+        )
+        return model_devi_file
+
     @OP.exec_sign_check
     def execute(
         self,
@@ -168,29 +194,14 @@ def execute(
                 forces_list[j] = forces
                 virial_list[j] = virial / len(atype)
             traj_file = ip["task_path"] / ("traj.%s.dump" % fname)
-            traj_file.write_text(dump_str)
+            traj_file = self.write_traj(dump_str, traj_file)
             trajs.append(traj_file)
             devi = [np.array(step_list)]
             devi += list(calc_model_devi_v(np.array(virial_list)))
             devi += list(calc_model_devi_f(np.array(forces_list)))
             devi = np.vstack(devi).T
-            header = "%10s%19s%19s%19s%19s%19s%19s" % (
-                "step",
-                "max_devi_v",
-                "min_devi_v",
-                "avg_devi_v",
-                "max_devi_f",
-                "min_devi_f",
-                "avg_devi_f",
-            )
             model_devi_file = ip["task_path"] / ("model_devi.%s.out" % fname)
-            np.savetxt(
-                model_devi_file,
-                devi,
-                fmt=["%12d"] + ["%19.6e"] * 6,
-                delimiter="",
-                header=header,
-            )
+            model_devi_file = self.write_model_devi(devi, model_devi_file)
             model_devis.append(model_devi_file)
         return OPIO(
             {
@@ -215,3 +226,18 @@ def normalize_config(data={}):
         data = base.normalize_value(data, trim_pattern="_*")
         base.check_value(data, strict=False)
         return data
+
+
+class RunRelaxHDF5(RunRelax):
+    @classmethod
+    def get_output_sign(cls):
+        output_sign = super().get_output_sign()
+        output_sign["trajs"] = Artifact(HDF5Datasets)
+        output_sign["model_devis"] = Artifact(HDF5Datasets)
+        return output_sign
+
+    def write_traj(self, dump_str, traj_file):
+        return dump_str
+
+    def write_model_devi(self, devi, model_devi_file):
+        return devi
diff --git a/dpgen2/op/select_confs.py b/dpgen2/op/select_confs.py
index fea0bf59..cddcca55 100644
--- a/dpgen2/op/select_confs.py
+++ b/dpgen2/op/select_confs.py
@@ -7,6 +7,7 @@
     List,
     Set,
     Tuple,
+    Union,
 )
 
 from dflow.python import (
@@ -15,6 +16,7 @@
     Artifact,
     BigParameter,
     FatalError,
+    HDF5Datasets,
     OPIOSign,
 )
 
@@ -35,8 +37,8 @@ def get_input_sign(cls):
             {
                 "conf_selector": ConfSelector,
                 "type_map": List[str],
-                "trajs": Artifact(List[Path]),
-                "model_devis": Artifact(List[Path]),
+                "trajs": Artifact(Union[List[Path], HDF5Datasets]),
+                "model_devis": Artifact(Union[List[Path], HDF5Datasets]),
                 "optional_outputs": Artifact(List[Path], optional=True),
             }
         )
diff --git a/pyproject.toml b/pyproject.toml
index e327dec3..bd8c92f2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,7 +17,7 @@ classifiers = [
 dependencies = [
 	     'numpy',
 	     'dpdata>=0.2.20',
-	     'pydflow>=1.6.57',
+	     'pydflow>=1.8.88',
 	     'dargs>=0.3.1',
 	     'scipy',
 	     'lbg',