Access through s3 or HSDS works, although the latter is quite slow.

dsgrid · Sep 30, 2021 · 025020d · 025020d
1 parent 6795e78
commit 025020d
Show file tree

Hide file tree

Showing 4 changed files with 49 additions and 40 deletions.
diff --git a/dsgrid/dataformat/enumeration.py b/dsgrid/dataformat/enumeration.py
@@ -135,10 +135,11 @@ def persist(self, h5group):
     @classmethod
     def load(cls, h5group):
         h5dset = h5group[cls.dimension]
+        h5dset_data = h5group[cls.dimension][...] # workaround for h5pyd
         return cls(
             get_str(h5dset.attrs["name"]),
-            [get_str(vid) for vid in h5dset["id"]],
-            [get_str(vname) for vname in h5dset["name"]]
+            [get_str(vid) for vid in h5dset_data["id"]],
+            [get_str(vname) for vname in h5dset_data["name"]]
         )
 
     @classmethod
@@ -193,9 +194,10 @@ def load(cls, h5group):
             return MultiFuelEndUseEnumeration.load(h5group)
 
         h5dset = h5group[cls.dimension]
+        h5dset_data = h5group[cls.dimension][...] # workaround for h5pyd
         name = get_str(h5dset.attrs["name"])
-        ids = [get_str(vid) for vid in h5dset["id"]]
-        names = [get_str(vname) for vname in h5dset["name"]]
+        ids = [get_str(vid) for vid in h5dset_data["id"]]
+        names = [get_str(vname) for vname in h5dset_data["name"]]
 
         if 'fuel' in h5dset.attrs:
             return SingleFuelEndUseEnumeration(name, ids, names,
@@ -628,11 +630,12 @@ def persist(self, h5group):
     @classmethod
     def load(cls, h5group):
         h5dset = h5group[cls.dimension]
+        h5dset_data = h5group[cls.dimension][...] # workaround for h5pyd
         return cls(
             get_str(h5dset.attrs["name"]),
-            [get_str(vid) for vid in h5dset["id"]],
-            [get_str(vname) for vname in h5dset["name"]],
-            [get_str(vunits) for vunits in h5dset["units"]]
+            [get_str(vid) for vid in h5dset_data["id"]],
+            [get_str(vname) for vname in h5dset_data["name"]],
+            [get_str(vunits) for vunits in h5dset_data["units"]]
         )
 
     @classmethod
@@ -768,12 +771,13 @@ def load(cls, h5group):
         fuel_enum = FuelEnumeration.load(h5group)
 
         h5dset = h5group[cls.dimension]
+        h5dset_data = h5group[cls.dimension][...] # workaround for h5pyd
         return cls(
             get_str(h5dset.attrs["name"]),
-            [get_str(vid) for vid in h5dset["id"]],
-            [get_str(vname) for vname in h5dset["name"]],
+            [get_str(vid) for vid in h5dset_data["id"]],
+            [get_str(vname) for vname in h5dset_data["name"]],
             fuel_enum,
-            [get_str(vfuel_id) for vfuel_id in h5dset["fuel_id"]]
+            [get_str(vfuel_id) for vfuel_id in h5dset_data["fuel_id"]]
         )
 
     @classmethod

diff --git a/dsgrid/dataformat/sectordataset.py b/dsgrid/dataformat/sectordataset.py
@@ -3,6 +3,7 @@
 from itertools import repeat
 import h5py
 import logging
+from h5pyd._hl.files import is_hdf5
 import numpy as np
 import pandas as pd
 
@@ -92,7 +93,7 @@ def create(cls, enum, enum_ids, enum_scales=None):
         return cls(datamap)
 
     @classmethod
-    def load(cls,dataset):
+    def load(cls,dataset,hsds=False):
         """
         Parameters
         ----------
@@ -103,9 +104,14 @@ def load(cls,dataset):
         -------
         Datamap
         """
-        assert isinstance(dataset,h5py.Dataset)
-        idx = dataset[:,"idx"]
-        scale = dataset[:,"scale"]
+        if hsds: # workaround for h5pyd
+            dset = dataset[...] # workaround for h5pyd
+            idx = dset["idx"]
+            scale = dset["scale"]
+        else:
+            assert isinstance(dataset,h5py.Dataset)
+            idx = dataset[:,"idx"]
+            scale = dataset[:,"scale"]
         datamap = np.empty(len(idx), enum_datamap_dtype)
         datamap["idx"] = idx
         datamap["scale"] = scale
@@ -296,7 +302,7 @@ def append_element_to_dataset_dimension(dataset,new_elem_idx,enum_ids,enum,scali
 
 class SectorDataset(object):
 
-    def __init__(self,datafile,sector_id,enduses,times):
+    def __init__(self,datafile,sector_id,enduses,times,hsds=False):
         """
         Creates a SectorDataset object. Note that this does not read from
         or write to datafile in any way, and should generally not be called directly.
@@ -321,6 +327,8 @@ def __init__(self,datafile,sector_id,enduses,times):
         self.datafile = datafile
         self.enduses = enduses
         self.times = times
+
+        self.is_hsds = hsds
 
         self.n_geos = 0 # data is inserted by geography
 
@@ -362,16 +370,17 @@ def new(cls,datafile,sector_id,enduses=None,times=None):
     @classmethod
     def load(cls,datafile,f,sector_id):
         dgroup = f["data/" + sector_id]
+        is_hsds = ("h5pyd" in str(type(f)))
 
-        datamap = Datamap.load(dgroup["enduses"])
+        datamap = Datamap.load(dgroup["enduses"],hsds=is_hsds)
         enduses = datamap.get_subenum(datafile.enduse_enum)
 
-        datamap = Datamap.load(dgroup["times"])
+        datamap = Datamap.load(dgroup["times"],hsds=is_hsds)
         times = datamap.get_subenum(datafile.time_enum)
 
-        result = cls(datafile,sector_id,enduses,times)
+        result = cls(datafile,sector_id,enduses,times,hsds=is_hsds)
 
-        datamap = Datamap.load(dgroup["geographies"])
+        datamap = Datamap.load(dgroup["geographies"],hsds=is_hsds)
         result.n_geos = datamap.num_entries
 
         return result
@@ -384,7 +393,7 @@ def loadall(cls,datafile,f,_upgrade_class=None):
             if _upgrade_class is not None:
                 yield sector_id, _upgrade_class.load_sectordataset(datafile,f,sector_id)
                 continue
-            assert isinstance(sector_group, h5py.Group)
+            # assert isinstance(sector_group, h5py.Group) # fails for h5pyd
             yield sector_id, SectorDataset.load(datafile,f,sector_id)
 
 
@@ -543,7 +552,11 @@ def has_data(self,geo_id):
         with H5Reader(self.datafile.h5path) as f:
             dgroup = f["data/" + self.sector_id]
 
-            geo_idx, geo_scale = dgroup["geographies"][id_idx]
+            if self.is_hsds: # workaround for h5pyd
+                dset = dgroup["geographies"][...] 
+                geo_idx, geo_scale = dset[id_idx]
+            else:
+                geo_idx, geo_scale = dgroup["geographies"][id_idx]
 
             if geo_idx == NULL_IDX:
                 return False
@@ -552,7 +565,7 @@ def has_data(self,geo_id):
     def get_datamap(self,dim_key):
         with H5Reader(self.datafile.h5path) as f:
             dgroup = f["data"][self.sector_id]
-            result = Datamap.load(dgroup[dim_key])
+            result = Datamap.load(dgroup[dim_key], hsds=self.is_hsds)
         return result
 
     def get_data(self, dataset_geo_index):
@@ -589,7 +602,7 @@ def get_data(self, dataset_geo_index):
                               columns=self.enduses,
                               dtype="float32")
 
-            geo_datamap = Datamap.load(dgroup["geographies"])
+            geo_datamap = Datamap.load(dgroup["geographies"], hsds=self.is_hsds)
             geo_ids = geo_datamap.ids(dataset_geo_index,self.datafile.geo_enum)
             scalings = geo_datamap.scales(dataset_geo_index)
 

diff --git a/notebooks/Browse one file.ipynb b/notebooks/Browse one file.ipynb
@@ -44,8 +44,8 @@
    "outputs": [],
    "source": [
     "# Publicly available\n",
-    "dsgrid_hsds_base_path = \"/nrel/dsgrid-2018-efs\"               # More performant, uses h5pyd\n",
     "dsgrid_oedi_base_path = \"s3://oedi-data-lake/dsgrid-2018-efs\" # Uses h5py\n",
+    "dsgrid_hsds_base_path = \"/nrel/dsgrid-2018-efs\"               # Uses h5pyd\n",
     "# Internal to NREL\n",
     "dsgrid_nrel_base_path_windows = Path(\"//nrelnas01/PLEXOS/Projects/Load/dsgrid_v0.2.0/\")\n",
     "dsgrid_nrel_base_path_mac = Path(\"/Volumes/PLEXOS/Projects/Load/dsgrid_v0.2.0/\")"
@@ -407,14 +407,6 @@
     "total_energy = Datatable(new_datafile).data.sum()\n",
     "print(f\"{p.name} describes {total_energy} {energy_units} of electricity\")"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "2664d807",
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {

diff --git a/notebooks/Visualize dsgrid model.ipynb b/notebooks/Visualize dsgrid model.ipynb
@@ -72,8 +72,8 @@
    "outputs": [],
    "source": [
     "# Publicly available\n",
-    "#dsgrid_hsds_base_path = \"/nrel/dsgrid-2018-efs\"               # More performant, uses h5pyd -- Not yet working\n",
     "dsgrid_oedi_base_path = \"s3://oedi-data-lake/dsgrid-2018-efs\" # Uses h5py\n",
+    "dsgrid_hsds_base_path = \"/nrel/dsgrid-2018-efs\"               # Uses h5pyd\n",
     "# Internal to NREL\n",
     "dsgrid_nrel_base_path_windows = Path(\"//nrelnas01/PLEXOS/Projects/Load/dsgrid_v0.2.0/\")\n",
     "dsgrid_nrel_base_path_mac = Path(\"/Volumes/PLEXOS/Projects/Load/dsgrid_v0.2.0/\")"
@@ -167,7 +167,7 @@
     "            files = list(d)\n",
     "    is_s3 = str(datadir).startswith(\"s3://\")\n",
     "    if is_s3:\n",
-    "        files = [p.split(\"/\")[-1] for p in s3.glob(f\"{dsgrid_dataset_path}/*.dsg\")]\n",
+    "        files = [p.split(\"/\")[-1] for p in s3.glob(f\"{dsgrid_dataset_path}*.dsg\")]\n",
     "    \n",
     "    result = []\n",
     "    for name, color, filepath in tuple_list:\n",
@@ -180,7 +180,7 @@
     "            continue\n",
     "        result.append(LoadModelComponent(name,component_type=component_type,color=color))\n",
     "        result[-1].load_datafile(\n",
-    "            f\"{datadir}/{filepath}\" if (is_hsds or is_s3) else datadir / filepath)\n",
+    "            f\"{datadir}{filepath}\" if (is_hsds or is_s3) else datadir / filepath)\n",
     "    return result\n",
     "\n",
     "# Bottom-Up\n",
@@ -882,8 +882,8 @@
     "    Modeled = auto()   # only dsgrid-modeled \"bottom-up\" and \"gap\" components\n",
     "\n",
     "if is_hsds or is_s3:\n",
-    "    all_components_model_path = f\"{dsgrid_base_path}/state_hourly_residuals\"\n",
-    "    bottom_up_components_model_path = f\"{dsgrid_base_path}/dsgrid_site_energy_state_hourly\"\n",
+    "    all_components_model_path = f\"{dsgrid_base_path}/state_hourly_residuals/\"\n",
+    "    bottom_up_components_model_path = f\"{dsgrid_base_path}/dsgrid_site_energy_state_hourly/\"\n",
     "else:\n",
     "    all_components_model_path = dsgrid_base_path / \"products\" / \"state_hourly_residuals\"\n",
     "    bottom_up_components_model_path = dsgrid_base_path / \"products\" / \"dsgrid_site_energy_state_hourly\""
@@ -948,7 +948,7 @@
    "id": "ce6d9e0f",
    "metadata": {},
    "source": [
-    "⚠️ **WARNING** ⚠️ This cell can take a long time to run the first time, especially over s3."
+    "⚠️ **WARNING** ⚠️ This cell can take a long time to run the first time, especially over HSDS or s3."
    ]
   },
   {
@@ -1145,7 +1145,7 @@
    "id": "58ed7b5a",
    "metadata": {},
    "source": [
-    "⚠️ **WARNING** ⚠️ This cell can take a long time to run the first time, especially over s3."
+    "⚠️ **WARNING** ⚠️ This cell can take a long time to run the first time, especially over HSDS or s3."
    ]
   },
   {
@@ -1357,7 +1357,7 @@
    "id": "ea55380f",
    "metadata": {},
    "source": [
-    "⚠️ **WARNING** ⚠️ This cell can take a while to run over s3 and/or slow network connections."
+    "⚠️ **WARNING** ⚠️ This cell can take a while to run over HSDS, s3 and/or slow network connections."
    ]
   },
   {