Merge pull request #24 from dsgrid/eh/support_oedi

Supporting OEDI dataset
dsgrid · Sep 30, 2021 · df19cc0 · df19cc0
2 parents b8b5dd3 + 025020d
commit df19cc0
Show file tree

Hide file tree

Showing 12 changed files with 213 additions and 71 deletions.
diff --git a/README.md b/README.md
@@ -13,11 +13,16 @@ To get the basic package, run:
 pip install dsgrid-legacy-efs-api
 ```
 
-If you would like to run the example notebooks, install the required extra dependencies:
+If you would like to run the example notebooks and browse the files available 
+through the Open Energy Data Initiative (OEDI), install the required extra 
+dependencies:
 
 ```
-pip install dsgrid-legacy-efs-api[ntbks]
+pip install dsgrid-legacy-efs-api[ntbks,oedi]
 ```
 
 and also clone the repository. Then you should be able to run the .ipynb files 
-in the dsgrid-legacy-efs-api/notebooks folder.
+in the dsgrid-legacy-efs-api/notebooks folder, which include functionality for 
+directly browsing the OEDI [oedi-data-lake/dsgrid-2018-efs](https://data.openei.org/s3_viewer?bucket=oedi-data-lake&prefix=dsgrid-2018-efs%2F) data files. If you would like 
+to use the HSDS service, please see the configuration instructions at 
+https://github.com/NREL/hsds-examples/.
diff --git a/docs/source/overview.rst b/docs/source/overview.rst
@@ -26,15 +26,20 @@ To get the basic package, run:
 
     pip install dsgrid-legacy-efs-api
 
-If you would like to run the example notebooks, install the required extra 
+If you would like to run the example notebooks and browse the files available 
+through the Open Energy Data Initiative (OEDI), install the required extra 
 dependencies:
 
 ::
 
-    pip install dsgrid-legacy-efs-api[ntbks]
+    pip install dsgrid-legacy-efs-api[ntbks,oedi]
 
 and also clone the repository. Then you should be able to run the .ipynb files 
-in the dsgrid-legacy-efs-api/notebooks folder.
+in the dsgrid-legacy-efs-api/notebooks folder, which include functionality for 
+directly browsing the OEDI `oedi-data-lake/dsgrid-2018-efs 
+<https://data.openei.org/s3_viewer?bucket=oedi-data-lake&prefix=dsgrid-2018-efs%2F>`__ 
+data files. If you would like to use the HSDS service, please see the 
+configuration instructions at `https://github.com/NREL/hsds-examples/ <https://github.com/NREL/hsds-examples/>`__.
 
 Creating a new data file
 ~~~~~~~~~~~~~~~~~~~~~~~~

diff --git a/dsgrid/_version.py b/dsgrid/_version.py
@@ -4,7 +4,7 @@
 __description__ = ("Python API for accessing demand-side grid model (dsgrid) "
     "data produced for the Electrification Futures Study (EFS)")
 __url__ = "https://github.com/dsgrid/dsgrid-legacy-efs-api"
-__version__ = "0.3.0"
+__version__ = "0.4.0"
 __author__ = "NREL"
 __maintainer_email__ = "elaine.hale@nrel.gov"
 __license__ = "BSD-3"

diff --git a/dsgrid/dataformat/datafile.py b/dsgrid/dataformat/datafile.py
@@ -10,11 +10,11 @@
     # Python 2
     from collections import Mapping
 
-
 import h5py
 
 from dsgrid import __version__ as VERSION
 from dsgrid import DSGridNotImplemented, DSGridValueError
+from dsgrid.helpers import H5Reader
 from dsgrid.dataformat import get_str
 from dsgrid.dataformat.enumeration import (
     SectorEnumeration, GeographyEnumeration,
@@ -110,7 +110,7 @@ def contains(self, an_enum):
     @classmethod
     def load(cls,filepath,upgrade=True,overwrite=False,new_filepath=None,**kwargs):
         # Version Handling
-        with h5py.File(filepath, "r") as f:
+        with H5Reader(filepath) as f:
             version = get_str(f.attrs.get("dsgrid", "0.1.0"))
 
         if StrictVersion(version) > StrictVersion(VERSION):
@@ -133,7 +133,7 @@ def load(cls,filepath,upgrade=True,overwrite=False,new_filepath=None,**kwargs):
 
         # Current version, old version data that is compatible with current code,
         # or old version data that is not compatible and not being upgraded
-        with h5py.File(filepath, "r") as f:
+        with H5Reader(filepath) as f:
             enum_group = f["enumerations"]
             result = cls(filepath,
                          SectorEnumeration.load(enum_group),

diff --git a/dsgrid/dataformat/datatable.py b/dsgrid/dataformat/datatable.py
@@ -2,8 +2,8 @@
 
 import numpy as np
 import pandas as pd
-import h5py
 
+from dsgrid.helpers import H5Reader
 from dsgrid.dataformat.sectordataset import NULL_IDX
 
 logger = logging.getLogger(__name__)
@@ -18,7 +18,7 @@ def __init__(self,datafile,sort=True,verify_integrity=True):
         self.time_enum = datafile.time_enum
 
         self.data = []
-        with h5py.File(datafile.h5path, "r") as f:
+        with H5Reader(datafile.h5path) as f:
 
             for sectorname, sectordataset in datafile.sectordata.items():
 

diff --git a/dsgrid/dataformat/enumeration.py b/dsgrid/dataformat/enumeration.py
@@ -135,10 +135,11 @@ def persist(self, h5group):
     @classmethod
     def load(cls, h5group):
         h5dset = h5group[cls.dimension]
+        h5dset_data = h5group[cls.dimension][...] # workaround for h5pyd
         return cls(
             get_str(h5dset.attrs["name"]),
-            [get_str(vid) for vid in h5dset["id"]],
-            [get_str(vname) for vname in h5dset["name"]]
+            [get_str(vid) for vid in h5dset_data["id"]],
+            [get_str(vname) for vname in h5dset_data["name"]]
         )
 
     @classmethod
@@ -193,9 +194,10 @@ def load(cls, h5group):
             return MultiFuelEndUseEnumeration.load(h5group)
 
         h5dset = h5group[cls.dimension]
+        h5dset_data = h5group[cls.dimension][...] # workaround for h5pyd
         name = get_str(h5dset.attrs["name"])
-        ids = [get_str(vid) for vid in h5dset["id"]]
-        names = [get_str(vname) for vname in h5dset["name"]]
+        ids = [get_str(vid) for vid in h5dset_data["id"]]
+        names = [get_str(vname) for vname in h5dset_data["name"]]
 
         if 'fuel' in h5dset.attrs:
             return SingleFuelEndUseEnumeration(name, ids, names,
@@ -628,11 +630,12 @@ def persist(self, h5group):
     @classmethod
     def load(cls, h5group):
         h5dset = h5group[cls.dimension]
+        h5dset_data = h5group[cls.dimension][...] # workaround for h5pyd
         return cls(
             get_str(h5dset.attrs["name"]),
-            [get_str(vid) for vid in h5dset["id"]],
-            [get_str(vname) for vname in h5dset["name"]],
-            [get_str(vunits) for vunits in h5dset["units"]]
+            [get_str(vid) for vid in h5dset_data["id"]],
+            [get_str(vname) for vname in h5dset_data["name"]],
+            [get_str(vunits) for vunits in h5dset_data["units"]]
         )
 
     @classmethod
@@ -768,12 +771,13 @@ def load(cls, h5group):
         fuel_enum = FuelEnumeration.load(h5group)
 
         h5dset = h5group[cls.dimension]
+        h5dset_data = h5group[cls.dimension][...] # workaround for h5pyd
         return cls(
             get_str(h5dset.attrs["name"]),
-            [get_str(vid) for vid in h5dset["id"]],
-            [get_str(vname) for vname in h5dset["name"]],
+            [get_str(vid) for vid in h5dset_data["id"]],
+            [get_str(vname) for vname in h5dset_data["name"]],
             fuel_enum,
-            [get_str(vfuel_id) for vfuel_id in h5dset["fuel_id"]]
+            [get_str(vfuel_id) for vfuel_id in h5dset_data["fuel_id"]]
         )
 
     @classmethod

diff --git a/dsgrid/dataformat/sectordataset.py b/dsgrid/dataformat/sectordataset.py
@@ -3,11 +3,13 @@
 from itertools import repeat
 import h5py
 import logging
+from h5pyd._hl.files import is_hdf5
 import numpy as np
 import pandas as pd
 
 from dsgrid import __version__ as VERSION
 from dsgrid import DSGridError, DSGridNotImplemented
+from dsgrid.helpers import H5Reader
 from dsgrid.dataformat.enumeration import (
     SectorEnumeration, GeographyEnumeration,
     EndUseEnumerationBase, TimeEnumeration)
@@ -91,7 +93,7 @@ def create(cls, enum, enum_ids, enum_scales=None):
         return cls(datamap)
 
     @classmethod
-    def load(cls,dataset):
+    def load(cls,dataset,hsds=False):
         """
         Parameters
         ----------
@@ -102,9 +104,14 @@ def load(cls,dataset):
         -------
         Datamap
         """
-        assert isinstance(dataset,h5py.Dataset)
-        idx = dataset[:,"idx"]
-        scale = dataset[:,"scale"]
+        if hsds: # workaround for h5pyd
+            dset = dataset[...] # workaround for h5pyd
+            idx = dset["idx"]
+            scale = dset["scale"]
+        else:
+            assert isinstance(dataset,h5py.Dataset)
+            idx = dataset[:,"idx"]
+            scale = dataset[:,"scale"]
         datamap = np.empty(len(idx), enum_datamap_dtype)
         datamap["idx"] = idx
         datamap["scale"] = scale
@@ -295,7 +302,7 @@ def append_element_to_dataset_dimension(dataset,new_elem_idx,enum_ids,enum,scali
 
 class SectorDataset(object):
 
-    def __init__(self,datafile,sector_id,enduses,times):
+    def __init__(self,datafile,sector_id,enduses,times,hsds=False):
         """
         Creates a SectorDataset object. Note that this does not read from
         or write to datafile in any way, and should generally not be called directly.
@@ -320,6 +327,8 @@ def __init__(self,datafile,sector_id,enduses,times):
         self.datafile = datafile
         self.enduses = enduses
         self.times = times
+
+        self.is_hsds = hsds
 
         self.n_geos = 0 # data is inserted by geography
 
@@ -361,16 +370,17 @@ def new(cls,datafile,sector_id,enduses=None,times=None):
     @classmethod
     def load(cls,datafile,f,sector_id):
         dgroup = f["data/" + sector_id]
+        is_hsds = ("h5pyd" in str(type(f)))
 
-        datamap = Datamap.load(dgroup["enduses"])
+        datamap = Datamap.load(dgroup["enduses"],hsds=is_hsds)
         enduses = datamap.get_subenum(datafile.enduse_enum)
 
-        datamap = Datamap.load(dgroup["times"])
+        datamap = Datamap.load(dgroup["times"],hsds=is_hsds)
         times = datamap.get_subenum(datafile.time_enum)
 
-        result = cls(datafile,sector_id,enduses,times)
+        result = cls(datafile,sector_id,enduses,times,hsds=is_hsds)
 
-        datamap = Datamap.load(dgroup["geographies"])
+        datamap = Datamap.load(dgroup["geographies"],hsds=is_hsds)
         result.n_geos = datamap.num_entries
 
         return result
@@ -383,7 +393,7 @@ def loadall(cls,datafile,f,_upgrade_class=None):
             if _upgrade_class is not None:
                 yield sector_id, _upgrade_class.load_sectordataset(datafile,f,sector_id)
                 continue
-            assert isinstance(sector_group, h5py.Group)
+            # assert isinstance(sector_group, h5py.Group) # fails for h5pyd
             yield sector_id, SectorDataset.load(datafile,f,sector_id)
 
 
@@ -520,7 +530,7 @@ def __getitem__(self, geo_id):
 
         id_idx = self.datafile.geo_enum.ids.index(geo_id)
 
-        with h5py.File(self.datafile.h5path, "r") as f:
+        with H5Reader(self.datafile.h5path) as f:
             dgroup = f["data/" + self.sector_id]
             dset = dgroup["data"]
 
@@ -539,19 +549,23 @@ def __getitem__(self, geo_id):
     def has_data(self,geo_id):
         id_idx = self.datafile.geo_enum.ids.index(geo_id)
 
-        with h5py.File(self.datafile.h5path, "r") as f:
+        with H5Reader(self.datafile.h5path) as f:
             dgroup = f["data/" + self.sector_id]
 
-            geo_idx, geo_scale = dgroup["geographies"][id_idx]
+            if self.is_hsds: # workaround for h5pyd
+                dset = dgroup["geographies"][...] 
+                geo_idx, geo_scale = dset[id_idx]
+            else:
+                geo_idx, geo_scale = dgroup["geographies"][id_idx]
 
             if geo_idx == NULL_IDX:
                 return False
         return True
 
     def get_datamap(self,dim_key):
-        with h5py.File(self.datafile.h5path, "r") as f:
+        with H5Reader(self.datafile.h5path) as f:
             dgroup = f["data"][self.sector_id]
-            result = Datamap.load(dgroup[dim_key])
+            result = Datamap.load(dgroup[dim_key], hsds=self.is_hsds)
         return result
 
     def get_data(self, dataset_geo_index):
@@ -577,7 +591,7 @@ def get_data(self, dataset_geo_index):
         if (dataset_geo_index < 0) or (not dataset_geo_index < self.n_geos):
             raise ValueError("dataset_geo_index must be in the range [0,{}), but is {}.".format(self.n_geos,dataset_geo_index))
 
-        with h5py.File(self.datafile.h5path, "r") as f:
+        with H5Reader(self.datafile.h5path) as f:
 
             dgroup = f["data"][self.sector_id]
             dset = dgroup["data"]
@@ -588,7 +602,7 @@ def get_data(self, dataset_geo_index):
                               columns=self.enduses,
                               dtype="float32")
 
-            geo_datamap = Datamap.load(dgroup["geographies"])
+            geo_datamap = Datamap.load(dgroup["geographies"], hsds=self.is_hsds)
             geo_ids = geo_datamap.ids(dataset_geo_index,self.datafile.geo_enum)
             scalings = geo_datamap.scales(dataset_geo_index)
 

diff --git a/dsgrid/helpers.py b/dsgrid/helpers.py
@@ -1,7 +1,9 @@
+import h5py
 import numpy as np
 import pandas as pds
 import webcolors
 
+
 def multi_index(df, cols):
     result = df.copy()
     if len(cols) == 1:
@@ -13,6 +15,7 @@ def multi_index(df, cols):
         del result[col]
     return result  
 
+
 def ensure_enum(cls, val):
     """
     Returns the instance of cls that corresponds to val. cls is expected to be 
@@ -31,17 +34,49 @@ def ensure_enum(cls, val):
         return cls[val]
     return cls(val)
 
+
 def lighten_color(hex_color,fraction_to_white):
     rgb_color = np.array(webcolors.hex_to_rgb(hex_color))
     white = np.array([255,255,255])
     direction = white - rgb_color
     result = [int(round(x)) for x in list(rgb_color + direction * fraction_to_white)]
     return webcolors.rgb_to_hex(tuple(result))
 
+
 def palette(hex_color,n,max_fraction=0.75):
     result = []; step = max_fraction / float(n)
     for frac in [i * step for i in range(n)]:
         result.append(lighten_color(hex_color,frac))
     assert len(result) == n
     return result
 
+
+class H5Reader(object):
+    def __init__(self, filepath):
+        self.filepath = filepath
+        if self.is_hsds:
+            import h5pyd
+            self._f = h5pyd.File(filepath, mode="r", use_cache=False)
+        elif self.is_s3:
+            import s3fs
+            self._s3p = s3fs.S3FileSystem().open(filepath, 'rb')
+            self._f = h5py.File(self._s3p, mode="r")
+        else:
+            self._f = h5py.File(filepath, mode="r")
+
+    @property
+    def is_hsds(self):
+        return str(self.filepath).startswith("/nrel/")
+
+    @property
+    def is_s3(self):
+        return str(self.filepath).startswith("s3://")
+
+    def __enter__(self):
+        return self._f
+
+    def __exit__(self, exc, value, tb):
+        self._f.close()
+        if self.is_s3:
+            self._s3p.close()
+