Skip to content

Commit

Permalink
Merge pull request #24 from dsgrid/eh/support_oedi
Browse files Browse the repository at this point in the history
Supporting OEDI dataset
  • Loading branch information
elainethale authored Sep 30, 2021
2 parents b8b5dd3 + 025020d commit df19cc0
Show file tree
Hide file tree
Showing 12 changed files with 213 additions and 71 deletions.
11 changes: 8 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,16 @@ To get the basic package, run:
pip install dsgrid-legacy-efs-api
```

If you would like to run the example notebooks, install the required extra dependencies:
If you would like to run the example notebooks and browse the files available
through the Open Energy Data Initiative (OEDI), install the required extra
dependencies:

```
pip install dsgrid-legacy-efs-api[ntbks]
pip install dsgrid-legacy-efs-api[ntbks,oedi]
```

and also clone the repository. Then you should be able to run the .ipynb files
in the dsgrid-legacy-efs-api/notebooks folder.
in the dsgrid-legacy-efs-api/notebooks folder, which include functionality for
directly browsing the OEDI [oedi-data-lake/dsgrid-2018-efs](https://data.openei.org/s3_viewer?bucket=oedi-data-lake&prefix=dsgrid-2018-efs%2F) data files. If you would like
to use the HSDS service, please see the configuration instructions at
https://github.com/NREL/hsds-examples/.
11 changes: 8 additions & 3 deletions docs/source/overview.rst
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,20 @@ To get the basic package, run:

pip install dsgrid-legacy-efs-api

If you would like to run the example notebooks, install the required extra
If you would like to run the example notebooks and browse the files available
through the Open Energy Data Initiative (OEDI), install the required extra
dependencies:

::

pip install dsgrid-legacy-efs-api[ntbks]
pip install dsgrid-legacy-efs-api[ntbks,oedi]

and also clone the repository. Then you should be able to run the .ipynb files
in the dsgrid-legacy-efs-api/notebooks folder.
in the dsgrid-legacy-efs-api/notebooks folder, which include functionality for
directly browsing the OEDI `oedi-data-lake/dsgrid-2018-efs
<https://data.openei.org/s3_viewer?bucket=oedi-data-lake&prefix=dsgrid-2018-efs%2F>`__
data files. If you would like to use the HSDS service, please see the
configuration instructions at `https://github.com/NREL/hsds-examples/ <https://github.com/NREL/hsds-examples/>`__.

Creating a new data file
~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down
2 changes: 1 addition & 1 deletion dsgrid/_version.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
__description__ = ("Python API for accessing demand-side grid model (dsgrid) "
"data produced for the Electrification Futures Study (EFS)")
__url__ = "https://github.com/dsgrid/dsgrid-legacy-efs-api"
__version__ = "0.3.0"
__version__ = "0.4.0"
__author__ = "NREL"
__maintainer_email__ = "elaine.hale@nrel.gov"
__license__ = "BSD-3"
Expand Down
6 changes: 3 additions & 3 deletions dsgrid/dataformat/datafile.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@
# Python 2
from collections import Mapping


import h5py

from dsgrid import __version__ as VERSION
from dsgrid import DSGridNotImplemented, DSGridValueError
from dsgrid.helpers import H5Reader
from dsgrid.dataformat import get_str
from dsgrid.dataformat.enumeration import (
SectorEnumeration, GeographyEnumeration,
Expand Down Expand Up @@ -110,7 +110,7 @@ def contains(self, an_enum):
@classmethod
def load(cls,filepath,upgrade=True,overwrite=False,new_filepath=None,**kwargs):
# Version Handling
with h5py.File(filepath, "r") as f:
with H5Reader(filepath) as f:
version = get_str(f.attrs.get("dsgrid", "0.1.0"))

if StrictVersion(version) > StrictVersion(VERSION):
Expand All @@ -133,7 +133,7 @@ def load(cls,filepath,upgrade=True,overwrite=False,new_filepath=None,**kwargs):

# Current version, old version data that is compatible with current code,
# or old version data that is not compatible and not being upgraded
with h5py.File(filepath, "r") as f:
with H5Reader(filepath) as f:
enum_group = f["enumerations"]
result = cls(filepath,
SectorEnumeration.load(enum_group),
Expand Down
4 changes: 2 additions & 2 deletions dsgrid/dataformat/datatable.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

import numpy as np
import pandas as pd
import h5py

from dsgrid.helpers import H5Reader
from dsgrid.dataformat.sectordataset import NULL_IDX

logger = logging.getLogger(__name__)
Expand All @@ -18,7 +18,7 @@ def __init__(self,datafile,sort=True,verify_integrity=True):
self.time_enum = datafile.time_enum

self.data = []
with h5py.File(datafile.h5path, "r") as f:
with H5Reader(datafile.h5path) as f:

for sectorname, sectordataset in datafile.sectordata.items():

Expand Down
24 changes: 14 additions & 10 deletions dsgrid/dataformat/enumeration.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,10 +135,11 @@ def persist(self, h5group):
@classmethod
def load(cls, h5group):
h5dset = h5group[cls.dimension]
h5dset_data = h5group[cls.dimension][...] # workaround for h5pyd
return cls(
get_str(h5dset.attrs["name"]),
[get_str(vid) for vid in h5dset["id"]],
[get_str(vname) for vname in h5dset["name"]]
[get_str(vid) for vid in h5dset_data["id"]],
[get_str(vname) for vname in h5dset_data["name"]]
)

@classmethod
Expand Down Expand Up @@ -193,9 +194,10 @@ def load(cls, h5group):
return MultiFuelEndUseEnumeration.load(h5group)

h5dset = h5group[cls.dimension]
h5dset_data = h5group[cls.dimension][...] # workaround for h5pyd
name = get_str(h5dset.attrs["name"])
ids = [get_str(vid) for vid in h5dset["id"]]
names = [get_str(vname) for vname in h5dset["name"]]
ids = [get_str(vid) for vid in h5dset_data["id"]]
names = [get_str(vname) for vname in h5dset_data["name"]]

if 'fuel' in h5dset.attrs:
return SingleFuelEndUseEnumeration(name, ids, names,
Expand Down Expand Up @@ -628,11 +630,12 @@ def persist(self, h5group):
@classmethod
def load(cls, h5group):
h5dset = h5group[cls.dimension]
h5dset_data = h5group[cls.dimension][...] # workaround for h5pyd
return cls(
get_str(h5dset.attrs["name"]),
[get_str(vid) for vid in h5dset["id"]],
[get_str(vname) for vname in h5dset["name"]],
[get_str(vunits) for vunits in h5dset["units"]]
[get_str(vid) for vid in h5dset_data["id"]],
[get_str(vname) for vname in h5dset_data["name"]],
[get_str(vunits) for vunits in h5dset_data["units"]]
)

@classmethod
Expand Down Expand Up @@ -768,12 +771,13 @@ def load(cls, h5group):
fuel_enum = FuelEnumeration.load(h5group)

h5dset = h5group[cls.dimension]
h5dset_data = h5group[cls.dimension][...] # workaround for h5pyd
return cls(
get_str(h5dset.attrs["name"]),
[get_str(vid) for vid in h5dset["id"]],
[get_str(vname) for vname in h5dset["name"]],
[get_str(vid) for vid in h5dset_data["id"]],
[get_str(vname) for vname in h5dset_data["name"]],
fuel_enum,
[get_str(vfuel_id) for vfuel_id in h5dset["fuel_id"]]
[get_str(vfuel_id) for vfuel_id in h5dset_data["fuel_id"]]
)

@classmethod
Expand Down
48 changes: 31 additions & 17 deletions dsgrid/dataformat/sectordataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@
from itertools import repeat
import h5py
import logging
from h5pyd._hl.files import is_hdf5
import numpy as np
import pandas as pd

from dsgrid import __version__ as VERSION
from dsgrid import DSGridError, DSGridNotImplemented
from dsgrid.helpers import H5Reader
from dsgrid.dataformat.enumeration import (
SectorEnumeration, GeographyEnumeration,
EndUseEnumerationBase, TimeEnumeration)
Expand Down Expand Up @@ -91,7 +93,7 @@ def create(cls, enum, enum_ids, enum_scales=None):
return cls(datamap)

@classmethod
def load(cls,dataset):
def load(cls,dataset,hsds=False):
"""
Parameters
----------
Expand All @@ -102,9 +104,14 @@ def load(cls,dataset):
-------
Datamap
"""
assert isinstance(dataset,h5py.Dataset)
idx = dataset[:,"idx"]
scale = dataset[:,"scale"]
if hsds: # workaround for h5pyd
dset = dataset[...] # workaround for h5pyd
idx = dset["idx"]
scale = dset["scale"]
else:
assert isinstance(dataset,h5py.Dataset)
idx = dataset[:,"idx"]
scale = dataset[:,"scale"]
datamap = np.empty(len(idx), enum_datamap_dtype)
datamap["idx"] = idx
datamap["scale"] = scale
Expand Down Expand Up @@ -295,7 +302,7 @@ def append_element_to_dataset_dimension(dataset,new_elem_idx,enum_ids,enum,scali

class SectorDataset(object):

def __init__(self,datafile,sector_id,enduses,times):
def __init__(self,datafile,sector_id,enduses,times,hsds=False):
"""
Creates a SectorDataset object. Note that this does not read from
or write to datafile in any way, and should generally not be called directly.
Expand All @@ -320,6 +327,8 @@ def __init__(self,datafile,sector_id,enduses,times):
self.datafile = datafile
self.enduses = enduses
self.times = times

self.is_hsds = hsds

self.n_geos = 0 # data is inserted by geography

Expand Down Expand Up @@ -361,16 +370,17 @@ def new(cls,datafile,sector_id,enduses=None,times=None):
@classmethod
def load(cls,datafile,f,sector_id):
dgroup = f["data/" + sector_id]
is_hsds = ("h5pyd" in str(type(f)))

datamap = Datamap.load(dgroup["enduses"])
datamap = Datamap.load(dgroup["enduses"],hsds=is_hsds)
enduses = datamap.get_subenum(datafile.enduse_enum)

datamap = Datamap.load(dgroup["times"])
datamap = Datamap.load(dgroup["times"],hsds=is_hsds)
times = datamap.get_subenum(datafile.time_enum)

result = cls(datafile,sector_id,enduses,times)
result = cls(datafile,sector_id,enduses,times,hsds=is_hsds)

datamap = Datamap.load(dgroup["geographies"])
datamap = Datamap.load(dgroup["geographies"],hsds=is_hsds)
result.n_geos = datamap.num_entries

return result
Expand All @@ -383,7 +393,7 @@ def loadall(cls,datafile,f,_upgrade_class=None):
if _upgrade_class is not None:
yield sector_id, _upgrade_class.load_sectordataset(datafile,f,sector_id)
continue
assert isinstance(sector_group, h5py.Group)
# assert isinstance(sector_group, h5py.Group) # fails for h5pyd
yield sector_id, SectorDataset.load(datafile,f,sector_id)


Expand Down Expand Up @@ -520,7 +530,7 @@ def __getitem__(self, geo_id):

id_idx = self.datafile.geo_enum.ids.index(geo_id)

with h5py.File(self.datafile.h5path, "r") as f:
with H5Reader(self.datafile.h5path) as f:
dgroup = f["data/" + self.sector_id]
dset = dgroup["data"]

Expand All @@ -539,19 +549,23 @@ def __getitem__(self, geo_id):
def has_data(self,geo_id):
id_idx = self.datafile.geo_enum.ids.index(geo_id)

with h5py.File(self.datafile.h5path, "r") as f:
with H5Reader(self.datafile.h5path) as f:
dgroup = f["data/" + self.sector_id]

geo_idx, geo_scale = dgroup["geographies"][id_idx]
if self.is_hsds: # workaround for h5pyd
dset = dgroup["geographies"][...]
geo_idx, geo_scale = dset[id_idx]
else:
geo_idx, geo_scale = dgroup["geographies"][id_idx]

if geo_idx == NULL_IDX:
return False
return True

def get_datamap(self,dim_key):
with h5py.File(self.datafile.h5path, "r") as f:
with H5Reader(self.datafile.h5path) as f:
dgroup = f["data"][self.sector_id]
result = Datamap.load(dgroup[dim_key])
result = Datamap.load(dgroup[dim_key], hsds=self.is_hsds)
return result

def get_data(self, dataset_geo_index):
Expand All @@ -577,7 +591,7 @@ def get_data(self, dataset_geo_index):
if (dataset_geo_index < 0) or (not dataset_geo_index < self.n_geos):
raise ValueError("dataset_geo_index must be in the range [0,{}), but is {}.".format(self.n_geos,dataset_geo_index))

with h5py.File(self.datafile.h5path, "r") as f:
with H5Reader(self.datafile.h5path) as f:

dgroup = f["data"][self.sector_id]
dset = dgroup["data"]
Expand All @@ -588,7 +602,7 @@ def get_data(self, dataset_geo_index):
columns=self.enduses,
dtype="float32")

geo_datamap = Datamap.load(dgroup["geographies"])
geo_datamap = Datamap.load(dgroup["geographies"], hsds=self.is_hsds)
geo_ids = geo_datamap.ids(dataset_geo_index,self.datafile.geo_enum)
scalings = geo_datamap.scales(dataset_geo_index)

Expand Down
35 changes: 35 additions & 0 deletions dsgrid/helpers.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import h5py
import numpy as np
import pandas as pds
import webcolors


def multi_index(df, cols):
result = df.copy()
if len(cols) == 1:
Expand All @@ -13,6 +15,7 @@ def multi_index(df, cols):
del result[col]
return result


def ensure_enum(cls, val):
"""
Returns the instance of cls that corresponds to val. cls is expected to be
Expand All @@ -31,17 +34,49 @@ def ensure_enum(cls, val):
return cls[val]
return cls(val)


def lighten_color(hex_color,fraction_to_white):
rgb_color = np.array(webcolors.hex_to_rgb(hex_color))
white = np.array([255,255,255])
direction = white - rgb_color
result = [int(round(x)) for x in list(rgb_color + direction * fraction_to_white)]
return webcolors.rgb_to_hex(tuple(result))


def palette(hex_color,n,max_fraction=0.75):
result = []; step = max_fraction / float(n)
for frac in [i * step for i in range(n)]:
result.append(lighten_color(hex_color,frac))
assert len(result) == n
return result


class H5Reader(object):
def __init__(self, filepath):
self.filepath = filepath
if self.is_hsds:
import h5pyd
self._f = h5pyd.File(filepath, mode="r", use_cache=False)
elif self.is_s3:
import s3fs
self._s3p = s3fs.S3FileSystem().open(filepath, 'rb')
self._f = h5py.File(self._s3p, mode="r")
else:
self._f = h5py.File(filepath, mode="r")

@property
def is_hsds(self):
return str(self.filepath).startswith("/nrel/")

@property
def is_s3(self):
return str(self.filepath).startswith("s3://")

def __enter__(self):
return self._f

def __exit__(self, exc, value, tb):
self._f.close()
if self.is_s3:
self._s3p.close()

Loading

0 comments on commit df19cc0

Please sign in to comment.