Skip to content

Commit

Permalink
Access through s3 or HSDS works, although the latter is quite slow.
Browse files Browse the repository at this point in the history
  • Loading branch information
elainethale committed Sep 30, 2021
1 parent 6795e78 commit 025020d
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 40 deletions.
24 changes: 14 additions & 10 deletions dsgrid/dataformat/enumeration.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,10 +135,11 @@ def persist(self, h5group):
@classmethod
def load(cls, h5group):
h5dset = h5group[cls.dimension]
h5dset_data = h5group[cls.dimension][...] # workaround for h5pyd
return cls(
get_str(h5dset.attrs["name"]),
[get_str(vid) for vid in h5dset["id"]],
[get_str(vname) for vname in h5dset["name"]]
[get_str(vid) for vid in h5dset_data["id"]],
[get_str(vname) for vname in h5dset_data["name"]]
)

@classmethod
Expand Down Expand Up @@ -193,9 +194,10 @@ def load(cls, h5group):
return MultiFuelEndUseEnumeration.load(h5group)

h5dset = h5group[cls.dimension]
h5dset_data = h5group[cls.dimension][...] # workaround for h5pyd
name = get_str(h5dset.attrs["name"])
ids = [get_str(vid) for vid in h5dset["id"]]
names = [get_str(vname) for vname in h5dset["name"]]
ids = [get_str(vid) for vid in h5dset_data["id"]]
names = [get_str(vname) for vname in h5dset_data["name"]]

if 'fuel' in h5dset.attrs:
return SingleFuelEndUseEnumeration(name, ids, names,
Expand Down Expand Up @@ -628,11 +630,12 @@ def persist(self, h5group):
@classmethod
def load(cls, h5group):
h5dset = h5group[cls.dimension]
h5dset_data = h5group[cls.dimension][...] # workaround for h5pyd
return cls(
get_str(h5dset.attrs["name"]),
[get_str(vid) for vid in h5dset["id"]],
[get_str(vname) for vname in h5dset["name"]],
[get_str(vunits) for vunits in h5dset["units"]]
[get_str(vid) for vid in h5dset_data["id"]],
[get_str(vname) for vname in h5dset_data["name"]],
[get_str(vunits) for vunits in h5dset_data["units"]]
)

@classmethod
Expand Down Expand Up @@ -768,12 +771,13 @@ def load(cls, h5group):
fuel_enum = FuelEnumeration.load(h5group)

h5dset = h5group[cls.dimension]
h5dset_data = h5group[cls.dimension][...] # workaround for h5pyd
return cls(
get_str(h5dset.attrs["name"]),
[get_str(vid) for vid in h5dset["id"]],
[get_str(vname) for vname in h5dset["name"]],
[get_str(vid) for vid in h5dset_data["id"]],
[get_str(vname) for vname in h5dset_data["name"]],
fuel_enum,
[get_str(vfuel_id) for vfuel_id in h5dset["fuel_id"]]
[get_str(vfuel_id) for vfuel_id in h5dset_data["fuel_id"]]
)

@classmethod
Expand Down
39 changes: 26 additions & 13 deletions dsgrid/dataformat/sectordataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from itertools import repeat
import h5py
import logging
from h5pyd._hl.files import is_hdf5
import numpy as np
import pandas as pd

Expand Down Expand Up @@ -92,7 +93,7 @@ def create(cls, enum, enum_ids, enum_scales=None):
return cls(datamap)

@classmethod
def load(cls,dataset):
def load(cls,dataset,hsds=False):
"""
Parameters
----------
Expand All @@ -103,9 +104,14 @@ def load(cls,dataset):
-------
Datamap
"""
assert isinstance(dataset,h5py.Dataset)
idx = dataset[:,"idx"]
scale = dataset[:,"scale"]
if hsds: # workaround for h5pyd
dset = dataset[...] # workaround for h5pyd
idx = dset["idx"]
scale = dset["scale"]
else:
assert isinstance(dataset,h5py.Dataset)
idx = dataset[:,"idx"]
scale = dataset[:,"scale"]
datamap = np.empty(len(idx), enum_datamap_dtype)
datamap["idx"] = idx
datamap["scale"] = scale
Expand Down Expand Up @@ -296,7 +302,7 @@ def append_element_to_dataset_dimension(dataset,new_elem_idx,enum_ids,enum,scali

class SectorDataset(object):

def __init__(self,datafile,sector_id,enduses,times):
def __init__(self,datafile,sector_id,enduses,times,hsds=False):
"""
Creates a SectorDataset object. Note that this does not read from
or write to datafile in any way, and should generally not be called directly.
Expand All @@ -321,6 +327,8 @@ def __init__(self,datafile,sector_id,enduses,times):
self.datafile = datafile
self.enduses = enduses
self.times = times

self.is_hsds = hsds

self.n_geos = 0 # data is inserted by geography

Expand Down Expand Up @@ -362,16 +370,17 @@ def new(cls,datafile,sector_id,enduses=None,times=None):
@classmethod
def load(cls,datafile,f,sector_id):
dgroup = f["data/" + sector_id]
is_hsds = ("h5pyd" in str(type(f)))

datamap = Datamap.load(dgroup["enduses"])
datamap = Datamap.load(dgroup["enduses"],hsds=is_hsds)
enduses = datamap.get_subenum(datafile.enduse_enum)

datamap = Datamap.load(dgroup["times"])
datamap = Datamap.load(dgroup["times"],hsds=is_hsds)
times = datamap.get_subenum(datafile.time_enum)

result = cls(datafile,sector_id,enduses,times)
result = cls(datafile,sector_id,enduses,times,hsds=is_hsds)

datamap = Datamap.load(dgroup["geographies"])
datamap = Datamap.load(dgroup["geographies"],hsds=is_hsds)
result.n_geos = datamap.num_entries

return result
Expand All @@ -384,7 +393,7 @@ def loadall(cls,datafile,f,_upgrade_class=None):
if _upgrade_class is not None:
yield sector_id, _upgrade_class.load_sectordataset(datafile,f,sector_id)
continue
assert isinstance(sector_group, h5py.Group)
# assert isinstance(sector_group, h5py.Group) # fails for h5pyd
yield sector_id, SectorDataset.load(datafile,f,sector_id)


Expand Down Expand Up @@ -543,7 +552,11 @@ def has_data(self,geo_id):
with H5Reader(self.datafile.h5path) as f:
dgroup = f["data/" + self.sector_id]

geo_idx, geo_scale = dgroup["geographies"][id_idx]
if self.is_hsds: # workaround for h5pyd
dset = dgroup["geographies"][...]
geo_idx, geo_scale = dset[id_idx]
else:
geo_idx, geo_scale = dgroup["geographies"][id_idx]

if geo_idx == NULL_IDX:
return False
Expand All @@ -552,7 +565,7 @@ def has_data(self,geo_id):
def get_datamap(self,dim_key):
with H5Reader(self.datafile.h5path) as f:
dgroup = f["data"][self.sector_id]
result = Datamap.load(dgroup[dim_key])
result = Datamap.load(dgroup[dim_key], hsds=self.is_hsds)
return result

def get_data(self, dataset_geo_index):
Expand Down Expand Up @@ -589,7 +602,7 @@ def get_data(self, dataset_geo_index):
columns=self.enduses,
dtype="float32")

geo_datamap = Datamap.load(dgroup["geographies"])
geo_datamap = Datamap.load(dgroup["geographies"], hsds=self.is_hsds)
geo_ids = geo_datamap.ids(dataset_geo_index,self.datafile.geo_enum)
scalings = geo_datamap.scales(dataset_geo_index)

Expand Down
10 changes: 1 addition & 9 deletions notebooks/Browse one file.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@
"outputs": [],
"source": [
"# Publicly available\n",
"dsgrid_hsds_base_path = \"/nrel/dsgrid-2018-efs\" # More performant, uses h5pyd\n",
"dsgrid_oedi_base_path = \"s3://oedi-data-lake/dsgrid-2018-efs\" # Uses h5py\n",
"dsgrid_hsds_base_path = \"/nrel/dsgrid-2018-efs\" # Uses h5pyd\n",
"# Internal to NREL\n",
"dsgrid_nrel_base_path_windows = Path(\"//nrelnas01/PLEXOS/Projects/Load/dsgrid_v0.2.0/\")\n",
"dsgrid_nrel_base_path_mac = Path(\"/Volumes/PLEXOS/Projects/Load/dsgrid_v0.2.0/\")"
Expand Down Expand Up @@ -407,14 +407,6 @@
"total_energy = Datatable(new_datafile).data.sum()\n",
"print(f\"{p.name} describes {total_energy} {energy_units} of electricity\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2664d807",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
16 changes: 8 additions & 8 deletions notebooks/Visualize dsgrid model.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,8 @@
"outputs": [],
"source": [
"# Publicly available\n",
"#dsgrid_hsds_base_path = \"/nrel/dsgrid-2018-efs\" # More performant, uses h5pyd -- Not yet working\n",
"dsgrid_oedi_base_path = \"s3://oedi-data-lake/dsgrid-2018-efs\" # Uses h5py\n",
"dsgrid_hsds_base_path = \"/nrel/dsgrid-2018-efs\" # Uses h5pyd\n",
"# Internal to NREL\n",
"dsgrid_nrel_base_path_windows = Path(\"//nrelnas01/PLEXOS/Projects/Load/dsgrid_v0.2.0/\")\n",
"dsgrid_nrel_base_path_mac = Path(\"/Volumes/PLEXOS/Projects/Load/dsgrid_v0.2.0/\")"
Expand Down Expand Up @@ -167,7 +167,7 @@
" files = list(d)\n",
" is_s3 = str(datadir).startswith(\"s3://\")\n",
" if is_s3:\n",
" files = [p.split(\"/\")[-1] for p in s3.glob(f\"{dsgrid_dataset_path}/*.dsg\")]\n",
" files = [p.split(\"/\")[-1] for p in s3.glob(f\"{dsgrid_dataset_path}*.dsg\")]\n",
" \n",
" result = []\n",
" for name, color, filepath in tuple_list:\n",
Expand All @@ -180,7 +180,7 @@
" continue\n",
" result.append(LoadModelComponent(name,component_type=component_type,color=color))\n",
" result[-1].load_datafile(\n",
" f\"{datadir}/{filepath}\" if (is_hsds or is_s3) else datadir / filepath)\n",
" f\"{datadir}{filepath}\" if (is_hsds or is_s3) else datadir / filepath)\n",
" return result\n",
"\n",
"# Bottom-Up\n",
Expand Down Expand Up @@ -882,8 +882,8 @@
" Modeled = auto() # only dsgrid-modeled \"bottom-up\" and \"gap\" components\n",
"\n",
"if is_hsds or is_s3:\n",
" all_components_model_path = f\"{dsgrid_base_path}/state_hourly_residuals\"\n",
" bottom_up_components_model_path = f\"{dsgrid_base_path}/dsgrid_site_energy_state_hourly\"\n",
" all_components_model_path = f\"{dsgrid_base_path}/state_hourly_residuals/\"\n",
" bottom_up_components_model_path = f\"{dsgrid_base_path}/dsgrid_site_energy_state_hourly/\"\n",
"else:\n",
" all_components_model_path = dsgrid_base_path / \"products\" / \"state_hourly_residuals\"\n",
" bottom_up_components_model_path = dsgrid_base_path / \"products\" / \"dsgrid_site_energy_state_hourly\""
Expand Down Expand Up @@ -948,7 +948,7 @@
"id": "ce6d9e0f",
"metadata": {},
"source": [
"⚠️ **WARNING** ⚠️ This cell can take a long time to run the first time, especially over s3."
"⚠️ **WARNING** ⚠️ This cell can take a long time to run the first time, especially over HSDS or s3."
]
},
{
Expand Down Expand Up @@ -1145,7 +1145,7 @@
"id": "58ed7b5a",
"metadata": {},
"source": [
"⚠️ **WARNING** ⚠️ This cell can take a long time to run the first time, especially over s3."
"⚠️ **WARNING** ⚠️ This cell can take a long time to run the first time, especially over HSDS or s3."
]
},
{
Expand Down Expand Up @@ -1357,7 +1357,7 @@
"id": "ea55380f",
"metadata": {},
"source": [
"⚠️ **WARNING** ⚠️ This cell can take a while to run over s3 and/or slow network connections."
"⚠️ **WARNING** ⚠️ This cell can take a while to run over HSDS, s3 and/or slow network connections."
]
},
{
Expand Down

0 comments on commit 025020d

Please sign in to comment.