Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dataclass fixes #317

Merged
merged 19 commits into from
Aug 7, 2024
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
237 changes: 153 additions & 84 deletions lib/adf_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ def set_reference(self):
self.ref_var_loc = {v: self.adf.var_obs_dict[v]['obs_file'] for v in self.adf.var_obs_dict}
self.ref_labels = {v: self.adf.var_obs_dict[v]['obs_name'] for v in self.adf.var_obs_dict}
self.ref_var_nam = {v: self.adf.var_obs_dict[v]['obs_var'] for v in self.adf.var_obs_dict}
self.ref_case_label = "Obs"
if not self.adf.var_obs_dict:
warnings.warn("\t WARNING: reference is observations, but no observations found to plot against.")
else:
Expand All @@ -76,97 +77,34 @@ def set_reference(self):
# when using a reference simulation, allow a "special" attribute with the case name:
self.ref_case_label = self.adf.get_baseline_info("cam_case_name", required=True)
for v in self.adf.diag_var_list:
self.ref_var_nam[v] = v
justin-richling marked this conversation as resolved.
Show resolved Hide resolved
self.ref_labels[v] = self.adf.get_baseline_info("cam_case_name", required=True)
f = self.get_reference_climo_file(v)
if f is None:
warnings.warn(f"\t WARNING: ADFData found no reference climo file for {v}")
continue
else:
if f:
self.ref_var_loc[v] = f
self.ref_var_nam[v] = v
self.ref_labels[v] = self.adf.get_baseline_info("cam_case_name", required=True)

def get_reference_climo_file(self, var):
"""Return a list of files to be used as reference (aka baseline) for variable var."""
if self.adf.compare_obs:
fils = self.ref_var_loc.get(var, None)
return [fils] if fils is not None else None
ref_loc = self.adf.get_baseline_info("cam_climo_loc")
# NOTE: originally had this looking for *_baseline.nc
fils = sorted(Path(ref_loc).glob(f"{self.ref_case_label}_{var}_climo.nc"))
if fils:
return fils
return None

def load_reference_dataset(self, var):
fils = self.get_reference_climo_file(var)
if not fils:
warnings.warn(f"ERROR: Did not find any reference files for variable: {var}. Will try to skip.")
return None
return self.load_dataset(fils)

def load_reference_da(self, variablename):
da = self.load_reference_dataset(variablename)[self.ref_var_nam[variablename]]
if variablename in self.adf.variable_defaults:
vres = self.adf.variable_defaults[variablename]
if self.adf.compare_obs:
scale_factor = vres.get("obs_scale_factor",1)
add_offset = vres.get("obs_add_offset", 0)
else:
scale_factor = vres.get("scale_factor",1)
add_offset = vres.get("add_offset", 0)
da = da * scale_factor + add_offset
da.attrs['units'] = vres.get("new_unit", da.attrs.get('units', 'none'))
return da


def load_reference_regrid_dataset(self, case, field):
fils = self.get_ref_regrid_file(case, field)
if not fils:
warnings.warn(f"ERROR: Did not find regrid file(s) for case: {case}, variable: {field}")
return None
return self.load_dataset(fils)

def set_ref_var_loc(self):
"""Set reference climo file locations"""
for v in self.adf.diag_var_list:
f = self.get_reference_climo_file(v)
self.ref_var_loc[v] = f

def load_reference_regrid_da(self, case, field):
fils = self.get_ref_regrid_file(case, field)
if not fils:
warnings.warn(f"ERROR: Did not find regrid file(s) for case: {case}, variable: {field}")
return None
return self.load_da(fils, field)


def load_climo_da(self, case, variablename):
"""Return DataArray from climo file"""
fils = self.get_climo_file(case, variablename)
return self.load_da(fils, variablename)


def load_climo_file(self, case, variablename):
"""Return Dataset for climo of variablename"""
fils = self.get_climo_file(case, variablename)
if not fils:
warnings.warn(f"ERROR: Did not find climo file for variable: {variablename}. Will try to skip.")
return None
return self.load_dataset(fils)


def get_climo_file(self, case, variablename):
"""Retrieve the climo file path(s) for variablename for a specific case."""
a = self.adf.get_cam_info("cam_climo_loc", required=True) # list of paths (could be multiple cases)
caseindex = (self.case_names).index(case) # the entry for specified case
model_cl_loc = Path(a[caseindex])
return sorted(model_cl_loc.glob(f"{case}_{variablename}_climo.nc"))

# Time series files
#------------------
# Test case(s)
def get_timeseries_file(self, case, field):
"""Return list of test time series files"""
ts_locs = self.adf.get_cam_info("cam_ts_loc", required=True) # list of paths (could be multiple cases)
caseindex = (self.case_names).index(case)
ts_loc = Path(ts_locs[caseindex])
ts_filenames = f'{case}.*.{field}.*nc'
ts_files = sorted(ts_loc.glob(ts_filenames))
return ts_files


# Reference case (baseline/obs)
def get_ref_timeseries_file(self, field):
"""Return list of reference time series files"""
if self.adf.compare_obs:
return None
else:
Expand All @@ -177,6 +115,7 @@ def get_ref_timeseries_file(self, field):


def load_timeseries_dataset(self, fils):
"""Return DataSet from time series file(s) and assign time to midpoint of interval"""
if (len(fils) == 0):
warnings.warn("Input file list is empty.")
return None
Expand All @@ -203,32 +142,127 @@ def load_timeseries_dataset(self, fils):
warnings.warn("Timeseries file does not have time bounds info.")
return xr.decode_cf(ds)

def get_ref_regrid_file(self, case, field):
model_rg_loc = Path(self.adf.get_basic_info("cam_regrid_loc", required=True))
return sorted(model_rg_loc.glob(f"{case}_{field}_*.nc"))

#------------------


# Climatology files
#------------------

# Test case(s)
def load_climo_da(self, case, variablename):
"""Return DataArray from climo file"""
add_offset, scale_factor = self.get_defaults(case, variablename)
fils = self.get_climo_file(case, variablename)
nusbaume marked this conversation as resolved.
Show resolved Hide resolved
return self.load_da(fils, variablename, add_offset=add_offset, scale_factor=scale_factor)


def load_climo_file(self, case, variablename):
"""Return Dataset for climo of variablename"""
fils = self.get_climo_file(case, variablename)
if not fils:
warnings.warn(f"ERROR: Did not find climo file for variable: {variablename}. Will try to skip.")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would change ERROR to WARNING here, so that users realize that the ADF will keep going.

Suggested change
warnings.warn(f"ERROR: Did not find climo file for variable: {variablename}. Will try to skip.")
warnings.warn(f"WARNING: Did not find climo file for variable: {variablename}. Will try to skip.")

The same request holds for the other warnings in this file as well.

return None
return self.load_dataset(fils)


def get_climo_file(self, case, variablename):
"""Retrieve the climo file path(s) for variablename for a specific case."""
a = self.adf.get_cam_info("cam_climo_loc", required=True) # list of paths (could be multiple cases)
caseindex = (self.case_names).index(case) # the entry for specified case
model_cl_loc = Path(a[caseindex])
return sorted(model_cl_loc.glob(f"{case}_{variablename}_climo.nc"))


# Reference case (baseline/obs)
def get_reference_climo_file(self, var):
"""Return a list of files to be used as reference (aka baseline) for variable var."""
if self.adf.compare_obs:
fils = self.ref_var_loc.get(var, None)
return [fils] if fils is not None else None
ref_loc = self.adf.get_baseline_info("cam_climo_loc")
# NOTE: originally had this looking for *_baseline.nc
fils = sorted(Path(ref_loc).glob(f"{self.ref_case_label}_{var}_climo.nc"))
if fils:
return fils
return None

#------------------


# Regridded files
#------------------

# Test case(s)
def get_regrid_file(self, case, field):
"""Return list of test regridded files"""
model_rg_loc = Path(self.adf.get_basic_info("cam_regrid_loc", required=True))
rlbl = self.ref_labels[field] # rlbl = "reference label" = the name of the reference data that defines target grid
return sorted(model_rg_loc.glob(f"{rlbl}_{case}_{field}_*.nc"))


def load_regrid_dataset(self, case, field):
"""Return a data set to be used as reference (aka baseline) for variable field."""
fils = self.get_regrid_file(case, field)
if not fils:
warnings.warn(f"ERROR: Did not find regrid file(s) for case: {case}, variable: {field}")
return None
return self.load_dataset(fils)


def load_regrid_da(self, case, field):
"""Return a data array to be used as reference (aka baseline) for variable field."""
add_offset, scale_factor = self.get_defaults(case, field)
fils = self.get_regrid_file(case, field)
if not fils:
warnings.warn(f"ERROR: Did not find regrid file(s) for case: {case}, variable: {field}")
return None
return self.load_da(fils, field)
return self.load_da(fils, field, add_offset=add_offset, scale_factor=scale_factor)


# Reference case (baseline/obs)
def get_ref_regrid_file(self, case, field):
"""Return list of reference regridded files"""
if self.adf.compare_obs:
obs_loc = self.ref_var_loc.get(field, None)
fils = [str(obs_loc)]
nusbaume marked this conversation as resolved.
Show resolved Hide resolved
else:
model_rg_loc = Path(self.adf.get_basic_info("cam_regrid_loc", required=True))
fils = sorted(model_rg_loc.glob(f"{case}_{field}_*.nc"))
return fils


def load_reference_regrid_dataset(self, case, field):
"""Return a data set to be used as reference (aka baseline) for variable field."""
fils = self.get_ref_regrid_file(case, field)
if not fils:
warnings.warn(f"ERROR: Did not find regrid file(s) for case: {case}, variable: {field}")
nusbaume marked this conversation as resolved.
Show resolved Hide resolved
return None
return self.load_dataset(fils)


def load_reference_regrid_da(self, case, field):
"""Return a data array to be used as reference (aka baseline) for variable field."""
add_offset, scale_factor = self.get_defaults(case, field)
fils = self.get_ref_regrid_file(case, field)
if not fils:
warnings.warn(f"ERROR: Did not find regrid file(s) for case: {case}, variable: {field}")
nusbaume marked this conversation as resolved.
Show resolved Hide resolved
return None
#Change the variable name from CAM standard to what is is
# listed in variable defaults for this observation field
nusbaume marked this conversation as resolved.
Show resolved Hide resolved
if self.adf.compare_obs:
field = self.ref_var_nam[field]
return self.load_da(fils, field, add_offset=add_offset, scale_factor=scale_factor)

#------------------


# DataSet and DataArray load
#---------------------------

# Load DataSet
def load_dataset(self, fils):
"""Return xarray DataSet from file(s)"""
if (len(fils) == 0):
warnings.warn("Input file list is empty.")
return None
Expand All @@ -244,15 +278,50 @@ def load_dataset(self, fils):
warnings.warn(f"invalid data on load_dataset")
return ds


def load_da(self, fils, variablename):
# Load DataArray
def load_da(self, fils, variablename, **kwargs):
"""Return xarray DataArray from files(s) w/ optional scale factor, offset, and/or new units"""
ds = self.load_dataset(fils)
if ds is None:
warnings.warn(f"ERROR: Load failed for {variablename}")
return None
da = (ds[variablename]).squeeze()

da = da * kwargs["scale_factor"] + kwargs["add_offset"]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we probably just need to make this a bit more robust with something like
scale_factor = kwargs.get('scale_factor', 1)
add_offset = kwargs.get('add_offset', 0)
Just in case it gets used without the kwargs being passed.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, maybe use get_defaults to do this?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's what I was attempting to do with get_defaults, it should be setting those to the defaults before it becomes the kwargs

I have the offset set to 0 and scale factor to 1 initially, then the check if variablename in res should grab the values and if not it will default to 0/1 respectively with vres.get.

And if the variable is not in self.adf.variable_defaults, the original values for new_unit, add_offset , and scale_factor will take on the original set values of none, 0, and 1.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree with @brianpm's original suggestion here. Right now if someone tried to use this function without passing in the scale_factor and add_offset keywords I believe the function would crash with a dictionary key error.

if variablename in self.adf.variable_defaults:
vres = self.adf.variable_defaults[variablename]
da = da * vres.get("scale_factor",1) + vres.get("add_offset", 0)
da.attrs['units'] = vres.get("new_unit", da.attrs.get('units', 'none'))
else:
da.attrs['units'] = 'none'
return da

# Get vairable defaults, if applicable
def get_defaults(self, case, variablename):
nusbaume marked this conversation as resolved.
Show resolved Hide resolved
"""
Get variable defaults if applicable

- This is to get any scale factors or off-sets

Returns
-------
add_offset - int/float
scale_factor - int/float
"""
add_offset = 0
scale_factor = 1
res = self.adf.variable_defaults
if variablename in res:
vres = res[variablename]
if (case == self.ref_labels[variablename]) and (self.adf.compare_obs):
scale_factor = vres.get("obs_scale_factor",1)
add_offset = vres.get("obs_add_offset", 0)
else:
scale_factor = vres.get("scale_factor",1)
add_offset = vres.get("add_offset", 0)
return add_offset, scale_factor

#------------------




20 changes: 12 additions & 8 deletions lib/adf_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,9 @@ def __init__(self, config_file, debug=False):
#that check this variable won't crash:
self.__cam_bl_climo_info = None

# Set baseline hist string object to None
self.__base_hist_str = None

#Also set data name for use below:
data_name = "Obs"
base_nickname = "Obs"
Expand Down Expand Up @@ -778,16 +781,17 @@ def get_climo_yrs_from_ts(self, input_ts_loc, case_name):
errmsg = f"Time series directory '{input_ts_loc}' not found. Script is exiting."
raise AdfError(errmsg)

# Search for first variable in var_list to get a time series file to read
# Search for first available variable in var_list to get a time series file to read
# NOTE: it is assumed all the variables have the same dates!
# Also, it is assumed that only h0 files should be climo-ed.
ts_files = sorted(input_location.glob(f"{case_name}*h0*.{var_list[0]}.*nc"))

#Read hist_str (component.hist_num) from the yaml file, or set to default
hist_str = self.get_basic_info('hist_str')
#If hist_str is not present, then default to 'cam.h0':
if not hist_str:
hist_str = 'cam.h0'
for var in var_list:
ts_files = sorted(input_location.glob(f"{case_name}*h0*.{var}.*nc"))
if ts_files:
break
else:
logmsg = "get years for time series:"
logmsg = f"\tVar '{var}' not in dataset, skip to next to try and find climo years..."
self.debug_log(logmsg)

#Read in file(s)
if len(ts_files) == 1:
Expand Down
Loading