NCAR · justin-richling · Jul 11, 2024 · Oct 27, 2023 · Oct 27, 2023 · Oct 27, 2023
diff --git a/lib/adf_dataset.py b/lib/adf_dataset.py
@@ -0,0 +1,209 @@
+from pathlib import Path
+import xarray as xr
+
+import warnings # use to warn user about missing files
+
+def my_formatwarning(msg, *args, **kwargs):
+    # ignore everything except the message
+    return str(msg) + '\n'
+warnings.formatwarning = my_formatwarning
+
+# "reference data"
+# It is often just a "baseline case", 
+# but could be some totally external data (reanalysis or observation or other model)
+# When it is another simulation, it gets treated like another "case"
+# When it is external data expect:
+# - "climo" files (12 monthly climos in the file)
+# - one variable per "climo"
+# - source can differ for each variable, requires label
+# - resolution can differ for each variable, requires regridded file(s)
+# - the variable name and units in the file may differ from CAM; use defaults.yaml to set conversion
+# - there could be multiple instances of a variable from different sources (e.g. different observations)
+
+# NOTE: the last item (multiple instances of a variable) is not allowed in AdfObs.var_obs_dict
+#       Since ADF is not able to handle this case, for now it is excluded the AdfData class.
+
+# NOTE: To make the "baseline case" vs "external data" cases as similar as possible,
+#       below construct the "baseline case" version to be similar to "external data".
+#       - provide a dictionary of (variable: file-path)
+#         + For external data, that dictionay is from AdfObs.var_obs_dict,
+#           which provides a dict of all the available variables. 
+#         + For reference simulation, look for files that match the diag_var_list
+
+# NOTE: There is currently a "base_nickname" allowed from AdfInfo. 
+#       Set AdfData.ref_nickname to that.
+#       Could be altered from "Obs" to be the data source label.
+
+class AdfData:
+    """A class instantiated with an AdfDiag object. 
+       Methods provide means to load data. 
+       This class does not interact with plotting, 
+       just provides access to data locations and loading data.
+
+       A future need is to add some kind of frequency/sampling
+       parameters to allow for non-h0 files. 
+
+    """
+    def __init__(self, adfobj):
+        self.adf = adfobj  # provides quick access to the AdfDiag object
+        # paths 
+        self.model_rgrid_loc = adfobj.get_basic_info("cam_regrid_loc", required=True)
+
+        # variables (and info for unit transform)
+        # use self.adf.diag_var_list and self.adf.self.adf.variable_defaults
+
+        # case names and nicknames
+        self.case_names = adfobj.get_cam_info("cam_case_name", required=True)
+        self.test_nicknames = adfobj.case_nicknames["test_nicknames"]
+        self.base_nickname = adfobj.case_nicknames["base_nickname"]
+        self.ref_nickname = self.base_nickname
+
+        # define reference data
+        self.set_reference() # specify "ref_labels" -> called "data_list" in zonal_mean (name of data source)
+
+    def set_reference(self):
+        """Set attributes for reference (aka baseline) data location, names, and variables."""
+        if self.adf.compare_obs:
+            self.ref_var_loc = {v: self.adf.var_obs_dict[v]['obs_file'] for v in self.adf.var_obs_dict}
+            self.ref_labels = {v: self.adf.var_obs_dict[v]['obs_name'] for v in self.adf.var_obs_dict}
+            self.ref_var_nam = {v: self.adf.var_obs_dict[v]['obs_var'] for v in self.adf.var_obs_dict}
+            if not self.adf.var_obs_dict:
+                warnings.warn("\t WARNING: reference is observations, but no observations found to plot against.")
+        else:
+            self.ref_var_loc = {}
+            self.ref_var_nam = {}
+            self.ref_labels = {}
+            # when using a reference simulation, allow a "special" attribute with the case name:
+            self.ref_case_label = self.adf.get_baseline_info("cam_case_name", required=True)
+            for v in self.adf.diag_var_list:
+                f = self.get_reference_climo_file(v)
+                if f is None:
+                    warnings.warn(f"\t WARNING: ADFData found no reference climo file for {v}")
+                    continue
+                else:
+                    self.ref_var_loc[v] = f
+                    self.ref_var_nam[v] = v
+                    self.ref_labels[v] = self.adf.get_baseline_info("cam_case_name", required=True)
+
+    def get_reference_climo_file(self, var):
+        """Return a list of files to be used as reference (aka baseline) for variable var."""
+        if self.adf.compare_obs:
+            fils = self.ref_var_loc.get(var, None)
+            return [fils] if fils is not None else None
+        ref_loc = self.adf.get_baseline_info("cam_climo_loc")
+        # NOTE: originally had this looking for *_baseline.nc
+        fils = sorted(Path(ref_loc).glob(f"{self.ref_case_label}_{var}_climo.nc"))
+        if fils:
+            return fils
+        return None
+
+    def load_reference_dataset(self, var):
+        fils = self.get_reference_climo_file(var)
+        if not fils:
+            warnings.warn(f"ERROR: Did not find any reference files for variable: {var}. Will try to skip.")
+            return None
+        return self.load_dataset(fils)
+
+    def load_reference_da(self, variablename):
+        da = self.load_reference_dataset(variablename)[self.ref_var_nam[variablename]]
+        if variablename in self.adf.variable_defaults:
+            vres = self.adf.variable_defaults[variablename]
+            if self.adf.compare_obs:
+                scale_factor = vres.get("obs_scale_factor",1)
+                add_offset = vres.get("obs_add_offset", 0)
+            else:
+                scale_factor = vres.get("scale_factor",1)
+                add_offset = vres.get("add_offset", 0)
+            da = da * scale_factor + add_offset
+            da.attrs['units'] = vres.get("new_unit", da.attrs.get('units', 'none'))
+        return da
+
+
+    def load_climo_da(self, case, variablename):
+        """Return DataArray from climo file"""
+        fils = self.get_climo_file(case, variablename)
+        return self.load_da(fils, variablename)
+
+
+    def load_climo_file(self, case, variablename):
+        """Return Dataset for climo of variablename"""
+        fils = self.get_climo_file(case, variablename)
+        if not fils:
+            warnings.warn(f"ERROR: Did not find climo file for variable: {variablename}. Will try to skip.")
+            return None
+        return self.load_dataset(fils)
+
+
+    def get_climo_file(self, case, variablename):
+        """Retrieve the climo file path(s) for variablename for a specific case."""
+        a = self.adf.get_cam_info("cam_climo_loc", required=True) # list of paths (could be multiple cases)
+        caseindex = (self.case_names).index(case) # the entry for specified case
+        model_cl_loc = Path(a[caseindex])
+        return sorted(model_cl_loc.glob(f"{case}_{variablename}_climo.nc"))
+
+    def get_timeseries_file(self, case, field):
+        ts_locs = self.adf.get_cam_info("cam_ts_loc", required=True) # list of paths (could be multiple cases)
+        caseindex = (self.case_names).index(case)
+        ts_loc = Path(ts_locs[caseindex])
+        ts_filenames = f'{case}.*.{field}.*nc'
+        ts_files = sorted(ts_loc.glob(ts_filenames))
+        return ts_files
+
+    def get_ref_timeseries_file(self, field):
+        if self.adf.compare_obs:
+            return None
+        else:
+            ts_loc = Path(self.adf.get_baseline_info("cam_ts_loc", required=True))
+            ts_filenames = f'{self.ref_case_label}.*.{field}.*nc'
+            ts_files = sorted(ts_loc.glob(ts_filenames))
+            return ts_files
+
+
+    def get_regrid_file(self, case, field):
+        model_rg_loc = Path(self.adf.get_basic_info("cam_regrid_loc", required=True))
+        rlbl = self.ref_labels[field]  # rlbl = "reference label" = the name of the reference data that defines target grid
+        return sorted(model_rg_loc.glob(f"{rlbl}_{case}_{field}_*.nc"))
+
+    def load_regrid_dataset(self, case, field):
+        fils = self.get_regrid_file(case, field)
+        if not fils:
+            warnings.warn(f"ERROR: Did not find regrid file(s) for case: {case}, variable: {field}")
+            return None
+        return self.load_dataset(fils)
+
+    def load_regrid_da(self, case, field):
+        fils = self.get_regrid_file(case, field)
+        if not fils:
+            warnings.warn(f"ERROR: Did not find regrid file(s) for case: {case}, variable: {field}")
+            return None
+        return self.load_da(fils, field)
+
+
+    def load_dataset(self, fils):
+        if (len(fils) == 0):
+            warnings.warn("Input file list is empty.")
+            return None
+        elif (len(fils) > 1):
+            ds = xr.open_mfdataset(fils, combine='by_coords')
+        else:
+            sfil = str(fils[0])
+            if not Path(sfil).is_file():
+                warnings.warn(f"Expecting to find file: {sfil}")
+                return None
+            ds = xr.open_dataset(sfil)
+        if ds is None:
+            warnings.warn(f"invalid data on load_dataset")
+        return ds
+
+
+    def load_da(self, fils, variablename):
+        ds = self.load_dataset(fils)
+        if ds is None:
+            warnings.warn(f"ERROR: Load failed for {variablename}")
+            return None
+        da = (ds[variablename]).squeeze()
+        if variablename in self.adf.variable_defaults:
+            vres = self.adf.variable_defaults[variablename]
+            da = da * vres.get("scale_factor",1) + vres.get("add_offset", 0)
+            da.attrs['units'] = vres.get("new_unit", da.attrs.get('units', 'none'))
+        return da
diff --git a/lib/adf_diag.py b/lib/adf_diag.py
@@ -92,9 +92,9 @@
 
 # +++++++++++++++++++++++++++++
 
-# Finally, import needed ADF module:
+# Finally, import needed ADF modules:
 from adf_web import AdfWeb
-
+from adf_dataset import AdfData
 
 #################
 # Helper functions
@@ -182,6 +182,9 @@ def __init__(self, config_file, debug=False):
         # Add plotting script names:
         self.__plotting_scripts = self.read_config_var("plotting_scripts")
 
+        # Provide convenience functions for data handling:
+        self.data = AdfData(self)
+
     # Create property needed to return "plotting_scripts" variable to user:
     @property
     def plotting_scripts(self):
@@ -538,11 +541,13 @@ def call_ncrcat(cmd):
             # Aerosol Calcs
             #--------------
             #Always make sure PMID is made if aerosols are desired in config file
+            # Since there's no requirement for `aerosol_zonal_list` to be included, allow it to be absent:
+            azl = res.get("aerosol_zonal_list", [])
             if "PMID" not in diag_var_list:
-                if any(item in res["aerosol_zonal_list"] for item in diag_var_list):
+                if any(item in azl for item in diag_var_list):
                     diag_var_list += ["PMID"]
             if "T" not in diag_var_list:
-                if any(item in res["aerosol_zonal_list"] for item in diag_var_list):
+                if any(item in azl for item in diag_var_list):
                     diag_var_list += ["T"]
             #End aerosol calcs
 
@@ -1056,7 +1061,7 @@ def derive_variables(self, res=None, vars_to_derive=None, ts_dir=None, overwrite
                 print(ermsg)
             else:
                 #Open a new dataset with all the constituent files/variables
-                ds = xr.open_mfdataset(constit_files)
+                ds = xr.open_mfdataset(constit_files).compute()
 
                 # create new file name for derived variable
                 derived_file = constit_files[0].replace(constit_list[0], var)
@@ -1088,7 +1093,8 @@ def derive_variables(self, res=None, vars_to_derive=None, ts_dir=None, overwrite
                 #These will be multiplied by rho (density of dry air)
                 ds_pmid_done = False
                 ds_t_done = False
-                if var in res["aerosol_zonal_list"]:
+                azl = res.get("aerosol_zonal_list", []) # User-defined defaults might not include aerosol zonal list
+                if var in azl:
 
                     #Only calculate once for all aerosol vars
                     if not ds_pmid_done:

diff --git a/lib/adf_info.py b/lib/adf_info.py
@@ -673,7 +673,15 @@ def get_climo_yrs_from_ts(self, input_ts_loc, case_name):
 
         #Search for first variable in var_list to get a time series file to read
         #NOTE: it is assumed all the variables have the same dates!
-        ts_files = sorted(input_location.glob(f"{case_name}*.{var_list[0]}.*nc"))
+
+        #Read hist_str (component.hist_num) from the yaml file, or set to default
+        hist_str = self.get_basic_info('hist_str')
+        #If hist_str is not present, then default to 'cam.h0':
+        if not hist_str:
+            hist_str = 'cam.h0'
+        #End if
+
+        ts_files = sorted(input_location.glob(f"{case_name}.{hist_str}.{var_list[0]}.*nc"))
 
         #Read in file(s)
         if len(ts_files) == 1:
@@ -683,10 +691,16 @@ def get_climo_yrs_from_ts(self, input_ts_loc, case_name):
 
         #Average time dimension over time bounds, if bounds exist:
         if 'time_bnds' in cam_ts_data:
+            timeBoundsName = 'time_bnds'
+        elif 'time_bounds' in cam_ts_data:
+            timeBoundsName = 'time_bounds'
+        else:
+            timeBoundsName = None
+        if timeBoundsName:
             time = cam_ts_data['time']
             #NOTE: force `load` here b/c if dask & time is cftime, 
             #throws a NotImplementedError:
-            time = xr.DataArray(cam_ts_data['time_bnds'].load().mean(dim='nbnd').values, dims=time.dims, attrs=time.attrs)
+            time = xr.DataArray(cam_ts_data[timeBoundsName].load().mean(dim='nbnd').values, dims=time.dims, attrs=time.attrs)
             cam_ts_data['time'] = time
             cam_ts_data.assign_coords(time=time)
             cam_ts_data = xr.decode_cf(cam_ts_data)

diff --git a/lib/plotting_functions.py b/lib/plotting_functions.py
@@ -2317,9 +2317,7 @@ def square_contour_difference(fld1, fld2, **kwargs):
     mnorm = mpl.colors.Normalize(mn, mx)
 
     coord1, coord2 = fld1.coords  # ASSUMES xarray WITH coords AND 2-dimensions
-    print(f"{coord1}, {coord2}")
     xx, yy = np.meshgrid(fld1[coord2], fld1[coord1])
-    print(f"shape of meshgrid: {xx.shape}")
 
     img1 = ax1.contourf(xx, yy, fld1.transpose())
     if (coord1 == 'month') and (fld1.shape[0] ==12):

diff --git a/scripts/averaging/create_climo_files.py b/scripts/averaging/create_climo_files.py
@@ -8,11 +8,20 @@ def my_formatwarning(msg, *args, **kwargs):
     return str(msg) + '\n'
 warnings.formatwarning = my_formatwarning
 
-
+import numpy as np
 import xarray as xr  # module-level import so all functions can get to it.
 
 import multiprocessing as mp
 
+def get_time_slice_by_year(time, startyear, endyear):
+    if not hasattr(time, 'dt'):
+        print("Warning: get_time_slice_by_year requires the `time` parameter to be an xarray time coordinate with a dt accessor. Returning generic slice (which will probably fail).")
+        return slice(startyear, endyear)
+    start_time_index = np.argwhere((time.dt.year >= startyear).values).flatten().min()
+    end_time_index = np.argwhere((time.dt.year <= endyear).values).flatten().max()
+    return slice(start_time_index, end_time_index+1)
+
+
 
 ##############
 #Main function
@@ -205,7 +214,8 @@ def process_variable(ts_files, syr, eyr, output_file):
         cam_ts_data.assign_coords(time=time)
         cam_ts_data = xr.decode_cf(cam_ts_data)
     #Extract data subset using provided year bounds:
-    cam_ts_data = cam_ts_data.sel(time=slice(syr, eyr))
+    tslice = get_time_slice_by_year(cam_ts_data.time, int(syr), int(eyr))
+    cam_ts_data = cam_ts_data.isel(time=tslice)
     #Group time series values by month, and average those months together:
     cam_climo_data = cam_ts_data.groupby('time.month').mean(dim='time')
     #Rename "months" to "time":