NCAR · justin-richling · Jul 11, 2024 · Oct 27, 2023 · Oct 27, 2023 · Oct 27, 2023
diff --git a/lib/adf_dataset.py b/lib/adf_dataset.py
@@ -0,0 +1,210 @@
+from pathlib import Path
+import xarray as xr
+
+import warnings # use to warn user about missing files
+
+def my_formatwarning(msg, *args, **kwargs):
+    # ignore everything except the message
+    return str(msg) + '\n'
+warnings.formatwarning = my_formatwarning
+
+# "reference data"
+# It is often just a "baseline case", 
+# but could be some totally external data (reanalysis or observation or other model)
+# When it is another simulation, it gets treated like another "case"
+# When it is external data expect:
+# - "climo" files (12 monthly climos in the file)
+# - one variable per "climo"
+# - source can differ for each variable, requires label
+# - resolution can differ for each variable, requires regridded file(s)
+# - the variable name and units in the file may differ from CAM; use defaults.yaml to set conversion
+# - there could be multiple instances of a variable from different sources (e.g. different observations)
+
+# NOTE: the last item (multiple instances of a variable) is not allowed in AdfObs.var_obs_dict
+#       Since ADF is not able to handle this case, for now it is excluded the AdfData class.
+
+# NOTE: To make the "baseline case" vs "external data" cases as similar as possible,
+#       below construct the "baseline case" version to be similar to "external data".
+#       - provide a dictionary of (variable: file-path)
+#         + For external data, that dictionay is from AdfObs.var_obs_dict,
+#           which provides a dict of all the available variables. 
+#         + For reference simulation, look for files that match the diag_var_list
+
+# NOTE: There is currently a "base_nickname" allowed from AdfInfo. 
+#       Set AdfData.ref_nickname to that.
+#       Could be altered from "Obs" to be the data source label.
+
+class AdfData:
+    """A class instantiated with an AdfDiag object. 
+       Methods provide means to load data. 
+       This class does not interact with plotting, 
+       just provides access to data locations and loading data.
+
+       A future need is to add some kind of frequency/sampling
+       parameters to allow for non-h0 files. 
+
+    """
+    def __init__(self, adfobj):
+        self.adf = adfobj  # provides quick access to the AdfDiag object
+        # paths 
+        self.model_rgrid_loc = adfobj.get_basic_info("cam_regrid_loc", required=True)
+
+        # variables (and info for unit transform)
+        self.var_list = adfobj.diag_var_list
+        self.res = adfobj.variable_defaults
+
+        # case names and nicknames
+        self.case_names = adfobj.get_cam_info("cam_case_name", required=True)
+        self.test_nicknames = adfobj.case_nicknames["test_nicknames"]
+        self.base_nickname = adfobj.case_nicknames["base_nickname"]
+        self.ref_nickname = self.base_nickname
+
+        # define reference data
+        self.reference_is_obs = adfobj.get_basic_info("compare_obs")
+        self.set_reference() # specify "ref_labels" -> called "data_list" in zonal_mean (name of data source)
+
+    def set_reference(self):
+        """Set attributes for reference (aka baseline) data location, names, and variables."""
+        if self.reference_is_obs:
+            self.ref_var_loc = {v: self.adf.var_obs_dict[v]['obs_file'] for v in self.adf.var_obs_dict}
+            self.ref_labels = {v: self.adf.var_obs_dict[v]['obs_name'] for v in self.adf.var_obs_dict}
+            self.ref_var_nam = {v: self.adf.var_obs_dict[v]['obs_var'] for v in self.adf.var_obs_dict}
+            if not self.adf.var_obs_dict:
+                print("\t WARNING: reference is observations, but no observations found to plot against.")
+        else:
+            self.ref_var_loc = {}
+            self.ref_var_nam = {}
+            self.ref_labels = {}
+            # when using a reference simulation, allow a "special" attribute with the case name:
+            self.ref_case_label = self.adf.get_baseline_info("cam_case_name", required=True)
+            for v in self.var_list:
+                f = self.get_reference_climo_file(v)
+                if f is None:
+                    print(f"\t WARNING: ADFData found no reference climo file for {v}")
+                    continue
+                else:
+                    self.ref_var_loc[v] = f
+                    self.ref_var_nam[v] = v
+                    self.ref_labels[v] = self.adf.get_baseline_info("cam_case_name", required=True)
+
+    def get_reference_climo_file(self, var):
+        """Return a list of files to be used as reference (aka baseline) for variable var."""
+        if self.reference_is_obs:
+            fils = self.ref_var_loc.get(var, None)
+            return [fils] if fils is not None else None
+        self.ref_loc = self.adf.get_baseline_info("cam_climo_loc")
+        # NOTE: originally had this looking for *_baseline.nc
+        fils = sorted(Path(self.ref_loc).glob(f"{self.ref_case_label}_{var}_climo.nc"))
+        if fils:
+            return fils
+        return None
+
+    def load_reference_dataset(self, var):
+        fils = self.get_reference_climo_file(var)
+        if not fils:
+            print(f"ERROR: Did not find any reference files for variable: {var}. Will try to skip.")
+            return None
+        return self.load_dataset(fils)
+
+    def load_reference_da(self, variablename):
+        da = self.load_reference_dataset(variablename)[self.ref_var_nam[variablename]]
+        if variablename in self.res:
+            vres = self.res[variablename]
+            if self.reference_is_obs:
+                scale_factor = vres.get("obs_scale_factor",1)
+                add_offset = vres.get("obs_add_offset", 0)
+            else:
+                scale_factor = vres.get("scale_factor",1)
+                add_offset = vres.get("add_offset", 0)
+            da = da * scale_factor + add_offset
+            da.attrs['units'] = vres.get("new_unit", da.attrs.get('units', 'none'))
+        return da
+
+
+    def load_climo_da(self, case, variablename):
+        """Return DataArray from climo file"""
+        fils = self.get_climo_file(case, variablename)
+        return self.load_da(fils, variablename)
+
+
+    def load_climo_file(self, case, variablename):
+        """Return Dataset for climo of variablename"""
+        fils = self.get_climo_file(case, variablename)
+        if not fils:
+            print(f"ERROR: Did not find climo file for variable: {variablename}. Will try to skip.")
+            return None
+        return self.load_dataset(fils)
+
+
+    def get_climo_file(self, case, variablename):
+        """Retrieve the climo file path(s) for variablename for a specific case."""
+        a = self.adf.get_cam_info("cam_climo_loc", required=True) # list of paths (could be multiple cases)
+        caseindex = (self.case_names).index(case) # the entry for specified case
+        # print(f"Checking if case name is in the climo loc entry: {case in a[caseindex]}")
+        model_cl_loc = Path(a[caseindex])
+        return sorted(model_cl_loc.glob(f"{case}_{variablename}_climo.nc"))
+
+    def get_timeseries_file(self, case, field):
+        ts_locs = self.adf.get_cam_info("cam_ts_loc", required=True)
+        ts_loc = Path(ts_locs[case])
+        ts_filenames = f'{case}.*.{field}.*nc'
+        ts_files = sorted(ts_loc.glob(ts_filenames))
+        return ts_files
+
+    def get_ref_timeseries_file(self, field):
+        if self.reference_is_obs:
+            return None
+        else:
+            ts_loc = Path(self.adf.get_baseline_info("cam_ts_loc", required=True))
+            ts_filenames = f'{self.ref_case_label}.*.{field}.*nc'
+            ts_files = sorted(ts_loc.glob(ts_filenames))
+            return ts_files
+
+
+    def get_regrid_file(self, case, field):
+        model_rg_loc = Path(self.adf.get_basic_info("cam_regrid_loc", required=True))
+        rlbl = self.ref_labels[field]  # rlbl = "reference label" = the name of the reference data that defines target grid
+        return sorted(model_rg_loc.glob(f"{rlbl}_{case}_{field}_*.nc"))
+
+    def load_regrid_dataset(self, case, field):
+        fils = self.get_regrid_file(case, field)
+        if not fils:
+            print(f"ERROR: Did not find regrid file(s) for case: {case}, variable: {field}")
+            return None
+        return self.load_dataset(fils)
+
+    def load_regrid_da(self, case, field):
+        fils = self.get_regrid_file(case, field)
+        if not fils:
+            print(f"ERROR: Did not find regrid file(s) for case: {case}, variable: {field}")
+            return None
+        return self.load_da(fils, field)
+
+    def get_file_list():
+        pass
+
+    def load_dataset(self, fils):
+        if (len(fils) == 0):
+            warnings.warn("Input file list is empty.")
+            return None
+        elif (len(fils) > 1):
+            ds = xr.open_mfdataset(fils, combine='by_coords')
+        else:
+            sfil = str(fils[0])
+            assert Path(sfil).is_file(), f"Needs to be a file: {sfil}"
+            ds = xr.open_dataset(sfil)
+        if ds is None:
+            warnings.warn(f"invalid data on load_dataset")
+        return ds
+
+    def load_da(self, fils, variablename):
+        ds = self.load_dataset(fils)
+        if ds is None:
+            print(f"ERROR: Load failed for {variablename}")
+            return None
+        da = (ds[variablename]).squeeze()
+        if variablename in self.res:
+            vres = self.res[variablename]
+            da = da * vres.get("scale_factor",1) + vres.get("add_offset", 0)
+            da.attrs['units'] = vres.get("new_unit", da.attrs.get('units', 'none'))
+        return da
diff --git a/lib/plotting_functions.py b/lib/plotting_functions.py
@@ -2317,9 +2317,9 @@ def square_contour_difference(fld1, fld2, **kwargs):
     mnorm = mpl.colors.Normalize(mn, mx)
 
     coord1, coord2 = fld1.coords  # ASSUMES xarray WITH coords AND 2-dimensions
-    print(f"{coord1}, {coord2}")
+    # print(f"{coord1}, {coord2}")
     xx, yy = np.meshgrid(fld1[coord2], fld1[coord1])
-    print(f"shape of meshgrid: {xx.shape}")
+    # print(f"shape of meshgrid: {xx.shape}")
 
     img1 = ax1.contourf(xx, yy, fld1.transpose())
     if (coord1 == 'month') and (fld1.shape[0] ==12):

diff --git a/scripts/averaging/create_climo_files.py b/scripts/averaging/create_climo_files.py
@@ -8,11 +8,20 @@ def my_formatwarning(msg, *args, **kwargs):
     return str(msg) + '\n'
 warnings.formatwarning = my_formatwarning
 
-
+import numpy as np
 import xarray as xr  # module-level import so all functions can get to it.
 
 import multiprocessing as mp
 
+def get_time_slice_by_year(time, startyear, endyear):
+    if not hasattr(time, 'dt'):
+        print("Warning: get_time_slice_by_year requires the `time` parameter to be an xarray time coordinate with a dt accessor. Returning generic slice (which will probably fail).")
+        return slice(startyear, endyear)
+    start_time_index = np.argwhere((time.dt.year >= startyear).values).flatten().min()
+    end_time_index = np.argwhere((time.dt.year <= endyear).values).flatten().max()
+    return slice(start_time_index, end_time_index+1)
+
+
 
 ##############
 #Main function
@@ -205,7 +214,9 @@ def process_variable(ts_files, syr, eyr, output_file):
         cam_ts_data.assign_coords(time=time)
         cam_ts_data = xr.decode_cf(cam_ts_data)
     #Extract data subset using provided year bounds:
-    cam_ts_data = cam_ts_data.sel(time=slice(syr, eyr))
+    #cam_ts_data = cam_ts_data.sel(time=slice(syr, eyr))
+    tslice = get_time_slice_by_year(cam_ts_data.time, int(syr), int(eyr))
+    cam_ts_data = cam_ts_data.isel(time=tslice)
     #Group time series values by month, and average those months together:
     cam_climo_data = cam_ts_data.groupby('time.month').mean(dim='time')
     #Rename "months" to "time":