From dde12c8be4b3172c95943ea621f00d3c67a902fd Mon Sep 17 00:00:00 2001 From: Benjamin Adams Date: Mon, 16 Sep 2024 11:50:12 -0400 Subject: [PATCH] WIP commit for domain ragged array representation --- compliance_checker/cf/cf_1_6.py | 5 +- compliance_checker/cf/cf_1_8.py | 6 +- compliance_checker/cf/cf_1_9.py | 137 +++++++++++++----- compliance_checker/cf/util.py | 37 +++-- compliance_checker/cfutil.py | 53 +++---- compliance_checker/tests/test_cf.py | 7 +- .../tests/test_feature_detection.py | 2 +- 7 files changed, 161 insertions(+), 86 deletions(-) diff --git a/compliance_checker/cf/cf_1_6.py b/compliance_checker/cf/cf_1_6.py index c77d5ea4..2bdb5f59 100644 --- a/compliance_checker/cf/cf_1_6.py +++ b/compliance_checker/cf/cf_1_6.py @@ -2252,7 +2252,6 @@ def check_multi_dimensional_coords(self, ds): # IS THIS EVEN NEEDED ANYMORE? # *************** def check_grid_coordinates(self, ds): - # def _check_grid_coordinates(self, ds): """ 5.6 When the coordinate variables for a horizontal grid are not longitude and latitude, it is required that the true latitude and @@ -2266,7 +2265,7 @@ def check_grid_coordinates(self, ds): latitudes = cfutil.get_true_latitude_variables(ds) longitudes = cfutil.get_true_longitude_variables(ds) - check_featues = [ + check_features = [ "2d-regular-grid", "2d-static-grid", "3d-regular-grid", @@ -2287,7 +2286,7 @@ def check_grid_coordinates(self, ds): # dimensions dimensions = set(ds.variables[variable].dimensions) # If it's not a grid, skip it - if cfutil.guess_feature_type(ds, variable) not in check_featues: + if cfutil.guess_feature_type(ds, variable) not in check_features: continue has_coords = TestCtx(BaseCheck.HIGH, self.section_titles["5.6"]) diff --git a/compliance_checker/cf/cf_1_8.py b/compliance_checker/cf/cf_1_8.py index 6d51ba66..8f262db7 100644 --- a/compliance_checker/cf/cf_1_8.py +++ b/compliance_checker/cf/cf_1_8.py @@ -169,17 +169,17 @@ def check_geometry(self, ds: Dataset): results.append(geom_valid.to_result()) continue - node_count = reference_attr_variables( + node_count, node_count_errors = reference_attr_variables( ds, getattr(geometry_var, "node_count", None), ) # multipart lines and polygons only - part_node_count = reference_attr_variables( + part_node_count, part_node_count_errors = reference_attr_variables( ds, getattr(geometry_var, "part_node_count", None), ) # polygons with interior geometry only - interior_ring = reference_attr_variables( + interior_ring, interior_ring_errors = reference_attr_variables( ds, getattr(geometry_var, "interior_ring", None), ) diff --git a/compliance_checker/cf/cf_1_9.py b/compliance_checker/cf/cf_1_9.py index 3fcf5dfe..2b09531e 100644 --- a/compliance_checker/cf/cf_1_9.py +++ b/compliance_checker/cf/cf_1_9.py @@ -112,25 +112,46 @@ def check_domain_variables(self, ds: Dataset): var for var in ds.get_variables_by_attributes( coordinates=lambda c: c is not None, - ) - # IMPLICIT CONFORMANCE REQUIRED 1/4 + )): # all variables have dimensions attribute, but should be scalar - if var.dimensions == () - ): + if not ds.dimensions == (): + continue + + # IMPLICIT CONFORMANCE REQUIRED 1/4 + # Has a dimensions *NetCDF* attribute + try: + dim_nc_attr = var.getncattr(dimensions) + # most variables are unlikely to be domain variables, so don't treat this + # as a failure + except AttributeError: + continue + # IMPLICIT CONFORMANCE REQUIRED 2/4 + # Aforementioned dimensions attribute is comprised of space separated + # dimension names which must exist in the file domain_valid = TestCtx(BaseCheck.MEDIUM, self.section_titles["5.8"]) - domain_valid.out_of += 1 - domain_coord_vars = reference_attr_variables( + domain_valid.out_of += 2 + domain_dims, dim_errors = reference_attr_variables( ds, - domain_var.coordinates, + domain_var.getncattr("dimensions"), " ", + "dimensions" + ) + if dim_errors: + errors_str = ", ".join(dim_errors) + domain_valid.messages.append( + "Could not find the following " + "dimensions referenced in " + "dimensions attribute from " + "domain variable " + f"{domain_var.name}: {errors_str}", + ) + else: + domain_valid.score += 1 + domain_coord_vars, domain_coord_var_errors = reference_attr_variables( + ds, domain_var.coordinates, " " ) - errors = [ - maybe_error.name - for maybe_error in domain_coord_vars - if isinstance(maybe_error, VariableReferenceError) - ] - if errors: - errors_str = ", ".join(errors) + if domain_coord_var_errors: + errors_str = ", ".join(var_errors) domain_valid.messages.append( "Could not find the following " "variables referenced in " @@ -138,35 +159,75 @@ def check_domain_variables(self, ds: Dataset): "domain variable " f"{domain_var.name}: {errors_str}", ) + else: + domain_valid.score += 1 + coord_var_dim_failures = [] + is_ragged_array_repr = is_dataset_valid_ragged_array_repr_featureType(ds, getattr(ds, "featureType", "")) + if is_ragged_array_repr: + for var in domain_coord_vars: + domain_valid.out_of += 1 + ragged_array_dim_variable, ragged_attr_name = resolve_ragged_array_dimension(ds) + dim_name = getattr(ragged_array_dim_variable, ragged_attr_name) + referenced_dim = reference_attr_variables(ds, dim_name, reference_type="dimension") + if isinstance(referenced_dim, VariableReferenceError): + domain_valid.messages.append( + f"Found ragged array variable {ragged_array_dim_variable.name}, " + f"but dimension {dim_name} referenced from {attr_name} does not exist in file" + ) + + coord_var_reference_failures = [] + for coord_var in reference_attr_variables(ds, dim_name, " "): + if isinstance(coord_var, VariableReferenceError): + coord_var_reference_failures.append(coord_var) + domain_valid.messages.append( + f"Referenced coordinate variable {coord_var} does not exist in file") + continue + # TODO: check for label variables + if not util.get_possible_label_variable_dimensions(coord_var).issubset({referenced_dim}): + domain_valid.messages.append( + f"Found ragged array variable {ragged_array_dim_variable.name}, " + f"but dimension {dim_name} referenced from {attr_name} does not exist in file" + ) + else: + domain_valid.scored += 1 + pass else: - long_name = getattr(domain_var, "long_name", None) - if long_name is None or not isinstance(long_name, str): - domain_valid.messages.append( - f"For domain variable {domain_var.name} " - f"it is recommended that attribute long_name be present and a string", - ) - results.append(domain_valid.to_result()) - continue - appendix_a_not_recommended_attrs = [] - for attr_name in domain_var.ncattrs(): - if ( - attr_name in self.appendix_a - and "D" not in self.appendix_a[attr_name]["attr_loc"] - ): - appendix_a_not_recommended_attrs.append(attr_name) - - if appendix_a_not_recommended_attrs: - domain_valid.messages.append( - f"The following attributes appear in variable {domain_var.name} " - "and CF Appendix A, but are not for use in domain variables: " - f"{appendix_a_not_recommended_attrs}", - ) + for var in domain_coord_vars: + domain_valid.out_of += 1 + if not util.get_possible_label_variable_dimensions(coord_var).issubset(domain_dims): + domain_valid.messages.append("Domain dimension failure") + else: + domain_valid.scored += 1 + + # not in conformance docs, but mentioned as recommended anyways + long_name = getattr(domain_var, "long_name", None) + if long_name is None or not isinstance(long_name, str): + domain_valid.messages.append( + f"For domain variable {domain_var.name} " + f"it is recommended that attribute long_name be present and a string", + ) + results.append(domain_valid.to_result()) + continue + appendix_a_not_recommended_attrs = [] + for attr_name in domain_var.ncattrs(): + if attr_name in self.appendix_a and "D" not in self.appendix_a[attr_name]["attr_loc"]: + appendix_a_not_recommended_attrs.append(attr_name) + + if appendix_a_not_recommended_attrs: + domain_valid.messages.append( + f"The following attributes appear in variable {domain_var.name} " + "and CF Appendix A, but are not for use in domain variables: " + f"{appendix_a_not_recommended_attrs}", + ) + + # no errors occurred + domain_valid.score += 1 - # no errors occurred - domain_valid.score += 1 # IMPLEMENTATION CONFORMANCE 5.8 REQUIRED 4/4 + # variables named by domain variable's cell_measures attributes must themselves be a subset + # of dimensions named by domain variable's dimensions NetCDF attribute if hasattr(domain_var, "cell_measures"): cell_measures_var_names = regex.findall( r"\b(?:area|volume):\s+(\w+)", diff --git a/compliance_checker/cf/util.py b/compliance_checker/cf/util.py index 67cba708..227ce851 100644 --- a/compliance_checker/cf/util.py +++ b/compliance_checker/cf/util.py @@ -405,25 +405,40 @@ def string_from_var_type(variable): ) +def get_possible_label_variable_dimensions(variable: Variable) -> Tuple[int, ...]: + """ + Return dimensions if non-char variable, or return variable dimensions + without trailing dimension if char variable, treating it as a label variable. + """ + if variable.kind == "C" and len(variable.dimensions) > 0: + return variable.dimensions[:-1] + return variable.dimensions + def reference_attr_variables( dataset: Dataset, attributes_string: str, split_by: str = None, + reference_type: str = "variable", + group: Union[Group, Dataset] = None ): """ Attempts to reference variables in the string, optionally splitting by a string """ + references, errors = [], [] if attributes_string is None: return None - elif split_by is None: - return dataset.variables.get( - attributes_string, - VariableReferenceError(attributes_string), - ) - else: - string_proc = attributes_string.split(split_by) - return [ - dataset.variables.get(var_name, VariableReferenceError(var_name)) - for var_name in string_proc - ] + elif reference_type == "variable": + if split_by is None: + return dataset.variables.get( + attributes_string, + VariableReferenceError(attributes_string), + ) + else: + string_proc = attributes_string.split(split_by) + for var_name in string_proc: + if var_name in dataset.variables: + references.append(dataset.variables[var_name]) + else: + errors.append(VariableReferenceError(var_name)) + return references, errors diff --git a/compliance_checker/cfutil.py b/compliance_checker/cfutil.py index 202970c2..e422bfdc 100644 --- a/compliance_checker/cfutil.py +++ b/compliance_checker/cfutil.py @@ -9,6 +9,7 @@ from functools import lru_cache, partial from importlib.resources import files +from netCDF4 import Dataset from cf_units import Unit _UNITLESS_DB = None @@ -241,6 +242,7 @@ def is_geophysical(nc, variable): return True +@lru_cache() def get_coordinate_variables(nc): """ Returns a list of variable names that identify as coordinate variables. @@ -260,6 +262,7 @@ def get_coordinate_variables(nc): coord_vars = [] for dimension in nc.dimensions: if dimension in nc.variables: + # TODO: Handle string coordinate variables if nc.variables[dimension].dimensions == (dimension,): coord_vars.append(dimension) return coord_vars @@ -491,11 +494,6 @@ def get_latitude_variables(nc): for variable in nc.get_variables_by_attributes(standard_name="latitude"): latitude_variables.append(variable.name) - # Then axis - for variable in nc.get_variables_by_attributes(axis="Y"): - if variable.name not in latitude_variables: - latitude_variables.append(variable.name) - check_fn = partial( attr_membership, value_set=VALID_LAT_UNITS, @@ -557,11 +555,6 @@ def get_longitude_variables(nc): for variable in nc.get_variables_by_attributes(standard_name="longitude"): longitude_variables.append(variable.name) - # Then axis - for variable in nc.get_variables_by_attributes(axis="X"): - if variable.name not in longitude_variables: - longitude_variables.append(variable.name) - check_fn = partial( attr_membership, value_set=VALID_LON_UNITS, @@ -905,29 +898,26 @@ def is_dataset_valid_ragged_array_repr_featureType(nc, feature_type: str): array structure. See inline comments. """ + # regardless of if compound type or not, must have a cf_role + # variable; if compound, this will be the first part of the + # feature_type as we'll have to search for one with profile_id + # regardless; if single feature type, cf_role must match that + # featureType + cf_role_vars = nc.get_variables_by_attributes(cf_role=lambda x: x is not None) is_compound = False if feature_type.lower() in {"timeseriesprofile", "trajectoryprofile"}: is_compound = True ftype = feature_type.lower().split("profile")[0] + if len(cf_role_vars) > 2: + return False else: ftype = feature_type.lower() + if len(cf_role_vars) > 1: + return False - # regardless of if compound type or not, must have a cf_role - # variable; if compound, this will be the first part of the - # feature_type as we'll have to search for one with profile_id - # regardless; if single feature type, cf_role must match that - # featureType - cf_role_vars = nc.get_variables_by_attributes(cf_role=lambda x: x is not None) - if ( - not cf_role_vars - or (len(cf_role_vars) > 1 and not is_compound) - or (len(cf_role_vars) > 2 and is_compound) - ): - return False cf_role_var = nc.get_variables_by_attributes(cf_role=f"{ftype}_id")[0] - if ( - cf_role_var.cf_role.split("_id")[0].lower() != ftype - ): # if cf_role_var returns None, this should raise an error? + # if cf_role_var returns None, this should raise an error? + if cf_role_var.cf_role.split("_id")[0].lower() != ftype: return False # now we'll check dimensions for singular feature types and/or @@ -936,7 +926,7 @@ def is_dataset_valid_ragged_array_repr_featureType(nc, feature_type: str): if len(instance_dim) != 1: return False - # Wow we check for the presence of an index variable or count variable; + # Now we check for the presence of an index variable or count variable; # NOTE that if no index or count variables exist, we can't determine with # certainty that this is invalid, because single-instance data sets # are valid representations of the ragged array structures. Instead, @@ -1022,6 +1012,17 @@ def is_dataset_valid_ragged_array_repr_featureType(nc, feature_type: str): return True +def resolve_ragged_array_dimension(ds: Dataset): + # TODO: put in loop? + ragged_variable = ds.get_variables_by_attributes(sample_dimension=lambda s: isinstance(s, str)) + if ragged_variable: + ragged_type = 'sample_dimension' + else: + ragged_variable = ds.get_variables_by_attributes(instance_dimension=lambda s: isinstance(s, str)) + ragged_type = "instance_dimension" + if ragged_variable is None: + raise ValueError("Could not find a ragged array related variable") + def is_variable_valid_ragged_array_repr_featureType(nc, variable: str) -> bool: """ diff --git a/compliance_checker/tests/test_cf.py b/compliance_checker/tests/test_cf.py index 65088f84..e5d2a8b8 100644 --- a/compliance_checker/tests/test_cf.py +++ b/compliance_checker/tests/test_cf.py @@ -948,7 +948,6 @@ def test_latitude(self): dataset = self.load_dataset(STATIC_FILES["bad"]) results = self.cf.check_latitude(dataset) scored, out_of, messages = get_results(results) - assert len(results) == 12 assert scored < out_of assert len([r for r in results if r.value[0] < r.value[1]]) == 3 assert (r.name == "§4.1 Latitude Coordinate" for r in results) @@ -957,7 +956,7 @@ def test_latitude(self): dataset = self.load_dataset(STATIC_FILES["rotated_pole_grid"]) results = self.cf.check_latitude(dataset) scored, out_of, messages = get_results(results) - assert len(results) == 6 + assert len(results) == 3 assert scored == out_of assert (r.name == "§4.1 Latitude Coordinate" for r in results) @@ -990,7 +989,6 @@ def test_longitude(self): dataset = self.load_dataset(STATIC_FILES["bad"]) results = self.cf.check_longitude(dataset) scored, out_of, messages = get_results(results) - assert len(results) == 12 assert scored < out_of assert len([r for r in results if r.value[0] < r.value[1]]) == 3 assert all(r.name == "§4.2 Longitude Coordinate" for r in results) @@ -999,7 +997,7 @@ def test_longitude(self): dataset = self.load_dataset(STATIC_FILES["rotated_pole_grid"]) results = self.cf.check_latitude(dataset) scored, out_of, messages = get_results(results) - assert (scored, out_of) == (6, 6) + assert (scored, out_of) == (3, 3) # hack to avoid writing to read-only file dataset.variables["rlon"] = MockVariable(dataset.variables["rlon"]) rlon = dataset.variables["rlon"] @@ -3215,6 +3213,7 @@ def test_domain(self): domain_var = dataset.createVariable("domain", "c", ()) domain_var.long_name = "Domain variable" domain_var.coordinates = "lon lat depth" + domain_var.setncattr("dimensions", "time") results = self.cf.check_domain_variables(dataset) self.assertEqual(results[0].value[0], results[0].value[1]) self.assertFalse(results[0].msgs) diff --git a/compliance_checker/tests/test_feature_detection.py b/compliance_checker/tests/test_feature_detection.py index 11f88fdb..8f3d8769 100644 --- a/compliance_checker/tests/test_feature_detection.py +++ b/compliance_checker/tests/test_feature_detection.py @@ -300,7 +300,7 @@ def test_forecast_reference_metadata(self): def test_rotated_pole_grid(self): with Dataset(resources.STATIC_FILES["rotated_pole_grid"]) as nc: latitudes = util.get_latitude_variables(nc) - assert latitudes == ["lat", "rlat"] + assert latitudes == ["lat"] assert util.is_mapped_grid(nc, "temperature") is True def test_vertical_coords(self):