Skip to content

Commit

Permalink
WIP commit for domain ragged array representation
Browse files Browse the repository at this point in the history
  • Loading branch information
benjwadams committed Dec 6, 2024
1 parent 91aea06 commit dde12c8
Show file tree
Hide file tree
Showing 7 changed files with 161 additions and 86 deletions.
5 changes: 2 additions & 3 deletions compliance_checker/cf/cf_1_6.py
Original file line number Diff line number Diff line change
Expand Up @@ -2252,7 +2252,6 @@ def check_multi_dimensional_coords(self, ds):
# IS THIS EVEN NEEDED ANYMORE?
# ***************
def check_grid_coordinates(self, ds):
# def _check_grid_coordinates(self, ds):
"""
5.6 When the coordinate variables for a horizontal grid are not
longitude and latitude, it is required that the true latitude and
Expand All @@ -2266,7 +2265,7 @@ def check_grid_coordinates(self, ds):
latitudes = cfutil.get_true_latitude_variables(ds)
longitudes = cfutil.get_true_longitude_variables(ds)

check_featues = [
check_features = [
"2d-regular-grid",
"2d-static-grid",
"3d-regular-grid",
Expand All @@ -2287,7 +2286,7 @@ def check_grid_coordinates(self, ds):
# dimensions
dimensions = set(ds.variables[variable].dimensions)
# If it's not a grid, skip it
if cfutil.guess_feature_type(ds, variable) not in check_featues:
if cfutil.guess_feature_type(ds, variable) not in check_features:
continue
has_coords = TestCtx(BaseCheck.HIGH, self.section_titles["5.6"])

Expand Down
6 changes: 3 additions & 3 deletions compliance_checker/cf/cf_1_8.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,17 +169,17 @@ def check_geometry(self, ds: Dataset):
results.append(geom_valid.to_result())
continue

node_count = reference_attr_variables(
node_count, node_count_errors = reference_attr_variables(
ds,
getattr(geometry_var, "node_count", None),
)
# multipart lines and polygons only
part_node_count = reference_attr_variables(
part_node_count, part_node_count_errors = reference_attr_variables(
ds,
getattr(geometry_var, "part_node_count", None),
)
# polygons with interior geometry only
interior_ring = reference_attr_variables(
interior_ring, interior_ring_errors = reference_attr_variables(
ds,
getattr(geometry_var, "interior_ring", None),
)
Expand Down
137 changes: 99 additions & 38 deletions compliance_checker/cf/cf_1_9.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,61 +112,122 @@ def check_domain_variables(self, ds: Dataset):
var
for var in ds.get_variables_by_attributes(
coordinates=lambda c: c is not None,
)
# IMPLICIT CONFORMANCE REQUIRED 1/4
)):
# all variables have dimensions attribute, but should be scalar
if var.dimensions == ()
):
if not ds.dimensions == ():
continue

# IMPLICIT CONFORMANCE REQUIRED 1/4
# Has a dimensions *NetCDF* attribute
try:
dim_nc_attr = var.getncattr(dimensions)
# most variables are unlikely to be domain variables, so don't treat this
# as a failure
except AttributeError:
continue
# IMPLICIT CONFORMANCE REQUIRED 2/4
# Aforementioned dimensions attribute is comprised of space separated
# dimension names which must exist in the file
domain_valid = TestCtx(BaseCheck.MEDIUM, self.section_titles["5.8"])
domain_valid.out_of += 1
domain_coord_vars = reference_attr_variables(
domain_valid.out_of += 2
domain_dims, dim_errors = reference_attr_variables(
ds,
domain_var.coordinates,
domain_var.getncattr("dimensions"),
" ",
"dimensions"
)
if dim_errors:
errors_str = ", ".join(dim_errors)
domain_valid.messages.append(
"Could not find the following "
"dimensions referenced in "
"dimensions attribute from "
"domain variable "
f"{domain_var.name}: {errors_str}",
)
else:
domain_valid.score += 1
domain_coord_vars, domain_coord_var_errors = reference_attr_variables(
ds, domain_var.coordinates, " "
)
errors = [
maybe_error.name
for maybe_error in domain_coord_vars
if isinstance(maybe_error, VariableReferenceError)
]
if errors:
errors_str = ", ".join(errors)
if domain_coord_var_errors:
errors_str = ", ".join(var_errors)
domain_valid.messages.append(
"Could not find the following "
"variables referenced in "
"coordinates attribute from "
"domain variable "
f"{domain_var.name}: {errors_str}",
)
else:
domain_valid.score += 1

coord_var_dim_failures = []
is_ragged_array_repr = is_dataset_valid_ragged_array_repr_featureType(ds, getattr(ds, "featureType", ""))
if is_ragged_array_repr:
for var in domain_coord_vars:
domain_valid.out_of += 1
ragged_array_dim_variable, ragged_attr_name = resolve_ragged_array_dimension(ds)
dim_name = getattr(ragged_array_dim_variable, ragged_attr_name)
referenced_dim = reference_attr_variables(ds, dim_name, reference_type="dimension")
if isinstance(referenced_dim, VariableReferenceError):
domain_valid.messages.append(
f"Found ragged array variable {ragged_array_dim_variable.name}, "
f"but dimension {dim_name} referenced from {attr_name} does not exist in file"
)

coord_var_reference_failures = []
for coord_var in reference_attr_variables(ds, dim_name, " "):
if isinstance(coord_var, VariableReferenceError):
coord_var_reference_failures.append(coord_var)
domain_valid.messages.append(
f"Referenced coordinate variable {coord_var} does not exist in file")
continue
# TODO: check for label variables
if not util.get_possible_label_variable_dimensions(coord_var).issubset({referenced_dim}):
domain_valid.messages.append(
f"Found ragged array variable {ragged_array_dim_variable.name}, "
f"but dimension {dim_name} referenced from {attr_name} does not exist in file"
)
else:
domain_valid.scored += 1
pass
else:
long_name = getattr(domain_var, "long_name", None)
if long_name is None or not isinstance(long_name, str):
domain_valid.messages.append(
f"For domain variable {domain_var.name} "
f"it is recommended that attribute long_name be present and a string",
)
results.append(domain_valid.to_result())
continue
appendix_a_not_recommended_attrs = []
for attr_name in domain_var.ncattrs():
if (
attr_name in self.appendix_a
and "D" not in self.appendix_a[attr_name]["attr_loc"]
):
appendix_a_not_recommended_attrs.append(attr_name)

if appendix_a_not_recommended_attrs:
domain_valid.messages.append(
f"The following attributes appear in variable {domain_var.name} "
"and CF Appendix A, but are not for use in domain variables: "
f"{appendix_a_not_recommended_attrs}",
)
for var in domain_coord_vars:
domain_valid.out_of += 1
if not util.get_possible_label_variable_dimensions(coord_var).issubset(domain_dims):
domain_valid.messages.append("Domain dimension failure")
else:
domain_valid.scored += 1

# not in conformance docs, but mentioned as recommended anyways
long_name = getattr(domain_var, "long_name", None)
if long_name is None or not isinstance(long_name, str):
domain_valid.messages.append(
f"For domain variable {domain_var.name} "
f"it is recommended that attribute long_name be present and a string",
)
results.append(domain_valid.to_result())
continue
appendix_a_not_recommended_attrs = []
for attr_name in domain_var.ncattrs():
if attr_name in self.appendix_a and "D" not in self.appendix_a[attr_name]["attr_loc"]:
appendix_a_not_recommended_attrs.append(attr_name)

if appendix_a_not_recommended_attrs:
domain_valid.messages.append(
f"The following attributes appear in variable {domain_var.name} "
"and CF Appendix A, but are not for use in domain variables: "
f"{appendix_a_not_recommended_attrs}",
)

# no errors occurred
domain_valid.score += 1

# no errors occurred
domain_valid.score += 1

# IMPLEMENTATION CONFORMANCE 5.8 REQUIRED 4/4
# variables named by domain variable's cell_measures attributes must themselves be a subset
# of dimensions named by domain variable's dimensions NetCDF attribute
if hasattr(domain_var, "cell_measures"):
cell_measures_var_names = regex.findall(
r"\b(?:area|volume):\s+(\w+)",
Expand Down
37 changes: 26 additions & 11 deletions compliance_checker/cf/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,25 +405,40 @@ def string_from_var_type(variable):
)


def get_possible_label_variable_dimensions(variable: Variable) -> Tuple[int, ...]:
"""
Return dimensions if non-char variable, or return variable dimensions
without trailing dimension if char variable, treating it as a label variable.
"""
if variable.kind == "C" and len(variable.dimensions) > 0:
return variable.dimensions[:-1]
return variable.dimensions

def reference_attr_variables(
dataset: Dataset,
attributes_string: str,
split_by: str = None,
reference_type: str = "variable",
group: Union[Group, Dataset] = None
):
"""
Attempts to reference variables in the string, optionally splitting by
a string
"""
references, errors = [], []
if attributes_string is None:
return None
elif split_by is None:
return dataset.variables.get(
attributes_string,
VariableReferenceError(attributes_string),
)
else:
string_proc = attributes_string.split(split_by)
return [
dataset.variables.get(var_name, VariableReferenceError(var_name))
for var_name in string_proc
]
elif reference_type == "variable":
if split_by is None:
return dataset.variables.get(
attributes_string,
VariableReferenceError(attributes_string),
)
else:
string_proc = attributes_string.split(split_by)
for var_name in string_proc:
if var_name in dataset.variables:
references.append(dataset.variables[var_name])
else:
errors.append(VariableReferenceError(var_name))
return references, errors
53 changes: 27 additions & 26 deletions compliance_checker/cfutil.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from functools import lru_cache, partial
from importlib.resources import files

from netCDF4 import Dataset
from cf_units import Unit

_UNITLESS_DB = None
Expand Down Expand Up @@ -241,6 +242,7 @@ def is_geophysical(nc, variable):
return True


@lru_cache()
def get_coordinate_variables(nc):
"""
Returns a list of variable names that identify as coordinate variables.
Expand All @@ -260,6 +262,7 @@ def get_coordinate_variables(nc):
coord_vars = []
for dimension in nc.dimensions:
if dimension in nc.variables:
# TODO: Handle string coordinate variables
if nc.variables[dimension].dimensions == (dimension,):
coord_vars.append(dimension)
return coord_vars
Expand Down Expand Up @@ -491,11 +494,6 @@ def get_latitude_variables(nc):
for variable in nc.get_variables_by_attributes(standard_name="latitude"):
latitude_variables.append(variable.name)

# Then axis
for variable in nc.get_variables_by_attributes(axis="Y"):
if variable.name not in latitude_variables:
latitude_variables.append(variable.name)

check_fn = partial(
attr_membership,
value_set=VALID_LAT_UNITS,
Expand Down Expand Up @@ -557,11 +555,6 @@ def get_longitude_variables(nc):
for variable in nc.get_variables_by_attributes(standard_name="longitude"):
longitude_variables.append(variable.name)

# Then axis
for variable in nc.get_variables_by_attributes(axis="X"):
if variable.name not in longitude_variables:
longitude_variables.append(variable.name)

check_fn = partial(
attr_membership,
value_set=VALID_LON_UNITS,
Expand Down Expand Up @@ -905,29 +898,26 @@ def is_dataset_valid_ragged_array_repr_featureType(nc, feature_type: str):
array structure. See inline comments.
"""

# regardless of if compound type or not, must have a cf_role
# variable; if compound, this will be the first part of the
# feature_type as we'll have to search for one with profile_id
# regardless; if single feature type, cf_role must match that
# featureType
cf_role_vars = nc.get_variables_by_attributes(cf_role=lambda x: x is not None)
is_compound = False
if feature_type.lower() in {"timeseriesprofile", "trajectoryprofile"}:
is_compound = True
ftype = feature_type.lower().split("profile")[0]
if len(cf_role_vars) > 2:
return False
else:
ftype = feature_type.lower()
if len(cf_role_vars) > 1:
return False

# regardless of if compound type or not, must have a cf_role
# variable; if compound, this will be the first part of the
# feature_type as we'll have to search for one with profile_id
# regardless; if single feature type, cf_role must match that
# featureType
cf_role_vars = nc.get_variables_by_attributes(cf_role=lambda x: x is not None)
if (
not cf_role_vars
or (len(cf_role_vars) > 1 and not is_compound)
or (len(cf_role_vars) > 2 and is_compound)
):
return False
cf_role_var = nc.get_variables_by_attributes(cf_role=f"{ftype}_id")[0]
if (
cf_role_var.cf_role.split("_id")[0].lower() != ftype
): # if cf_role_var returns None, this should raise an error?
# if cf_role_var returns None, this should raise an error?
if cf_role_var.cf_role.split("_id")[0].lower() != ftype:
return False

# now we'll check dimensions for singular feature types and/or
Expand All @@ -936,7 +926,7 @@ def is_dataset_valid_ragged_array_repr_featureType(nc, feature_type: str):
if len(instance_dim) != 1:
return False

# Wow we check for the presence of an index variable or count variable;
# Now we check for the presence of an index variable or count variable;
# NOTE that if no index or count variables exist, we can't determine with
# certainty that this is invalid, because single-instance data sets
# are valid representations of the ragged array structures. Instead,
Expand Down Expand Up @@ -1022,6 +1012,17 @@ def is_dataset_valid_ragged_array_repr_featureType(nc, feature_type: str):

return True

def resolve_ragged_array_dimension(ds: Dataset):
# TODO: put in loop?
ragged_variable = ds.get_variables_by_attributes(sample_dimension=lambda s: isinstance(s, str))
if ragged_variable:
ragged_type = 'sample_dimension'
else:
ragged_variable = ds.get_variables_by_attributes(instance_dimension=lambda s: isinstance(s, str))
ragged_type = "instance_dimension"
if ragged_variable is None:
raise ValueError("Could not find a ragged array related variable")


def is_variable_valid_ragged_array_repr_featureType(nc, variable: str) -> bool:
"""
Expand Down
Loading

0 comments on commit dde12c8

Please sign in to comment.