Skip to content

Commit

Permalink
Refactor BaseBuilder.parse_access_ncfile (#222)
Browse files Browse the repository at this point in the history
* Updated `parse_access_ncfile`to return an instance of  `_AccessNCFileInfo` dataclass rather than a tuple.
* Added a _VarInfo dataclass to handle populating attributes of `_AccessNCFileInfo`.
* Updated tests to reflect change of return type from tuple to`_AccessNCFileInfo`.
  • Loading branch information
charles-turner-1 authored Oct 17, 2024
1 parent 761e7f4 commit 017cd2d
Show file tree
Hide file tree
Showing 3 changed files with 551 additions and 367 deletions.
193 changes: 54 additions & 139 deletions src/access_nri_intake/source/builders.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,12 @@

from ..utils import validate_against_schema
from . import ESM_JSONSCHEMA, PATH_COLUMN, VARIABLE_COLUMN
from .utils import EmptyFileError, get_timeinfo
from .utils import (
EmptyFileError,
_AccessNCFileInfo,
_VarInfo,
get_timeinfo,
)

# Frequency translations
FREQUENCIES: dict[str, tuple[int, str]] = {
Expand Down Expand Up @@ -273,7 +278,9 @@ def parse_access_filename(
return file_id, timestamp, frequency

@classmethod
def parse_access_ncfile(cls, fname: str, time_dim: str = "time") -> tuple:
def parse_access_ncfile(
cls, file: str, time_dim: str = "time"
) -> _AccessNCFileInfo:
"""
Get Intake-ESM datastore entry info from an ACCESS netcdf file
Expand All @@ -286,19 +293,18 @@ def parse_access_ncfile(cls, fname: str, time_dim: str = "time") -> tuple:
Returns
-------
tuple
A tuple containing the information parsed from the file
output_nc_info: _AccessNCFileInfo
A dataclass containing the information parsed from the file
Raises
------
EmptyFileError: If the file contains no variables
"""

file = Path(fname)
filename = file.name
file_path = Path(file)

file_id, filename_timestamp, filename_frequency = cls.parse_access_filename(
file.stem
file_path.stem
)

with xr.open_dataset(
Expand All @@ -308,51 +314,31 @@ def parse_access_ncfile(cls, fname: str, time_dim: str = "time") -> tuple:
decode_times=False,
decode_coords=False,
) as ds:
variable_list = []
variable_long_name_list = []
variable_standard_name_list = []
variable_cell_methods_list = []
variable_units_list = []
dvars = _VarInfo()

for var in ds.data_vars:
attrs = ds[var].attrs
if "long_name" in attrs:
variable_list.append(var)
variable_long_name_list.append(attrs["long_name"])
if "standard_name" in attrs:
variable_standard_name_list.append(attrs["standard_name"])
else:
variable_standard_name_list.append("")
if "cell_methods" in attrs:
variable_cell_methods_list.append(attrs["cell_methods"])
else:
variable_cell_methods_list.append("")
if "units" in attrs:
variable_units_list.append(attrs["units"])
else:
variable_units_list.append("")
dvars.append_attrs(var, attrs) # type: ignore

start_date, end_date, frequency = get_timeinfo(
ds, filename_frequency, time_dim
)

if not variable_list:
if not dvars.variable_list:
raise EmptyFileError("This file contains no variables")

outputs = (
filename,
file_id,
filename_timestamp,
frequency,
start_date,
end_date,
variable_list,
variable_long_name_list,
variable_standard_name_list,
variable_cell_methods_list,
variable_units_list,
output_ncfile = _AccessNCFileInfo(
filename=file_path.name,
path=file,
file_id=file_id,
filename_timestamp=filename_timestamp,
frequency=frequency,
start_date=start_date,
end_date=end_date,
**dvars.to_var_info_dict(),
)

return outputs
return output_ncfile


class AccessOm2Builder(BaseBuilder):
Expand Down Expand Up @@ -406,36 +392,12 @@ def parser(cls, file) -> dict:
if realm == "ice":
realm = "seaIce"

(
filename,
file_id,
_,
frequency,
start_date,
end_date,
variable_list,
variable_long_name_list,
variable_standard_name_list,
variable_cell_methods_list,
variable_units_list,
) = cls.parse_access_ncfile(file)

info = {
"path": str(file),
"realm": realm,
"variable": variable_list,
"frequency": frequency,
"start_date": start_date,
"end_date": end_date,
"variable_long_name": variable_long_name_list,
"variable_standard_name": variable_standard_name_list,
"variable_cell_methods": variable_cell_methods_list,
"variable_units": variable_units_list,
"filename": filename,
"file_id": file_id,
}

return info
nc_info = cls.parse_access_ncfile(file)
ncinfo_dict = nc_info.to_dict()

ncinfo_dict["realm"] = realm

return ncinfo_dict

except Exception:
return {INVALID_ASSET: file, TRACEBACK: traceback.format_exc()}
Expand Down Expand Up @@ -486,47 +448,22 @@ def __init__(self, path):
super().__init__(**kwargs)

@classmethod
def parser(cls, file):
def parser(cls, file) -> dict:
try:
(
filename,
file_id,
_,
frequency,
start_date,
end_date,
variable_list,
variable_long_name_list,
variable_standard_name_list,
variable_cell_methods_list,
variable_units_list,
) = cls.parse_access_ncfile(file)

if "mom6" in filename:
output_nc_info = cls.parse_access_ncfile(file)
ncinfo_dict = output_nc_info.to_dict()

if "mom6" in ncinfo_dict["filename"]:
realm = "ocean"
elif "ww3" in filename:
elif "ww3" in ncinfo_dict["filename"]:
realm = "wave"
elif "cice" in filename:
elif "cice" in ncinfo_dict["filename"]:
realm = "seaIce"
else:
raise ParserError(f"Cannot determine realm for file {file}")
ncinfo_dict["realm"] = realm

info = {
"path": str(file),
"realm": realm,
"variable": variable_list,
"frequency": frequency,
"start_date": start_date,
"end_date": end_date,
"variable_long_name": variable_long_name_list,
"variable_standard_name": variable_standard_name_list,
"variable_cell_methods": variable_cell_methods_list,
"variable_units": variable_units_list,
"filename": filename,
"file_id": file_id,
}

return info
return ncinfo_dict

except Exception:
return {INVALID_ASSET: file, TRACEBACK: traceback.format_exc()}
Expand Down Expand Up @@ -590,42 +527,20 @@ def parser(cls, file):
realm = match_groups[1]

realm_mapping = {"atm": "atmos", "ocn": "ocean", "ice": "seaIce"}
realm = realm_mapping[realm]

(
filename,
file_id,
_,
frequency,
start_date,
end_date,
variable_list,
variable_long_name_list,
variable_standard_name_list,
variable_cell_methods_list,
variable_units_list,
) = cls.parse_access_ncfile(file)

nc_info = cls.parse_access_ncfile(file)
ncinfo_dict = nc_info.to_dict()

# Remove exp_id from file id so that members can be part of the same dataset
file_id = re.sub(exp_id, "", file_id).strip("_")

info = {
"path": str(file),
"realm": realm,
"variable": variable_list,
"frequency": frequency,
"start_date": start_date,
"end_date": end_date,
"member": exp_id,
"variable_long_name": variable_long_name_list,
"variable_standard_name": variable_standard_name_list,
"variable_cell_methods": variable_cell_methods_list,
"variable_units": variable_units_list,
"filename": filename,
"file_id": file_id,
}

return info
ncinfo_dict["file_id"] = re.sub(
exp_id,
"",
ncinfo_dict["file_id"],
).strip("_")
ncinfo_dict["realm"] = realm_mapping[realm]
ncinfo_dict["member"] = exp_id

return ncinfo_dict

except Exception:
return {INVALID_ASSET: file, TRACEBACK: traceback.format_exc()}
Expand Down
76 changes: 76 additions & 0 deletions src/access_nri_intake/source/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
"""Shared utilities for writing Intake-ESM builders and their parsers"""

import warnings
from dataclasses import asdict, dataclass, field
from datetime import timedelta
from pathlib import Path
from typing import Union

import cftime
Expand All @@ -15,6 +17,80 @@ class EmptyFileError(Exception):
pass


@dataclass
class _AccessNCFileInfo:
"""
Holds information about a NetCDF file that is used to create an intake-esm
catalog entry.
Notes
-----
Use of both path and filename seems redundant, but constructing filename from
the path using a __post_init__ method makes testing more difficult. On balance,
more explicit tests are probably more important than the slight redundancy.
"""

filename: Union[str, Path]
file_id: str
path: str
filename_timestamp: Union[str, None]
frequency: str
start_date: str
end_date: str
variable: list[str]
variable_long_name: list[str]
variable_standard_name: list[str]
variable_cell_methods: list[str]
variable_units: list[str]

def to_dict(self) -> dict[str, Union[str, list[str]]]:
"""
Return a dictionary representation of the NcFileInfo object
"""
return asdict(self)


@dataclass
class _VarInfo:
"""
Holds information about the variables in a NetCDF file that is used to
create an intake-esm catalog entry.
"""

variable_list: list[str] = field(default_factory=list)
long_name_list: list[str] = field(default_factory=list)
standard_name_list: list[str] = field(default_factory=list)
cell_methods_list: list[str] = field(default_factory=list)
units_list: list[str] = field(default_factory=list)

def append_attrs(self, var: str, attrs: dict) -> None:
"""
Append attributes to the _VarInfo object, if the attribute has a
'long_name' key.
"""
if "long_name" not in attrs:
return None

self.variable_list.append(var)
self.long_name_list.append(attrs["long_name"])
self.standard_name_list.append(attrs.get("standard_name", ""))
self.cell_methods_list.append(attrs.get("cell_methods", ""))
self.units_list.append(attrs.get("units", ""))

def to_var_info_dict(self) -> dict[str, list[str]]:
"""
Return a dictionary representation of the _VarInfo object. Fields are
defined explicitly for use in the _AccessNCFileInfo constructor.
"""
return {
"variable": self.variable_list,
"variable_long_name": self.long_name_list,
"variable_standard_name": self.standard_name_list,
"variable_cell_methods": self.cell_methods_list,
"variable_units": self.units_list,
}


def _add_month_start(time, n: int):
"""Add months to cftime datetime and truncate to start"""
year = time.year + ((time.month + n - 1) // 12)
Expand Down
Loading

0 comments on commit 017cd2d

Please sign in to comment.