diff --git a/src/access_nri_intake/data/metadata_schema_experiment.json b/src/access_nri_intake/data/metadata_schema_experiment.json index dd12b67..3e54b52 100644 --- a/src/access_nri_intake/data/metadata_schema_experiment.json +++ b/src/access_nri_intake/data/metadata_schema_experiment.json @@ -75,6 +75,9 @@ { "pattern": "^subhr$" }, + { + "pattern": "^subhrPt$" + }, { "pattern": "^\\d+hr$" }, diff --git a/src/access_nri_intake/data/metadata_schema_file.json b/src/access_nri_intake/data/metadata_schema_file.json index 5e02b4b..86ccb53 100644 --- a/src/access_nri_intake/data/metadata_schema_file.json +++ b/src/access_nri_intake/data/metadata_schema_file.json @@ -74,6 +74,9 @@ { "pattern": "^subhr$" }, + { + "pattern": "^subhrPt$" + }, { "pattern": "^\\d+hr$" }, diff --git a/src/access_nri_intake/source/builders.py b/src/access_nri_intake/source/builders.py index 4d61466..9d9dfcb 100644 --- a/src/access_nri_intake/source/builders.py +++ b/src/access_nri_intake/source/builders.py @@ -246,7 +246,7 @@ def parse_filename( patterns: list[str] | None = None, frequencies: dict = FREQUENCIES, redaction_fill: str = "X", - ) -> tuple[str, str | None, str | None, dict | None]: + ) -> tuple[str, str | None, str | None, dict]: """ Parse an ACCESS model filename and return a file id and any time information @@ -290,7 +290,7 @@ def parse_filename( # Parse file id file_id = filename timestamp = None - exargs = None + exargs = {} for pattern in patterns: match = re.match(pattern, file_id) if match: @@ -764,7 +764,7 @@ def parser(cls, fpath, to_select=None): # TODO work out if more appropriate to override parse_ncfile # FIXME self --> cls @classmethod - def parse_ncfile(self, fpath, exargs={}): + def parse_ncfile(cls, file: str, time_dim: str = "time"): """ Get Intake-ESM datastore entry info from an ACCESS netcdf file CMOR has its own base date format, length depends on frequency @@ -784,12 +784,19 @@ def parse_ncfile(self, fpath, exargs={}): Stores extra arguments as frequency, date_range, variable etc, derived from fpattern """ + + file_path = Path(file) + + file_id, filename_timestamp, filename_frequency, exargs = cls.parse_filename( + cls._get_relevant_filepath(file_path) + ) + time_format = "%Y-%m-%d, %H:%M:%S" # get format for dates based on dates lenght # dformat is the longest possible datetime format for cmor dformat = "%Y%m%d%H%M%S" - date_range = exargs.get("date_range", "") - if date_range == "": + date_range = exargs.get("date_range", None) + if date_range is None: start_date = "none" end_date = "none" else: @@ -800,9 +807,11 @@ def parse_ncfile(self, fpath, exargs={}): te = datetime.strptime(te, cmor_format) end_date = te.strftime(time_format) - variable = exargs.get("variable", "") + variable = exargs.get("variable", None) + if variable is None: + raise RuntimeError(f"Unable to parse variable name from {file}") with xr.open_dataset( - fpath, + file, chunks={}, decode_cf=False, decode_times=False, @@ -813,12 +822,12 @@ def parse_ncfile(self, fpath, exargs={}): variable_standard_name = attrs.get("standard_name", "unknown") variable_cell_methods = attrs.get("cell_methods", "unknown") variable_units = attrs.get("units", "unknown") - tracking_id = ds.attrs.get("tracking_id", "unknown") + # tracking_id = ds.attrs.get("tracking_id", "unknown") output_nc_info = _NCFileInfo( - filename=Path(fpath).name, - path=fpath, - file_id=tracking_id, + filename=Path(file).name, + path=file, + file_id=file_id, filename_timestamp=date_range, frequency=exargs.get("frequency", ""), start_date=start_date, @@ -830,4 +839,4 @@ def parse_ncfile(self, fpath, exargs={}): variable_cell_methods=[variable_cell_methods], ) - return output_nc_info, exargs + return output_nc_info