Skip to content

Commit

Permalink
Changed behaviour of source._open_dataset to: (#681)
Browse files Browse the repository at this point in the history
* Changed behaviour of `source._open_dataset` to:
 - Search for data & coordinate variables from just data variables.
 - Don't check to remove unnecessary coordinates & variables from dataset as
this automatically removes all requested coordinate variables.
 - If no data variables are found, load the first dataset returned: this
avoids concatenation issues resulting from trying to concatenate along
nonexistent dimenions.

Added a 'test_request_coord_vars' test to test/test_source to
ensure the following behaviour:
- Only data variables requested & coordinates they depend on are
returned if only data variables are requested (no change from
previous behaviour).
- Entire dataset (all data & coordinate variables) are returned if
no variables are requested (no change from previous behaviour).
- Only requested coordinate variables are returned if only
coordinate variables are requested (updated behaviour).
- Data variables requested, coordinates they depend on, and
additional requested coordinate variables are returned both data
and coordinate variables are requested (updated behaviour).

* restored
```
datasets = [
    ds.set_coords(set(ds.variables) - set(ds.attrs[OPTIONS['vars_key']]))
    for ds in datasets
]
```
to _open_dataset - ought to fix failing test. Removal was based on wrong
assumption that this check always removes specified coordinate
variables, which is only true if they are not passed in the __init__ of
the ESMDataSource class.

* Changed behaviour to catch 'no valid dimension coordinates' error and return first dataset if so - necessary to indexing static files

* Changed behaviour to catch 'no valid dimension coordinates' error and return first dataset if so - necessary to indexing static files
Appears to be potential intermittent issue with Read the Docs build -
unable to reproduce reliably locally.
  • Loading branch information
charles-turner-1 authored Oct 28, 2024
1 parent 991fe32 commit 090cb85
Show file tree
Hide file tree
Showing 2 changed files with 90 additions and 3 deletions.
34 changes: 31 additions & 3 deletions intake_esm/source.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import typing
import warnings

import dask
import fsspec
Expand All @@ -11,6 +12,10 @@
from .utils import OPTIONS


class ConcatenationWarning(UserWarning):
pass


class ESMDataSourceError(Exception):
pass

Expand Down Expand Up @@ -84,9 +89,16 @@ def _open_dataset(
if requested_variables:
if isinstance(requested_variables, str):
requested_variables = [requested_variables]

variable_intersection = set(requested_variables).intersection(set(varname))
variables = [variable for variable in variable_intersection if variable in ds.data_vars]

data_vars = variable_intersection & set(ds.data_vars)
coord_vars = variable_intersection & set(ds.coords)

variables = list(data_vars | coord_vars)

scalar_variables = [v for v in ds.data_vars if len(ds[v].dims) == 0]

ds = ds.set_coords(scalar_variables)
ds = ds[variables]
ds.attrs[OPTIONS['vars_key']] = variables
Expand Down Expand Up @@ -242,7 +254,7 @@ def _open_dataset(self):
]

datasets = dask.compute(*datasets)
if len(datasets) == 1:
if len(datasets) == 1 or not datasets[0].data_vars:
self._ds = datasets[0]
else:
datasets = sorted(
Expand All @@ -256,7 +268,23 @@ def _open_dataset(self):
ds.set_coords(set(ds.variables) - set(ds.attrs[OPTIONS['vars_key']]))
for ds in datasets
]
self._ds = xr.combine_by_coords(datasets, **self.xarray_combine_by_coords_kwargs)
try:
self._ds = xr.combine_by_coords(
datasets, **self.xarray_combine_by_coords_kwargs
)
except ValueError as exc:
if (
str(exc)
== 'Could not find any dimension coordinates to use to order the datasets for concatenation'
):
warnings.warn(
'Attempting to concatenate datasets without valid dimension coordinates: retaining only first dataset.'
' Request valid dimension coordinate to silence this warning.',
category=ConcatenationWarning,
)
self._ds = datasets[0]
else:
raise exc

self._ds.attrs[OPTIONS['dataset_key']] = self.key

Expand Down
59 changes: 59 additions & 0 deletions tests/test_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,3 +102,62 @@ def test_update_attrs(tmp_path, data_format, attrs):
_xarray_open_kwargs = _get_xarray_open_kwargs(data_format=data_format)
ds_new = _open_dataset(fpath, 'tasmax', xarray_open_kwargs=_xarray_open_kwargs).compute()
assert ds_new.attrs == ds.attrs


@pytest.mark.parametrize(
'fpath,dvars,cvars,expected',
[
(
f1,
['time_bnds'],
[''],
['time_bnds', 'height', 'time'],
),
(f1, ['tasmax'], [''], ['tasmax', 'height', 'time', 'lat', 'lon']),
(
f1,
[],
['height'],
['height'],
),
(
f1,
[],
[],
['height', 'time_bnds', 'lon_bnds', 'lat_bnds', 'tasmax', 'time', 'lat', 'lon'],
),
(multi_path, ['time_bnds'], [''], ['time_bnds', 'height', 'time']),
(
multi_path,
['tasmax'],
[''],
['tasmax', 'time', 'height', 'lat', 'lon'],
),
(multi_path, [], ['height'], ['height']),
(
multi_path,
[],
[],
['time_bnds', 'lon_bnds', 'lat_bnds', 'tasmax', 'time', 'height', 'lat', 'lon'],
),
],
)
def test_request_coord_vars(fpath, dvars, cvars, expected):
"""
Test requesting a combination of data & coordinate variables.
"""
requested_vars = [*dvars, *cvars]
xarray_open_kwargs = _get_xarray_open_kwargs('netcdf')
ds = _open_dataset(
urlpath=fpath,
varname=['height', 'lat', 'lat_bnds', 'lon', 'lon_bnds', 'tasmax', 'time', 'time_bnds'],
xarray_open_kwargs=xarray_open_kwargs,
requested_variables=requested_vars,
).compute()

ds_dvars = ds.data_vars or set()
ds_cvars = ds.coords or set()

found_vars = set(ds_dvars) | set(ds_cvars)

assert found_vars == set(expected)

0 comments on commit 090cb85

Please sign in to comment.