Skip to content

Commit

Permalink
Merge pull request #68 from CMIP-REF/update-sample-data
Browse files Browse the repository at this point in the history
  • Loading branch information
lewisjared authored Jan 23, 2025
2 parents 37ad1bb + 6ea4833 commit 59a3565
Show file tree
Hide file tree
Showing 17 changed files with 1,040 additions and 217 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -130,4 +130,4 @@ fetch-test-data: ## Download any data needed by the test suite

.PHONY: update-test-data-registry
update-test-data-registry: ## Update the test data registry
curl --output packages/ref/src/ref/datasets/sample_data.txt https://raw.githubusercontent.com/CMIP-REF/ref-sample-data/refs/heads/main/registry.txt
curl --output packages/ref/src/cmip_ref/datasets/sample_data.txt https://raw.githubusercontent.com/CMIP-REF/ref-sample-data/refs/heads/main/registry.txt
1 change: 1 addition & 0 deletions changelog/68.trivial.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Update the target version of the sample data to v0.3.0
30 changes: 30 additions & 0 deletions docs/development.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,36 @@ make mypy
make test
```

### Sample data

We use sample data from [ref-sample-data](https://github.com/CMIP-REF/ref-sample-data)
to provide a consistent set of data for testing.
These data are fetched automatically by the test suite.

As we support more metrics,
we should expand the sample data to include additional datasets to be able to adequately test the REF.
If you wish to use a particular dataset for testing,
please open a pull request to add it to the sample data repository.

The sample data is versioned and periodically we need to update the targeted version in the REF.
Updating the sample data can be done by running the following command:

```bash
# Fetch the latest registry from the sample data repository
make update-test-data-registry

# Manually edit the `SAMPLE_VERSION` in `packages/ref/src/cmip_ref/testing.py`

# Regenerate any failing regression tests that depend on the sample data catalog
export PYTEST_ADDOPTS="--force-regen"
make test
````

Some other manual tweaks may be required to get the test suite to pass,
but we should try and write tests that don't change when new data becomes available,
or to use [pytest-regressions](https://pytest-regressions.readthedocs.io/en/latest/api.html) to be able to
regenerate the expected output files.
## Documentation
Our documentation is written in Markdown and built using
Expand Down
21 changes: 11 additions & 10 deletions packages/ref-core/tests/unit/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
@pytest.fixture
def dataset_collection(cmip6_data_catalog) -> DatasetCollection:
return DatasetCollection(
cmip6_data_catalog,
cmip6_data_catalog[cmip6_data_catalog.variable_id == "tas"],
"instance_id",
)

Expand All @@ -32,18 +32,19 @@ def test_python_hash(self, metric_dataset, cmip6_data_catalog, data_regression):
assert hash(metric_dataset.hash) == dataset_hash
assert isinstance(dataset_hash, int)

assert dataset_hash == hash(
MetricDataset({SourceDatasetType.CMIP6: DatasetCollection(cmip6_data_catalog, "instance_id")})
)
# Check that the hash changes if the dataset changes
assert dataset_hash != hash(
MetricDataset(
{
SourceDatasetType.CMIP6: DatasetCollection(
cmip6_data_catalog[cmip6_data_catalog.variable_id == "tas"], "instance_id"
cmip6_data_catalog[cmip6_data_catalog.variable_id != "tas"], "instance_id"
)
}
)
)

# This will change if the data catalog changes
# Specifically if more tas datasets are provided
data_regression.check(metric_dataset.hash, basename="metric_dataset_hash")


Expand All @@ -57,12 +58,12 @@ def test_get_attr(self, dataset_collection):
assert dataset_collection.instance_id.equals(expected)

def test_hash(self, dataset_collection, cmip6_data_catalog, data_regression):
dataset_hash = hash(dataset_collection)
tas_datasets = cmip6_data_catalog[cmip6_data_catalog.variable_id == "tas"]
dataset_hash = hash(DatasetCollection(tas_datasets, "instance_id"))
assert isinstance(dataset_hash, int)

assert dataset_hash == hash(DatasetCollection(cmip6_data_catalog, "instance_id"))
assert dataset_hash != hash(
DatasetCollection(cmip6_data_catalog[cmip6_data_catalog.variable_id == "tas"], "instance_id")
)
assert dataset_hash != hash(DatasetCollection(tas_datasets.iloc[[0, 1]], "instance_id"))

# This hash will change if the data catalog changes
# Specifically if more tas datasets are provided
data_regression.check(dataset_hash, basename="dataset_collection_hash")
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
2142660390363301609
303970149429875064
...
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
3d5eefc9508ff9c417e9ff314afadf7e568b9588
17842649a474366b8d39d11b9867065508d295f0
...
2 changes: 1 addition & 1 deletion packages/ref-metrics-example/tests/unit/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def metric_dataset(cmip6_data_catalog) -> MetricDataset:
def test_annual_mean(sample_data_dir, metric_dataset):
annual_mean = calculate_annual_mean_timeseries(metric_dataset["cmip6"].path.to_list())

assert annual_mean.time.size == 286
assert annual_mean.time.size == 11


def test_example_metric(tmp_path, metric_dataset, cmip6_data_catalog, mocker):
Expand Down
2 changes: 1 addition & 1 deletion packages/ref-metrics-ilamb/tests/unit/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def metric_dataset(cmip6_data_catalog) -> MetricDataset:

def test_annual_mean(metric_dataset):
annual_mean = calculate_global_mean_timeseries(metric_dataset["cmip6"].path.to_list())
assert annual_mean.time.size == 572
assert annual_mean.time.size == 132


def test_example_metric(tmp_path, cmip6_data_catalog, mocker):
Expand Down
36 changes: 23 additions & 13 deletions packages/ref/src/cmip_ref/datasets/sample_data.txt
Original file line number Diff line number Diff line change
@@ -1,13 +1,23 @@
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Amon/rlut/gn/v20191115/rlut_Amon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_185001-201412.nc 95341df80de95ddb0b45da11aed67db771414fff94508687fb30fce63b82c104
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Amon/rsdt/gn/v20191115/rsdt_Amon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_185001-201412.nc 25e9e817a05ffab4a2b073078f6be0e52096fa9da8eb55f009d079842c708614
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Amon/rsut/gn/v20191115/rsut_Amon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_185001-201412.nc 139c4c59d98c737ce2d7ca777e52e35e38d49fcb8b08dd98175ed0f1354f8e75
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Amon/tas/gn/v20191115/tas_Amon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_185001-201412.nc be4a191c75b3643aad34238970c0587128a3852694f2c61425b4bbda42e5ff08
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/fx/areacella/gn/v20191115/areacella_fx_ACCESS-ESM1-5_historical_r1i1p1f1_gn.nc d9d07cacc65c196b9ec47d60cabcf86fd397b1e22063a32c3798a98ee3dfb16e
CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Amon/rlut/gn/v20210318/rlut_Amon_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn_210101-230012.nc 1083d92079e9c40d3797ecc4235df1c86c99af7ca3b9458b21f1d34054351041
CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Amon/rsdt/gn/v20210318/rsdt_Amon_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn_201501-210012.nc 9f9ae50efc55f4e18dc174d7c3af10f4e67a391c84d81cdb6ba574fa8b61b276
CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Amon/rsdt/gn/v20210318/rsdt_Amon_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn_210101-230012.nc 2299e10eb6ccf190fe07f7b60aa40b8700f7f964ca68c989f3572abe39eb22c7
CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Amon/rsut/gn/v20210318/rsut_Amon_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn_201501-210012.nc 5141bb64d6f457550d8bf429a4233af1bd706ed8b2131fc2ef329bcb6db7a236
CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Amon/rsut/gn/v20210318/rsut_Amon_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn_210101-230012.nc 49fbd6c0d7b8c0d10a270e8d88191764c02ba651f80b464605dfa5b0221d622b
CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Amon/tas/gn/v20210318/tas_Amon_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn_201501-210012.nc bff52adef26d48d4b747368816aff3712c606cafa92f6b78f4974f23efcba510
CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Amon/tas/gn/v20210318/tas_Amon_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn_210101-230012.nc b61faa48540472be5b208a4ecf40873860c1d4cfb7f50a4dff4ac17ee2ba4f73
CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/fx/areacella/gn/v20210318/areacella_fx_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn.nc e8e3b873d9ba115974329c0f7785c9e30dcca66007fa973c22cd734efc46dcfd
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/1pctCO2/r1i1p1f1/Amon/tas/gn/v20191115/tas_Amon_ACCESS-ESM1-5_1pctCO2_r1i1p1f1_gn_010101-018012.nc ab365f0663c4e33e36fc24c826e2017ddf9ef3da281205fcc2d3ab9f86085338
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/1pctCO2/r1i1p1f1/fx/areacella/gn/v20191115/areacella_fx_ACCESS-ESM1-5_1pctCO2_r1i1p1f1_gn.nc 3a6c3a6bb56da0ef6b0d89d4e32812f2942b00641f1fe647cf8237efd52afe0b
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/abrupt-4xCO2/r1i1p1f1/Amon/rlut/gn/v20191115/rlut_Amon_ACCESS-ESM1-5_abrupt-4xCO2_r1i1p1f1_gn_010101-012512.nc 989260da3a40e9938187aa764b325ca6f680817771d2af7114e817e21e54d3d4
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/abrupt-4xCO2/r1i1p1f1/Amon/rsdt/gn/v20191115/rsdt_Amon_ACCESS-ESM1-5_abrupt-4xCO2_r1i1p1f1_gn_010101-012512.nc d814794a1d5e0e1fdea6e480d3a0a4af10cf3cfea7a8c35ed8be2725810c2cf5
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/abrupt-4xCO2/r1i1p1f1/Amon/rsut/gn/v20191115/rsut_Amon_ACCESS-ESM1-5_abrupt-4xCO2_r1i1p1f1_gn_010101-012512.nc f5b18ac52742a467bf8c9f7c8000349bc535b9967d672c05f8b01ecb84a3216b
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/abrupt-4xCO2/r1i1p1f1/Amon/tas/gn/v20191115/tas_Amon_ACCESS-ESM1-5_abrupt-4xCO2_r1i1p1f1_gn_010101-012512.nc d7813e452ebc320d724829a651705c2351ad6e271422573a77ccd1e358b6b6c3
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/abrupt-4xCO2/r1i1p1f1/fx/areacella/gn/v20191115/areacella_fx_ACCESS-ESM1-5_abrupt-4xCO2_r1i1p1f1_gn.nc d5ccb8cc146d8682b602f7d531be8b3edfdad6d36be284d6ad24528ebecad24a
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Amon/rlut/gn/v20191115/rlut_Amon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_200001-201412.nc abd885fd088b1b2db74d7f1244427486f8d2c47ae0fa13ba5b7ddb0a3ddc429b
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Amon/rsdt/gn/v20191115/rsdt_Amon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_200001-201412.nc 53096cef9aacdbefd4e32dabe3705843d21979ba7cc06db9efd30528105effc9
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Amon/rsut/gn/v20191115/rsut_Amon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_200001-201412.nc 0d9373c0824018caa838ce98b3a65d81c411113c60b78194eebc876bd183adaa
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Amon/tas/gn/v20191115/tas_Amon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_200001-201412.nc 8aa1d145b1218634b7d5fb2ae1cc96a7218ee2ff05fa57ac96fa2e9d02eb282e
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Omon/tos/gn/v20191115/tos_Omon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_200001-201412.nc 2dd6a3828136dd5ef427d68e6115f8cc9699a2034adeab17da5bd83c59d2c48c
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/fx/areacella/gn/v20191115/areacella_fx_ACCESS-ESM1-5_historical_r1i1p1f1_gn.nc b4ed6bfb22c15541f4d66ca57bee4e2e9c06c6c50676db703b6712565a5c7abf
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/piControl/r1i1p1f1/Amon/rlut/gn/v20210316/rlut_Amon_ACCESS-ESM1-5_piControl_r1i1p1f1_gn_010101-012512.nc 4bcdb5108c884a13299cc855a3a78ff1af1a28c45b00dc9a388d510035328886
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/piControl/r1i1p1f1/Amon/rsdt/gn/v20210316/rsdt_Amon_ACCESS-ESM1-5_piControl_r1i1p1f1_gn_010101-012512.nc bc1ba4b5b91dc40f80318b73dedffd75daec25e317167119794366ea7d48779d
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/piControl/r1i1p1f1/Amon/rsut/gn/v20210316/rsut_Amon_ACCESS-ESM1-5_piControl_r1i1p1f1_gn_010101-012512.nc 9c236e425fc94a067888f5ae48377eb75cc29f9b4a697621ea68a6c46c1aa0f4
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/piControl/r1i1p1f1/Amon/tas/gn/v20210316/tas_Amon_ACCESS-ESM1-5_piControl_r1i1p1f1_gn_010101-018012.nc cacab3637a4cc94e5cc6d7267ec4c6f96cd5d618b0f8a097005dbfe89b2c5eb9
CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/piControl/r1i1p1f1/fx/areacella/gn/v20210316/areacella_fx_ACCESS-ESM1-5_piControl_r1i1p1f1_gn.nc fbdf118bd3677eef2a3a63993cf6492a74c34b2a6650bdb6018711ad66e36594
CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Amon/rsdt/gn/v20210318/rsdt_Amon_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn_201501-202512.nc 187d291702e4a969792fedca9db832657d96e385ea67747cf6432e7eab1af779
CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Amon/rsut/gn/v20210318/rsut_Amon_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn_201501-202512.nc 725c164d0ff33f6443c58eb24a354715525decf051d88df04ec70ee1a358cc56
CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Amon/tas/gn/v20210318/tas_Amon_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn_201501-202512.nc 3124671936cb2554af0a1f48b814fa8bb186a0ee2af6bcc86b5cb126b107d7a2
CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Omon/tos/gn/v20210318/tos_Omon_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn_201501-202512.nc 10d13b1250f5483e5d6105b0dd811658849324c03f27539b83642062a1151b93
CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/fx/areacella/gn/v20210318/areacella_fx_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn.nc 064b48e5b2971cb4e8edad95b27fbbfc2f6dcdc2de99e2df2944d9c2b0db4910
2 changes: 1 addition & 1 deletion packages/ref/src/cmip_ref/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def _build_sample_data_registry(sample_data_version: str) -> pooch.Pooch:


TEST_DATA_DIR = _determine_test_directory()
SAMPLE_DATA_VERSION = "v0.2.1"
SAMPLE_DATA_VERSION = "v0.3.0"


def fetch_sample_data(version: str = SAMPLE_DATA_VERSION) -> None:
Expand Down
4 changes: 2 additions & 2 deletions packages/ref/tests/unit/cli/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def test_ingest(self, sample_data_dir, db, invoke_cli):

assert db.session.query(Dataset).count() == 5
assert db.session.query(CMIP6Dataset).count() == 5
assert db.session.query(CMIP6File).count() == 8
assert db.session.query(CMIP6File).count() == 5

def test_ingest_and_solve(self, sample_data_dir, db, invoke_cli):
result = invoke_cli(
Expand Down Expand Up @@ -82,7 +82,7 @@ def test_ingest_multiple_times(self, sample_data_dir, db, invoke_cli):
)

assert db.session.query(Dataset).count() == 1
assert db.session.query(CMIP6File).count() == 2
assert db.session.query(CMIP6File).count() == 1

invoke_cli(
[
Expand Down
3 changes: 0 additions & 3 deletions packages/ref/tests/unit/datasets/test_cmip6.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,6 @@ def test_load_catalog(self, db_seeded, catalog_regression, sample_data_dir):
for k in adapter.dataset_specific_metadata + adapter.file_specific_metadata:
assert k in df.columns

assert len(df) == 13 # unique files
assert df.groupby("instance_id").ngroups == 10 # unique datasets

# The order of the rows may be flakey due to sqlite ordering and the created time resolution
catalog_regression(df.sort_values(["instance_id", "start_time"]), basename="cmip6_catalog_db")

Expand Down
Loading

0 comments on commit 59a3565

Please sign in to comment.