Merge pull request #30 from CMIP-REF/dataset-cli

CMIP-REF · Dec 5, 2024 · 83bc6f0 · 83bc6f0
2 parents 5873ac9 + 0b1a3df
commit 83bc6f0
Show file tree

Hide file tree

Showing 24 changed files with 596 additions and 287 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -51,7 +51,7 @@ jobs:
       - name: docs
         run: |
           mkdir -p ${{ github.workspace }}/.ref/db
-          uv run ref ingest --source-type cmip6 .esgpull/data
+          uv run ref datasets ingest --source-type cmip6 .esgpull/data
           uv run mkdocs build --strict
 
   tests:

diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -21,6 +21,6 @@ build:
     # Fetch test data from ESGF (needed by notebooks)
     - uv run esgpull self install $READTHEDOCS_REPOSITORY_PATH/.esgpull
     - uv run python scripts/fetch_test_data.py
-    - uv run ref ingest --source-type cmip6 $READTHEDOCS_REPOSITORY_PATH/.esgpull/data
+    - uv run ref datasets ingest --source-type cmip6 $READTHEDOCS_REPOSITORY_PATH/.esgpull/data
     # Run a strict build
     - NO_COLOR=1 REF_DATA_ROOT=$READTHEDOCS_REPOSITORY_PATH/.esgpull/data uv run mkdocs build --strict --site-dir $READTHEDOCS_OUTPUT/html
diff --git a/README.md b/README.md
@@ -47,7 +47,7 @@ but the REF can still be used to evaluate it.
 TODO: Docs for that workflow
 
 ```bash
-ref ingest {data_path} --solve
+ref datasets ingest {data_path} --solve
 ```
 
 ### As a devops engineer
@@ -105,7 +105,7 @@ uv run esgpull self install $PWD/.esgpull
 uv run ref config list > $PWD/.ref/ref.toml
 export REF_CONFIGURATION=$PWD/.ref
 make fetch-test-data
-uv run ref ingest --source-type cmip6 $PWD/.esgpull/data
+uv run ref datasets ingest --source-type cmip6 $PWD/.esgpull/data
 ```
 
 The local `ref.toml` configuration file will make it easier to play around with settings.

diff --git a/changelog/30.breaking.md b/changelog/30.breaking.md
@@ -0,0 +1 @@
+Renames `ref ingest` to `ref datasets ingest`
diff --git a/changelog/30.feature.md b/changelog/30.feature.md
@@ -0,0 +1 @@
+Adds `ref datasets list` command to list ingested datasets
diff --git a/docs/how-to-guides/dataset-selection.py b/docs/how-to-guides/dataset-selection.py
@@ -26,13 +26,13 @@
 from ref_core.datasets import FacetFilter, SourceDatasetType
 from ref_core.metrics import DataRequirement
 
-from ref.cli.config import load_config
+from ref.config import Config
 from ref.database import Database
 
 logger.remove()
 
 # %%
-config = load_config()
+config = Config.default()
 db = Database.from_config(config)
 
 # %% [markdown]

diff --git a/docs/how-to-guides/ingest-datasets.md b/docs/how-to-guides/ingest-datasets.md
@@ -0,0 +1,81 @@
+# Ingest Datasets
+
+This guide will walk you through the process of ingesting local datasets into the REF.
+Ingesting datasets is the first step in the REF workflow.
+
+The REF supports the following dataset formats:
+
+* CMIP6
+
+Downloading the input data is out of scope for this guide,
+but we recommend using the [esgpull](https://esgf.github.io/esgf-download/) to download CMIP6 data.
+If you have access to a high-performance computing (HPC) system,
+you may have a local archive of CMIP6 data already available.
+
+
+## What is Ingestion?
+
+When processing metrics, the REF needs to know the location of the datasets and various metadata.
+Ingestion is the process of extracting metadata from datasets and storing it in a local database.
+This makes it easier to query and filter datasets for further processing.
+
+The REF extracts metadata for each dataset (and file if a dataset contains multiple files).
+The collection of metadata, also known as a data catalog, is stored in a local SQLite database.
+This database is used to query and filter datasets for further processing.
+
+## Ingesting Datasets
+
+To ingest datasets, use the `ref datasets ingest` command.
+This command takes a path to a directory containing datasets as an argument
+and the type of the dataset being ingested (only cmip6 is currently supported).
+
+This will walk through the provided directory looking for `*.nc` files to ingest.
+Each file will be loaded and its metadata extracted.
+
+```bash
+>>> ref --log-level INFO datasets ingest --source-type cmip6 /path/to/cmip6
+2024-12-05 12:00:05.979 | INFO     | ref.database:__init__:77 - Connecting to database at sqlite:///.ref/db/ref.db
+2024-12-05 12:00:05.987 | INFO     | alembic.runtime.migration:__init__:215 - Context impl SQLiteImpl.
+2024-12-05 12:00:05.987 | INFO     | alembic.runtime.migration:__init__:218 - Will assume non-transactional DDL.
+2024-12-05 12:00:05.989 | INFO     | alembic.runtime.migration:run_migrations:623 - Running upgrade  -> ea2aa1134cb3, dataset-rework
+2024-12-05 12:00:05.995 | INFO     | ref.cli.datasets:ingest:115 - ingesting /path/to/cmip6
+2024-12-05 12:00:06.401 | INFO     | ref.cli.datasets:ingest:127 - Found 9 files for 5 datasets
+
+  activity_id   institution_id   source_id       experiment_id   member_id   table_id   variable_id   grid_label   version
+ ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
+  ScenarioMIP   CSIRO            ACCESS-ESM1-5   ssp126          r1i1p1f1    Amon       rlut          gn           v20210318
+  ScenarioMIP   CSIRO            ACCESS-ESM1-5   ssp126          r1i1p1f1    Amon       rlut          gn           v20210318
+  ScenarioMIP   CSIRO            ACCESS-ESM1-5   ssp126          r1i1p1f1    Amon       rsdt          gn           v20210318
+  ScenarioMIP   CSIRO            ACCESS-ESM1-5   ssp126          r1i1p1f1    Amon       rsdt          gn           v20210318
+  ScenarioMIP   CSIRO            ACCESS-ESM1-5   ssp126          r1i1p1f1    Amon       rsut          gn           v20210318
+  ScenarioMIP   CSIRO            ACCESS-ESM1-5   ssp126          r1i1p1f1    Amon       rsut          gn           v20210318
+  ScenarioMIP   CSIRO            ACCESS-ESM1-5   ssp126          r1i1p1f1    Amon       tas           gn           v20210318
+  ScenarioMIP   CSIRO            ACCESS-ESM1-5   ssp126          r1i1p1f1    Amon       tas           gn           v20210318
+  ScenarioMIP   CSIRO            ACCESS-ESM1-5   ssp126          r1i1p1f1    fx         areacella     gn           v20210318
+
+2024-12-05 12:00:06.409 | INFO     | ref.cli.datasets:ingest:131 - Processing dataset CMIP6.ScenarioMIP.CSIRO.ACCESS-ESM1-5.ssp126.r1i1p1f1.Amon.rlut.gn
+2024-12-05 12:00:06.431 | INFO     | ref.cli.datasets:ingest:131 - Processing dataset CMIP6.ScenarioMIP.CSIRO.ACCESS-ESM1-5.ssp126.r1i1p1f1.Amon.rsdt.gn
+2024-12-05 12:00:06.441 | INFO     | ref.cli.datasets:ingest:131 - Processing dataset CMIP6.ScenarioMIP.CSIRO.ACCESS-ESM1-5.ssp126.r1i1p1f1.Amon.rsut.gn
+2024-12-05 12:00:06.449 | INFO     | ref.cli.datasets:ingest:131 - Processing dataset CMIP6.ScenarioMIP.CSIRO.ACCESS-ESM1-5.ssp126.r1i1p1f1.Amon.tas.gn
+2024-12-05 12:00:06.459 | INFO     | ref.cli.datasets:ingest:131 - Processing dataset CMIP6.ScenarioMIP.CSIRO.ACCESS-ESM1-5.ssp126.r1i1p1f1.fx.areacella.gn
+```
+
+
+### Querying ingested datasets
+
+You can query the ingested datasets using the `ref datasets list` command.
+This will display a list of datasets and their associated metadata.
+The `--column` flag allows you to specify which columns to display (defaults to all columns).
+See `ref datasets list-columns` for a list of available columns.
+
+```bash
+>>> ref datasets list --column instance_id --column variable_id
+
+  instance_id                                                             variable_id
+ ─────────────────────────────────────────────────────────────────────────────────────
+  CMIP6.ScenarioMIP.CSIRO.ACCESS-ESM1-5.ssp126.r1i1p1f1.Amon.rlut.gn      rlut
+  CMIP6.ScenarioMIP.CSIRO.ACCESS-ESM1-5.ssp126.r1i1p1f1.Amon.rsdt.gn      rsdt
+  CMIP6.ScenarioMIP.CSIRO.ACCESS-ESM1-5.ssp126.r1i1p1f1.Amon.rsut.gn      rsut
+  CMIP6.ScenarioMIP.CSIRO.ACCESS-ESM1-5.ssp126.r1i1p1f1.Amon.tas.gn       tas
+  CMIP6.ScenarioMIP.CSIRO.ACCESS-ESM1-5.ssp126.r1i1p1f1.fx.areacella.gn   areacella
+```
diff --git a/docs/how-to-guides/running-metrics-locally.py b/docs/how-to-guides/running-metrics-locally.py
@@ -38,7 +38,7 @@
 from ref_core.datasets import SourceDatasetType
 from ref_core.executor import run_metric
 
-from ref.cli.config import load_config
+from ref.config import Config
 from ref.database import Database
 from ref.datasets import get_dataset_adapter
 from ref.provider_registry import ProviderRegistry
@@ -64,7 +64,7 @@
 prettyprinter.pprint(metric.data_requirements[0])
 
 # %% tags=["hide_code"]
-config = load_config()
+config = Config.default()
 db = Database.from_config(config)
 
 # %% [markdown]

diff --git a/mkdocs.yml b/mkdocs.yml
@@ -12,6 +12,7 @@ nav:
   - Configuration: configuration.md
   - How-to guides:
     - how-to-guides/index.md
+    - how-to-guides/ingest-datasets.md
     - how-to-guides/dataset-selection.py
     - how-to-guides/running-metrics-locally.py
   - Tutorials: tutorials.md

diff --git a/packages/ref/alembic/env.py b/packages/ref/alembic/env.py
@@ -8,7 +8,7 @@
 
 # Setup logging
 capture_logging()
-logger.info("Running alembic env")
+logger.debug("Running alembic env")
 
 # this is the Alembic Config object, which provides
 # access to the values within the .ini file in use.

diff --git a/packages/ref/conftest.py b/packages/ref/conftest.py
@@ -53,4 +53,5 @@ def db_seeded(config, cmip6_data_catalog) -> Database:
     for instance_id, data_catalog_dataset in cmip6_data_catalog.groupby(adapter.slug_column):
         adapter.register_dataset(config, database, data_catalog_dataset)
 
+    database.session.commit()
     return database
diff --git a/packages/ref/src/ref/cli/__init__.py b/packages/ref/src/ref/cli/__init__.py
@@ -1,50 +1,40 @@
 """Entrypoint for the CLI"""
 
-import inspect
-import logging
 import sys
+from enum import Enum
+from pathlib import Path
 from typing import Annotated, Optional
 
 import typer
+from attrs import define
 from loguru import logger
 
 from ref import __core_version__, __version__
-from ref.cli import config, ingest, solve
+from ref.cli import config, datasets, solve
+from ref.cli._logging import capture_logging
+from ref.config import Config
+from ref.constants import config_filename
+from ref.database import Database
 
 
-class _InterceptHandler(logging.Handler):
-    def emit(self, record: logging.LogRecord) -> None:
-        # Get corresponding Loguru level if it exists.
-        level: str | int
-        try:
-            level = logger.level(record.levelname).name
-        except ValueError:  # pragma: no cover
-            level = record.levelno
-
-        # Find caller from where originated the logged message.
-        frame, depth = inspect.currentframe(), 0
-        while frame and (depth == 0 or frame.f_code.co_filename == logging.__file__):
-            frame = frame.f_back
-            depth += 1
+class LogLevel(str, Enum):
+    """
+    Log levels for the CLI
+    """
 
-        logger.opt(depth=depth, exception=record.exc_info).log(level, record.getMessage())
+    Normal = "WARNING"
+    Debug = "DEBUG"
+    Info = "INFO"
 
 
-def capture_logging() -> None:
+@define
+class CLIContext:
     """
-    Capture logging from the standard library and redirect it to Loguru
-
-    Note that this replaces the root logger, so any other handlers attached to it will be removed.
+    Context object that can be passed to commands
     """
-    # logger.debug("Capturing logging from the standard library")
-    logging.basicConfig(handlers=[_InterceptHandler()], level=0, force=True)
 
-
-app = typer.Typer(name="ref", no_args_is_help=True)
-
-app.command(name="ingest")(ingest.ingest)
-app.command(name="solve")(solve.solve)
-app.add_typer(config.app, name="config")
+    config: Config
+    database: Database
 
 
 def _version_callback(value: bool) -> None:
@@ -54,9 +44,48 @@ def _version_callback(value: bool) -> None:
         raise typer.Exit()
 
 
+def load_config(configuration_directory: Path | None = None) -> Config:
+    """
+    Load the configuration from the specified directory
+
+    Parameters
+    ----------
+    configuration_directory
+        The directory to load the configuration from
+
+        If the specified directory is not found, the process will exit with an exit code of 1
+
+        If None, the default configuration will be loaded
+
+    Returns
+    -------
+    :
+        The configuration loaded from the specified directory
+    """
+    try:
+        if configuration_directory:
+            config = Config.load(configuration_directory / config_filename, allow_missing=False)
+        else:
+            config = Config.default()
+    except FileNotFoundError:
+        typer.secho("Configuration file not found", fg=typer.colors.RED)
+        raise typer.Exit(1)
+    return config
+
+
+app = typer.Typer(name="ref", no_args_is_help=True)
+
+app.command(name="solve")(solve.solve)
+app.add_typer(config.app, name="config")
+app.add_typer(datasets.app, name="datasets")
+
+
 @app.callback()
 def main(
-    verbose: bool = typer.Option(False, "--verbose", "-v"),
+    ctx: typer.Context,
+    configuration_directory: Annotated[Path | None, typer.Option(help="Configuration directory")] = None,
+    verbose: Annotated[bool, typer.Option("--verbose", "-v")] = False,
+    log_level: Annotated[LogLevel, typer.Option(case_sensitive=False)] = LogLevel.Normal,
     version: Annotated[
         Optional[bool],
         typer.Option("--version", callback=_version_callback, is_eager=True),
@@ -67,9 +96,15 @@ def main(
     """
     capture_logging()
 
-    lvl = logging.INFO
     if verbose:
-        lvl = logging.DEBUG
+        log_level = LogLevel.Debug
 
     logger.remove()
-    logger.add(sys.stderr, level=lvl)
+    logger.add(sys.stderr, level=log_level.value)
+
+    config = load_config(configuration_directory)
+    ctx.obj = CLIContext(config=config, database=Database.from_config(config))
+
+
+if __name__ == "__main__":
+    app()
diff --git a/packages/ref/src/ref/cli/_logging.py b/packages/ref/src/ref/cli/_logging.py
@@ -0,0 +1,35 @@
+import inspect
+import logging
+
+from loguru import logger
+
+
+class _InterceptHandler(logging.Handler):
+    def emit(self, record: logging.LogRecord) -> None:
+        # Get corresponding Loguru level if it exists.
+        level: str | int
+        try:
+            level = logger.level(record.levelname).name
+        except ValueError:  # pragma: no cover
+            level = record.levelno
+
+        # Find caller from where originated the logged message.
+        frame, depth = inspect.currentframe(), 0
+        while frame and (depth == 0 or frame.f_code.co_filename == logging.__file__):
+            frame = frame.f_back
+            depth += 1
+
+        logger.opt(depth=depth, exception=record.exc_info).log(level, record.getMessage())
+
+
+def capture_logging() -> None:
+    """
+    Capture logging from the standard library and redirect it to Loguru
+
+    Note that this replaces the root logger, so any other handlers attached to it will be removed.
+    """
+    # logger.debug("Capturing logging from the standard library")
+    logging.basicConfig(handlers=[_InterceptHandler()], level=0, force=True)
+
+
+__all__ = ["capture_logging", "logger"]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Adds `ref datasets list` command to list ingested datasets