HopkinsIDD · TimothyWillard · Jan 8, 2025 · Nov 4, 2024 · Jan 8, 2025 · Jan 8, 2025
diff --git a/.github/workflows/gempyor-ci.yml b/.github/workflows/gempyor-ci.yml
@@ -49,6 +49,7 @@ jobs:
         shell: bash
       - name: Run gempyor tests
         run: |
+          export FLEPI_PATH=$(pwd)
           cd flepimop/gempyor_pkg
           pytest --exitfirst
         shell: bash

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -5,6 +5,7 @@ on:
   push:
     paths:
       - 'flepimop/gempyor_pkg/**/*.py'
+      - 'info/**/*'
   pull_request:
     types:
       - edited
@@ -14,8 +15,10 @@ on:
       - synchronize
     paths:
       - '**/*.py'
+      - 'info/**/*'
     branches:
       - main
+      - dev
 
 jobs:
   black-for-python:
@@ -45,3 +48,37 @@ jobs:
         with:
           src: ${{ env.BLACK_SRC }}
           options: "--line-length ${{ env.BLACK_LINE_LENGTH }} --extend-exclude '${{ env.BLACK_EXTEND_EXCLUDE }}' --check --verbose"
+  check-info-json-schema:
+    runs-on: ubuntu-latest
+    if: github.event_name != 'pull_request' || github.event.pull_request.draft == false
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          lfs: true
+          sparse-checkout: |
+            info
+          sparse-checkout-cone-mode: false
+      - name: Install yq and check-jsonschema
+        run: |
+          sudo apt update
+          sudo apt install snapd
+          sudo snap install yq
+          sudo apt install pipx
+          pipx install check-jsonschema
+      - name: Convert YAML to JSON
+        run: |
+          cd info
+          for d in $( ls ); do
+            cd $d
+            for y in *.yml; do
+              yq --output-format json $y > ${y%.yml}.json
+            done
+            for j in *.json; do
+              if [[ "$j" != "schema.json" ]]; then
+                check-jsonschema --verbose --schemafile schema.json $j
+              fi
+            done
+            cd ..
+          done
+          cd ..
diff --git a/.gitignore b/.gitignore
@@ -78,3 +78,6 @@ flepimop/gempyor_pkg/.coverage
 
 # Environment variables
 .env
+
+# info/ directory
+info/**/*.json
diff --git a/flepimop/gempyor_pkg/pyproject.toml b/flepimop/gempyor_pkg/pyproject.toml
@@ -25,6 +25,7 @@ dependencies = [
     "numpy",
     "pandas",
     "pyarrow",
+    "pydantic",
     "scipy",
     "seaborn",
     "sympy",

diff --git a/flepimop/gempyor_pkg/src/gempyor/info.py b/flepimop/gempyor_pkg/src/gempyor/info.py
@@ -0,0 +1,223 @@
+"""
+Retrieving static information from developer managed yaml files.
+
+Currently, it includes utilities for handling cluster-specific information, but it can 
+be extended to other categories as needed.
+
+Classes:
+    Module: Represents a software module with a name and optional version.
+    PathExport: Represents a path export with a path, prepend flag, and error handling.
+    Cluster: Represents a cluster with a name, list of modules, and list of path 
+        exports.
+
+Functions:
+    get_cluster_info: Retrieves cluster-specific information.
+
+Notes:
+    By default the order for search paths is:
+
+    1) The current working directory, then
+    2) The directory specified by the `$FLEPI_INFO_PATH` environment variable if set,
+       and finally
+    3) The directory specified by the `$FLEPI_PATH` environment variable if set.
+
+    The functions in this module will search for an `info/` directory under the search
+    paths with a structure of `info/<category>/<name>.yml` where `<category>` is the
+    category of the information and `<name>` is the name of the information. The first
+    yaml file found will be used to populate the model.
+
+    The default search paths can be overridden by passing a list of paths to the
+    function being used via the `search_paths` argument.
+
+Examples:
+    >>> from pprint import pprint
+    >>> from gempyor.info import get_cluster_info
+    >>> cluster_info = get_cluster_info("longleaf")
+    >>> cluster_info.name
+    'longleaf'
+    >>> pprint(cluster_info.modules)
+    [Module(name='gcc', version='9.1.0'),
+    Module(name='anaconda', version='2023.03'),
+    Module(name='git', version=None),
+    Module(name='aws', version=None)]
+"""
+
+__all__ = ["Cluster", "Module", "PathExport", "get_cluster_info"]
+
+
+from collections.abc import Iterable
+import os
+from pathlib import Path
+import re
+from socket import getfqdn
+from typing import Pattern, TypeVar
+
+from pydantic import BaseModel
+import yaml
+
+
+class Module(BaseModel):
+    """
+    A model representing a module to load.
+
+    Attributes:
+        name: The name of the module to load.
+        version: The specific version of the module to load if there is one.
+
+    See Also:
+        [Lmod](https://lmod.readthedocs.io/en/latest/)
+    """
+
+    name: str
+    version: str | None = None
+
+
+class PathExport(BaseModel):
+    """
+    A model representing the export path configuration.
+
+    Attributes:
+        path: The file system path of the path to add to the `$PATH` environment
+            variable.
+        prepend: A flag indicating whether to prepend additional information to the
+            `$PATH` environment variable.
+        error_if_missing: A flag indicating whether to raise an error if the path is
+            missing.
+    """
+
+    path: Path
+    prepend: bool = True
+    error_if_missing: bool = False
+
+
+class Cluster(BaseModel):
+    """
+    A model representing a cluster configuration.
+
+    Attributes:
+        name: The name of the cluster.
+        modules: A list of modules associated with the cluster.
+        path_exports: A list of path exports for the cluster.
+    """
+
+    name: str
+    modules: list[Module] = []
+    path_exports: list[PathExport] = []
+
+
+_BASE_MODEL_TYPE = TypeVar("T", bound=BaseModel)
+
+
+_CLUSTER_FQDN_REGEXES: tuple[tuple[str, Pattern], ...] = (
+    ("longleaf", re.compile(r"^longleaf\-login[0-9]+\.its\.unc\.edu$")),
+    ("rockfish", re.compile(r"^login[0-9]+\.cm\.cluster$")),
+)
+
+
+def _get_info(
+    category: str,
+    name: str,
+    model: type[_BASE_MODEL_TYPE],
+    search_paths: Iterable[os.PathLike | str] | os.PathLike | str | None,
+) -> _BASE_MODEL_TYPE:
+    """
+    Get and parse an information yaml file.
+
+    This function is a light wrapper around reading and parsing yaml files located in
+    `$FLEPI_PATH/info`.
+
+    Args:
+        category: The category of info to get, corresponds to a subdirectory in
+            `$FLEPI_PATH/info`.
+        name: The name of the info to get, corresponds to the name of a yaml file and
+            is usually a human readable short name.
+        model: The pydantic class to parse the info file with, determines the return
+            type.
+        search_paths: Either a path(s) like determine the directory to look for the info
+            directory in or `None` to use the the default search paths.
+
+    Notes:
+        The default search paths are:
+        1) The current working directory, then
+        2) The directory specified by the `$FLEPI_INFO_PATH` environment variable if
+           set, and finally
+        3) The directory specified by the `$FLEPI_PATH` environment variable if set.
+
+    Returns:
+        An instance of `model` with the contained info found and parsed.
+    """
+    if search_paths is None:
+        search_paths = [
+            p
+            for p in (Path.cwd(), os.getenv("FLEPI_INFO_PATH"), os.getenv("FLEPI_PATH"))
+            if p is not None
+        ]
+    elif isinstance(search_paths, (os.PathLike, str)):
+        search_paths = [search_paths]
+    search_paths = [Path(p).absolute() for p in search_paths]
+    info = next(
+        (
+            info
+            for p in search_paths
+            if (info := p / "info" / category / f"{name}.yml").exists() and info.is_file()
+        ),
+        None,
+    )
+    if info is None:
+        raise ValueError(
+            f"An {category}/{name}.yml file was not found in any of the following "
+            f"directories: {', '.join(map(lambda p: str(p / 'info'), search_paths))}."
+        )
+    return model.model_validate(yaml.safe_load(info.read_text()))
+
+
+def get_cluster_info(
+    name: str | None,
+    search_paths: Iterable[os.PathLike | str] | os.PathLike | str | None = None,
+) -> Cluster:
+    """
+    Get cluster specific info.
+
+    Args:
+        name: The name of the cluster to pull information for. Currently only 'longleaf'
+            and 'rockfish' are supported or `None` to infer from the FQDN.
+        search_paths: Either a path(s) like determine the directory to look for the info
+            directory in or `None` to use the the default search paths.
+
+    Returns:
+        An object containing the information about the `name` cluster.
+
+    Examples:
+        >>> from gempyor.info import get_cluster_info
+        >>> cluster_info = get_cluster_info("longleaf")
+        >>> cluster_info.name
+        'longleaf'
+    """
+    name = _infer_cluster_from_fqdn() if name is None else name
+    return _get_info("cluster", name, Cluster, search_paths)
+
+
+def _infer_cluster_from_fqdn(raise_error: bool = True) -> str | None:
+    """
+    Infer the cluster name from the FQDN.
+
+    Args:
+        raise_error: A flag indicating whether to raise an error if the FQDN does not
+            match any of the expected regexes.
+
+    Returns:
+        The name of the cluster inferred from the FQDN.
+
+    Raises:
+        ValueError: If the value of `socket.getfqdn()` does not match an expected regex
+            and `raise_error` is `True`.
+    """
+    fqdn = getfqdn()
+    for cluster, regex in _CLUSTER_FQDN_REGEXES:
+        if regex.match(fqdn):
+            return cluster
+    if raise_error:
+        raise ValueError(
+            f"The fqdn, '{fqdn}', does not match any of the expected clusters."
+        )
+    return None
diff --git a/flepimop/gempyor_pkg/tests/info/test__get_info.py b/flepimop/gempyor_pkg/tests/info/test__get_info.py
@@ -0,0 +1,78 @@
+from pathlib import Path
+from typing import Literal
+
+from pydantic import BaseModel
+import pytest
+import yaml
+
+from gempyor.info import _get_info
+
+
+class NameOnly(BaseModel):
+    name: str
+
+
+@pytest.fixture
+def create_mock_info_directory(tmp_path: Path) -> Path:
+    for file, contents in (
+        ("abc/def.yml", {"name": "Foobar"}),
+        ("abc/ghi.yml", {"name": "Fizzbuzz"}),
+    ):
+        file = tmp_path / "info" / file
+        file.parent.mkdir(parents=True, exist_ok=True)
+        with file.open(mode="w") as f:
+            yaml.dump(contents, f)
+    return tmp_path.absolute()
+
+
+def test_file_does_not_exist_value_error(
+    monkeypatch: pytest.MonkeyPatch, create_mock_info_directory: Path
+) -> None:
+    monkeypatch.setenv("FLEPI_PATH", str(create_mock_info_directory.parent))
+    with pytest.raises(ValueError):
+        _get_info("does_not", "exist", object, None)
+
+
+@pytest.mark.parametrize(
+    ("category", "name", "model"), (("abc", "def", NameOnly), ("abc", "ghi", NameOnly))
+)
+def test_output_validation_with_working_directory(
+    monkeypatch: pytest.MonkeyPatch,
+    create_mock_info_directory: Path,
+    category: str,
+    name: str,
+    model: type[BaseModel],
+) -> None:
+    monkeypatch.chdir(create_mock_info_directory)
+    _output_validation_test(create_mock_info_directory, category, name, model)
+
+
+@pytest.mark.parametrize(
+    ("category", "name", "model"), (("abc", "def", NameOnly), ("abc", "ghi", NameOnly))
+)
+@pytest.mark.parametrize("envvar", ("FLEPI_INFO_PATH", "FLEPI_PATH"))
+def test_output_validation_with_env_vars(
+    monkeypatch: pytest.MonkeyPatch,
+    create_mock_info_directory: Path,
+    category: str,
+    name: str,
+    model: type[BaseModel],
+    envvar: Literal["FLEPI_INFO_PATH", "FLEPI_PATH"],
+) -> None:
+    monkeypatch.setenv(envvar, str(create_mock_info_directory))
+    _output_validation_test(create_mock_info_directory, category, name, model)
+
+
+def _output_validation_test(
+    path: Path,
+    category: str,
+    name: str,
+    model: type[BaseModel],
+) -> None:
+    results = []
+    for path in (None, path, str(path)):
+        info = _get_info(category, name, model, path)
+        assert isinstance(info, model)
+        results.append(info)
+    for i in range(len(results) - 1):
+        assert results[i] == results[i + 1]