Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Manifest JSON Writing Utility #486

Open
wants to merge 5 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 89 additions & 0 deletions flepimop/gempyor_pkg/src/gempyor/batch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
"""
Generate manifest files containing batch run metadata.

This module provides functionality for working with and executing batch jobs, either on
HPC or AWS environments.

Functions:
write_manifest: A utility to generate a manifest json file with metadata for a batch
job.
"""

__all__ = ["write_manifest"]


import json
from pathlib import Path
import sys
from typing import Any

from gempyor.utils import _git_head


def write_manifest(
job_name: str,
flepi_path: Path,
project_path: Path,
destination: Path | None = None,
**additional_meta: Any,
) -> Path:
"""
Write job metadata to a manifest file.

This function produces a manifest metadata file for a batch run. By default the
json generated by this function will contain:
* 'cmd': The command line arguments provided to the CLI script invoked.
* 'job_name': A human readable unique job name.
* 'data_sha': The git commit of the project git repository, called 'data' for
legacy reasons.
* 'flepimop_sha': The git commit of the flepiMoP git repository.
Further data can be provided via `**additional_meta`, but these values are
overridden by the defaults described above.

Args:
job_name: A user specified or generated from user specified values unique name
for the job.
flepi_path: The path to the flepiMoP git repository being used.
project_path: The path to the project git repository being used.
destination: Either a path to where the json file should be written or `None` to
write the json file to 'manifest.json' in the current working directory.
additional_meta: User specified additional fields added to the manifest json.
Values with the name 'cmd', 'job_name', 'data_sha', or 'flepimop_sha' will
be overridden by the default behavior. Must be a json encodable type.

Returns:
The path to the written json file.

Examples:
>>> import os
>>> from pathlib import Path
>>> flepi_path = Path(os.environ["FLEPI_PATH"])
>>> project_path = flepi_path / "examples" / "tutorials"
>>> manifest = write_manifest("Foobar", flepi_path, project_path)
>>> manifest.name
'manifest.json'
>>> print(manifest.read_text())
{
"cmd": "",
"job_name": "Foobar",
"data_sha": "59fe36d13fe34b6c1fb5c92bf8c53b83bd3ba593",
"flepimop_sha": "2bdfbc74e69bdd0243ef8340dda238f5504f1ad9"
}
"""
flepimop_sha = _git_head(flepi_path)
data_sha = _git_head(project_path)

manifest = {
"cmd": " ".join(sys.argv),
"job_name": job_name,
"data_sha": data_sha,
"flepimop_sha": flepimop_sha,
}
if additional_meta:
manifest = {**additional_meta, **manifest}

destination = Path("manifest.json").absolute() if destination is None else destination
with destination.open(mode="w", encoding="utf-8") as f:
json.dump(manifest, f, indent=4)

return destination
76 changes: 76 additions & 0 deletions flepimop/gempyor_pkg/src/gempyor/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from collections.abc import Iterable
import datetime
import functools
import logging
Expand Down Expand Up @@ -1111,3 +1112,78 @@ def _represent_str(self, dumper, data):
return yaml.dump(
yaml.safe_load(cfg.dump()), Dumper=CustomDumper, indent=4, sort_keys=False
)


def _shutil_which(
cmd: str,
mode: int = os.F_OK | os.X_OK,
path: str | bytes | os.PathLike | None = None,
check: bool = True,
) -> str | None:
"""
A thin wrapper around `shutil.which` with extra validation.

Args:
cmd: The name of the command to search for.
mode: The permission mask required of possible files.
path: A path describing the locations to search, or `None` to use
the PATH environment variable.
check: If `True` an `OSError` will be raised if a `cmd` is not found.
Similar in spirit to the `check` arg of `subprocess.run`.

Returns:
Either the full path to the `cmd` found, or `None` if `cmd` is not
found and `check` is `False`.

Raises:
OSError: If `cmd` is not found and `check` is `True`.

Examples:
>>> import os
>>> from shutil import which
>>> _shutil_which("python") == which("python")
True
>>> _shutil_which("does_not_exist", check=False)
>>> try:
... _shutil_which("does_not_exist", check=True)
... except Exception as e:
... print(type(e))
... print(str(e).replace(os.environ.get("PATH"), "..."))
...
<class 'OSError'>
Did not find 'does_not_exist' on path '...'.

See Also:
[`shutil.which`](https://docs.python.org/3/library/shutil.html#shutil.which)
"""
result = shutil.which(cmd, mode=mode, path=path)
if check and result is None:
path = os.environ.get("PATH") if path is None else path
raise OSError(f"Did not find '{cmd}' on path '{path}'.")
return result


def _git_head(repository: Path) -> str:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this seems fine for now; may want to squirrel away plans for using a git-library ala https://github.com/gitpython-developers/GitPython depending on how sophisticated flepimop's interaction with version control gets.

Do we also need to specify in dependencies that flepimop expects git to be available? (again, might be a separate issue to fix).

"""
Get the sha commit hash for the head of a git repository.

Args:
repository: A directory under version control with git to get the sha commit of.

Returns:
The sha commit of head for `repository`.

Examples:
>>> import os
>>> from pathlib import Path
>>> _git_head(Path(os.environ["FLEPI_PATH"]))
'efe896b1a5e4f8e33667c170cd5319d6ef1e3db5'
"""
git_cmd = _shutil_which("git")
proc = subprocess.run(
[git_cmd, "rev-parse", "HEAD"],
cwd=repository.expanduser().absolute(),
capture_output=True,
check=True,
)
return proc.stdout.decode().strip()
74 changes: 74 additions & 0 deletions flepimop/gempyor_pkg/tests/batch/test_write_manifest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import hashlib
import json
from pathlib import Path
from unittest.mock import patch
from typing import Any

import pytest

from gempyor.batch import write_manifest
from gempyor.utils import _git_head


@pytest.mark.parametrize("job_name", ("my job name", "flu scenario"))
@pytest.mark.parametrize(
"flepi_path", (Path("/path/to/flepiMoP"), Path("flepiMoP"), Path("~/flepiMoP"))
)
@pytest.mark.parametrize(
"project_path", (Path("/path/to/project"), Path("project"), Path("~/project"))
)
@pytest.mark.parametrize(
"destination",
(
None,
Path("manifest.json"),
Path("/absolute/manifest.json"),
Path("not_manifest.json"),
),
)
@pytest.mark.parametrize(
"additional_meta",
({}, {"var1": 1, "var2": "abc", "other": 3.14, "bool": True, "null": None}),
)
def test_output_validation(
monkeypatch: pytest.MonkeyPatch,
tmp_path: Path,
job_name: str,
flepi_path: Path,
project_path: Path,
destination: Path | None,
additional_meta: dict[str, Any],
) -> None:
monkeypatch.chdir(tmp_path)
if isinstance(destination, Path) and destination.is_absolute():
destination = tmp_path / destination.name

def git_head_wraps(repository: Path) -> str:
return (
hashlib.sha1(str(repository).encode()).hexdigest()
if repository in [flepi_path, project_path]
else _git_head(repository)
)

with patch("gempyor.batch._git_head", wraps=git_head_wraps) as git_head_patch:
manifest_file = write_manifest(
job_name, flepi_path, project_path, destination=destination, **additional_meta
)
assert (
manifest_file == Path("manifest.json").absolute()
if destination is None
else destination
)
with manifest_file.open(encoding="utf-8") as f:
manifest = json.load(f)

assert "cmd" in manifest and isinstance(manifest["cmd"], str)
del manifest["cmd"]
assert manifest == {
**additional_meta,
**{
"job_name": job_name,
"data_sha": hashlib.sha1(str(project_path).encode()).hexdigest(),
"flepimop_sha": hashlib.sha1(str(flepi_path).encode()).hexdigest(),
},
}
52 changes: 52 additions & 0 deletions flepimop/gempyor_pkg/tests/utils/test__git_head.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import os
from pathlib import Path
from unittest.mock import patch
import subprocess

import pytest

from gempyor.utils import _git_head, _shutil_which


@pytest.mark.parametrize(
"repository",
(Path("/mock/repository"), Path("relative/repository/path"), Path("~/repo")),
)
@pytest.mark.parametrize(
"sha",
(
"59fe36d13fe34b6c1fb5c92bf8c53b83bd3ba593",
"bba583acf3c4b17ab3241288bff4bcad271a807c",
),
)
def test_output_validation(repository: Path, sha: str) -> None:
def shutil_which_wraps(
cmd: str,
mode: int = os.F_OK | os.X_OK,
path: str | bytes | os.PathLike | None = None,
check: bool = True,
) -> str | None:
return (
"git" if cmd == "git" else _shutil_which(cmd, mode=mode, path=path, check=check)
)

def subprocess_run_wraps(args, **kwargs):
if os.path.basename(args[0]) == "git":
return subprocess.CompletedProcess(
args=args, returncode=0, stdout=f"{sha}\n".encode(), stderr=b""
)
return subprocess.run(args, **kwargs)

with patch(
"gempyor.utils._shutil_which", wraps=shutil_which_wraps
) as shutil_which_patch:
with patch(
"gempyor.utils.subprocess.run", wraps=subprocess_run_wraps
) as subprocess_run_patch:
assert _git_head(repository) == sha
shutil_which_patch.assert_called_once_with("git")
subprocess_run_patch.assert_called_once()
args = subprocess_run_patch.call_args.args[0]
assert args == ["git", "rev-parse", "HEAD"]
kwargs = subprocess_run_patch.call_args.kwargs
assert kwargs["cwd"] == repository.expanduser().absolute()
41 changes: 41 additions & 0 deletions flepimop/gempyor_pkg/tests/utils/test__shutil_which.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import os
from pathlib import Path
from shutil import which

import pytest

from gempyor.utils import _shutil_which


@pytest.fixture
def custom_path_setup(tmp_path: Path) -> str:
path_spec = set()
files = (
Path("abc"),
Path("bin/def"),
Path("bin/ghi"),
Path("bin/jkl"),
Path("dir1/dir2/xyz"),
)
for file in files:
file = tmp_path / file
file.parent.mkdir(parents=True, exist_ok=True)
file.write_text("#!/usr/bin/env bash\necho 'Hello!'")
file.chmod(0o755)
path_spec.add(str(file.parent.absolute()))
return os.pathsep.join(path_spec)


@pytest.mark.parametrize("cmd", ("abc", "def", "ghi", "jkl", "xyz"))
def test_matches_python_stdlib_in_basic_case(custom_path_setup: str, cmd: str) -> None:
assert _shutil_which(cmd, path=custom_path_setup, check=False) == which(
cmd, path=custom_path_setup
)


def test_oserror_when_check_and_cmd_not_found(custom_path_setup: Path) -> None:
cmd = "does_not_exist"
with pytest.raises(
OSError, match=f"^Did not find '{cmd}' on path '{custom_path_setup}'.$"
):
_shutil_which(cmd, path=custom_path_setup, check=True)
Loading