Skip to content

Commit

Permalink
Merge pull request #878 from microbiomedata/866-runtime-use-nmdc-sche…
Browse files Browse the repository at this point in the history
…ma-provided-function-to-extract-typecode-from-id-regex-pattern
  • Loading branch information
eecavanna authored Jan 23, 2025
2 parents b3d7e0b + 196d123 commit b4db574
Showing 1 changed file with 18 additions and 50 deletions.
68 changes: 18 additions & 50 deletions nmdc_runtime/minter/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
from functools import lru_cache
from typing import List

from nmdc_runtime.util import get_nmdc_jsonschema_dict
from nmdc_schema.id_helpers import get_typecode_for_future_ids

from nmdc_runtime.util import get_nmdc_jsonschema_dict
from nmdc_runtime.api.db.mongo import get_mongo_db


Expand All @@ -12,55 +13,24 @@ def minting_service_id() -> str | None:
return os.getenv("MINTING_SERVICE_ID")


def extract_typecode_from_pattern(pattern: str) -> str:
r"""
Returns the typecode portion of the specified string.
>>> extract_typecode_from_pattern("foo-123-456$") # original behavior
'foo'
>>> extract_typecode_from_pattern("(foo)-123-456$") # returns first and only typecode
'foo'
>>> extract_typecode_from_pattern("(foo|bar)-123-456$") # returns first of 2 typecodes
'foo'
>>> extract_typecode_from_pattern("(foo|bar|baz)-123-456$") # returns first of > 2 typecodes
'foo'
"""

# Get the portion of the pattern preceding the first hyphen.
# e.g. "foo-bar-baz" → ["foo", "bar-baz"] → "foo"
typecode_sub_pattern = pattern.split("-", maxsplit=1)[0]

# If that portion of the pattern is enclosed in parentheses, get the portion between the parentheses.
# e.g. "(apple|banana|carrot)" → "apple|banana|carrot"
if typecode_sub_pattern.startswith("(") and typecode_sub_pattern.endswith(")"):
inner_pattern = typecode_sub_pattern[1:-1]

# Finally, get everything before the first `|`, if any.
# e.g. "apple|banana|carrot" → "apple"
# e.g. "apple" → "apple"
typecode = inner_pattern.split("|", maxsplit=1)[0]
else:
# Note: This is the original behavior, before we added support for multi-typecode patterns.
# e.g. "apple" → "apple"
typecode = typecode_sub_pattern

return typecode


@lru_cache()
def typecodes() -> List[dict]:
r"""
Returns a list of dictionaries containing typecodes and associated information derived from the schema.
Preconditions about the schema:
- The typecode portion of the pattern is between the pattern prefix and the first subsequent hyphen.
- The typecode portion of the pattern either consists of a single typecode verbatim (e.g. "foo");
or consists of multiple typecodes in a pipe-delimited list enclosed in parentheses (e.g. "(foo|bar|baz)").
- The typecode portion of the pattern does not, itself, contain any hyphens.
TODO: Get the typecodes in a different way than by extracting them from a larger string, which seems brittle to me.
Getting them a different way may require schema authors to _define_ them a different way (e.g. defining them
in a dedicated property of a class; for example, one named `typecode`).
Note: In this function, we rely on a helper function provided by the `nmdc-schema` package to extract—from a given
class's `id` slot's pattern—the typecode that the minter would use when generating an ID for an instance of
that class _today_; regardless of what it may have used in the past.
>>> typecode_descriptors = typecodes()
# Test #1: We get the typecode we expect, for a class whose pattern contains only one typecode.
>>> any((td["name"] == "sty" and td["schema_class"] == "nmdc:Study") for td in typecode_descriptors)
True
# Tests #2 and #3: We get only the typecode we expect, for a class whose pattern contains multiple typecodes.
>>> any((td["name"] == "dgms" and td["schema_class"] == "nmdc:MassSpectrometry") for td in typecode_descriptors)
True
>>> any((td["name"] == "omprc" and td["schema_class"] == "nmdc:MassSpectrometry") for td in typecode_descriptors)
False
"""
id_pattern_prefix = r"^(nmdc):"

Expand All @@ -69,16 +39,14 @@ def typecodes() -> List[dict]:
for cls_name, defn in schema_dict["$defs"].items():
match defn.get("properties"):
case {"id": {"pattern": p}} if p.startswith(id_pattern_prefix):
# Get the portion of the pattern following the prefix.
# e.g. "^(nmdc):foo-bar-baz" → "foo-bar-baz"
index_of_first_character_following_prefix = len(id_pattern_prefix)
pattern_without_prefix = p[index_of_first_character_following_prefix:]
# Extract the typecode from the pattern.
typecode_for_future_ids = get_typecode_for_future_ids(slot_pattern=p)

rv.append(
{
"id": "nmdc:" + cls_name + "_" + "typecode",
"schema_class": "nmdc:" + cls_name,
"name": extract_typecode_from_pattern(pattern_without_prefix),
"name": typecode_for_future_ids,
}
)
case _:
Expand Down

0 comments on commit b4db574

Please sign in to comment.