Skip to content

Commit

Permalink
Merge pull request #861 from microbiomedata/860-real-time-referential…
Browse files Browse the repository at this point in the history
…-integrity-checking-is-broken

Fix bug where `validate_json` referential integrity check ignored documents in database
  • Loading branch information
eecavanna authored Jan 17, 2025
2 parents d3e146b + 5ab16c1 commit 27e3f0c
Show file tree
Hide file tree
Showing 3 changed files with 113 additions and 69 deletions.
7 changes: 5 additions & 2 deletions nmdc_runtime/api/endpoints/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,10 +168,13 @@ def iter_grid_out():

@router.post("/metadata/json:validate", name="Validate JSON")
async def validate_json_nmdcdb(docs: dict, mdb: MongoDatabase = Depends(get_mongo_db)):
"""
r"""
Validate a NMDC JSON Schema "nmdc:Database" object.
This API endpoint validates the JSON payload in two steps. The first step is to check the format of each document
(e.g., the presence, name, and value of each field). If it encounters any violations during that step, it will not
proceed to the second step. The second step is to check whether all documents referenced by the document exist,
whether in the database or the same JSON payload. We call the second step a "referential integrity check."
"""

return validate_json(docs, mdb, check_inter_document_references=True)
Expand Down
128 changes: 62 additions & 66 deletions nmdc_runtime/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -542,6 +542,13 @@ class OverlayDB(AbstractContextManager):
overlay collection, that id is marked as "seen" and will not also be returned when
subsequently scanning the (unmodified) base-database collection.
Note: The OverlayDB object does not provide a means to perform arbitrary MongoDB queries on the virtual "merged"
database. Callers can access the real database via `overlay_db._bottom_db` and the overlaying database via
`overlay_db._top_db` and perform arbitrary MongoDB queries on the individual databases that way. Access to
the virtual "merged" database is limited to the methods of the `OverlayDB` class, which simulates the
"merging" just-in-time to process the method invocation. You can see an example of this in the implementation
of the `merge_find` method, which internally accesses both the real database and the overlaying database.
Mongo "update" commands (as the "apply_updates" method) are simulated by first copying affected
documents from a base collection to the overlay, and then applying the updates to the overlay,
so that again, base collections are unmodified, and a "merge_find" call will produce a result
Expand Down Expand Up @@ -570,13 +577,6 @@ def __enter__(self):
def __exit__(self, exc_type, exc_value, traceback):
self._bottom_db.client.drop_database(self._top_db.name)

def get_collection(self, coll_name: str):
r"""Returns a reference to the specified collection."""
try:
return self._top_db[coll_name]
except OperationFailure as e:
raise OverlayDBError(str(e.details))

def replace_or_insert_many(self, coll_name, documents: list):
try:
self._top_db[coll_name].insert_many(documents)
Expand Down Expand Up @@ -697,69 +697,65 @@ def validate_json(

# Third pass (if enabled): Check inter-document references.
if check_inter_document_references is True:
# Insert all documents specified for all collections specified, into the OverlayDB.
# Prepare to use `refscan`.
#
# Note: This will allow us to validate referential integrity in the database's _final_ state. If we were to,
# instead, validate it after processing _each_ collection, we would get a false positive if a document
# inserted into an earlier-processed collection happened to reference a document slated for insertion
# into a later-processed collection. By waiting until all documents in all collections specified have
# been inserted, we avoid that situation.
# Note: We check the inter-document references in two stages, which are:
# 1. For each document in the JSON payload, check whether each document it references already exists
# (in the collections the schema says it can exist in) in the database. We use the
# `refscan` package to do this, which returns violation details we'll use in the second stage.
# 2. For each violation found in the first stage (i.e. each reference to a not-found document), we
# check whether that document exists (in the collections the schema says it can exist in) in the
# JSON payload. If it does, then we "waive" (i.e. discard) that violation.
# The violations that remain after those two stages are the ones we return to the caller.
#
with OverlayDB(mdb) as overlay_db:
print(f"Inserting documents into the OverlayDB.")
for collection_name, documents_to_insert in docs.items():
# Insert the documents into the OverlayDB.
#
# Note: The `isinstance(..., list)` check is here to work around the fact that the previous
# validation stages allow for the request payload to specify a collection named "@type" whose
# value is a string, as opposed to a list of dictionaries.
#
# I don't know why those stages do that. I posed the question in this GitHub Discussion:
# https://github.com/microbiomedata/nmdc-runtime/discussions/858
#
# The `len(...) > 0` check is here because pymongo complains when `insert_many` is called
# with an empty list.
#
if (
isinstance(documents_to_insert, list)
and len(documents_to_insert) > 0
):
try:
overlay_db.replace_or_insert_many(
collection_name, documents_to_insert
)
except OverlayDBError as error:
validation_errors[collection_name].append(str(error))

# Now that the OverlayDB contains all the specified documents, we will check whether
# every document referenced by any of the inserted documents exists.
finder = Finder(database=overlay_db)
references = get_allowed_references()
reference_field_names_by_source_class_name = (
references.get_reference_field_names_by_source_class_name()
)
for source_collection_name, documents_inserted in docs.items():
# If `documents_inserted` is not a list (which is a scenario that the previous validation stages
# allow), abort processing this collection and proceed to processing the next collection.
if not isinstance(documents_inserted, list):
continue

# Check the referential integrity of the replaced or inserted documents.
print(
f"Checking references emanating from documents inserted into '{source_collection_name}'."
# Note: The reason we do not insert documents into an `OverlayDB` and scan _that_, is that the `OverlayDB`
# does not provide a means to perform arbitrary queries against its virtual "merged" database. It
# is not a drop-in replacement for a pymongo's `Database` class, which is the only thing that
# `refscan`'s `Finder` class accepts.
#
finder = Finder(database=mdb)
references = get_allowed_references()
reference_field_names_by_source_class_name = (
references.get_reference_field_names_by_source_class_name()
)

# Iterate over the collections in the JSON payload.
for source_collection_name, documents in in_docs.items():
for document in documents:
# Add an `_id` field to the document, since `refscan` requires the document to have one.
source_document = dict(document, _id=None)
violations = scan_outgoing_references(
document=source_document,
schema_view=nmdc_schema_view(),
reference_field_names_by_source_class_name=reference_field_names_by_source_class_name,
references=references,
finder=finder,
collection_names=nmdc_database_collection_names(),
source_collection_name=source_collection_name,
user_wants_to_locate_misplaced_documents=False,
)
for document in documents_inserted:
violations = scan_outgoing_references(
document=document,
schema_view=nmdc_schema_view(),
reference_field_names_by_source_class_name=reference_field_names_by_source_class_name,
references=references,
finder=finder,
collection_names=nmdc_database_collection_names(),
source_collection_name=source_collection_name,
user_wants_to_locate_misplaced_documents=False,

# For each violation, check whether the misplaced document is in the JSON payload, itself.
for violation in violations:
can_waive_violation = False
# Determine which collections can contain the referenced document, based upon
# the schema class of which this source document is an instance.
target_collection_names = (
references.get_target_collection_names(
source_class_name=violation.source_class_name,
source_field_name=violation.source_field_name,
)
)
for violation in violations:
# Check whether the referenced document exists in any of those collections in the JSON payload.
for json_coll_name, json_coll_docs in in_docs.items():
if json_coll_name in target_collection_names:
for json_coll_doc in json_coll_docs:
if json_coll_doc["id"] == violation.target_id:
can_waive_violation = True
break # stop checking
if can_waive_violation:
break # stop checking
if not can_waive_violation:
violation_as_str = (
f"Document '{violation.source_document_id}' "
f"in collection '{violation.source_collection_name}' "
Expand Down
47 changes: 46 additions & 1 deletion tests/test_the_util/test_the_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# Tip: At the time of this writing, you can run the tests in this file without running other tests in this repo,
# by issuing the following command from the root directory of the repository within the `fastapi` container:
# ```
# $ pytest tests/test_the_util/test_the_util.py
# $ pytest -vv tests/test_the_util/test_the_util.py
# ```

# Define a reusable dictionary that matches the value the `validate_json` function
Expand Down Expand Up @@ -148,6 +148,23 @@ def test_validate_json_does_not_check_references_if_documents_are_schema_defiant
assert len(result["detail"]["study_set"]) == 1 # not 2


def test_validate_json_reports_multiple_broken_references_emanating_from_single_document(db):
database_dict = {
"study_set": [
{
"id": "nmdc:sty-00-000001",
"type": "nmdc:Study",
"study_category": "research_study",
"part_of": ["nmdc:sty-00-000008", "nmdc:sty-00-000009"], # identifies 2 non-existent studies
},
]
}
result = validate_json(in_docs=database_dict, mdb=db, **check_refs)
assert result["result"] == "errors"
assert "study_set" in result["detail"]
assert len(result["detail"]["study_set"]) == 2


def test_validate_json_checks_referential_integrity_after_applying_all_collections_changes(db):
r"""
Note: This test targets the scenario where a single payload introduces both the source document and target document
Expand All @@ -170,3 +187,31 @@ def test_validate_json_checks_referential_integrity_after_applying_all_collectio
]
}
assert validate_json(in_docs=database_dict, mdb=db, **check_refs) == ok_result


def test_validate_json_considers_existing_documents_when_checking_references(db):
r"""
Note: This test focuses on the case where the database already contains the to-be-referenced document;
as opposed to the to-be-referenced document being introduced via the same request payload.
For that reason, we will seed the database before calling the function-under-test.
"""
existing_study_id = "nmdc:sty-00-000001"

db.get_collection("study_set").replace_one(
{"id": existing_study_id},
{"id": existing_study_id, "type": "nmdc:Study", "study_category": "research_study"},
upsert=True
)
database_dict = {
"study_set": [
{
"id": "nmdc:sty-00-000002",
"type": "nmdc:Study",
"study_category": "research_study",
"part_of": [existing_study_id], # identifies the existing study
},
]
}
assert validate_json(in_docs=database_dict, mdb=db, **check_refs) == ok_result

db.get_collection("study_set").delete_one({"id": existing_study_id})

0 comments on commit 27e3f0c

Please sign in to comment.