Merge pull request #861 from microbiomedata/860-real-time-referential…

…-integrity-checking-is-broken Fix bug where `validate_json` referential integrity check ignored documents in database
microbiomedata · Jan 17, 2025 · 27e3f0c · 27e3f0c
2 parents d3e146b + 5ab16c1
commit 27e3f0c
Show file tree

Hide file tree

Showing 3 changed files with 113 additions and 69 deletions.
diff --git a/nmdc_runtime/api/endpoints/metadata.py b/nmdc_runtime/api/endpoints/metadata.py
@@ -168,10 +168,13 @@ def iter_grid_out():
 
 @router.post("/metadata/json:validate", name="Validate JSON")
 async def validate_json_nmdcdb(docs: dict, mdb: MongoDatabase = Depends(get_mongo_db)):
-    """
-
+    r"""
     Validate a NMDC JSON Schema "nmdc:Database" object.
 
+    This API endpoint validates the JSON payload in two steps. The first step is to check the format of each document
+    (e.g., the presence, name, and value of each field). If it encounters any violations during that step, it will not
+    proceed to the second step. The second step is to check whether all documents referenced by the document exist,
+    whether in the database or the same JSON payload. We call the second step a "referential integrity check."
     """
 
     return validate_json(docs, mdb, check_inter_document_references=True)

diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py
@@ -542,6 +542,13 @@ class OverlayDB(AbstractContextManager):
     overlay collection, that id is marked as "seen" and will not also be returned when
     subsequently scanning the (unmodified) base-database collection.
 
+    Note: The OverlayDB object does not provide a means to perform arbitrary MongoDB queries on the virtual "merged"
+          database. Callers can access the real database via `overlay_db._bottom_db` and the overlaying database via
+          `overlay_db._top_db` and perform arbitrary MongoDB queries on the individual databases that way. Access to
+          the virtual "merged" database is limited to the methods of the `OverlayDB` class, which simulates the
+          "merging" just-in-time to process the method invocation. You can see an example of this in the implementation
+          of the `merge_find` method, which internally accesses both the real database and the overlaying database.
+
     Mongo "update" commands (as the "apply_updates" method) are simulated by first copying affected
     documents from a base collection to the overlay, and then applying the updates to the overlay,
     so that again, base collections are unmodified, and a "merge_find" call will produce a result
@@ -570,13 +577,6 @@ def __enter__(self):
     def __exit__(self, exc_type, exc_value, traceback):
         self._bottom_db.client.drop_database(self._top_db.name)
 
-    def get_collection(self, coll_name: str):
-        r"""Returns a reference to the specified collection."""
-        try:
-            return self._top_db[coll_name]
-        except OperationFailure as e:
-            raise OverlayDBError(str(e.details))
-
     def replace_or_insert_many(self, coll_name, documents: list):
         try:
             self._top_db[coll_name].insert_many(documents)
@@ -697,69 +697,65 @@ def validate_json(
 
         # Third pass (if enabled): Check inter-document references.
         if check_inter_document_references is True:
-            # Insert all documents specified for all collections specified, into the OverlayDB.
+            # Prepare to use `refscan`.
             #
-            # Note: This will allow us to validate referential integrity in the database's _final_ state. If we were to,
-            #       instead, validate it after processing _each_ collection, we would get a false positive if a document
-            #       inserted into an earlier-processed collection happened to reference a document slated for insertion
-            #       into a later-processed collection. By waiting until all documents in all collections specified have
-            #       been inserted, we avoid that situation.
+            # Note: We check the inter-document references in two stages, which are:
+            #       1. For each document in the JSON payload, check whether each document it references already exists
+            #          (in the collections the schema says it can exist in) in the database. We use the
+            #          `refscan` package to do this, which returns violation details we'll use in the second stage.
+            #       2. For each violation found in the first stage (i.e. each reference to a not-found document), we
+            #          check whether that document exists (in the collections the schema says it can exist in) in the
+            #          JSON payload. If it does, then we "waive" (i.e. discard) that violation.
+            #       The violations that remain after those two stages are the ones we return to the caller.
             #
-            with OverlayDB(mdb) as overlay_db:
-                print(f"Inserting documents into the OverlayDB.")
-                for collection_name, documents_to_insert in docs.items():
-                    # Insert the documents into the OverlayDB.
-                    #
-                    # Note: The `isinstance(..., list)` check is here to work around the fact that the previous
-                    #       validation stages allow for the request payload to specify a collection named "@type" whose
-                    #       value is a string, as opposed to a list of dictionaries.
-                    #
-                    #       I don't know why those stages do that. I posed the question in this GitHub Discussion:
-                    #       https://github.com/microbiomedata/nmdc-runtime/discussions/858
-                    #
-                    #       The `len(...) > 0` check is here because pymongo complains when `insert_many` is called
-                    #       with an empty list.
-                    #
-                    if (
-                        isinstance(documents_to_insert, list)
-                        and len(documents_to_insert) > 0
-                    ):
-                        try:
-                            overlay_db.replace_or_insert_many(
-                                collection_name, documents_to_insert
-                            )
-                        except OverlayDBError as error:
-                            validation_errors[collection_name].append(str(error))
-
-                # Now that the OverlayDB contains all the specified documents, we will check whether
-                # every document referenced by any of the inserted documents exists.
-                finder = Finder(database=overlay_db)
-                references = get_allowed_references()
-                reference_field_names_by_source_class_name = (
-                    references.get_reference_field_names_by_source_class_name()
-                )
-                for source_collection_name, documents_inserted in docs.items():
-                    # If `documents_inserted` is not a list (which is a scenario that the previous validation stages
-                    # allow), abort processing this collection and proceed to processing the next collection.
-                    if not isinstance(documents_inserted, list):
-                        continue
-
-                    # Check the referential integrity of the replaced or inserted documents.
-                    print(
-                        f"Checking references emanating from documents inserted into '{source_collection_name}'."
+            # Note: The reason we do not insert documents into an `OverlayDB` and scan _that_, is that the `OverlayDB`
+            #       does not provide a means to perform arbitrary queries against its virtual "merged" database. It
+            #       is not a drop-in replacement for a pymongo's `Database` class, which is the only thing that
+            #       `refscan`'s `Finder` class accepts.
+            #
+            finder = Finder(database=mdb)
+            references = get_allowed_references()
+            reference_field_names_by_source_class_name = (
+                references.get_reference_field_names_by_source_class_name()
+            )
+
+            # Iterate over the collections in the JSON payload.
+            for source_collection_name, documents in in_docs.items():
+                for document in documents:
+                    # Add an `_id` field to the document, since `refscan` requires the document to have one.
+                    source_document = dict(document, _id=None)
+                    violations = scan_outgoing_references(
+                        document=source_document,
+                        schema_view=nmdc_schema_view(),
+                        reference_field_names_by_source_class_name=reference_field_names_by_source_class_name,
+                        references=references,
+                        finder=finder,
+                        collection_names=nmdc_database_collection_names(),
+                        source_collection_name=source_collection_name,
+                        user_wants_to_locate_misplaced_documents=False,
                     )
-                    for document in documents_inserted:
-                        violations = scan_outgoing_references(
-                            document=document,
-                            schema_view=nmdc_schema_view(),
-                            reference_field_names_by_source_class_name=reference_field_names_by_source_class_name,
-                            references=references,
-                            finder=finder,
-                            collection_names=nmdc_database_collection_names(),
-                            source_collection_name=source_collection_name,
-                            user_wants_to_locate_misplaced_documents=False,
+
+                    # For each violation, check whether the misplaced document is in the JSON payload, itself.
+                    for violation in violations:
+                        can_waive_violation = False
+                        # Determine which collections can contain the referenced document, based upon
+                        # the schema class of which this source document is an instance.
+                        target_collection_names = (
+                            references.get_target_collection_names(
+                                source_class_name=violation.source_class_name,
+                                source_field_name=violation.source_field_name,
+                            )
                         )
-                        for violation in violations:
+                        # Check whether the referenced document exists in any of those collections in the JSON payload.
+                        for json_coll_name, json_coll_docs in in_docs.items():
+                            if json_coll_name in target_collection_names:
+                                for json_coll_doc in json_coll_docs:
+                                    if json_coll_doc["id"] == violation.target_id:
+                                        can_waive_violation = True
+                                        break  # stop checking
+                            if can_waive_violation:
+                                break  # stop checking
+                        if not can_waive_violation:
                             violation_as_str = (
                                 f"Document '{violation.source_document_id}' "
                                 f"in collection '{violation.source_collection_name}' "

diff --git a/tests/test_the_util/test_the_util.py b/tests/test_the_util/test_the_util.py
@@ -6,7 +6,7 @@
 # Tip: At the time of this writing, you can run the tests in this file without running other tests in this repo,
 #      by issuing the following command from the root directory of the repository within the `fastapi` container:
 #      ```
-#      $ pytest tests/test_the_util/test_the_util.py
+#      $ pytest -vv tests/test_the_util/test_the_util.py
 #      ```
 
 # Define a reusable dictionary that matches the value the `validate_json` function
@@ -148,6 +148,23 @@ def test_validate_json_does_not_check_references_if_documents_are_schema_defiant
     assert len(result["detail"]["study_set"]) == 1  # not 2
 
 
+def test_validate_json_reports_multiple_broken_references_emanating_from_single_document(db):
+    database_dict = {
+        "study_set": [
+            {
+                "id": "nmdc:sty-00-000001",
+                "type": "nmdc:Study",
+                "study_category": "research_study",
+                "part_of": ["nmdc:sty-00-000008", "nmdc:sty-00-000009"],  # identifies 2 non-existent studies
+            },
+        ]
+    }
+    result = validate_json(in_docs=database_dict, mdb=db, **check_refs)
+    assert result["result"] == "errors"
+    assert "study_set" in result["detail"]
+    assert len(result["detail"]["study_set"]) == 2
+
+
 def test_validate_json_checks_referential_integrity_after_applying_all_collections_changes(db):
     r"""
     Note: This test targets the scenario where a single payload introduces both the source document and target document
@@ -170,3 +187,31 @@ def test_validate_json_checks_referential_integrity_after_applying_all_collectio
         ]
     }
     assert validate_json(in_docs=database_dict, mdb=db, **check_refs) == ok_result
+
+
+def test_validate_json_considers_existing_documents_when_checking_references(db):
+    r"""
+    Note: This test focuses on the case where the database already contains the to-be-referenced document;
+          as opposed to the to-be-referenced document being introduced via the same request payload.
+          For that reason, we will seed the database before calling the function-under-test.
+    """
+    existing_study_id = "nmdc:sty-00-000001"
+
+    db.get_collection("study_set").replace_one(
+        {"id": existing_study_id},
+        {"id": existing_study_id, "type": "nmdc:Study", "study_category": "research_study"},
+        upsert=True
+    )
+    database_dict = {
+        "study_set": [
+            {
+                "id": "nmdc:sty-00-000002",
+                "type": "nmdc:Study",
+                "study_category": "research_study",
+                "part_of": [existing_study_id],  # identifies the existing study
+            },
+        ]
+    }
+    assert validate_json(in_docs=database_dict, mdb=db, **check_refs) == ok_result
+
+    db.get_collection("study_set").delete_one({"id": existing_study_id})