From d540afbf6d965855ee275d0b5a9aa8274ce11d3f Mon Sep 17 00:00:00 2001
From: eecavanna <eecavanna@users.noreply.github.com>
Date: Wed, 15 Jan 2025 21:26:13 -0800
Subject: [PATCH 1/8] Fix bug where `validate_json` ref. int. check ignored
 existing documents

---
 nmdc_runtime/util.py                 | 108 ++++++++++++---------------
 tests/test_the_util/test_the_util.py |  47 +++++++++++-
 2 files changed, 95 insertions(+), 60 deletions(-)

diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py
index c93ba35c..81f1fdc2 100644
--- a/nmdc_runtime/util.py
+++ b/nmdc_runtime/util.py
@@ -697,69 +697,59 @@ def validate_json(
 
         # Third pass (if enabled): Check inter-document references.
         if check_inter_document_references is True:
-            # Insert all documents specified for all collections specified, into the OverlayDB.
+            # Prepare to use `refscan`.
             #
-            # Note: This will allow us to validate referential integrity in the database's _final_ state. If we were to,
-            #       instead, validate it after processing _each_ collection, we would get a false positive if a document
-            #       inserted into an earlier-processed collection happened to reference a document slated for insertion
-            #       into a later-processed collection. By waiting until all documents in all collections specified have
-            #       been inserted, we avoid that situation.
+            # Note: We check the inter-document references in two stages, which are:
+            #       1. For each document in the JSON payload, check whether each document it references already exists
+            #          (in the collections the schema says it can exist in) in the database. We use the
+            #          `refscan` package to do this, which returns violation details we'll use in the second stage.
+            #       2. For each violation found in the first stage (i.e. each reference to a not-found document), we
+            #          check whether that document exists (in the collections the schema says it can exist in) in the
+            #          JSON payload. If it does, then we "waive" (i.e. discard) that violation.
+            #       The violations that remain after those two stages are the ones we return to the caller.
             #
-            with OverlayDB(mdb) as overlay_db:
-                print(f"Inserting documents into the OverlayDB.")
-                for collection_name, documents_to_insert in docs.items():
-                    # Insert the documents into the OverlayDB.
-                    #
-                    # Note: The `isinstance(..., list)` check is here to work around the fact that the previous
-                    #       validation stages allow for the request payload to specify a collection named "@type" whose
-                    #       value is a string, as opposed to a list of dictionaries.
-                    #
-                    #       I don't know why those stages do that. I posed the question in this GitHub Discussion:
-                    #       https://github.com/microbiomedata/nmdc-runtime/discussions/858
-                    #
-                    #       The `len(...) > 0` check is here because pymongo complains when `insert_many` is called
-                    #       with an empty list.
-                    #
-                    if (
-                        isinstance(documents_to_insert, list)
-                        and len(documents_to_insert) > 0
-                    ):
-                        try:
-                            overlay_db.replace_or_insert_many(
-                                collection_name, documents_to_insert
-                            )
-                        except OverlayDBError as error:
-                            validation_errors[collection_name].append(str(error))
-
-                # Now that the OverlayDB contains all the specified documents, we will check whether
-                # every document referenced by any of the inserted documents exists.
-                finder = Finder(database=overlay_db)
-                references = get_allowed_references()
-                reference_field_names_by_source_class_name = (
-                    references.get_reference_field_names_by_source_class_name()
-                )
-                for source_collection_name, documents_inserted in docs.items():
-                    # If `documents_inserted` is not a list (which is a scenario that the previous validation stages
-                    # allow), abort processing this collection and proceed to processing the next collection.
-                    if not isinstance(documents_inserted, list):
-                        continue
-
-                    # Check the referential integrity of the replaced or inserted documents.
-                    print(
-                        f"Checking references emanating from documents inserted into '{source_collection_name}'."
+            finder = Finder(database=mdb)
+            references = get_allowed_references()
+            reference_field_names_by_source_class_name = (
+                references.get_reference_field_names_by_source_class_name()
+            )
+
+            # Iterate over the collections in the JSON payload.
+            for source_collection_name, documents in in_docs.items():
+                print(f"Checking inter-document references for {source_collection_name}")
+                for document in documents:
+                    source_document = dict(document, _id=None)  # adds `_id` field, since `refscan` requires it to exist
+                    violations = scan_outgoing_references(
+                        document=source_document,
+                        schema_view=nmdc_schema_view(),
+                        reference_field_names_by_source_class_name=reference_field_names_by_source_class_name,
+                        references=references,
+                        finder=finder,
+                        collection_names=nmdc_database_collection_names(),
+                        source_collection_name=source_collection_name,
+                        user_wants_to_locate_misplaced_documents=False,
                     )
-                    for document in documents_inserted:
-                        violations = scan_outgoing_references(
-                            document=document,
-                            schema_view=nmdc_schema_view(),
-                            reference_field_names_by_source_class_name=reference_field_names_by_source_class_name,
-                            references=references,
-                            finder=finder,
-                            collection_names=nmdc_database_collection_names(),
-                            source_collection_name=source_collection_name,
-                            user_wants_to_locate_misplaced_documents=False,
+
+                    # For each violation, check whether the misplaced document is in the JSON payload, itself.
+                    for violation in violations:
+                        print(f"Checking whether inter-document referential integrity violation can be waived.")
+                        can_waive_violation = False
+                        # Determine which collections can contain the referenced document, based upon
+                        # the schema class of which this source document is an instance.
+                        target_collection_names = references.get_target_collection_names(
+                            source_class_name=violation.source_class_name,
+                            source_field_name=violation.source_field_name,
                         )
-                        for violation in violations:
+                        # Check whether the referenced document exists in any of those collections in the JSON payload.
+                        for in_collection_name, in_collection_docs in in_docs.items():
+                            if in_collection_name in target_collection_names:
+                                for in_collection_doc in in_collection_docs:
+                                    if in_collection_doc.get("id") == violation.target_id:
+                                        can_waive_violation = True
+                                        break  # stop checking
+                            if can_waive_violation:
+                                break  # stop checking
+                        if not can_waive_violation:
                             violation_as_str = (
                                 f"Document '{violation.source_document_id}' "
                                 f"in collection '{violation.source_collection_name}' "
diff --git a/tests/test_the_util/test_the_util.py b/tests/test_the_util/test_the_util.py
index cf2ea291..3de72d36 100644
--- a/tests/test_the_util/test_the_util.py
+++ b/tests/test_the_util/test_the_util.py
@@ -6,7 +6,7 @@
 # Tip: At the time of this writing, you can run the tests in this file without running other tests in this repo,
 #      by issuing the following command from the root directory of the repository within the `fastapi` container:
 #      ```
-#      $ pytest tests/test_the_util/test_the_util.py
+#      $ pytest -vv tests/test_the_util/test_the_util.py
 #      ```
 
 # Define a reusable dictionary that matches the value the `validate_json` function
@@ -148,6 +148,23 @@ def test_validate_json_does_not_check_references_if_documents_are_schema_defiant
     assert len(result["detail"]["study_set"]) == 1  # not 2
 
 
+def test_validate_json_reports_multiple_broken_references_emanating_from_single_document(db):
+    database_dict = {
+        "study_set": [
+            {
+                "id": "nmdc:sty-00-000001",
+                "type": "nmdc:Study",
+                "study_category": "research_study",
+                "part_of": ["nmdc:sty-00-000008", "nmdc:sty-00-000009"],  # identifies 2 non-existent studies
+            },
+        ]
+    }
+    result = validate_json(in_docs=database_dict, mdb=db, **check_refs)
+    assert result["result"] == "errors"
+    assert "study_set" in result["detail"]
+    assert len(result["detail"]["study_set"]) == 2
+
+
 def test_validate_json_checks_referential_integrity_after_applying_all_collections_changes(db):
     r"""
     Note: This test targets the scenario where a single payload introduces both the source document and target document
@@ -170,3 +187,31 @@ def test_validate_json_checks_referential_integrity_after_applying_all_collectio
         ]
     }
     assert validate_json(in_docs=database_dict, mdb=db, **check_refs) == ok_result
+
+
+def test_validate_json_considers_existing_documents_when_checking_references(db):
+    r"""
+    Note: This test focuses on the case where the database already contains the to-be-referenced document;
+          as opposed to the to-be-referenced document being introduced via the same request payload.
+          For that reason, we will seed the database before calling the function-under-test.
+    """
+    existing_study_id = "nmdc:sty-00-000001"
+
+    db.get_collection("study_set").replace_one(
+        {"id": existing_study_id},
+        {"id": "nmdc:sty-00-000001", "type": "nmdc:Study", "study_category": "research_study"},
+        upsert=True
+    )
+    database_dict = {
+        "study_set": [
+            {
+                "id": "nmdc:sty-00-000002",
+                "type": "nmdc:Study",
+                "study_category": "research_study",
+                "part_of": [existing_study_id],  # identifies the existing study
+            },
+        ]
+    }
+    assert validate_json(in_docs=database_dict, mdb=db, **check_refs) == ok_result
+
+    db.get_collection("study_set").delete_one({"id": existing_study_id})

From 08b6fefef7f87ab1f11f765da3b8cfe8f7ddce75 Mon Sep 17 00:00:00 2001
From: eecavanna <eecavanna@users.noreply.github.com>
Date: Wed, 15 Jan 2025 21:27:48 -0800
Subject: [PATCH 2/8] Delete defective class method, which is also now obsolete

---
 nmdc_runtime/util.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py
index 81f1fdc2..0ad164bc 100644
--- a/nmdc_runtime/util.py
+++ b/nmdc_runtime/util.py
@@ -570,13 +570,6 @@ def __enter__(self):
     def __exit__(self, exc_type, exc_value, traceback):
         self._bottom_db.client.drop_database(self._top_db.name)
 
-    def get_collection(self, coll_name: str):
-        r"""Returns a reference to the specified collection."""
-        try:
-            return self._top_db[coll_name]
-        except OperationFailure as e:
-            raise OverlayDBError(str(e.details))
-
     def replace_or_insert_many(self, coll_name, documents: list):
         try:
             self._top_db[coll_name].insert_many(documents)

From 592d837d00fdbd3c169ac248203f4b83ab530458 Mon Sep 17 00:00:00 2001
From: github-actions <github-actions@github.com>
Date: Thu, 16 Jan 2025 05:39:05 +0000
Subject: [PATCH 3/8] style: reformat

---
 nmdc_runtime/util.py | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py
index 0ad164bc..35e5ecfb 100644
--- a/nmdc_runtime/util.py
+++ b/nmdc_runtime/util.py
@@ -709,9 +709,13 @@ def validate_json(
 
             # Iterate over the collections in the JSON payload.
             for source_collection_name, documents in in_docs.items():
-                print(f"Checking inter-document references for {source_collection_name}")
+                print(
+                    f"Checking inter-document references for {source_collection_name}"
+                )
                 for document in documents:
-                    source_document = dict(document, _id=None)  # adds `_id` field, since `refscan` requires it to exist
+                    source_document = dict(
+                        document, _id=None
+                    )  # adds `_id` field, since `refscan` requires it to exist
                     violations = scan_outgoing_references(
                         document=source_document,
                         schema_view=nmdc_schema_view(),
@@ -725,19 +729,26 @@ def validate_json(
 
                     # For each violation, check whether the misplaced document is in the JSON payload, itself.
                     for violation in violations:
-                        print(f"Checking whether inter-document referential integrity violation can be waived.")
+                        print(
+                            f"Checking whether inter-document referential integrity violation can be waived."
+                        )
                         can_waive_violation = False
                         # Determine which collections can contain the referenced document, based upon
                         # the schema class of which this source document is an instance.
-                        target_collection_names = references.get_target_collection_names(
-                            source_class_name=violation.source_class_name,
-                            source_field_name=violation.source_field_name,
+                        target_collection_names = (
+                            references.get_target_collection_names(
+                                source_class_name=violation.source_class_name,
+                                source_field_name=violation.source_field_name,
+                            )
                         )
                         # Check whether the referenced document exists in any of those collections in the JSON payload.
                         for in_collection_name, in_collection_docs in in_docs.items():
                             if in_collection_name in target_collection_names:
                                 for in_collection_doc in in_collection_docs:
-                                    if in_collection_doc.get("id") == violation.target_id:
+                                    if (
+                                        in_collection_doc.get("id")
+                                        == violation.target_id
+                                    ):
                                         can_waive_violation = True
                                         break  # stop checking
                             if can_waive_violation:

From 2f65bbec451ea4cb409cf124fc935cc82a546dae Mon Sep 17 00:00:00 2001
From: eecavanna <eecavanna@users.noreply.github.com>
Date: Wed, 15 Jan 2025 21:44:51 -0800
Subject: [PATCH 4/8] Simplify variable names and remove print statements

---
 nmdc_runtime/util.py | 22 ++++++----------------
 1 file changed, 6 insertions(+), 16 deletions(-)

diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py
index 35e5ecfb..eafdf274 100644
--- a/nmdc_runtime/util.py
+++ b/nmdc_runtime/util.py
@@ -709,13 +709,9 @@ def validate_json(
 
             # Iterate over the collections in the JSON payload.
             for source_collection_name, documents in in_docs.items():
-                print(
-                    f"Checking inter-document references for {source_collection_name}"
-                )
                 for document in documents:
-                    source_document = dict(
-                        document, _id=None
-                    )  # adds `_id` field, since `refscan` requires it to exist
+                    # Add an `_id` field to the document, since `refscan` requires the document to have one.
+                    source_document = dict(document, _id=None)
                     violations = scan_outgoing_references(
                         document=source_document,
                         schema_view=nmdc_schema_view(),
@@ -729,9 +725,6 @@ def validate_json(
 
                     # For each violation, check whether the misplaced document is in the JSON payload, itself.
                     for violation in violations:
-                        print(
-                            f"Checking whether inter-document referential integrity violation can be waived."
-                        )
                         can_waive_violation = False
                         # Determine which collections can contain the referenced document, based upon
                         # the schema class of which this source document is an instance.
@@ -742,13 +735,10 @@ def validate_json(
                             )
                         )
                         # Check whether the referenced document exists in any of those collections in the JSON payload.
-                        for in_collection_name, in_collection_docs in in_docs.items():
-                            if in_collection_name in target_collection_names:
-                                for in_collection_doc in in_collection_docs:
-                                    if (
-                                        in_collection_doc.get("id")
-                                        == violation.target_id
-                                    ):
+                        for json_coll_name, json_coll_docs in in_docs.items():
+                            if json_coll_name in target_collection_names:
+                                for json_coll_doc in json_coll_docs:
+                                    if json_coll_doc["id"] == violation.target_id:
                                         can_waive_violation = True
                                         break  # stop checking
                             if can_waive_violation:

From 236c8115dc8e57f412367de645cfa7cab955e24f Mon Sep 17 00:00:00 2001
From: eecavanna <eecavanna@users.noreply.github.com>
Date: Thu, 16 Jan 2025 00:22:27 -0800
Subject: [PATCH 5/8] Replace literal string with variable to emphasize
 sameness

---
 tests/test_the_util/test_the_util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_the_util/test_the_util.py b/tests/test_the_util/test_the_util.py
index 3de72d36..0fdc09a2 100644
--- a/tests/test_the_util/test_the_util.py
+++ b/tests/test_the_util/test_the_util.py
@@ -199,7 +199,7 @@ def test_validate_json_considers_existing_documents_when_checking_references(db)
 
     db.get_collection("study_set").replace_one(
         {"id": existing_study_id},
-        {"id": "nmdc:sty-00-000001", "type": "nmdc:Study", "study_category": "research_study"},
+        {"id": existing_study_id, "type": "nmdc:Study", "study_category": "research_study"},
         upsert=True
     )
     database_dict = {

From 4ce19fab5ee910f0967f714b8e235198953152f6 Mon Sep 17 00:00:00 2001
From: eecavanna <eecavanna@users.noreply.github.com>
Date: Thu, 16 Jan 2025 11:47:01 -0800
Subject: [PATCH 6/8] Document limitation of `OverlayDB` based upon my recent
 misunderstanding

---
 nmdc_runtime/util.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py
index eafdf274..9911e024 100644
--- a/nmdc_runtime/util.py
+++ b/nmdc_runtime/util.py
@@ -542,6 +542,13 @@ class OverlayDB(AbstractContextManager):
     overlay collection, that id is marked as "seen" and will not also be returned when
     subsequently scanning the (unmodified) base-database collection.
 
+    Note: The OverlayDB object does not provide a means to perform arbitrary MongoDB queries on the virtual "merged"
+          database. Callers can access the real database via `overlay_db._bottom_db` and the overlaying database via
+          `overlay_db._top_db` and perform arbitrary MongoDB queries on the individual databases that way. Access to
+          the virtual "merged" database is limited to the methods of the `OverlayDB` class, which simulates the
+          "merging" just-in-time to process the method invocation. You can see an example of this in the implementation
+          of the `merge_find` method, which internally accesses both the real database and the overlaying database.
+
     Mongo "update" commands (as the "apply_updates" method) are simulated by first copying affected
     documents from a base collection to the overlay, and then applying the updates to the overlay,
     so that again, base collections are unmodified, and a "merge_find" call will produce a result

From 22943236a98b841d218b05b9d1370064005e7950 Mon Sep 17 00:00:00 2001
From: eecavanna <eecavanna@users.noreply.github.com>
Date: Thu, 16 Jan 2025 11:55:25 -0800
Subject: [PATCH 7/8] Add comment explaining rationale for not using
 `OverlayDB` class

---
 nmdc_runtime/util.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py
index 9911e024..c393ccca 100644
--- a/nmdc_runtime/util.py
+++ b/nmdc_runtime/util.py
@@ -708,6 +708,11 @@ def validate_json(
             #          JSON payload. If it does, then we "waive" (i.e. discard) that violation.
             #       The violations that remain after those two stages are the ones we return to the caller.
             #
+            # Note: The reason we do not insert documents into an `OverlayDB` and scan _that_, is that the `OverlayDB`
+            #       does not provide a means to perform arbitrary queries against its virtual "merged" database. It
+            #       is not a drop-in replacement for a pymongo's `Database` class, which is the only thing that
+            #       `refscan`'s `Finder` class accepts.
+            #
             finder = Finder(database=mdb)
             references = get_allowed_references()
             reference_field_names_by_source_class_name = (

From 5ab16c186a801e94508a2470f7a7522247c1b88e Mon Sep 17 00:00:00 2001
From: eecavanna <eecavanna@users.noreply.github.com>
Date: Thu, 16 Jan 2025 12:09:46 -0800
Subject: [PATCH 8/8] Add user-facing documentation about
 `/metadata/json:validate` behavior

---
 nmdc_runtime/api/endpoints/metadata.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/nmdc_runtime/api/endpoints/metadata.py b/nmdc_runtime/api/endpoints/metadata.py
index 9b6288fe..ea89027d 100644
--- a/nmdc_runtime/api/endpoints/metadata.py
+++ b/nmdc_runtime/api/endpoints/metadata.py
@@ -168,10 +168,13 @@ def iter_grid_out():
 
 @router.post("/metadata/json:validate", name="Validate JSON")
 async def validate_json_nmdcdb(docs: dict, mdb: MongoDatabase = Depends(get_mongo_db)):
-    """
-
+    r"""
     Validate a NMDC JSON Schema "nmdc:Database" object.
 
+    This API endpoint validates the JSON payload in two steps. The first step is to check the format of each document
+    (e.g., the presence, name, and value of each field). If it encounters any violations during that step, it will not
+    proceed to the second step. The second step is to check whether all documents referenced by the document exist,
+    whether in the database or the same JSON payload. We call the second step a "referential integrity check."
     """
 
     return validate_json(docs, mdb, check_inter_document_references=True)