From d540afbf6d965855ee275d0b5a9aa8274ce11d3f Mon Sep 17 00:00:00 2001 From: eecavanna Date: Wed, 15 Jan 2025 21:26:13 -0800 Subject: [PATCH 1/8] Fix bug where `validate_json` ref. int. check ignored existing documents --- nmdc_runtime/util.py | 108 ++++++++++++--------------- tests/test_the_util/test_the_util.py | 47 +++++++++++- 2 files changed, 95 insertions(+), 60 deletions(-) diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py index c93ba35c..81f1fdc2 100644 --- a/nmdc_runtime/util.py +++ b/nmdc_runtime/util.py @@ -697,69 +697,59 @@ def validate_json( # Third pass (if enabled): Check inter-document references. if check_inter_document_references is True: - # Insert all documents specified for all collections specified, into the OverlayDB. + # Prepare to use `refscan`. # - # Note: This will allow us to validate referential integrity in the database's _final_ state. If we were to, - # instead, validate it after processing _each_ collection, we would get a false positive if a document - # inserted into an earlier-processed collection happened to reference a document slated for insertion - # into a later-processed collection. By waiting until all documents in all collections specified have - # been inserted, we avoid that situation. + # Note: We check the inter-document references in two stages, which are: + # 1. For each document in the JSON payload, check whether each document it references already exists + # (in the collections the schema says it can exist in) in the database. We use the + # `refscan` package to do this, which returns violation details we'll use in the second stage. + # 2. For each violation found in the first stage (i.e. each reference to a not-found document), we + # check whether that document exists (in the collections the schema says it can exist in) in the + # JSON payload. If it does, then we "waive" (i.e. discard) that violation. + # The violations that remain after those two stages are the ones we return to the caller. # - with OverlayDB(mdb) as overlay_db: - print(f"Inserting documents into the OverlayDB.") - for collection_name, documents_to_insert in docs.items(): - # Insert the documents into the OverlayDB. - # - # Note: The `isinstance(..., list)` check is here to work around the fact that the previous - # validation stages allow for the request payload to specify a collection named "@type" whose - # value is a string, as opposed to a list of dictionaries. - # - # I don't know why those stages do that. I posed the question in this GitHub Discussion: - # https://github.com/microbiomedata/nmdc-runtime/discussions/858 - # - # The `len(...) > 0` check is here because pymongo complains when `insert_many` is called - # with an empty list. - # - if ( - isinstance(documents_to_insert, list) - and len(documents_to_insert) > 0 - ): - try: - overlay_db.replace_or_insert_many( - collection_name, documents_to_insert - ) - except OverlayDBError as error: - validation_errors[collection_name].append(str(error)) - - # Now that the OverlayDB contains all the specified documents, we will check whether - # every document referenced by any of the inserted documents exists. - finder = Finder(database=overlay_db) - references = get_allowed_references() - reference_field_names_by_source_class_name = ( - references.get_reference_field_names_by_source_class_name() - ) - for source_collection_name, documents_inserted in docs.items(): - # If `documents_inserted` is not a list (which is a scenario that the previous validation stages - # allow), abort processing this collection and proceed to processing the next collection. - if not isinstance(documents_inserted, list): - continue - - # Check the referential integrity of the replaced or inserted documents. - print( - f"Checking references emanating from documents inserted into '{source_collection_name}'." + finder = Finder(database=mdb) + references = get_allowed_references() + reference_field_names_by_source_class_name = ( + references.get_reference_field_names_by_source_class_name() + ) + + # Iterate over the collections in the JSON payload. + for source_collection_name, documents in in_docs.items(): + print(f"Checking inter-document references for {source_collection_name}") + for document in documents: + source_document = dict(document, _id=None) # adds `_id` field, since `refscan` requires it to exist + violations = scan_outgoing_references( + document=source_document, + schema_view=nmdc_schema_view(), + reference_field_names_by_source_class_name=reference_field_names_by_source_class_name, + references=references, + finder=finder, + collection_names=nmdc_database_collection_names(), + source_collection_name=source_collection_name, + user_wants_to_locate_misplaced_documents=False, ) - for document in documents_inserted: - violations = scan_outgoing_references( - document=document, - schema_view=nmdc_schema_view(), - reference_field_names_by_source_class_name=reference_field_names_by_source_class_name, - references=references, - finder=finder, - collection_names=nmdc_database_collection_names(), - source_collection_name=source_collection_name, - user_wants_to_locate_misplaced_documents=False, + + # For each violation, check whether the misplaced document is in the JSON payload, itself. + for violation in violations: + print(f"Checking whether inter-document referential integrity violation can be waived.") + can_waive_violation = False + # Determine which collections can contain the referenced document, based upon + # the schema class of which this source document is an instance. + target_collection_names = references.get_target_collection_names( + source_class_name=violation.source_class_name, + source_field_name=violation.source_field_name, ) - for violation in violations: + # Check whether the referenced document exists in any of those collections in the JSON payload. + for in_collection_name, in_collection_docs in in_docs.items(): + if in_collection_name in target_collection_names: + for in_collection_doc in in_collection_docs: + if in_collection_doc.get("id") == violation.target_id: + can_waive_violation = True + break # stop checking + if can_waive_violation: + break # stop checking + if not can_waive_violation: violation_as_str = ( f"Document '{violation.source_document_id}' " f"in collection '{violation.source_collection_name}' " diff --git a/tests/test_the_util/test_the_util.py b/tests/test_the_util/test_the_util.py index cf2ea291..3de72d36 100644 --- a/tests/test_the_util/test_the_util.py +++ b/tests/test_the_util/test_the_util.py @@ -6,7 +6,7 @@ # Tip: At the time of this writing, you can run the tests in this file without running other tests in this repo, # by issuing the following command from the root directory of the repository within the `fastapi` container: # ``` -# $ pytest tests/test_the_util/test_the_util.py +# $ pytest -vv tests/test_the_util/test_the_util.py # ``` # Define a reusable dictionary that matches the value the `validate_json` function @@ -148,6 +148,23 @@ def test_validate_json_does_not_check_references_if_documents_are_schema_defiant assert len(result["detail"]["study_set"]) == 1 # not 2 +def test_validate_json_reports_multiple_broken_references_emanating_from_single_document(db): + database_dict = { + "study_set": [ + { + "id": "nmdc:sty-00-000001", + "type": "nmdc:Study", + "study_category": "research_study", + "part_of": ["nmdc:sty-00-000008", "nmdc:sty-00-000009"], # identifies 2 non-existent studies + }, + ] + } + result = validate_json(in_docs=database_dict, mdb=db, **check_refs) + assert result["result"] == "errors" + assert "study_set" in result["detail"] + assert len(result["detail"]["study_set"]) == 2 + + def test_validate_json_checks_referential_integrity_after_applying_all_collections_changes(db): r""" Note: This test targets the scenario where a single payload introduces both the source document and target document @@ -170,3 +187,31 @@ def test_validate_json_checks_referential_integrity_after_applying_all_collectio ] } assert validate_json(in_docs=database_dict, mdb=db, **check_refs) == ok_result + + +def test_validate_json_considers_existing_documents_when_checking_references(db): + r""" + Note: This test focuses on the case where the database already contains the to-be-referenced document; + as opposed to the to-be-referenced document being introduced via the same request payload. + For that reason, we will seed the database before calling the function-under-test. + """ + existing_study_id = "nmdc:sty-00-000001" + + db.get_collection("study_set").replace_one( + {"id": existing_study_id}, + {"id": "nmdc:sty-00-000001", "type": "nmdc:Study", "study_category": "research_study"}, + upsert=True + ) + database_dict = { + "study_set": [ + { + "id": "nmdc:sty-00-000002", + "type": "nmdc:Study", + "study_category": "research_study", + "part_of": [existing_study_id], # identifies the existing study + }, + ] + } + assert validate_json(in_docs=database_dict, mdb=db, **check_refs) == ok_result + + db.get_collection("study_set").delete_one({"id": existing_study_id}) From 08b6fefef7f87ab1f11f765da3b8cfe8f7ddce75 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Wed, 15 Jan 2025 21:27:48 -0800 Subject: [PATCH 2/8] Delete defective class method, which is also now obsolete --- nmdc_runtime/util.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py index 81f1fdc2..0ad164bc 100644 --- a/nmdc_runtime/util.py +++ b/nmdc_runtime/util.py @@ -570,13 +570,6 @@ def __enter__(self): def __exit__(self, exc_type, exc_value, traceback): self._bottom_db.client.drop_database(self._top_db.name) - def get_collection(self, coll_name: str): - r"""Returns a reference to the specified collection.""" - try: - return self._top_db[coll_name] - except OperationFailure as e: - raise OverlayDBError(str(e.details)) - def replace_or_insert_many(self, coll_name, documents: list): try: self._top_db[coll_name].insert_many(documents) From 592d837d00fdbd3c169ac248203f4b83ab530458 Mon Sep 17 00:00:00 2001 From: github-actions Date: Thu, 16 Jan 2025 05:39:05 +0000 Subject: [PATCH 3/8] style: reformat --- nmdc_runtime/util.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py index 0ad164bc..35e5ecfb 100644 --- a/nmdc_runtime/util.py +++ b/nmdc_runtime/util.py @@ -709,9 +709,13 @@ def validate_json( # Iterate over the collections in the JSON payload. for source_collection_name, documents in in_docs.items(): - print(f"Checking inter-document references for {source_collection_name}") + print( + f"Checking inter-document references for {source_collection_name}" + ) for document in documents: - source_document = dict(document, _id=None) # adds `_id` field, since `refscan` requires it to exist + source_document = dict( + document, _id=None + ) # adds `_id` field, since `refscan` requires it to exist violations = scan_outgoing_references( document=source_document, schema_view=nmdc_schema_view(), @@ -725,19 +729,26 @@ def validate_json( # For each violation, check whether the misplaced document is in the JSON payload, itself. for violation in violations: - print(f"Checking whether inter-document referential integrity violation can be waived.") + print( + f"Checking whether inter-document referential integrity violation can be waived." + ) can_waive_violation = False # Determine which collections can contain the referenced document, based upon # the schema class of which this source document is an instance. - target_collection_names = references.get_target_collection_names( - source_class_name=violation.source_class_name, - source_field_name=violation.source_field_name, + target_collection_names = ( + references.get_target_collection_names( + source_class_name=violation.source_class_name, + source_field_name=violation.source_field_name, + ) ) # Check whether the referenced document exists in any of those collections in the JSON payload. for in_collection_name, in_collection_docs in in_docs.items(): if in_collection_name in target_collection_names: for in_collection_doc in in_collection_docs: - if in_collection_doc.get("id") == violation.target_id: + if ( + in_collection_doc.get("id") + == violation.target_id + ): can_waive_violation = True break # stop checking if can_waive_violation: From 2f65bbec451ea4cb409cf124fc935cc82a546dae Mon Sep 17 00:00:00 2001 From: eecavanna Date: Wed, 15 Jan 2025 21:44:51 -0800 Subject: [PATCH 4/8] Simplify variable names and remove print statements --- nmdc_runtime/util.py | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py index 35e5ecfb..eafdf274 100644 --- a/nmdc_runtime/util.py +++ b/nmdc_runtime/util.py @@ -709,13 +709,9 @@ def validate_json( # Iterate over the collections in the JSON payload. for source_collection_name, documents in in_docs.items(): - print( - f"Checking inter-document references for {source_collection_name}" - ) for document in documents: - source_document = dict( - document, _id=None - ) # adds `_id` field, since `refscan` requires it to exist + # Add an `_id` field to the document, since `refscan` requires the document to have one. + source_document = dict(document, _id=None) violations = scan_outgoing_references( document=source_document, schema_view=nmdc_schema_view(), @@ -729,9 +725,6 @@ def validate_json( # For each violation, check whether the misplaced document is in the JSON payload, itself. for violation in violations: - print( - f"Checking whether inter-document referential integrity violation can be waived." - ) can_waive_violation = False # Determine which collections can contain the referenced document, based upon # the schema class of which this source document is an instance. @@ -742,13 +735,10 @@ def validate_json( ) ) # Check whether the referenced document exists in any of those collections in the JSON payload. - for in_collection_name, in_collection_docs in in_docs.items(): - if in_collection_name in target_collection_names: - for in_collection_doc in in_collection_docs: - if ( - in_collection_doc.get("id") - == violation.target_id - ): + for json_coll_name, json_coll_docs in in_docs.items(): + if json_coll_name in target_collection_names: + for json_coll_doc in json_coll_docs: + if json_coll_doc["id"] == violation.target_id: can_waive_violation = True break # stop checking if can_waive_violation: From 236c8115dc8e57f412367de645cfa7cab955e24f Mon Sep 17 00:00:00 2001 From: eecavanna Date: Thu, 16 Jan 2025 00:22:27 -0800 Subject: [PATCH 5/8] Replace literal string with variable to emphasize sameness --- tests/test_the_util/test_the_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_the_util/test_the_util.py b/tests/test_the_util/test_the_util.py index 3de72d36..0fdc09a2 100644 --- a/tests/test_the_util/test_the_util.py +++ b/tests/test_the_util/test_the_util.py @@ -199,7 +199,7 @@ def test_validate_json_considers_existing_documents_when_checking_references(db) db.get_collection("study_set").replace_one( {"id": existing_study_id}, - {"id": "nmdc:sty-00-000001", "type": "nmdc:Study", "study_category": "research_study"}, + {"id": existing_study_id, "type": "nmdc:Study", "study_category": "research_study"}, upsert=True ) database_dict = { From 4ce19fab5ee910f0967f714b8e235198953152f6 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Thu, 16 Jan 2025 11:47:01 -0800 Subject: [PATCH 6/8] Document limitation of `OverlayDB` based upon my recent misunderstanding --- nmdc_runtime/util.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py index eafdf274..9911e024 100644 --- a/nmdc_runtime/util.py +++ b/nmdc_runtime/util.py @@ -542,6 +542,13 @@ class OverlayDB(AbstractContextManager): overlay collection, that id is marked as "seen" and will not also be returned when subsequently scanning the (unmodified) base-database collection. + Note: The OverlayDB object does not provide a means to perform arbitrary MongoDB queries on the virtual "merged" + database. Callers can access the real database via `overlay_db._bottom_db` and the overlaying database via + `overlay_db._top_db` and perform arbitrary MongoDB queries on the individual databases that way. Access to + the virtual "merged" database is limited to the methods of the `OverlayDB` class, which simulates the + "merging" just-in-time to process the method invocation. You can see an example of this in the implementation + of the `merge_find` method, which internally accesses both the real database and the overlaying database. + Mongo "update" commands (as the "apply_updates" method) are simulated by first copying affected documents from a base collection to the overlay, and then applying the updates to the overlay, so that again, base collections are unmodified, and a "merge_find" call will produce a result From 22943236a98b841d218b05b9d1370064005e7950 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Thu, 16 Jan 2025 11:55:25 -0800 Subject: [PATCH 7/8] Add comment explaining rationale for not using `OverlayDB` class --- nmdc_runtime/util.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py index 9911e024..c393ccca 100644 --- a/nmdc_runtime/util.py +++ b/nmdc_runtime/util.py @@ -708,6 +708,11 @@ def validate_json( # JSON payload. If it does, then we "waive" (i.e. discard) that violation. # The violations that remain after those two stages are the ones we return to the caller. # + # Note: The reason we do not insert documents into an `OverlayDB` and scan _that_, is that the `OverlayDB` + # does not provide a means to perform arbitrary queries against its virtual "merged" database. It + # is not a drop-in replacement for a pymongo's `Database` class, which is the only thing that + # `refscan`'s `Finder` class accepts. + # finder = Finder(database=mdb) references = get_allowed_references() reference_field_names_by_source_class_name = ( From 5ab16c186a801e94508a2470f7a7522247c1b88e Mon Sep 17 00:00:00 2001 From: eecavanna Date: Thu, 16 Jan 2025 12:09:46 -0800 Subject: [PATCH 8/8] Add user-facing documentation about `/metadata/json:validate` behavior --- nmdc_runtime/api/endpoints/metadata.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/nmdc_runtime/api/endpoints/metadata.py b/nmdc_runtime/api/endpoints/metadata.py index 9b6288fe..ea89027d 100644 --- a/nmdc_runtime/api/endpoints/metadata.py +++ b/nmdc_runtime/api/endpoints/metadata.py @@ -168,10 +168,13 @@ def iter_grid_out(): @router.post("/metadata/json:validate", name="Validate JSON") async def validate_json_nmdcdb(docs: dict, mdb: MongoDatabase = Depends(get_mongo_db)): - """ - + r""" Validate a NMDC JSON Schema "nmdc:Database" object. + This API endpoint validates the JSON payload in two steps. The first step is to check the format of each document + (e.g., the presence, name, and value of each field). If it encounters any violations during that step, it will not + proceed to the second step. The second step is to check whether all documents referenced by the document exist, + whether in the database or the same JSON payload. We call the second step a "referential integrity check." """ return validate_json(docs, mdb, check_inter_document_references=True)