From 3fe47a0d7794e5933015a7e274c2ea3026c746b4 Mon Sep 17 00:00:00 2001 From: Sujay Patil Date: Thu, 26 Dec 2024 12:56:58 -0800 Subject: [PATCH 1/9] dagster harness for missing records updater --- nmdc_runtime/site/graphs.py | 14 ++++++++++++++ nmdc_runtime/site/ops.py | 32 +++++++++++++++++++++++++++++++- nmdc_runtime/site/resources.py | 2 +- nmdc_runtime/site/util.py | 9 +++++++-- nmdc_runtime/site/workspace.yaml | 2 +- 5 files changed, 54 insertions(+), 5 deletions(-) diff --git a/nmdc_runtime/site/graphs.py b/nmdc_runtime/site/graphs.py index fbdd4549..29f3926b 100644 --- a/nmdc_runtime/site/graphs.py +++ b/nmdc_runtime/site/graphs.py @@ -57,6 +57,9 @@ get_ncbi_export_pipeline_inputs, ncbi_submission_xml_from_nmdc_study, ncbi_submission_xml_asset, + get_database_updater_inputs, + nmdc_study_id_filename, + missing_data_generation_repair, ) from nmdc_runtime.site.export.study_metadata import get_biosamples_by_study_id @@ -467,3 +470,14 @@ def nmdc_study_to_ncbi_submission_export(): all_instruments, ) ncbi_submission_xml_asset(xml_data) + + +@graph +def fill_missing_data_generation_data_object_records(): + study_id = get_database_updater_inputs() + database = missing_data_generation_repair(study_id) + + database_dict = nmdc_schema_object_to_dict(database) + filename = nmdc_study_id_filename(study_id) + outputs = export_json_to_drs(database_dict, filename) + add_output_run_event(outputs) diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py index a9516c3f..d3283225 100644 --- a/nmdc_runtime/site/ops.py +++ b/nmdc_runtime/site/ops.py @@ -91,7 +91,12 @@ from nmdc_runtime.site.translation.submission_portal_translator import ( SubmissionPortalTranslator, ) -from nmdc_runtime.site.util import run_and_log, schema_collection_has_index_on_id +from nmdc_runtime.site.repair.database_updater import DatabaseUpdater +from nmdc_runtime.site.util import ( + run_and_log, + schema_collection_has_index_on_id, + nmdc_study_id_to_filename, +) from nmdc_runtime.util import ( drs_object_in_for, get_names_of_classes_in_effective_range_of_slot, @@ -1241,3 +1246,28 @@ def ncbi_submission_xml_from_nmdc_study( all_instruments, ) return ncbi_xml + + +@op +def nmdc_study_id_filename(nmdc_study_id: str) -> str: + filename = nmdc_study_id_to_filename(nmdc_study_id) + return f"missing_database_records_for_{filename}.json" + + +@op( + config_schema={"nmdc_study_id": str}, + out={"nmdc_study_id": Out(str)}, +) +def get_database_updater_inputs(context: OpExecutionContext) -> str: + return context.op_config["nmdc_study_id"] + + +@op(required_resource_keys={"runtime_api_user_client"}) +def missing_data_generation_repair( + context: OpExecutionContext, nmdc_study_id: str +) -> nmdc.Database: + client: RuntimeApiUserClient = context.resources.runtime_api_user_client + database_updater = DatabaseUpdater(client, nmdc_study_id) + database = database_updater.get_database() + + return database diff --git a/nmdc_runtime/site/resources.py b/nmdc_runtime/site/resources.py index 7ceb693d..d827a75d 100644 --- a/nmdc_runtime/site/resources.py +++ b/nmdc_runtime/site/resources.py @@ -134,7 +134,7 @@ def get_biosamples_for_study(self, study_id: str): f"/queries:run", { "find": "biosample_set", - "filter": {"part_of": {"$elemMatch": {"$eq": study_id}}}, + "filter": {"associated_studies": {"$elemMatch": {"$eq": study_id}}}, }, ) response.raise_for_status() diff --git a/nmdc_runtime/site/util.py b/nmdc_runtime/site/util.py index 4280fe65..1f09cb6d 100644 --- a/nmdc_runtime/site/util.py +++ b/nmdc_runtime/site/util.py @@ -1,8 +1,9 @@ import os -from functools import lru_cache -from subprocess import Popen, PIPE, STDOUT, CalledProcessError +from dagster import op +from functools import lru_cache from pymongo.database import Database as MongoDatabase +from subprocess import Popen, PIPE, STDOUT, CalledProcessError from nmdc_runtime.api.db.mongo import get_collection_names_from_schema from nmdc_runtime.site.resources import mongo_resource @@ -47,3 +48,7 @@ def schema_collection_has_index_on_id(mdb: MongoDatabase) -> dict: def get_basename(filename: str) -> str: return os.path.basename(filename) + + +def nmdc_study_id_to_filename(nmdc_study_id: str) -> str: + return nmdc_study_id.replace(":", "_").replace("-", "_") diff --git a/nmdc_runtime/site/workspace.yaml b/nmdc_runtime/site/workspace.yaml index 5da09ab9..531ad21e 100644 --- a/nmdc_runtime/site/workspace.yaml +++ b/nmdc_runtime/site/workspace.yaml @@ -13,7 +13,7 @@ load_from: attribute: biosample_submission_ingest - python_package: package_name: nmdc_runtime.site.repository - attribute: biosample_export + attribute: database_record_repair # - python_package: # package_name: nmdc_runtime.site.repository # attribute: validation From 8af14933bf6e374e1c9973318c7b8f9df7dabc71 Mon Sep 17 00:00:00 2001 From: Sujay Patil Date: Thu, 26 Dec 2024 13:08:42 -0800 Subject: [PATCH 2/9] stub of DatabaseUpdater class --- nmdc_runtime/site/repair/__init__.py | 0 nmdc_runtime/site/repair/database_updater.py | 16 ++++++++++ nmdc_runtime/site/repository.py | 33 ++++++++++++++++++++ 3 files changed, 49 insertions(+) create mode 100644 nmdc_runtime/site/repair/__init__.py create mode 100644 nmdc_runtime/site/repair/database_updater.py diff --git a/nmdc_runtime/site/repair/__init__.py b/nmdc_runtime/site/repair/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/nmdc_runtime/site/repair/database_updater.py b/nmdc_runtime/site/repair/database_updater.py new file mode 100644 index 00000000..28aed0a7 --- /dev/null +++ b/nmdc_runtime/site/repair/database_updater.py @@ -0,0 +1,16 @@ +from nmdc_runtime.site.resources import RuntimeApiUserClient +from nmdc_schema import nmdc + + +class DatabaseUpdater: + def __init__(self, runtime_api_user_client: RuntimeApiUserClient, study_id: str): + self.runtime_api_user_client = runtime_api_user_client + self.study_id = study_id + + def create_missing_dg_records(self): + pass + + def get_database(self) -> nmdc.Database: + database = nmdc.Database() + self.create_missing_dg_records() + return database diff --git a/nmdc_runtime/site/repository.py b/nmdc_runtime/site/repository.py index a1477394..1e4fe10a 100644 --- a/nmdc_runtime/site/repository.py +++ b/nmdc_runtime/site/repository.py @@ -44,6 +44,7 @@ ingest_neon_surface_water_metadata, ensure_alldocs, nmdc_study_to_ncbi_submission_export, + fill_missing_data_generation_data_object_records, ) from nmdc_runtime.site.resources import ( get_mongo, @@ -922,6 +923,38 @@ def biosample_export(): ] +@repository +def database_record_repair(): + normal_resources = run_config_frozen__normal_env["resources"] + return [ + fill_missing_data_generation_data_object_records.to_job( + resource_defs=resource_defs, + config={ + "resources": merge( + unfreeze(normal_resources), + { + "runtime_api_user_client": { + "config": { + "base_url": {"env": "API_HOST"}, + "username": {"env": "API_ADMIN_USER"}, + "password": {"env": "API_ADMIN_PASS"}, + }, + }, + }, + ), + "ops": { + "get_database_updater_inputs": { + "config": { + "nmdc_study_id": "", + } + }, + "export_json_to_drs": {"config": {"username": ""}}, + }, + }, + ), + ] + + # @repository # def validation(): # graph_jobs = [validate_jgi_job, validate_gold_job, validate_emsl_job] From ae220bed9adb589da3b57d0c081ad3ae5f543136 Mon Sep 17 00:00:00 2001 From: Sujay Patil Date: Fri, 27 Dec 2024 13:59:13 -0800 Subject: [PATCH 3/9] harness updates to accommodate DatabaseUpdater --- nmdc_runtime/site/graphs.py | 6 +++-- nmdc_runtime/site/ops.py | 48 ++++++++++++++++++++++++++------- nmdc_runtime/site/repository.py | 16 +++++++++++ nmdc_runtime/site/resources.py | 12 +++++++++ 4 files changed, 71 insertions(+), 11 deletions(-) diff --git a/nmdc_runtime/site/graphs.py b/nmdc_runtime/site/graphs.py index 29f3926b..f2e844e7 100644 --- a/nmdc_runtime/site/graphs.py +++ b/nmdc_runtime/site/graphs.py @@ -474,8 +474,10 @@ def nmdc_study_to_ncbi_submission_export(): @graph def fill_missing_data_generation_data_object_records(): - study_id = get_database_updater_inputs() - database = missing_data_generation_repair(study_id) + (study_id, gold_nmdc_instrument_mapping_file_url) = get_database_updater_inputs() + gold_nmdc_instrument_map_df = get_df_from_url(gold_nmdc_instrument_mapping_file_url) + + database = missing_data_generation_repair(study_id, gold_nmdc_instrument_map_df) database_dict = nmdc_schema_object_to_dict(database) filename = nmdc_study_id_filename(study_id) diff --git a/nmdc_runtime/site/ops.py b/nmdc_runtime/site/ops.py index d3283225..019a6b91 100644 --- a/nmdc_runtime/site/ops.py +++ b/nmdc_runtime/site/ops.py @@ -1255,19 +1255,49 @@ def nmdc_study_id_filename(nmdc_study_id: str) -> str: @op( - config_schema={"nmdc_study_id": str}, - out={"nmdc_study_id": Out(str)}, + config_schema={ + "nmdc_study_id": str, + "gold_nmdc_instrument_mapping_file_url": str, + }, + out={ + "nmdc_study_id": Out(str), + "gold_nmdc_instrument_mapping_file_url": Out(str), + }, ) -def get_database_updater_inputs(context: OpExecutionContext) -> str: - return context.op_config["nmdc_study_id"] +def get_database_updater_inputs(context: OpExecutionContext) -> Tuple[str, str]: + return ( + context.op_config["nmdc_study_id"], + context.op_config["gold_nmdc_instrument_mapping_file_url"], + ) -@op(required_resource_keys={"runtime_api_user_client"}) +@op( + required_resource_keys={ + "runtime_api_user_client", + "runtime_api_site_client", + "gold_api_client", + } +) def missing_data_generation_repair( - context: OpExecutionContext, nmdc_study_id: str + context: OpExecutionContext, + nmdc_study_id: str, + gold_nmdc_instrument_map_df: pd.DataFrame, ) -> nmdc.Database: - client: RuntimeApiUserClient = context.resources.runtime_api_user_client - database_updater = DatabaseUpdater(client, nmdc_study_id) - database = database_updater.get_database() + runtime_api_user_client: RuntimeApiUserClient = ( + context.resources.runtime_api_user_client + ) + runtime_api_site_client: RuntimeApiSiteClient = ( + context.resources.runtime_api_site_client + ) + gold_api_client: GoldApiClient = context.resources.gold_api_client + + database_updater = DatabaseUpdater( + runtime_api_user_client, + runtime_api_site_client, + gold_api_client, + nmdc_study_id, + gold_nmdc_instrument_map_df, + ) + database = database_updater.create_missing_dg_records() return database diff --git a/nmdc_runtime/site/repository.py b/nmdc_runtime/site/repository.py index 1e4fe10a..ee1bcdbb 100644 --- a/nmdc_runtime/site/repository.py +++ b/nmdc_runtime/site/repository.py @@ -940,12 +940,28 @@ def database_record_repair(): "password": {"env": "API_ADMIN_PASS"}, }, }, + "runtime_api_site_client": { + "config": { + "base_url": {"env": "API_HOST"}, + "client_id": {"env": "API_SITE_CLIENT_ID"}, + "client_secret": {"env": "API_SITE_CLIENT_SECRET"}, + "site_id": {"env": "API_SITE_ID"}, + }, + }, + "gold_api_client": { + "config": { + "base_url": {"env": "GOLD_API_BASE_URL"}, + "username": {"env": "GOLD_API_USERNAME"}, + "password": {"env": "GOLD_API_PASSWORD"}, + }, + }, }, ), "ops": { "get_database_updater_inputs": { "config": { "nmdc_study_id": "", + "gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv", } }, "export_json_to_drs": {"config": {"username": ""}}, diff --git a/nmdc_runtime/site/resources.py b/nmdc_runtime/site/resources.py index d827a75d..e382fe52 100644 --- a/nmdc_runtime/site/resources.py +++ b/nmdc_runtime/site/resources.py @@ -370,6 +370,18 @@ def fetch_study(self, id: str) -> Union[Dict[str, Any], None]: return None return results[0] + def fetch_projects_by_biosample(self, biosample_id: str) -> List[Dict[str, Any]]: + id = self._normalize_id(biosample_id) + results = self.request("/projects", params={"biosampleGoldId": id}) + return results + + def fetch_biosample_by_biosample_id( + self, biosample_id: str + ) -> List[Dict[str, Any]]: + id = self._normalize_id(biosample_id) + results = self.request("/biosamples", params={"biosampleGoldId": id}) + return results + @resource( config_schema={ From 47320dbdeac4c839694bcdb6cc163e41b46256aa Mon Sep 17 00:00:00 2001 From: Sujay Patil Date: Fri, 27 Dec 2024 13:59:44 -0800 Subject: [PATCH 4/9] logic to make DataGeneration records based on GOLD ids --- nmdc_runtime/site/repair/database_updater.py | 102 +++++++++++++++++-- 1 file changed, 96 insertions(+), 6 deletions(-) diff --git a/nmdc_runtime/site/repair/database_updater.py b/nmdc_runtime/site/repair/database_updater.py index 28aed0a7..490240e8 100644 --- a/nmdc_runtime/site/repair/database_updater.py +++ b/nmdc_runtime/site/repair/database_updater.py @@ -1,16 +1,106 @@ -from nmdc_runtime.site.resources import RuntimeApiUserClient +from functools import lru_cache +import pandas as pd +from nmdc_runtime.site.resources import ( + RuntimeApiUserClient, + RuntimeApiSiteClient, + GoldApiClient, +) +from nmdc_runtime.site.translation.gold_translator import GoldStudyTranslator from nmdc_schema import nmdc class DatabaseUpdater: - def __init__(self, runtime_api_user_client: RuntimeApiUserClient, study_id: str): + def __init__( + self, + runtime_api_user_client: RuntimeApiUserClient, + runtime_api_site_client: RuntimeApiSiteClient, + gold_api_client: GoldApiClient, + study_id: str, + gold_nmdc_instrument_map_df: pd.DataFrame = pd.DataFrame(), + ): self.runtime_api_user_client = runtime_api_user_client + self.runtime_api_site_client = runtime_api_site_client + self.gold_api_client = gold_api_client self.study_id = study_id + self.gold_nmdc_instrument_map_df = gold_nmdc_instrument_map_df + @lru_cache def create_missing_dg_records(self): - pass - - def get_database(self) -> nmdc.Database: database = nmdc.Database() - self.create_missing_dg_records() + + biosample_set = self.runtime_api_user_client.get_biosamples_for_study( + self.study_id + ) + + all_gold_biosamples = [] + all_gold_projects = [] + for biosample in biosample_set: + gold_biosample_identifiers = biosample.get("gold_biosample_identifiers") + if gold_biosample_identifiers: + gold_biosample_id = gold_biosample_identifiers[0] + gold_biosample = self.gold_api_client.fetch_biosample_by_biosample_id( + gold_biosample_id + )[0] + gold_projects = self.gold_api_client.fetch_projects_by_biosample( + gold_biosample_id + ) + gold_biosample["projects"] = gold_projects + all_gold_biosamples.append(gold_biosample) + all_gold_projects.extend(gold_projects) + + gold_study_translator = GoldStudyTranslator( + biosamples=all_gold_biosamples, + projects=all_gold_projects, + gold_nmdc_instrument_map_df=self.gold_nmdc_instrument_map_df, + ) + + filtered_biosamples = gold_study_translator.biosamples + filtered_projects = gold_study_translator.projects + + gold_project_ids = [project["projectGoldId"] for project in filtered_projects] + nmdc_nucleotide_sequencing_ids = self.runtime_api_site_client.mint_id( + "nmdc:NucleotideSequencing", len(gold_project_ids) + ).json() + gold_project_to_nmdc_nucleotide_sequencing_ids = dict( + zip(gold_project_ids, nmdc_nucleotide_sequencing_ids) + ) + + gold_to_nmdc_biosample_ids = { + biosample["gold_biosample_identifiers"][0].replace("gold:", ""): biosample[ + "id" + ] + for biosample in biosample_set + if "gold_biosample_identifiers" in biosample + and biosample["gold_biosample_identifiers"] + } + + database.data_generation_set = [] + for project in filtered_projects: + # Determine biosampleGoldId from filtered_biosamples + biosample_gold_id = next( + ( + biosample["biosampleGoldId"] + for biosample in filtered_biosamples + if any( + p["projectGoldId"] == project["projectGoldId"] + for p in biosample.get("projects", []) + ) + ), + None, + ) + + if biosample_gold_id: + nmdc_biosample_id = gold_to_nmdc_biosample_ids.get(biosample_gold_id) + if nmdc_biosample_id: + database.data_generation_set.append( + gold_study_translator._translate_nucleotide_sequencing( + project, + nmdc_nucleotide_sequencing_id=gold_project_to_nmdc_nucleotide_sequencing_ids[ + project["projectGoldId"] + ], + nmdc_biosample_id=nmdc_biosample_id, + nmdc_study_id=self.study_id, + ) + ) + return database From 4434739d6f29c5ef3fcc154e784fd6e8d1875318 Mon Sep 17 00:00:00 2001 From: Sujay Patil Date: Mon, 30 Dec 2024 14:42:39 -0800 Subject: [PATCH 5/9] added tests for DatabaseUpdater --- nmdc_runtime/site/repair/database_updater.py | 1 - tests/test_data/test_database_updater.py | 114 +++++++++++++++++++ 2 files changed, 114 insertions(+), 1 deletion(-) create mode 100644 tests/test_data/test_database_updater.py diff --git a/nmdc_runtime/site/repair/database_updater.py b/nmdc_runtime/site/repair/database_updater.py index 490240e8..e0a5170e 100644 --- a/nmdc_runtime/site/repair/database_updater.py +++ b/nmdc_runtime/site/repair/database_updater.py @@ -76,7 +76,6 @@ def create_missing_dg_records(self): database.data_generation_set = [] for project in filtered_projects: - # Determine biosampleGoldId from filtered_biosamples biosample_gold_id = next( ( biosample["biosampleGoldId"] diff --git a/tests/test_data/test_database_updater.py b/tests/test_data/test_database_updater.py new file mode 100644 index 00000000..a96f3efa --- /dev/null +++ b/tests/test_data/test_database_updater.py @@ -0,0 +1,114 @@ +import pytest + +import pandas as pd + +from unittest.mock import MagicMock, patch + +from nmdc_runtime.site.repair.database_updater import DatabaseUpdater + + +@pytest.fixture +def test_setup(test_minter): + mock_runtime_api_user_client = MagicMock() + mock_runtime_api_site_client = MagicMock() + mock_gold_api_client = MagicMock() + + study_id = "nmdc:sty-11-e4yb9z58" + mock_gold_nmdc_instrument_map_df = pd.DataFrame( + { + "GOLD SeqMethod": [ + "Illumina HiSeq", + "Illumina HiSeq 2500-1TB", + ], + "NMDC instrument_set id": [ + "nmdc:inst-14-79zxap02", + "nmdc:inst-14-nn4b6k72", + ], + } + ) + + mint_id_mock = MagicMock() + mint_id_mock.json.return_value = test_minter("nmdc:NucleotideSequencing", 1) + mock_runtime_api_site_client.mint_id.return_value = mint_id_mock + + database_updater = DatabaseUpdater( + runtime_api_user_client=mock_runtime_api_user_client, + runtime_api_site_client=mock_runtime_api_site_client, + gold_api_client=mock_gold_api_client, + study_id=study_id, + gold_nmdc_instrument_map_df=mock_gold_nmdc_instrument_map_df, + ) + + return { + "runtime_api_user_client": mock_runtime_api_user_client, + "runtime_api_site_client": mock_runtime_api_site_client, + "gold_api_client": mock_gold_api_client, + "database_updater": database_updater, + "study_id": study_id, + } + + +@patch("nmdc_runtime.site.repair.database_updater.GoldStudyTranslator") +def test_create_missing_dg_records(MockGoldStudyTranslator, test_setup): + mock_runtime_api_user_client = test_setup["runtime_api_user_client"] + mock_runtime_api_site_client = test_setup["runtime_api_site_client"] + mock_gold_api_client = test_setup["gold_api_client"] + database_updater = test_setup["database_updater"] + + mock_runtime_api_user_client.get_biosamples_for_study.return_value = [ + { + "id": "nmdc:bsm-11-q59jb831", + "gold_biosample_identifiers": ["gold:Gb0150488"], + } + ] + + mock_gold_api_client.fetch_biosample_by_biosample_id.return_value = [ + { + "biosampleGoldId": "Gb0150488", + "biosampleName": "Switchgrass phyllosphere microbial communities", + "projects": [ + { + "projectGoldId": "Gp0208640", + "biosampleGoldId": "Gb0150488", + "sequencingStrategy": "Metagenome", + } + ], + } + ] + + mock_gold_api_client.fetch_projects_by_biosample.return_value = [ + { + "projectGoldId": "Gp0208640", + "biosampleGoldId": "Gb0150488", + "sequencingStrategy": "Metagenome", + } + ] + + MockGoldStudyTranslator.return_value.biosamples = [ + {"biosampleGoldId": "Gb0150488", "projects": [{"projectGoldId": "Gp0208640"}]} + ] + MockGoldStudyTranslator.return_value.projects = [{"projectGoldId": "Gp0208640"}] + + MockGoldStudyTranslator.return_value._translate_nucleotide_sequencing.return_value = MagicMock( + id="nmdc:dgns-00-12345678", + biosample_id="nmdc:bsm-11-q59jb831", + ) + + database = database_updater.create_missing_dg_records() + + assert len(database.data_generation_set) > 0 + assert database.data_generation_set[0].id.startswith("nmdc:dgns-00-") + assert database.data_generation_set[0].biosample_id == "nmdc:bsm-11-q59jb831" + + mock_runtime_api_user_client.get_biosamples_for_study.assert_called_once_with( + test_setup["study_id"] + ) + mock_gold_api_client.fetch_biosample_by_biosample_id.assert_called_once_with( + "gold:Gb0150488" + ) + mock_gold_api_client.fetch_projects_by_biosample.assert_called_once_with( + "gold:Gb0150488" + ) + mock_runtime_api_site_client.mint_id.assert_called_once_with( + "nmdc:NucleotideSequencing", 1 + ) From faf4bf785257a113d7671539782c4f95ce31e010 Mon Sep 17 00:00:00 2001 From: Sujay Patil Date: Thu, 2 Jan 2025 12:30:23 -0800 Subject: [PATCH 6/9] add documentation for logic in the DatabaseUpdater class --- nmdc_runtime/site/repair/database_updater.py | 31 ++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/nmdc_runtime/site/repair/database_updater.py b/nmdc_runtime/site/repair/database_updater.py index e0a5170e..1034aaf9 100644 --- a/nmdc_runtime/site/repair/database_updater.py +++ b/nmdc_runtime/site/repair/database_updater.py @@ -18,6 +18,21 @@ def __init__( study_id: str, gold_nmdc_instrument_map_df: pd.DataFrame = pd.DataFrame(), ): + """This class serves as an API for repairing connections in the database by + adding records that are essentially missing "links"/"connections". As we identify + common use cases for adding missing records to the database, we can + add helper methods to this class. + + :param runtime_api_user_client: An object of RuntimeApiUserClient which can be + used to retrieve instance records from the NMDC database. + :param runtime_api_site_client: An object of RuntimeApiSiteClient which can be + used to mint new IDs for the repaired records that need to be added into the NMDC database. + :param gold_api_client: An object of GoldApiClient which can be used to retrieve + records from GOLD via the GOLD API. + :param study_id: NMDC study ID for which the missing records need to be added. + :param gold_nmdc_instrument_map_df: A dataframe originally stored as a TSV mapping file in the + NMDC schema repo, which maps GOLD instrument IDs to IDs of NMDC instrument_set records. + """ self.runtime_api_user_client = runtime_api_user_client self.runtime_api_site_client = runtime_api_site_client self.gold_api_client = gold_api_client @@ -26,6 +41,17 @@ def __init__( @lru_cache def create_missing_dg_records(self): + """This method creates missing data generation records for a given study in the NMDC database using + metadata from GOLD. The way the logic works is, it first fetches all the biosamples associated + with the study from the NMDC database. Then, it fetches all the biosample and project data data + associated with the individual biosamples from the GOLD API using the NMDC-GOLD biosample id + mappings on the "gold_biosample_identifiers" key/slot. We use the GoldStudyTranslator class + to mint the required number of `nmdc:DataGeneration` (`nmdc:NucleotideSequencing`) records based + on the number of GOLD sequencing projects, and then reimplement only the part of logic from that + class which is responsible for making data_generation_set records. + + :return: An instance of `nmdc:Database` object which is JSON-ified and rendered on the frontend. + """ database = nmdc.Database() biosample_set = self.runtime_api_user_client.get_biosamples_for_study( @@ -54,6 +80,8 @@ def create_missing_dg_records(self): gold_nmdc_instrument_map_df=self.gold_nmdc_instrument_map_df, ) + # The GoldStudyTranslator class has some pre-processing logic which filters out + # invalid biosamples and projects (based on `sequencingStrategy`, `projectStatus`, etc.) filtered_biosamples = gold_study_translator.biosamples filtered_projects = gold_study_translator.projects @@ -75,7 +103,10 @@ def create_missing_dg_records(self): } database.data_generation_set = [] + # Similar to the logic in GoldStudyTranslator, the number of nmdc:NucleotideSequencing records + # created is based on the number of GOLD sequencing projects for project in filtered_projects: + # map the projectGoldId to the NMDC biosample ID biosample_gold_id = next( ( biosample["biosampleGoldId"] From b028c8f78a4c6ca6f032180ca31a453647f8a807 Mon Sep 17 00:00:00 2001 From: Sujay Patil Date: Thu, 2 Jan 2025 17:08:50 -0800 Subject: [PATCH 7/9] improve caching in DatabaseUpdater --- nmdc_runtime/site/repair/database_updater.py | 27 +++++++++++++++----- 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/nmdc_runtime/site/repair/database_updater.py b/nmdc_runtime/site/repair/database_updater.py index 1034aaf9..bf213d0d 100644 --- a/nmdc_runtime/site/repair/database_updater.py +++ b/nmdc_runtime/site/repair/database_updater.py @@ -1,4 +1,5 @@ from functools import lru_cache +from typing import Any, Dict, List import pandas as pd from nmdc_runtime.site.resources import ( RuntimeApiUserClient, @@ -39,6 +40,24 @@ def __init__( self.study_id = study_id self.gold_nmdc_instrument_map_df = gold_nmdc_instrument_map_df + @lru_cache + def _fetch_gold_biosample(self, gold_biosample_id: str) -> List[Dict[str, Any]]: + """Fetch response from GOLD /biosamples API for a given biosample id. + + :param gold_biosample_id: GOLD biosample ID. + :return: Dictionary containing the response from the GOLD /biosamples API. + """ + return self.gold_api_client.fetch_biosample_by_biosample_id(gold_biosample_id) + + @lru_cache + def _fetch_gold_projects(self, gold_biosample_id: str): + """Fetch response from GOLD /projects API for a given biosample id. + + :param gold_biosample_id: GOLD biosample ID + :return: Dictionary containing the response from the GOLD /projects API. + """ + return self.gold_api_client.fetch_projects_by_biosample(gold_biosample_id) + @lru_cache def create_missing_dg_records(self): """This method creates missing data generation records for a given study in the NMDC database using @@ -64,12 +83,8 @@ class which is responsible for making data_generation_set records. gold_biosample_identifiers = biosample.get("gold_biosample_identifiers") if gold_biosample_identifiers: gold_biosample_id = gold_biosample_identifiers[0] - gold_biosample = self.gold_api_client.fetch_biosample_by_biosample_id( - gold_biosample_id - )[0] - gold_projects = self.gold_api_client.fetch_projects_by_biosample( - gold_biosample_id - ) + gold_biosample = self._fetch_gold_biosample(gold_biosample_id)[0] + gold_projects = self._fetch_gold_projects(gold_biosample_id) gold_biosample["projects"] = gold_projects all_gold_biosamples.append(gold_biosample) all_gold_projects.extend(gold_projects) From 128906f46f21f1a21eed4841123b078b99c47177 Mon Sep 17 00:00:00 2001 From: Sujay Patil Date: Tue, 7 Jan 2025 12:48:59 -0800 Subject: [PATCH 8/9] modify method that gets biosamples based on study --- nmdc_runtime/site/repair/database_updater.py | 28 ++++++++++---------- nmdc_runtime/site/resources.py | 17 ++++++++---- 2 files changed, 26 insertions(+), 19 deletions(-) diff --git a/nmdc_runtime/site/repair/database_updater.py b/nmdc_runtime/site/repair/database_updater.py index bf213d0d..4edfcd49 100644 --- a/nmdc_runtime/site/repair/database_updater.py +++ b/nmdc_runtime/site/repair/database_updater.py @@ -82,12 +82,13 @@ class which is responsible for making data_generation_set records. for biosample in biosample_set: gold_biosample_identifiers = biosample.get("gold_biosample_identifiers") if gold_biosample_identifiers: - gold_biosample_id = gold_biosample_identifiers[0] - gold_biosample = self._fetch_gold_biosample(gold_biosample_id)[0] - gold_projects = self._fetch_gold_projects(gold_biosample_id) - gold_biosample["projects"] = gold_projects - all_gold_biosamples.append(gold_biosample) - all_gold_projects.extend(gold_projects) + for gold_biosample_id in gold_biosample_identifiers: + gold_biosample = self._fetch_gold_biosample(gold_biosample_id)[0] + gold_projects = self._fetch_gold_projects(gold_biosample_id) + gold_biosample["projects"] = gold_projects + + all_gold_biosamples.append(gold_biosample) + all_gold_projects.extend(gold_projects) gold_study_translator = GoldStudyTranslator( biosamples=all_gold_biosamples, @@ -108,14 +109,13 @@ class which is responsible for making data_generation_set records. zip(gold_project_ids, nmdc_nucleotide_sequencing_ids) ) - gold_to_nmdc_biosample_ids = { - biosample["gold_biosample_identifiers"][0].replace("gold:", ""): biosample[ - "id" - ] - for biosample in biosample_set - if "gold_biosample_identifiers" in biosample - and biosample["gold_biosample_identifiers"] - } + gold_to_nmdc_biosample_ids = {} + + for biosample in biosample_set: + gold_ids = biosample.get("gold_biosample_identifiers", []) + for gold_id in gold_ids: + gold_id_stripped = gold_id.replace("gold:", "") + gold_to_nmdc_biosample_ids[gold_id_stripped] = biosample["id"] database.data_generation_set = [] # Similar to the logic in GoldStudyTranslator, the number of nmdc:NucleotideSequencing records diff --git a/nmdc_runtime/site/resources.py b/nmdc_runtime/site/resources.py index e382fe52..c00b0900 100644 --- a/nmdc_runtime/site/resources.py +++ b/nmdc_runtime/site/resources.py @@ -129,16 +129,23 @@ def get_omics_processing_records_by_gold_project_id(self, gold_project_id: str): return response.json()["cursor"]["firstBatch"] def get_biosamples_for_study(self, study_id: str): + # TODO: 10000 is an arbitrarily large number that has been chosen for the max_page_size param. + # The /nmdcschema/{collection-name} endpoint implements pagination via the page_token mechanism, + # but the tradeoff there is that we would need to make multiple requests to step through the + # each of the pages. By picking a large number for max_page_size, we can get all the results + # in a single request. + # This method previously used the /queries:run endpoint but the problem with that was that + # it used to truncate the number of results returned to 100. response = self.request( - "POST", - f"/queries:run", + "GET", + f"/nmdcschema/biosample_set", { - "find": "biosample_set", - "filter": {"associated_studies": {"$elemMatch": {"$eq": study_id}}}, + "filter": json.dumps({"associated_studies": study_id}), + "max_page_size": 10000, }, ) response.raise_for_status() - return response.json()["cursor"]["firstBatch"] + return response.json()["resources"] def get_omics_processing_by_name(self, name: str): response = self.request( From dfbb1feee47ed3fd5955fe8ec30ef8b3f11cdd4a Mon Sep 17 00:00:00 2001 From: Sujay Patil Date: Thu, 9 Jan 2025 18:04:20 -0800 Subject: [PATCH 9/9] remove cache decorator on create_missing_dg_records() --- nmdc_runtime/site/repair/database_updater.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nmdc_runtime/site/repair/database_updater.py b/nmdc_runtime/site/repair/database_updater.py index 4edfcd49..27d91ce7 100644 --- a/nmdc_runtime/site/repair/database_updater.py +++ b/nmdc_runtime/site/repair/database_updater.py @@ -58,7 +58,6 @@ def _fetch_gold_projects(self, gold_biosample_id: str): """ return self.gold_api_client.fetch_projects_by_biosample(gold_biosample_id) - @lru_cache def create_missing_dg_records(self): """This method creates missing data generation records for a given study in the NMDC database using metadata from GOLD. The way the logic works is, it first fetches all the biosamples associated