Skip to content

Commit

Permalink
logic to make DataGeneration records based on GOLD ids
Browse files Browse the repository at this point in the history
  • Loading branch information
sujaypatil96 committed Dec 27, 2024
1 parent ae220be commit 47320db
Showing 1 changed file with 96 additions and 6 deletions.
102 changes: 96 additions & 6 deletions nmdc_runtime/site/repair/database_updater.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,106 @@
from nmdc_runtime.site.resources import RuntimeApiUserClient
from functools import lru_cache
import pandas as pd
from nmdc_runtime.site.resources import (
RuntimeApiUserClient,
RuntimeApiSiteClient,
GoldApiClient,
)
from nmdc_runtime.site.translation.gold_translator import GoldStudyTranslator
from nmdc_schema import nmdc


class DatabaseUpdater:
def __init__(self, runtime_api_user_client: RuntimeApiUserClient, study_id: str):
def __init__(
self,
runtime_api_user_client: RuntimeApiUserClient,
runtime_api_site_client: RuntimeApiSiteClient,
gold_api_client: GoldApiClient,
study_id: str,
gold_nmdc_instrument_map_df: pd.DataFrame = pd.DataFrame(),
):
self.runtime_api_user_client = runtime_api_user_client
self.runtime_api_site_client = runtime_api_site_client
self.gold_api_client = gold_api_client
self.study_id = study_id
self.gold_nmdc_instrument_map_df = gold_nmdc_instrument_map_df

@lru_cache
def create_missing_dg_records(self):
pass

def get_database(self) -> nmdc.Database:
database = nmdc.Database()
self.create_missing_dg_records()

biosample_set = self.runtime_api_user_client.get_biosamples_for_study(
self.study_id
)

all_gold_biosamples = []
all_gold_projects = []
for biosample in biosample_set:
gold_biosample_identifiers = biosample.get("gold_biosample_identifiers")
if gold_biosample_identifiers:
gold_biosample_id = gold_biosample_identifiers[0]
gold_biosample = self.gold_api_client.fetch_biosample_by_biosample_id(
gold_biosample_id
)[0]
gold_projects = self.gold_api_client.fetch_projects_by_biosample(
gold_biosample_id
)
gold_biosample["projects"] = gold_projects
all_gold_biosamples.append(gold_biosample)
all_gold_projects.extend(gold_projects)

gold_study_translator = GoldStudyTranslator(
biosamples=all_gold_biosamples,
projects=all_gold_projects,
gold_nmdc_instrument_map_df=self.gold_nmdc_instrument_map_df,
)

filtered_biosamples = gold_study_translator.biosamples
filtered_projects = gold_study_translator.projects

gold_project_ids = [project["projectGoldId"] for project in filtered_projects]
nmdc_nucleotide_sequencing_ids = self.runtime_api_site_client.mint_id(
"nmdc:NucleotideSequencing", len(gold_project_ids)
).json()
gold_project_to_nmdc_nucleotide_sequencing_ids = dict(
zip(gold_project_ids, nmdc_nucleotide_sequencing_ids)
)

gold_to_nmdc_biosample_ids = {
biosample["gold_biosample_identifiers"][0].replace("gold:", ""): biosample[
"id"
]
for biosample in biosample_set
if "gold_biosample_identifiers" in biosample
and biosample["gold_biosample_identifiers"]
}

database.data_generation_set = []
for project in filtered_projects:
# Determine biosampleGoldId from filtered_biosamples
biosample_gold_id = next(
(
biosample["biosampleGoldId"]
for biosample in filtered_biosamples
if any(
p["projectGoldId"] == project["projectGoldId"]
for p in biosample.get("projects", [])
)
),
None,
)

if biosample_gold_id:
nmdc_biosample_id = gold_to_nmdc_biosample_ids.get(biosample_gold_id)
if nmdc_biosample_id:
database.data_generation_set.append(
gold_study_translator._translate_nucleotide_sequencing(
project,
nmdc_nucleotide_sequencing_id=gold_project_to_nmdc_nucleotide_sequencing_ids[
project["projectGoldId"]
],
nmdc_biosample_id=nmdc_biosample_id,
nmdc_study_id=self.study_id,
)
)

return database

0 comments on commit 47320db

Please sign in to comment.