From d21e4979a17e815d1407cd655e55b81e4d296493 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Mon, 11 Nov 2024 14:53:24 -0500 Subject: [PATCH 01/28] Added trackHub file creation to bedset --- bbconf/_version.py | 2 +- bbconf/modules/bedsets.py | 48 +++++++++++++++++++++++++++++++++++++++ docs/changelog.md | 7 +++++- 3 files changed, 55 insertions(+), 2 deletions(-) diff --git a/bbconf/_version.py b/bbconf/_version.py index 3e2f46a..bfbf79e 100644 --- a/bbconf/_version.py +++ b/bbconf/_version.py @@ -1 +1 @@ -__version__ = "0.9.0" +__version__ = "0.9.1_dev0" diff --git a/bbconf/modules/bedsets.py b/bbconf/modules/bedsets.py index 3f46bb9..66f6409 100644 --- a/bbconf/modules/bedsets.py +++ b/bbconf/modules/bedsets.py @@ -214,6 +214,54 @@ def get_bedset_pep(self, identifier: str) -> dict: "_subsample_list": [], } + def get_track_hub_file(self, identifier: str) -> str: + """ + Get track hub file for bedset. + + :param identifier: bedset identifier + :return: track hub file + """ + statement = select(BedFileBedSetRelation).where( + BedFileBedSetRelation.bedset_id == identifier + ) + + trackDb_txt = "" + + with Session(self._db_engine.engine) as session: + bs2bf_objects = session.scalars(statement) + if not bs2bf_objects: + raise BedSetNotFoundError(f"Bedset with id: {identifier} not found.") + + for bs2bf_obj in bs2bf_objects: + bed_obj = bs2bf_obj.bedfile + try: + bigbed_url = None + for bedfile in bed_obj.files: + if bedfile.name == "bigbed_file": + bigbed_url = self.config.get_prefixed_uri( + postfix=bedfile.path, access_id="http" + ) + break + if not bigbed_url: + _LOGGER.debug( + f"BigBed file for bedfile {bs2bf_obj.bedfile_id} not found." + ) + continue + except AttributeError: + _LOGGER.debug( + f"BigBed file for bedfile {bs2bf_obj.bedfile_id} not found." + ) + continue + trackDb_txt = ( + trackDb_txt + f"track\t {bed_obj.name}\n" + "type\t bigBed\n" + f"bigDataUrl\t {bigbed_url} \n" + f"shortLabel\t {bed_obj.name}\n" + f"longLabel\t {bed_obj.description}\n" + "visibility\t full\n\n" + ) + return trackDb_txt + def create( self, identifier: str, diff --git a/docs/changelog.md b/docs/changelog.md index 1eba8d4..e6d2d1d 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -2,9 +2,14 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) format. + +# [0.9.1] - 2024-XX-XX +## Changed +- Added trackHub file creation to bedset + # [0.9.0] - 2024-11-06 ## Changed -- Fixed bug with uploading tss dist plot\ +- Fixed bug with uploading tss dist plot ## Added - Added annotations to bedsets (author, source) From 60a7cff6f3f43633b39767cdae6bdca235bec742 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Tue, 12 Nov 2024 15:14:57 -0500 Subject: [PATCH 02/28] Added trackHub file creation to bedset --- bbconf/modules/bedfiles.py | 39 ++++++++++++++++++++++++++++++++++++++ docs/changelog.md | 4 ++++ manual_testing.py | 11 ++++++++++- 3 files changed, 53 insertions(+), 1 deletion(-) diff --git a/bbconf/modules/bedfiles.py b/bbconf/modules/bedfiles.py index f9a60a6..1783da3 100644 --- a/bbconf/modules/bedfiles.py +++ b/bbconf/modules/bedfiles.py @@ -241,6 +241,45 @@ def get_plots(self, identifier: str) -> BedPlots: ) return bed_plots + def get_neighbours( + self, identifier: str, limit: int = 10, offset: int = 0 + ) -> BedListSearchResult: + """ + Get nearest neighbours of bed file from qdrant. + + :param identifier: bed file identifier + :param limit: number of results to return + :param offset: offset to start from + + :return: list of nearest neighbours + """ + if not self.exists(identifier): + raise BEDFileNotFoundError(f"Bed file with id: {identifier} not found.") + s = identifier + results = self._qdrant_engine.qd_client.query_points( + collection_name=self._config.config.qdrant.file_collection, + query="-".join([s[:8], s[8:12], s[12:16], s[16:20], s[20:]]), + limit=limit, + offset=offset, + ) + result_list = [] + for result in results.points: + result_id = result.id.replace("-", "") + result_list.append( + QdrantSearchResult( + id=result_id, + payload=result.payload, + score=result.score, + metadata=self.get(result_id, full=False), + ) + ) + return BedListSearchResult( + count=0, + limit=limit, + offset=offset, + results=result_list, + ) + def get_files(self, identifier: str) -> BedFiles: """ Get file files by identifier. diff --git a/docs/changelog.md b/docs/changelog.md index e6d2d1d..6bd416b 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -4,6 +4,10 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm # [0.9.1] - 2024-XX-XX +## Added +- Search neighbours files for bed record + + ## Changed - Added trackHub file creation to bedset diff --git a/manual_testing.py b/manual_testing.py index 0386ada..3324f00 100644 --- a/manual_testing.py +++ b/manual_testing.py @@ -185,10 +185,19 @@ def get_id_plots_missing(): print(agent.get_list_genomes()) +def neighbour_beds(): + from bbconf import BedBaseAgent + + agent = BedBaseAgent(config="/home/bnt4me/virginia/repos/bedhost/config.yaml") + restults = agent.bed.get_neighbours("95900d67ed6411a322af35098e445eb0") + restults + + if __name__ == "__main__": # zarr_s3() # add_s3() # get_from_s3() # biocframe() # get_pep() - get_id_plots_missing() + # get_id_plots_missing() + neighbour_beds() From 0e5a36ef747abc0e4ff379fd941bc280f9df25a2 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Wed, 13 Nov 2024 13:51:46 -0500 Subject: [PATCH 03/28] Added bed sql search --- bbconf/models/bed_models.py | 4 +- bbconf/modules/bedfiles.py | 85 +++++++++++++++++++++++++++++++++++-- manual_testing.py | 15 +++++-- 3 files changed, 96 insertions(+), 8 deletions(-) diff --git a/bbconf/models/bed_models.py b/bbconf/models/bed_models.py index 3edf0c9..4ba7118 100644 --- a/bbconf/models/bed_models.py +++ b/bbconf/models/bed_models.py @@ -192,8 +192,8 @@ class BedListResult(BaseModel): class QdrantSearchResult(BaseModel): id: str - payload: dict - score: float + payload: dict = None + score: float = None metadata: Union[BedMetadataBasic, None] = None diff --git a/bbconf/modules/bedfiles.py b/bbconf/modules/bedfiles.py index 1783da3..9ec569f 100644 --- a/bbconf/modules/bedfiles.py +++ b/bbconf/modules/bedfiles.py @@ -9,7 +9,7 @@ from pephubclient.exceptions import ResponseError from pydantic import BaseModel from qdrant_client.models import Distance, PointIdsList, VectorParams -from sqlalchemy import and_, delete, func, select +from sqlalchemy import and_, delete, func, or_, select from sqlalchemy.orm import Session, aliased from tqdm import tqdm @@ -274,7 +274,7 @@ def get_neighbours( ) ) return BedListSearchResult( - count=0, + count=self.bb_agent.get_stats().bedfiles_number, limit=limit, offset=offset, results=result_list, @@ -898,12 +898,91 @@ def bed_to_bed_search( results=results_list, ) + def sql_search( + self, query: str, limit: int = 10, offset: int = 0 + ) -> BedListSearchResult: + """ + Search for bed files by using sql exact search. + This search will search files by id, name, and description + + :param query: text query + :param limit: number of results to return + :param offset: offset to start from + + :return: list of bed file metadata + """ + _LOGGER.debug(f"Looking for: {query}") + + sql_search_str = f"%{query}%" + with Session(self._sa_engine) as session: + statement = ( + select(Bed) + .where( + or_( + Bed.id.ilike(sql_search_str), + Bed.name.ilike(sql_search_str), + Bed.description.ilike(sql_search_str), + ) + ) + .limit(limit) + .offset(offset) + ) + bed_objects = session.scalars(statement) + results = [ + BedMetadataBasic( + **bedfile_obj.__dict__, + annotation=StandardMeta( + **( + bedfile_obj.annotations.__dict__ + if bedfile_obj.annotations + else {} + ) + ), + ) + for bedfile_obj in bed_objects + ] + result_list = [ + QdrantSearchResult(id=result.id, score=1, metadata=result) + for result in results + ] + + return BedListSearchResult( + count=self._sql_search_count(query), + limit=limit, + offset=offset, + results=result_list, + ) + + def _sql_search_count(self, query: str) -> int: + """ + Get number of total found files in the database. + + :param query: text query + + :return: number of found files + """ + sql_search_str = f"%{query}%" + with Session(self._sa_engine) as session: + statement = ( + select(func.count()) + .select_from(Bed) + .where( + or_( + Bed.id.ilike(sql_search_str), + Bed.name.ilike(sql_search_str), + Bed.description.ilike(sql_search_str), + ) + ) + ) + count = session.execute(statement).one() + return count[0] + def reindex_qdrant(self) -> None: """ Re-upload all files to quadrant. !Warning: only hg38 genome can be added to qdrant! - If you want want to fully reindex/reupload to qdrant, first delete collection and create new one. + If you want to fully reindex/reupload to qdrant, first delete collection and create new one. Upload all files to qdrant. """ diff --git a/manual_testing.py b/manual_testing.py index 3324f00..ea814bd 100644 --- a/manual_testing.py +++ b/manual_testing.py @@ -189,8 +189,16 @@ def neighbour_beds(): from bbconf import BedBaseAgent agent = BedBaseAgent(config="/home/bnt4me/virginia/repos/bedhost/config.yaml") - restults = agent.bed.get_neighbours("95900d67ed6411a322af35098e445eb0") - restults + results = agent.bed.get_neighbours("e76e41597622b3df45435dde1a8eb19d") + results + + +def sql_search(): + from bbconf import BedBaseAgent + + agent = BedBaseAgent(config="/home/bnt4me/virginia/repos/bedhost/config.yaml") + results = agent.bed.sql_search("K562", limit=100) + results if __name__ == "__main__": @@ -200,4 +208,5 @@ def neighbour_beds(): # biocframe() # get_pep() # get_id_plots_missing() - neighbour_beds() + # neighbour_beds() + sql_search() From 0e75c72a29bb170b24faa0722cf5b3ad1d91ec3a Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Wed, 13 Nov 2024 14:36:43 -0500 Subject: [PATCH 04/28] Set limit on bedset trackhub response --- bbconf/exceptions.py | 6 ++++++ bbconf/modules/bedsets.py | 18 +++++++++++++++--- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/bbconf/exceptions.py b/bbconf/exceptions.py index 3f8c017..3ce7c71 100644 --- a/bbconf/exceptions.py +++ b/bbconf/exceptions.py @@ -83,3 +83,9 @@ class QdrantInstanceNotInitializedError(BedBaseConfError): """Error type for missing qdrant instance""" pass + + +class BedSetTrackHubLimitError(BedBaseConfError): + """Limit for visualizing trackhub exceeded""" + + pass diff --git a/bbconf/modules/bedsets.py b/bbconf/modules/bedsets.py index 66f6409..f69829b 100644 --- a/bbconf/modules/bedsets.py +++ b/bbconf/modules/bedsets.py @@ -3,12 +3,16 @@ from geniml.io.utils import compute_md5sum_bedset from sqlalchemy import Float, Numeric, func, or_, select -from sqlalchemy.orm import Session +from sqlalchemy.orm import Session, relationship from bbconf.config_parser import BedBaseConfig from bbconf.const import PKG_NAME from bbconf.db_utils import Bed, BedFileBedSetRelation, BedSets, BedStats, Files -from bbconf.exceptions import BedSetExistsError, BedSetNotFoundError +from bbconf.exceptions import ( + BedSetExistsError, + BedSetNotFoundError, + BedSetTrackHubLimitError, +) from bbconf.models.bed_models import BedStatsModel, StandardMeta from bbconf.models.bedset_models import ( BedMetadataBasic, @@ -232,8 +236,16 @@ def get_track_hub_file(self, identifier: str) -> str: if not bs2bf_objects: raise BedSetNotFoundError(f"Bedset with id: {identifier} not found.") - for bs2bf_obj in bs2bf_objects: + relationship_objects = [relationship for relationship in bs2bf_objects] + + if len(relationship_objects) > 20: + raise BedSetTrackHubLimitError( + "Number of bedfiles exceeds 20. Unable to process request for track hub." + ) + + for bs2bf_obj in relationship_objects: bed_obj = bs2bf_obj.bedfile + try: bigbed_url = None for bedfile in bed_obj.files: From 9f231b74aeb6877e82a4a662de38de9ee52113d8 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Thu, 21 Nov 2024 18:31:30 -0500 Subject: [PATCH 05/28] improved error handling in bedset --- bbconf/modules/bedsets.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/bbconf/modules/bedsets.py b/bbconf/modules/bedsets.py index f69829b..3c6ceb8 100644 --- a/bbconf/modules/bedsets.py +++ b/bbconf/modules/bedsets.py @@ -4,14 +4,17 @@ from geniml.io.utils import compute_md5sum_bedset from sqlalchemy import Float, Numeric, func, or_, select from sqlalchemy.orm import Session, relationship +from sqlalchemy.exc import IntegrityError from bbconf.config_parser import BedBaseConfig from bbconf.const import PKG_NAME from bbconf.db_utils import Bed, BedFileBedSetRelation, BedSets, BedStats, Files from bbconf.exceptions import ( + BedBaseConfError, BedSetExistsError, BedSetNotFoundError, BedSetTrackHubLimitError, + BEDFileNotFoundError, ) from bbconf.models.bed_models import BedStatsModel, StandardMeta from bbconf.models.bedset_models import ( @@ -313,8 +316,15 @@ def create( else: stats = None if self.exists(identifier): - if not overwrite and not no_fail: - raise BedSetExistsError(identifier) + if not overwrite: + raise BedSetExistsError( + f"BEDset already exist in the database: {identifier}" + ) + if no_fail and not overwrite: + _LOGGER.warning( + f"Bedset '{identifier}' already exists. no_fail=True. Skipping updating bedset." + ) + return None self.delete(identifier) if not isinstance(annotation, dict): @@ -366,10 +376,13 @@ def create( session.add(new_file) session.commit() - except Exception as e: - _LOGGER.error(f"Failed to create bedset: {e}") + except IntegrityError as _: + raise BEDFileNotFoundError( + "Failed to create bedset. One of the bedfiles does not exist." + ) + except Exception as _: if not no_fail: - raise e + raise BedBaseConfError("Failed to create bedset. SQL error.") _LOGGER.info(f"Bedset '{identifier}' was created successfully") return None From 8adf567383bedda166255a0db44b4fbf69990c60 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Tue, 26 Nov 2024 13:57:17 -0500 Subject: [PATCH 06/28] bump pephubclient version --- requirements/requirements-all.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index cd2e043..cf4f5cf 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -6,7 +6,7 @@ colorlogs pydantic >= 2.9.0 botocore boto3 >= 1.34.54 -pephubclient >= 0.4.4 +pephubclient >= 0.4.5 sqlalchemy_schemadisplay zarr pyyaml >= 6.0.1 # for s3fs because of the errors From a4dd4690b092cc722d35975ae72071d1c2765c3f Mon Sep 17 00:00:00 2001 From: nsheff Date: Tue, 26 Nov 2024 19:18:10 -0500 Subject: [PATCH 07/28] add some logging and docs improvements --- bbconf/bbagent.py | 21 +++++++++++++++++---- bbconf/config_parser/bedbaseconfig.py | 15 ++++++++++++++- bbconf/modules/bedfiles.py | 4 ++-- 3 files changed, 33 insertions(+), 7 deletions(-) diff --git a/bbconf/bbagent.py b/bbconf/bbagent.py index 1b0baab..8ce3b7b 100644 --- a/bbconf/bbagent.py +++ b/bbconf/bbagent.py @@ -1,3 +1,5 @@ +import logging + from functools import cached_property from pathlib import Path from typing import List, Union @@ -12,6 +14,9 @@ from bbconf.modules.bedsets import BedAgentBedSet from bbconf.modules.objects import BBObjects +from .const import PKG_NAME + +_LOGGER = logging.getLogger(PKG_NAME) class BedBaseAgent(object): def __init__( @@ -19,15 +24,16 @@ def __init__( config: Union[Path, str], ): """ - Initialize connection to the pep_db database. You can use The basic connection parameters + Initialize connection to the pep_db database. You can use the basic connection parameters or libpq connection string. - """ - + _LOGGER.info(f"Initializing BedBaseConfig object") self.config = BedBaseConfig(config) - + _LOGGER.info(f"Initializing BedBaseAgent object") self._bed = BedAgentBedFile(self.config, self) + _LOGGER.info(f"Initializing BedAgentBedSet object") self._bedset = BedAgentBedSet(self.config) + _LOGGER.info(f"Initializing BBObjects object") self._objects = BBObjects(self.config) @property @@ -42,6 +48,13 @@ def bedset(self) -> BedAgentBedSet: def objects(self) -> BBObjects: return self._objects + def __repr__(self) -> str: + repr = f"BedBaseAgent(config={self.config})" + repr += f"\n{self.bed}" + repr += f"\n{self.bedset}" + repr += f"\n{self.objects}" + return repr + def get_stats(self) -> StatsReturn: """ Get statistics for a bed file diff --git a/bbconf/config_parser/bedbaseconfig.py b/bbconf/config_parser/bedbaseconfig.py index 3ca9d41..433074a 100644 --- a/bbconf/config_parser/bedbaseconfig.py +++ b/bbconf/config_parser/bedbaseconfig.py @@ -41,18 +41,31 @@ _LOGGER = logging.getLogger(PKG_NAME) -class BedBaseConfig: +class BedBaseConfig(object): + """ + Class to handle BEDbase configuration file and create objects for different modules. + """ def __init__(self, config: Union[Path, str]): + _LOGGER.info(f"Loading configuration file: {config}") self.cfg_path = get_bedbase_cfg(config) self._config = self._read_config_file(self.cfg_path) + _LOGGER.info(f"Initializing database engine...") self._db_engine = self._init_db_engine() + _LOGGER.info(f"Initializing qdrant engine...") self._qdrant_engine = self._init_qdrant_backend() + + _LOGGER.info(f"Initializing qdrant text engine...") self._qdrant_text_engine = self._init_qdrant_text_backend() + + _LOGGER.info(f"Initializing search interfaces...") self._b2bsi = self._init_b2bsi_object() + _LOGGER.info(f"Initializing R2V object...") self._r2v = self._init_r2v_object() + _LOGGER.info(f"Initializing Bivec object...") self._bivec = self._init_bivec_object() + _LOGGER.info(f"Initializing PEPHub client...") self._phc = self._init_pephubclient() self._boto3_client = self._init_boto3_client() diff --git a/bbconf/modules/bedfiles.py b/bbconf/modules/bedfiles.py index 9ec569f..e4691ac 100644 --- a/bbconf/modules/bedfiles.py +++ b/bbconf/modules/bedfiles.py @@ -62,9 +62,9 @@ class BedAgentBedFile: """ - Class that represents Bedfile in Database. + Class that represents a BED file in the Database. - This class has method to add, delete, get files and metadata from the database. + Provides methods to add, delete, get BED files and metadata from the database. """ def __init__(self, config: BedBaseConfig, bbagent_obj=None): From 5ba0c56a4b72e469cd23a7c4d9167f087fdc334a Mon Sep 17 00:00:00 2001 From: nsheff Date: Tue, 26 Nov 2024 21:25:45 -0500 Subject: [PATCH 08/28] some ideas toward #65 --- bbconf/config_parser/bedbaseconfig.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/bbconf/config_parser/bedbaseconfig.py b/bbconf/config_parser/bedbaseconfig.py index 433074a..cf9eaad 100644 --- a/bbconf/config_parser/bedbaseconfig.py +++ b/bbconf/config_parser/bedbaseconfig.py @@ -45,7 +45,7 @@ class BedBaseConfig(object): """ Class to handle BEDbase configuration file and create objects for different modules. """ - def __init__(self, config: Union[Path, str]): + def __init__(self, config: Union[Path, str], init_search_interfaces: bool = True): _LOGGER.info(f"Loading configuration file: {config}") self.cfg_path = get_bedbase_cfg(config) self._config = self._read_config_file(self.cfg_path) @@ -58,12 +58,13 @@ def __init__(self, config: Union[Path, str]): _LOGGER.info(f"Initializing qdrant text engine...") self._qdrant_text_engine = self._init_qdrant_text_backend() - _LOGGER.info(f"Initializing search interfaces...") - self._b2bsi = self._init_b2bsi_object() - _LOGGER.info(f"Initializing R2V object...") - self._r2v = self._init_r2v_object() - _LOGGER.info(f"Initializing Bivec object...") - self._bivec = self._init_bivec_object() + if init_search_interfaces: + _LOGGER.info(f"Initializing search interfaces...") + self._b2bsi = self._init_b2bsi_object() + _LOGGER.info(f"Initializing R2V object...") + self._r2v = self._init_r2v_object() + _LOGGER.info(f"Initializing Bivec object...") + self._bivec = self._init_bivec_object() _LOGGER.info(f"Initializing PEPHub client...") self._phc = self._init_pephubclient() @@ -244,9 +245,11 @@ def _init_bivec_object(self) -> Union[BiVectorSearchInterface, None]: :return: BiVectorSearchInterface """ + _LOGGER.info(f"Initializing BiVectorBackend...") search_backend = BiVectorBackend( metadata_backend=self._qdrant_text_engine, bed_backend=self._qdrant_engine ) + _LOGGER.info(f"Initializing BiVectorSearchInterface...") search_interface = BiVectorSearchInterface( backend=search_backend, query2vec=self.config.path.text2vec, From 8940fb55f8c80d93f9efe9908891abfabc19a90f Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Mon, 2 Dec 2024 11:41:50 -0500 Subject: [PATCH 09/28] Fixed #64 --- bbconf/db_utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bbconf/db_utils.py b/bbconf/db_utils.py index d929ec8..fe7114c 100644 --- a/bbconf/db_utils.py +++ b/bbconf/db_utils.py @@ -339,7 +339,7 @@ class TokenizedBed(Base): nullable=False, ) universe_id: Mapped[str] = mapped_column( - ForeignKey("universes.id", ondelete="CASCADE", passive_deletes=True), + ForeignKey("universes.id", ondelete="CASCADE"), primary_key=True, index=True, nullable=False, @@ -350,9 +350,10 @@ class TokenizedBed(Base): bed: Mapped["Bed"] = relationship("Bed", back_populates="tokenized") universe: Mapped["Universes"] = relationship( + "Universes", "Universes", back_populates="tokenized", - passive_deletes=True, + passive_deletes="all", ) From 3ad2523f6ad3e2e1bb311707ca35477a7a15e966 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Mon, 2 Dec 2024 23:45:58 -0500 Subject: [PATCH 10/28] Fixed #66 Fixed #68 --- bbconf/db_utils.py | 9 +- bbconf/modules/bedfiles.py | 308 +++++++++++++++++++++++++++++++------ bbconf/modules/bedsets.py | 4 +- 3 files changed, 275 insertions(+), 46 deletions(-) diff --git a/bbconf/db_utils.py b/bbconf/db_utils.py index fe7114c..3c5e858 100644 --- a/bbconf/db_utils.py +++ b/bbconf/db_utils.py @@ -130,6 +130,9 @@ class Bed(Base): ref_classifier: Mapped["GenomeRefStats"] = relationship( "GenomeRefStats", back_populates="bed", cascade="all, delete-orphan" ) + processed: Mapped[bool] = mapped_column( + default=False, comment="Whether the bed file was processed" + ) class BedMetadata(Base): @@ -255,6 +258,11 @@ class Files(Base): bedfile: Mapped["Bed"] = relationship("Bed", back_populates="files") bedset: Mapped["BedSets"] = relationship("BedSets", back_populates="files") + __table_args__ = ( + UniqueConstraint("name", "bedfile_id"), + UniqueConstraint("name", "bedset_id"), + ) + class BedFileBedSetRelation(Base): __tablename__ = "bedfile_bedset_relation" @@ -350,7 +358,6 @@ class TokenizedBed(Base): bed: Mapped["Bed"] = relationship("Bed", back_populates="tokenized") universe: Mapped["Universes"] = relationship( - "Universes", "Universes", back_populates="tokenized", passive_deletes="all", diff --git a/bbconf/modules/bedfiles.py b/bbconf/modules/bedfiles.py index 9ec569f..96d3549 100644 --- a/bbconf/modules/bedfiles.py +++ b/bbconf/modules/bedfiles.py @@ -1,3 +1,4 @@ +import datetime import os from logging import getLogger from typing import Dict, List, Union @@ -10,6 +11,7 @@ from pydantic import BaseModel from qdrant_client.models import Distance, PointIdsList, VectorParams from sqlalchemy import and_, delete, func, or_, select +from sqlalchemy.exc import IntegrityError from sqlalchemy.orm import Session, aliased from tqdm import tqdm @@ -454,6 +456,7 @@ def add( local_path: str = None, overwrite: bool = False, nofail: bool = False, + processed: bool = True, ) -> None: """ Add bed file to the database. @@ -473,6 +476,7 @@ def add( :param local_path: local path to the output files :param overwrite: overwrite bed file if it already exists :param nofail: do not raise an error for error in pephub/s3/qdrant or record exsist and not overwrite + :param processed: true if bedfile was processed and statistics and plots were calculated :return: None """ _LOGGER.info(f"Adding bed file to database. bed_id: {identifier}") @@ -554,6 +558,7 @@ def add( license_id=license_id, indexed=upload_qdrant, pephub=upload_pephub, + processed=processed, ) session.add(new_bed) if upload_s3: @@ -612,13 +617,16 @@ def update( plots: dict = None, files: dict = None, classification: dict = None, - add_to_qdrant: bool = False, - upload_pephub: bool = False, - upload_s3: bool = False, + ref_validation: Dict[str, BaseModel] = None, + license_id: str = DEFAULT_LICENSE, + upload_qdrant: bool = True, + upload_pephub: bool = True, + upload_s3: bool = True, local_path: str = None, overwrite: bool = False, nofail: bool = False, - ): + processed: bool = True, + ) -> None: """ Update bed file to the database. @@ -630,22 +638,33 @@ def update( :param plots: bed file plots :param files: bed file files :param classification: bed file classification - :param add_to_qdrant: add bed file to qdrant indexs + :param ref_validation: reference validation data. RefGenValidModel + :param license_id: bed file license id (default: 'DUO:0000042'). + :param upload_qdrant: add bed file to qdrant indexs :param upload_pephub: add bed file to pephub :param upload_s3: upload files to s3 :param local_path: local path to the output files :param overwrite: overwrite bed file if it already exists :param nofail: do not raise an error for error in pephub/s3/qdrant or record exsist and not overwrite + :param processed: true if bedfile was processed and statistics and plots were calculated :return: None """ if not self.exists(identifier): raise BEDFileNotFoundError( f"Bed file with id: {identifier} not found. Cannot update." ) + _LOGGER.info(f"Updating bed file: '{identifier}'") + + if license_id not in self.bb_agent.list_of_licenses and not license_id: + raise BedBaseConfError( + f"License: {license_id} is not in the list of licenses. Please provide a valid license." + f"List of licenses: {self.bb_agent.list_of_licenses}" + ) stats = BedStatsModel(**stats) plots = BedPlots(**plots) files = BedFiles(**files) + bed_metadata = StandardMeta(**metadata) classification = BedClassification(**classification) if upload_pephub: @@ -661,56 +680,259 @@ def update( else: _LOGGER.info("upload_pephub set to false. Skipping pephub..") - if add_to_qdrant: + if upload_qdrant: self.upload_file_qdrant( identifier, files.bed_file.path, payload=metadata.model_dump() ) - statement = select(Bed).where(and_(Bed.id == identifier)) - - if upload_s3: - _LOGGER.warning("S3 upload is not implemented yet") - # if files: - # files = self._config.upload_files_s3( - # identifier, files=files, base_path=local_path, type="files" - # ) - # - # if plots: - # plots = self._config.upload_files_s3( - # identifier, files=plots, base_path=local_path, type="plots" - # ) - with Session(self._sa_engine) as session: - bed_object = session.scalar(statement) + bed_statement = select(Bed).where(and_(Bed.id == identifier)) + bed_object = session.scalar(bed_statement) - setattr(bed_object, **stats.model_dump()) - setattr(bed_object, **classification.model_dump()) + self._update_classification( + sa_session=session, bed_object=bed_object, classification=classification + ) - bed_object.indexed = add_to_qdrant - bed_object.pephub = upload_pephub + self._update_metadata( + sa_session=session, + bed_object=bed_object, + bed_metadata=bed_metadata, + ) + self._update_stats(sa_session=session, bed_object=bed_object, stats=stats) if upload_s3: - _LOGGER.warning("S3 upload is not implemented yet") - # for k, v in files: - # if v: - # new_file = Files( - # **v.model_dump(exclude_none=True, exclude_unset=True), - # bedfile_id=identifier, - # type="file", - # ) - # session.add(new_file) - # for k, v in plots: - # if v: - # new_plot = Files( - # **v.model_dump(exclude_none=True, exclude_unset=True), - # bedfile_id=identifier, - # type="plot", - # ) - # session.add(new_plot) + self._update_plots( + sa_session=session, + bed_object=bed_object, + plots=plots, + local_path=local_path, + ) + self._update_files( + sa_session=session, + bed_object=bed_object, + files=files, + local_path=local_path, + ) + + self._update_ref_validation( + sa_session=session, bed_object=bed_object, ref_validation=ref_validation + ) + + bed_object.processed = processed + bed_object.indexed = upload_qdrant + bed_object.last_update_date = datetime.datetime.now(datetime.timezone.utc) session.commit() - raise NotImplementedError + return None + + @staticmethod + def _update_classification( + sa_session: Session, bed_object: Bed, classification: BedClassification + ) -> None: + """ + Update bed file classification + + :param sa_session: sqlalchemy session + :param bed_object: bed sqlalchemy object + :param classification: bed file classification as BedClassification object + + :return: None + """ + classification_dict = classification.model_dump( + exclude_defaults=True, exclude_none=True, exclude_unset=True + ) + for k, v in classification_dict.items(): + setattr(bed_object, k, v) + + sa_session.commit() + + @staticmethod + def _update_stats( + sa_session: Session, bed_object: Bed, stats: BedStatsModel + ) -> None: + """ + Update bed file statistics + + :param sa_session: sqlalchemy session + :param bed_object: bed sqlalchemy object + :param stats: bed file statistics as BedStatsModel object + :return: None + """ + + stats_dict = stats.model_dump( + exclude_defaults=True, exclude_none=True, exclude_unset=True + ) + if not bed_object.stats: + new_bedstat = BedStats(**stats.model_dump(), id=bed_object.id) + sa_session.add(new_bedstat) + else: + for k, v in stats_dict.items(): + setattr(bed_object.stats, k, v) + + sa_session.commit() + + @staticmethod + def _update_metadata( + sa_session: Session, bed_object: Bed, bed_metadata: StandardMeta + ) -> None: + """ + Update bed file metadata + + :param sa_session: sqlalchemy session + :param bed_object: bed sqlalchemy object + :param bed_metadata: bed file metadata as StandardMeta object + + :return: None + """ + + metadata_dict = bed_metadata.model_dump( + exclude_defaults=True, exclude_none=True, exclude_unset=True + ) + if not bed_object.annotations: + new_metadata = BedMetadata( + **bed_metadata.model_dump(exclude={"description"}), id=bed_object.id + ) + sa_session.add(new_metadata) + else: + for k, v in metadata_dict.items(): + setattr(bed_object.annotations, k, v) + + sa_session.commit() + + def _update_plots( + self, + sa_session: Session, + bed_object: Bed, + plots: BedPlots, + local_path: str = None, + ) -> None: + """ + Update bed file plots + + :param sa_session: sqlalchemy session + :param bed_object: bed sqlalchemy object + :param plots: bed file plots + :param local_path: local path to the output files + """ + + _LOGGER.info("Updating bed file plots..") + if plots: + plots = self._config.upload_files_s3( + bed_object.id, files=plots, base_path=local_path, type="plots" + ) + plots_dict = plots.model_dump( + exclude_defaults=True, exclude_none=True, exclude_unset=True + ) + if not plots_dict: + return None + + for k, v in plots: + if v: + new_plot = Files( + **v.model_dump( + exclude_none=True, + exclude_unset=True, + exclude={"object_id", "access_methods"}, + ), + bedfile_id=bed_object.id, + type="plot", + ) + try: + sa_session.add(new_plot) + sa_session.commit() + except IntegrityError as _: + sa_session.rollback() + _LOGGER.debug( + f"Plot with name: {v.name} already exists. Updating.." + ) + + return None + + def _update_files( + self, + sa_session: Session, + bed_object: Bed, + files: BedFiles, + local_path: str = None, + ) -> None: + """ + Update bed files + + :param sa_session: sqlalchemy session + :param bed_object: bed sqlalchemy object + :param files: bed file files + """ + + _LOGGER.info("Updating bed files..") + if files: + files = self._config.upload_files_s3( + bed_object.id, files=files, base_path=local_path, type="files" + ) + + files_dict = files.model_dump( + exclude_defaults=True, exclude_none=True, exclude_unset=True + ) + if not files_dict: + return None + + for k, v in files: + if v: + new_file = Files( + **v.model_dump( + exclude_none=True, + exclude_unset=True, + exclude={"object_id", "access_methods"}, + ), + bedfile_id=bed_object.id, + type="file", + ) + + try: + sa_session.add(new_file) + sa_session.commit() + except IntegrityError as _: + sa_session.rollback() + _LOGGER.debug( + f"File with name: {v.name} already exists. Updating.." + ) + + @staticmethod + def _update_ref_validation( + sa_session: Session, bed_object: Bed, ref_validation: Dict[str, BaseModel] + ) -> None: + """ + Update reference validation data + + :param sa_session: sqlalchemy session + :param bed_object: bed sqlalchemy object + :param ref_validation: bed file metadata + """ + + if not ref_validation: + return None + + _LOGGER.info("Updating reference validation data..") + + for ref_gen_check, data in ref_validation.items(): + new_gen_ref = GenomeRefStats( + **RefGenValidModel( + **data.model_dump(), + provided_genome=bed_object.genome_alias, + compared_genome=ref_gen_check, + ).model_dump(), + bed_id=bed_object.id, + ) + try: + sa_session.add(new_gen_ref) + sa_session.commit() + except IntegrityError as _: + sa_session.rollback() + _LOGGER.info( + f"Reference validation exists for BED id: {bed_object.id} and ref_gen_check." + ) + + return None def delete(self, identifier: str) -> None: """ diff --git a/bbconf/modules/bedsets.py b/bbconf/modules/bedsets.py index 3c6ceb8..79ac6b7 100644 --- a/bbconf/modules/bedsets.py +++ b/bbconf/modules/bedsets.py @@ -3,18 +3,18 @@ from geniml.io.utils import compute_md5sum_bedset from sqlalchemy import Float, Numeric, func, or_, select -from sqlalchemy.orm import Session, relationship from sqlalchemy.exc import IntegrityError +from sqlalchemy.orm import Session, relationship from bbconf.config_parser import BedBaseConfig from bbconf.const import PKG_NAME from bbconf.db_utils import Bed, BedFileBedSetRelation, BedSets, BedStats, Files from bbconf.exceptions import ( BedBaseConfError, + BEDFileNotFoundError, BedSetExistsError, BedSetNotFoundError, BedSetTrackHubLimitError, - BEDFileNotFoundError, ) from bbconf.models.bed_models import BedStatsModel, StandardMeta from bbconf.models.bedset_models import ( From 3c146f5b862e85c697a5a6f8f9671f925e61d1dd Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Tue, 3 Dec 2024 14:50:04 -0500 Subject: [PATCH 11/28] updated requirements --- bbconf/_version.py | 2 +- docs/bedbase_overview.svg | 2757 ++++++++++++++++++++++++++++++++++++- 2 files changed, 2728 insertions(+), 31 deletions(-) diff --git a/bbconf/_version.py b/bbconf/_version.py index bfbf79e..8e3b533 100644 --- a/bbconf/_version.py +++ b/bbconf/_version.py @@ -1 +1 @@ -__version__ = "0.9.1_dev0" +__version__ = "0.9.1_dev1" diff --git a/docs/bedbase_overview.svg b/docs/bedbase_overview.svg index d775db2..1faf6b0 100644 --- a/docs/bedbase_overview.svg +++ b/docs/bedbase_overview.svg @@ -6,8 +6,8 @@ BBclientBED2BED searchText2BED searchOther toolsGEO + sodipodi:nodetypes="cccccccccc" />BedmakerBedstat.bed.bed.bedBedbuncherOther tools From 590ee706bd83f7af1e5cf5e5c66c5d244d2a3b72 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Tue, 3 Dec 2024 14:51:13 -0500 Subject: [PATCH 12/28] lint --- bbconf/bbagent.py | 3 ++- bbconf/config_parser/bedbaseconfig.py | 1 + requirements/requirements-all.txt | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/bbconf/bbagent.py b/bbconf/bbagent.py index 8ce3b7b..c41f350 100644 --- a/bbconf/bbagent.py +++ b/bbconf/bbagent.py @@ -18,6 +18,7 @@ _LOGGER = logging.getLogger(PKG_NAME) + class BedBaseAgent(object): def __init__( self, @@ -53,7 +54,7 @@ def __repr__(self) -> str: repr += f"\n{self.bed}" repr += f"\n{self.bedset}" repr += f"\n{self.objects}" - return repr + return repr def get_stats(self) -> StatsReturn: """ diff --git a/bbconf/config_parser/bedbaseconfig.py b/bbconf/config_parser/bedbaseconfig.py index cf9eaad..8bc0a49 100644 --- a/bbconf/config_parser/bedbaseconfig.py +++ b/bbconf/config_parser/bedbaseconfig.py @@ -45,6 +45,7 @@ class BedBaseConfig(object): """ Class to handle BEDbase configuration file and create objects for different modules. """ + def __init__(self, config: Union[Path, str], init_search_interfaces: bool = True): _LOGGER.info(f"Loading configuration file: {config}") self.cfg_path = get_bedbase_cfg(config) diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index cf4f5cf..a202b34 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -1,6 +1,6 @@ yacman >= 0.9.1 sqlalchemy >= 2.0.0 -geniml[ml] >= 0.5.1 +geniml[ml] >= 0.5.2 psycopg >= 3.1.15 colorlogs pydantic >= 2.9.0 From af026e12b13e7d43b2fc4c69e678d1547820ee94 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Mon, 9 Dec 2024 12:41:27 -0500 Subject: [PATCH 13/28] updated for partial_processing --- bbconf/bbagent.py | 12 ++++---- bbconf/config_parser/bedbaseconfig.py | 41 +++++++++++++++++++-------- bbconf/helpers.py | 2 ++ bbconf/modules/bedfiles.py | 1 + 4 files changed, 39 insertions(+), 17 deletions(-) diff --git a/bbconf/bbagent.py b/bbconf/bbagent.py index c41f350..1ea6d26 100644 --- a/bbconf/bbagent.py +++ b/bbconf/bbagent.py @@ -23,18 +23,20 @@ class BedBaseAgent(object): def __init__( self, config: Union[Path, str], + init_ml: bool = True, ): """ Initialize connection to the pep_db database. You can use the basic connection parameters or libpq connection string. + + :param config: path to the configuration file + :param init_ml: initialize ML models for search (default: True) """ - _LOGGER.info(f"Initializing BedBaseConfig object") - self.config = BedBaseConfig(config) - _LOGGER.info(f"Initializing BedBaseAgent object") + + self.config = BedBaseConfig(config, init_ml) + self._bed = BedAgentBedFile(self.config, self) - _LOGGER.info(f"Initializing BedAgentBedSet object") self._bedset = BedAgentBedSet(self.config) - _LOGGER.info(f"Initializing BBObjects object") self._objects = BBObjects(self.config) @property diff --git a/bbconf/config_parser/bedbaseconfig.py b/bbconf/config_parser/bedbaseconfig.py index 8bc0a49..0a79758 100644 --- a/bbconf/config_parser/bedbaseconfig.py +++ b/bbconf/config_parser/bedbaseconfig.py @@ -46,28 +46,34 @@ class BedBaseConfig(object): Class to handle BEDbase configuration file and create objects for different modules. """ - def __init__(self, config: Union[Path, str], init_search_interfaces: bool = True): - _LOGGER.info(f"Loading configuration file: {config}") + def __init__(self, config: Union[Path, str], init_ml: bool = True): + """ + Initialize BedBaseConfig object + + :param config: path to the configuration file + :param init_ml: initialize machine learning models used for search + """ + self.cfg_path = get_bedbase_cfg(config) self._config = self._read_config_file(self.cfg_path) - - _LOGGER.info(f"Initializing database engine...") self._db_engine = self._init_db_engine() - _LOGGER.info(f"Initializing qdrant engine...") - self._qdrant_engine = self._init_qdrant_backend() - _LOGGER.info(f"Initializing qdrant text engine...") + self._qdrant_engine = self._init_qdrant_backend() self._qdrant_text_engine = self._init_qdrant_text_backend() - if init_search_interfaces: - _LOGGER.info(f"Initializing search interfaces...") + if init_ml: self._b2bsi = self._init_b2bsi_object() - _LOGGER.info(f"Initializing R2V object...") self._r2v = self._init_r2v_object() - _LOGGER.info(f"Initializing Bivec object...") self._bivec = self._init_bivec_object() + else: + _LOGGER.info( + f"Skipping initialization of ML models, init_ml parameter set to False." + ) + + self._b2bsi = None + self._r2v = None + self._bivec = None - _LOGGER.info(f"Initializing PEPHub client...") self._phc = self._init_pephubclient() self._boto3_client = self._init_boto3_client() @@ -197,6 +203,11 @@ def zarr_root(self) -> Union[Z_GROUP, None]: return zarr.group(store=cache, overwrite=False) def _init_db_engine(self) -> BaseEngine: + """ + Create database engine object using credentials provided in config file + """ + + _LOGGER.info(f"Initializing database engine...") return BaseEngine( host=self._config.database.host, port=self._config.database.port, @@ -212,6 +223,8 @@ def _init_qdrant_backend(self) -> QdrantBackend: :return: QdrantClient """ + + _LOGGER.info(f"Initializing qdrant engine...") try: return QdrantBackend( collection=self._config.qdrant.file_collection, @@ -232,6 +245,7 @@ def _init_qdrant_text_backend(self) -> QdrantBackend: :return: QdrantClient """ + _LOGGER.info(f"Initializing qdrant text engine...") return QdrantBackend( dim=TEXT_EMBEDDING_DIMENSION, collection=self.config.qdrant.text_collection, @@ -264,6 +278,7 @@ def _init_b2bsi_object(self) -> Union[BED2BEDSearchInterface, None]: :return: Bed2BEDSearchInterface object """ try: + _LOGGER.info(f"Initializing search interfaces...") return BED2BEDSearchInterface( backend=self.qdrant_engine, query2vec=BED2Vec(model=self._config.path.region2vec), @@ -284,6 +299,7 @@ def _init_pephubclient() -> Union[PEPHubClient, None]: :return: PephubClient """ try: + _LOGGER.info(f"Initializing PEPHub client...") return PEPHubClient() except Exception as e: _LOGGER.error(f"Error in creating PephubClient object: {e}") @@ -315,6 +331,7 @@ def _init_r2v_object(self) -> Union[Region2VecExModel, None]: Create Region2VecExModel object using credentials provided in config file """ try: + _LOGGER.info(f"Initializing R2V object...") return Region2VecExModel(self.config.path.region2vec) except Exception as e: _LOGGER.error(f"Error in creating Region2VecExModel object: {e}") diff --git a/bbconf/helpers.py b/bbconf/helpers.py index 47fe67c..4cb3792 100644 --- a/bbconf/helpers.py +++ b/bbconf/helpers.py @@ -22,6 +22,8 @@ def get_bedbase_cfg(cfg: str = None) -> str: Optional, the $BEDBASE config env var will be used if not provided :return str: absolute configuration file path """ + + _LOGGER.info(f"Loading configuration file: {cfg}") selected_cfg = select_config(config_filepath=cfg, config_env_vars=CFG_ENV_VARS) if not selected_cfg: raise BedBaseConnectionError( diff --git a/bbconf/modules/bedfiles.py b/bbconf/modules/bedfiles.py index 2eec12b..67b96a1 100644 --- a/bbconf/modules/bedfiles.py +++ b/bbconf/modules/bedfiles.py @@ -518,6 +518,7 @@ def add( _LOGGER.warning( f"Could not upload to pephub. Error: {e}. nofail: {nofail}" ) + upload_pephub = False if not nofail: raise e else: From e464003d9b52bb864c0c519251dcd9e676a13f66 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Wed, 11 Dec 2024 15:38:37 -0500 Subject: [PATCH 14/28] updated tests and fixed uploading files --- .github/workflows/run-pytest.yml | 16 +++++++++------- bbconf/_version.py | 2 +- bbconf/config_parser/bedbaseconfig.py | 16 +++++++++------- bbconf/modules/bedfiles.py | 23 ++++++++++++----------- tests/utils.py | 2 +- 5 files changed, 32 insertions(+), 27 deletions(-) diff --git a/.github/workflows/run-pytest.yml b/.github/workflows/run-pytest.yml index 8cd5382..0d37c1e 100644 --- a/.github/workflows/run-pytest.yml +++ b/.github/workflows/run-pytest.yml @@ -12,7 +12,7 @@ jobs: pytest: strategy: matrix: - python-version: ["3.9", "3.11"] + python-version: ["3.9", "3.12"] os: [ubuntu-latest] # can't use macOS when using service containers or container jobs runs-on: ${{ matrix.os }} services: @@ -27,21 +27,23 @@ jobs: - 5432:5432 options: --health-cmd pg_isready --health-interval 10s --health-timeout 5s --health-retries 5 steps: - - uses: actions/checkout@v2 - - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} + - name: Install uv + run: pip install uv + - name: Install dev dependencies - run: if [ -f requirements/requirements-dev.txt ]; then pip install -r requirements/requirements-dev.txt; fi + run: if [ -f requirements/requirements-dev.txt ]; then uv pip install -r requirements/requirements-dev.txt; fi - name: Install test dependencies run: if [ -f requirements/requirements-test.txt ]; then pip install -r requirements/requirements-test.txt; fi - name: Install package - run: python -m pip install . + run: python -m uv pip install . - name: Run pytest tests run: pytest tests -x -vv \ No newline at end of file diff --git a/bbconf/_version.py b/bbconf/_version.py index 8e3b533..61fb31c 100644 --- a/bbconf/_version.py +++ b/bbconf/_version.py @@ -1 +1 @@ -__version__ = "0.9.1_dev1" +__version__ = "0.10.0" diff --git a/bbconf/config_parser/bedbaseconfig.py b/bbconf/config_parser/bedbaseconfig.py index 8bc0a49..c1ec715 100644 --- a/bbconf/config_parser/bedbaseconfig.py +++ b/bbconf/config_parser/bedbaseconfig.py @@ -231,13 +231,15 @@ def _init_qdrant_text_backend(self) -> QdrantBackend: :return: QdrantClient """ - - return QdrantBackend( - dim=TEXT_EMBEDDING_DIMENSION, - collection=self.config.qdrant.text_collection, - qdrant_host=self.config.qdrant.host, - qdrant_api_key=self.config.qdrant.api_key, - ) + try: + return QdrantBackend( + dim=TEXT_EMBEDDING_DIMENSION, + collection=self.config.qdrant.text_collection, + qdrant_host=self.config.qdrant.host, + qdrant_api_key=self.config.qdrant.api_key, + ) + except Exception as e: + _LOGGER.error(f"Error while connecting to qdrant text engine: {e}") def _init_bivec_object(self) -> Union[BiVectorSearchInterface, None]: """ diff --git a/bbconf/modules/bedfiles.py b/bbconf/modules/bedfiles.py index e4691ac..3716178 100644 --- a/bbconf/modules/bedfiles.py +++ b/bbconf/modules/bedfiles.py @@ -446,7 +446,7 @@ def add( plots: dict = None, files: dict = None, classification: dict = None, - ref_validation: Dict[str, BaseModel] = None, + ref_validation: Union[Dict[str, BaseModel], None] = None, license_id: str = DEFAULT_LICENSE, upload_qdrant: bool = False, upload_pephub: bool = False, @@ -590,16 +590,17 @@ def add( session.add(new_bedstat) session.add(new_metadata) - for ref_gen_check, data in ref_validation.items(): - new_gen_ref = GenomeRefStats( - **RefGenValidModel( - **data.model_dump(), - provided_genome=classification.genome_alias, - compared_genome=ref_gen_check, - ).model_dump(), - bed_id=identifier, - ) - session.add(new_gen_ref) + if ref_validation: + for ref_gen_check, data in ref_validation.items(): + new_gen_ref = GenomeRefStats( + **RefGenValidModel( + **data.model_dump(), + provided_genome=classification.genome_alias, + compared_genome=ref_gen_check, + ).model_dump(), + bed_id=identifier, + ) + session.add(new_gen_ref) session.commit() return None diff --git a/tests/utils.py b/tests/utils.py index 61410b2..7ceb072 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -109,7 +109,7 @@ def __enter__(self): self._add_bedset_data() def __exit__(self, exc_type, exc_value, exc_traceback): - # self.db_engine.delete_schema() + self.db_engine.delete_schema() pass def _add_data(self): From d4bf110e97e33bb2383162894325518cac8f1b24 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Wed, 11 Dec 2024 15:40:30 -0500 Subject: [PATCH 15/28] updated uv in tests --- .github/workflows/run-pytest.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/run-pytest.yml b/.github/workflows/run-pytest.yml index 0d37c1e..730743b 100644 --- a/.github/workflows/run-pytest.yml +++ b/.github/workflows/run-pytest.yml @@ -37,13 +37,13 @@ jobs: run: pip install uv - name: Install dev dependencies - run: if [ -f requirements/requirements-dev.txt ]; then uv pip install -r requirements/requirements-dev.txt; fi + run: if [ -f requirements/requirements-dev.txt ]; then uv pip install -r requirements/requirements-dev.txt --system; fi - name: Install test dependencies - run: if [ -f requirements/requirements-test.txt ]; then pip install -r requirements/requirements-test.txt; fi + run: if [ -f requirements/requirements-test.txt ]; then pip install -r requirements/requirements-test.txt --system; fi - name: Install package - run: python -m uv pip install . + run: python -m uv pip install . --system - name: Run pytest tests run: pytest tests -x -vv \ No newline at end of file From fa91690453191726366e73ee87a71ec9442f7c62 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Wed, 11 Dec 2024 15:42:48 -0500 Subject: [PATCH 16/28] updated installation --- .github/workflows/run-pytest.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run-pytest.yml b/.github/workflows/run-pytest.yml index 730743b..e40883d 100644 --- a/.github/workflows/run-pytest.yml +++ b/.github/workflows/run-pytest.yml @@ -43,7 +43,7 @@ jobs: run: if [ -f requirements/requirements-test.txt ]; then pip install -r requirements/requirements-test.txt --system; fi - name: Install package - run: python -m uv pip install . --system + run: uv pip install . --system - name: Run pytest tests run: pytest tests -x -vv \ No newline at end of file From 3899882c189503b71b0da0ba64393c2906836c7c Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Wed, 11 Dec 2024 15:45:47 -0500 Subject: [PATCH 17/28] updated pip --- .github/workflows/run-pytest.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run-pytest.yml b/.github/workflows/run-pytest.yml index e40883d..876596f 100644 --- a/.github/workflows/run-pytest.yml +++ b/.github/workflows/run-pytest.yml @@ -40,7 +40,7 @@ jobs: run: if [ -f requirements/requirements-dev.txt ]; then uv pip install -r requirements/requirements-dev.txt --system; fi - name: Install test dependencies - run: if [ -f requirements/requirements-test.txt ]; then pip install -r requirements/requirements-test.txt --system; fi + run: if [ -f requirements/requirements-test.txt ]; then uv pip install -r requirements/requirements-test.txt --system; fi - name: Install package run: uv pip install . --system From a8d48f7064eceb6dcae72546c6174fc916093f8a Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Wed, 11 Dec 2024 15:50:08 -0500 Subject: [PATCH 18/28] lower version of pybiocfilecache --- requirements/requirements-all.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index a202b34..716e509 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -12,3 +12,4 @@ zarr pyyaml >= 6.0.1 # for s3fs because of the errors s3fs >= 2024.3.1 pandas >= 2.0.0 +pybiocfilecache < 0.5.0 From bcb20f8694bc16321f9b2c5d240a7537bbd31525 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Wed, 11 Dec 2024 16:05:36 -0500 Subject: [PATCH 19/28] SERVICE_UNAVAILABLE TRUE --- tests/conftest.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 2a74b35..773aaa6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -16,16 +16,17 @@ -p 5432:5432 postgres """ -try: - subprocess.check_output( - "docker inspect bedbase-test --format '{{.State.Status}}'", shell=True - ) - SERVICE_UNAVAILABLE = False -except: - register( - print, f"Some tests require a test database. To initiate it, run:\n{DB_CMD}" - ) - SERVICE_UNAVAILABLE = True +# try: +# subprocess.check_output( +# "docker inspect bedbase-test --format '{{.State.Status}}'", shell=True +# ) +# SERVICE_UNAVAILABLE = False +# except: +# register( +# print, f"Some tests require a test database. To initiate it, run:\n{DB_CMD}" +# ) +# SERVICE_UNAVAILABLE = True +SERVICE_UNAVAILABLE = False TESTS_DIR = os.path.dirname(os.path.abspath(__file__)) From f9a55b2329c04bd8c04274dabef370930a29b244 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Sun, 15 Dec 2024 13:52:01 -0500 Subject: [PATCH 20/28] - added tests - bug fixes and robustness - added get_unprocessed --- bbconf/config_parser/bedbaseconfig.py | 21 ++++-- bbconf/models/bed_models.py | 2 +- bbconf/modules/bedfiles.py | 100 ++++++++++++++++++++------ tests/conftest.py | 21 +++--- tests/test_bedfile.py | 45 ++++++++++-- tests/utils.py | 2 + 6 files changed, 142 insertions(+), 49 deletions(-) diff --git a/bbconf/config_parser/bedbaseconfig.py b/bbconf/config_parser/bedbaseconfig.py index 0a79758..64b2200 100644 --- a/bbconf/config_parser/bedbaseconfig.py +++ b/bbconf/config_parser/bedbaseconfig.py @@ -238,7 +238,7 @@ def _init_qdrant_backend(self) -> QdrantBackend: f"error in Connection to qdrant! skipping... Error: {err}", UserWarning ) - def _init_qdrant_text_backend(self) -> QdrantBackend: + def _init_qdrant_text_backend(self) -> Union[QdrantBackend, None]: """ Create qdrant client text embedding object using credentials provided in config file @@ -246,12 +246,19 @@ def _init_qdrant_text_backend(self) -> QdrantBackend: """ _LOGGER.info(f"Initializing qdrant text engine...") - return QdrantBackend( - dim=TEXT_EMBEDDING_DIMENSION, - collection=self.config.qdrant.text_collection, - qdrant_host=self.config.qdrant.host, - qdrant_api_key=self.config.qdrant.api_key, - ) + try: + return QdrantBackend( + dim=TEXT_EMBEDDING_DIMENSION, + collection=self.config.qdrant.text_collection, + qdrant_host=self.config.qdrant.host, + qdrant_api_key=self.config.qdrant.api_key, + ) + except Exception as _: + _LOGGER.error("Error in Connection to qdrant text! skipping...") + warnings.warn( + "Error in Connection to qdrant text! skipping...", UserWarning + ) + return None def _init_bivec_object(self) -> Union[BiVectorSearchInterface, None]: """ diff --git a/bbconf/models/bed_models.py b/bbconf/models/bed_models.py index 4ba7118..2545d5d 100644 --- a/bbconf/models/bed_models.py +++ b/bbconf/models/bed_models.py @@ -75,7 +75,7 @@ class BedStatsModel(BaseModel): class BedPEPHub(BaseModel): - sample_name: str + sample_name: str = "" genome: str = "" organism: str = "" species_id: str = "" diff --git a/bbconf/modules/bedfiles.py b/bbconf/modules/bedfiles.py index dbe6dbb..d1dd97b 100644 --- a/bbconf/modules/bedfiles.py +++ b/bbconf/modules/bedfiles.py @@ -614,12 +614,12 @@ def add( def update( self, identifier: str, - stats: dict, - metadata: dict = None, - plots: dict = None, - files: dict = None, - classification: dict = None, - ref_validation: Dict[str, BaseModel] = None, + stats: Union[dict, None] = None, + metadata: Union[dict, None] = None, + plots: Union[dict, None] = None, + files: Union[dict, None] = None, + classification: Union[dict, None] = None, + ref_validation: Union[Dict[str, BaseModel], None] = None, license_id: str = DEFAULT_LICENSE, upload_qdrant: bool = True, upload_pephub: bool = True, @@ -663,11 +663,11 @@ def update( f"List of licenses: {self.bb_agent.list_of_licenses}" ) - stats = BedStatsModel(**stats) - plots = BedPlots(**plots) - files = BedFiles(**files) - bed_metadata = StandardMeta(**metadata) - classification = BedClassification(**classification) + stats = BedStatsModel(**stats if stats else {}) + plots = BedPlots(**plots if plots else {}) + files = BedFiles(**files if files else {}) + bed_metadata = StandardMeta(**metadata if metadata else {}) + classification = BedClassification(**classification if classification else {}) if upload_pephub: metadata = BedPEPHub(**metadata) @@ -978,16 +978,19 @@ def upload_pephub(self, identifier: str, metadata: dict, overwrite: bool = False ) def update_pephub(self, identifier: str, metadata: dict, overwrite: bool = False): - if not metadata: - _LOGGER.warning("No metadata provided. Skipping pephub upload..") - return False - self._config.phc.sample.update( - namespace=self._config.config.phc.namespace, - name=self._config.config.phc.name, - tag=self._config.config.phc.tag, - sample_name=identifier, - sample_dict=metadata, - ) + try: + if not metadata: + _LOGGER.warning("No metadata provided. Skipping pephub upload..") + return False + self._config.phc.sample.update( + namespace=self._config.config.phc.namespace, + name=self._config.config.phc.name, + tag=self._config.config.phc.tag, + sample_name=identifier, + sample_dict=metadata, + ) + except ResponseError as e: + _LOGGER.warning(f"Could not update pephub. Error: {e}") def delete_pephub_sample(self, identifier: str): """ @@ -1023,6 +1026,10 @@ def upload_file_qdrant( """ _LOGGER.debug(f"Adding bed file to qdrant. bed_id: {bed_id}") + + if not self._qdrant_engine: + raise QdrantInstanceNotInitializedError("Could not upload file.") + bed_embedding = self._embed_file(bed_file) self._qdrant_engine.load( @@ -1559,3 +1566,54 @@ def get_missing_plots( results = [result for result in results] return results + + def get_unprocessed(self, limit: int = 1000, offset: int = 0) -> BedListResult: + """ + Get bed files that are not processed. + + :param limit: number of results to return + :param offset: offset to start from + + :return: list of bed file identifiers + """ + with Session(self._sa_engine) as session: + query = ( + select(Bed).where(Bed.processed.is_(False)).limit(limit).offset(offset) + ) + count_query = select(func.count()).where(Bed.processed.is_(False)) + + count = session.execute(count_query).one()[0] + + bed_results = session.scalars(query) + + results = [] + for bed_object in bed_results: + results.append( + BedMetadataBasic( + id=bed_object.id, + name=bed_object.name, + genome_alias=bed_object.genome_alias, + genome_digest=bed_object.genome_digest, + bed_type=bed_object.bed_type, + bed_format=bed_object.bed_format, + description=bed_object.description, + annotation=StandardMeta( + **( + bed_object.annotations.__dict__ + if bed_object.annotations + else {} + ) + ), + last_update_date=bed_object.last_update_date, + submission_date=bed_object.submission_date, + is_universe=bed_object.is_universe, + license_id=bed_object.license_id, + ) + ) + + return BedListResult( + count=count, + limit=limit, + offset=offset, + results=results, + ) diff --git a/tests/conftest.py b/tests/conftest.py index 773aaa6..2f269aa 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -16,19 +16,6 @@ -p 5432:5432 postgres """ -# try: -# subprocess.check_output( -# "docker inspect bedbase-test --format '{{.State.Status}}'", shell=True -# ) -# SERVICE_UNAVAILABLE = False -# except: -# register( -# print, f"Some tests require a test database. To initiate it, run:\n{DB_CMD}" -# ) -# SERVICE_UNAVAILABLE = True -SERVICE_UNAVAILABLE = False - - TESTS_DIR = os.path.dirname(os.path.abspath(__file__)) CONFIG_PATH = os.path.join( @@ -40,6 +27,14 @@ "data", ) +# try: +# BedBaseAgent(config=CONFIG_PATH) +# SERVICE_UNAVAILABLE = False +# except Exception as _: +# SERVICE_UNAVAILABLE = True +SERVICE_UNAVAILABLE = False + + if not SERVICE_UNAVAILABLE: agent = BedBaseAgent(config=CONFIG_PATH) diff --git a/tests/test_bedfile.py b/tests/test_bedfile.py index 07489c7..9875b78 100644 --- a/tests/test_bedfile.py +++ b/tests/test_bedfile.py @@ -204,13 +204,44 @@ def test_bed_delete_not_found(self, bbagent_obj): with pytest.raises(BEDFileNotFoundError): bbagent_obj.bed.delete("not_found") - @pytest.mark.skip("Skipped, not fully implemented") - def test_bed_update(self): - # agent = BedBaseAgent(config=config) - # ff = agent.bed.update("91b2754c8ff01769bacfc80e6923c46e", {"number_of_regions": 44}) - # print(ff) - # assert ff != None - pass + def test_bed_update(self, bbagent_obj): + + # TODO: has to be expanded + with ContextManagerDBTesting(config=bbagent_obj.config, add_data=True): + + bed_file = bbagent_obj.bed.get(BED_TEST_ID, full=True) + # assert bed_file.annotation.model_dump(exclude_defaults=True) == {} + assert bed_file.annotation.cell_line == "" + + new_metadata = { + "cell_line": "K562", + "tissue": "blood", + } + bbagent_obj.bed.update( + identifier=BED_TEST_ID, + metadata=new_metadata, + upload_qdrant=False, + upload_s3=False, + ) + + new_bed_file = bbagent_obj.bed.get(BED_TEST_ID, full=True) + + assert new_bed_file.annotation.cell_line == "K562" + + def test_get_unprocessed(self, bbagent_obj): + with ContextManagerDBTesting(config=bbagent_obj.config, add_data=True): + return_result = bbagent_obj.bed.get_unprocessed(limit=100, offset=0) + + assert return_result.count == 1 + assert return_result.results[0].id == BED_TEST_ID + + def test_get_missing_plots(self, bbagent_obj): + with ContextManagerDBTesting(config=bbagent_obj.config, add_data=True): + return_result = bbagent_obj.bed.get_missing_plots( + "tss_distance", limit=100, offset=0 + ) + + assert return_result[0] == BED_TEST_ID @pytest.mark.skip("Skipped, because ML models and qdrant needed") diff --git a/tests/utils.py b/tests/utils.py index 7ceb072..20a6ecc 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -39,6 +39,7 @@ def get_example_dict() -> dict: "genome_alias": "hg38", "genome_digest": "2230c535660fb4774114bfa966a62f823fdb6d21acf138d4", "name": "random_name", + "processed": False, } return value @@ -109,6 +110,7 @@ def __enter__(self): self._add_bedset_data() def __exit__(self, exc_type, exc_value, exc_traceback): + # If we want to keep data, and schema, comment out the following line self.db_engine.delete_schema() pass From 101849684676dfc7cfd7038648b46fe471ff5b29 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Mon, 16 Dec 2024 12:35:54 -0500 Subject: [PATCH 21/28] - added processed table to bedsets --- bbconf/db_utils.py | 4 +++ bbconf/modules/bedsets.py | 57 +++++++++++++++++++++++++++++++++++++++ tests/test_bedset.py | 7 +++++ tests/utils.py | 1 + 4 files changed, 69 insertions(+) diff --git a/bbconf/db_utils.py b/bbconf/db_utils.py index 3c5e858..aff568d 100644 --- a/bbconf/db_utils.py +++ b/bbconf/db_utils.py @@ -311,6 +311,10 @@ class BedSets(Base): author: Mapped[str] = mapped_column(nullable=True, comment="Author of the bedset") source: Mapped[str] = mapped_column(nullable=True, comment="Source of the bedset") + processed: Mapped[bool] = mapped_column( + default=False, comment="Whether the bedset was processed" + ) + class Universes(Base): __tablename__ = "universes" diff --git a/bbconf/modules/bedsets.py b/bbconf/modules/bedsets.py index 79ac6b7..7d99105 100644 --- a/bbconf/modules/bedsets.py +++ b/bbconf/modules/bedsets.py @@ -291,6 +291,7 @@ def create( local_path: str = "", no_fail: bool = False, overwrite: bool = False, + processed: bool = True, ) -> None: """ Create bedset in the database. @@ -307,6 +308,7 @@ def create( :param local_path: local path to the output files :param no_fail: do not raise an error if bedset already exists :param overwrite: overwrite the record in the database + :param processed: flag to indicate that bedset is processed. [Default: True] :return: None """ _LOGGER.info(f"Creating bedset '{identifier}'") @@ -347,6 +349,7 @@ def create( md5sum=compute_md5sum_bedset(bedid_list), author=annotation.get("author"), source=annotation.get("source"), + processed=processed, ) if upload_s3: @@ -599,6 +602,60 @@ def exists(self, identifier: str) -> bool: return True return False + def get_unprocessed(self, limit: int = 100, offset: int = 0) -> BedSetListResult: + """ + Get unprocessed bedset from the database. + + :param limit: limit of results + :param offset: offset of results + + :return: bedset metadata + """ + + with Session(self._db_engine.engine) as session: + + statement = ( + select(BedSets) + .where(BedSets.processed.is_(False)) + .limit(limit) + .offset(offset) + ) + count_statement = select(func.count()).where(BedSets.processed.is_(False)) + + count = session.execute(count_statement).one()[0] + + bedset_object_list = session.scalars(statement) + + results = [] + + for bedset_obj in bedset_object_list: + list_of_bedfiles = [ + bedset_obj.bedfile_id for bedset_obj in bedset_obj.bedfiles + ] + + results.append( + BedSetMetadata( + id=bedset_obj.id, + name=bedset_obj.name, + description=bedset_obj.description, + md5sum=bedset_obj.md5sum, + statistics=None, + plots=None, + bed_ids=list_of_bedfiles, + submission_date=bedset_obj.submission_date, + last_update_date=bedset_obj.last_update_date, + author=bedset_obj.author, + source=bedset_obj.source, + ) + ) + + return BedSetListResult( + count=count, + limit=limit, + offset=offset, + results=results, + ) + def add_bedfile(self, identifier: str, bedfile: str) -> None: raise NotImplementedError diff --git a/tests/test_bedset.py b/tests/test_bedset.py index e5b03da..ecc8948 100644 --- a/tests/test_bedset.py +++ b/tests/test_bedset.py @@ -204,3 +204,10 @@ def test_delete_s3_error(self, bbagent_obj): ): with pytest.raises(BedbaseS3ConnectionError): bbagent_obj.bedset.delete(BEDSET_TEST_ID) + + def test_retrieve_unprocessed(self, bbagent_obj): + with ContextManagerDBTesting( + config=bbagent_obj.config, add_data=True, bedset=True + ): + result = bbagent_obj.bedset.get_unprocessed() + assert result.count == 1 diff --git a/tests/utils.py b/tests/utils.py index 20a6ecc..e1be271 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -137,6 +137,7 @@ def _add_bedset_data(self): bedset_means=stats, bedset_standard_deviation=stats, md5sum="bbad0000000000000000000000000000", + processed=False, ) new_bed_bedset = BedFileBedSetRelation( bedfile_id=BED_TEST_ID, From 8134d8f4aa12866fa35fa6d54b98648af8b0eff1 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Tue, 17 Dec 2024 15:35:10 -0500 Subject: [PATCH 22/28] Fixed qdrant init error --- bbconf/modules/bedfiles.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/bbconf/modules/bedfiles.py b/bbconf/modules/bedfiles.py index d1dd97b..a932be8 100644 --- a/bbconf/modules/bedfiles.py +++ b/bbconf/modules/bedfiles.py @@ -16,6 +16,7 @@ from tqdm import tqdm from bbconf.config_parser.bedbaseconfig import BedBaseConfig +from geniml.search.backends import QdrantBackend from bbconf.const import DEFAULT_LICENSE, PKG_NAME, ZARR_TOKENIZED_FOLDER from bbconf.db_utils import ( Bed, @@ -1027,7 +1028,7 @@ def upload_file_qdrant( _LOGGER.debug(f"Adding bed file to qdrant. bed_id: {bed_id}") - if not self._qdrant_engine: + if not isinstance(self._qdrant_engine, QdrantBackend): raise QdrantInstanceNotInitializedError("Could not upload file.") bed_embedding = self._embed_file(bed_file) @@ -1056,7 +1057,11 @@ def _embed_file(self, bed_file: Union[str, RegionSet]) -> np.ndarray: ) if isinstance(bed_file, str): - bed_region_set = GRegionSet(bed_file) + # Use try if file is corrupted. In Python RegionSet we have functionality to tackle this problem + try: + bed_region_set = GRegionSet(bed_file) + except RuntimeError as _: + bed_region_set = RegionSet(bed_file) elif isinstance(bed_file, RegionSet) or isinstance(bed_file, GRegionSet): bed_region_set = bed_file else: From c3b4bcf99b489650708b55a06f86e7dcf6420648 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Wed, 18 Dec 2024 09:38:04 -0500 Subject: [PATCH 23/28] Added get method for reference validation results --- bbconf/models/bed_models.py | 6 ++++++ bbconf/modules/bedfiles.py | 40 +++++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+) diff --git a/bbconf/models/bed_models.py b/bbconf/models/bed_models.py index 2545d5d..745a5ca 100644 --- a/bbconf/models/bed_models.py +++ b/bbconf/models/bed_models.py @@ -233,3 +233,9 @@ class RefGenValidModel(BaseModel): tier_ranking: int model_config = ConfigDict(extra="forbid") + + +class RefGenValidReturnModel(BaseModel): + id: str + provided_genome: Union[str, None] = None + compared_genome: List[RefGenValidModel] diff --git a/bbconf/modules/bedfiles.py b/bbconf/modules/bedfiles.py index a932be8..9cef005 100644 --- a/bbconf/modules/bedfiles.py +++ b/bbconf/modules/bedfiles.py @@ -52,6 +52,7 @@ FileModel, QdrantSearchResult, RefGenValidModel, + RefGenValidReturnModel, StandardMeta, TokenizedBedResponse, TokenizedPathResponse, @@ -441,6 +442,45 @@ def get_ids_list( results=result_list, ) + def get_reference_validation(self, identifier: str) -> RefGenValidReturnModel: + """ + Get results of reference genome validation for the bed file. + + :param identifier: bed file identifier + :return: reference genome validation results + """ + + if not self.exists(identifier): + raise BEDFileNotFoundError(f"Bed file with id: {identifier} not found.") + + with Session(self._sa_engine) as session: + statement = select(GenomeRefStats).where( + GenomeRefStats.bed_id == identifier + ) + + results = session.scalars(statement) + + result_list = [] + + for result in results: + result_list.append( + RefGenValidModel( + provided_genome=result.provided_genome, + compared_genome=result.compared_genome, + xs=result.xs, + oobr=result.oobr, + sequence_fit=result.sequence_fit, + assigned_points=result.assigned_points, + tier_ranking=result.tier_ranking, + ) + ) + + return RefGenValidReturnModel( + id=identifier, + provided_genome=result.provided_genome, + compared_genome=result_list, + ) + def add( self, identifier: str, From 4c9891b3242f87c38e6d67613067ea17242d76a3 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Thu, 19 Dec 2024 12:50:45 -0500 Subject: [PATCH 24/28] Fixed Donald comments for PR --- bbconf/config_parser/bedbaseconfig.py | 4 ++-- bbconf/modules/bedfiles.py | 8 ++++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/bbconf/config_parser/bedbaseconfig.py b/bbconf/config_parser/bedbaseconfig.py index 64b2200..204af1a 100644 --- a/bbconf/config_parser/bedbaseconfig.py +++ b/bbconf/config_parser/bedbaseconfig.py @@ -253,8 +253,8 @@ def _init_qdrant_text_backend(self) -> Union[QdrantBackend, None]: qdrant_host=self.config.qdrant.host, qdrant_api_key=self.config.qdrant.api_key, ) - except Exception as _: - _LOGGER.error("Error in Connection to qdrant text! skipping...") + except Exception as e: + _LOGGER.error(f"Error in Connection to qdrant text! skipping {e}") warnings.warn( "Error in Connection to qdrant text! skipping...", UserWarning ) diff --git a/bbconf/modules/bedfiles.py b/bbconf/modules/bedfiles.py index 9cef005..c04a547 100644 --- a/bbconf/modules/bedfiles.py +++ b/bbconf/modules/bedfiles.py @@ -947,6 +947,8 @@ def _update_ref_validation( """ Update reference validation data + ! This function won't update the reference validation data, if it exists, it will skip it. + :param sa_session: sqlalchemy session :param bed_object: bed sqlalchemy object :param ref_validation: bed file metadata @@ -1018,11 +1020,13 @@ def upload_pephub(self, identifier: str, metadata: dict, overwrite: bool = False overwrite=overwrite, ) - def update_pephub(self, identifier: str, metadata: dict, overwrite: bool = False): + def update_pephub( + self, identifier: str, metadata: dict, overwrite: bool = False + ) -> None: try: if not metadata: _LOGGER.warning("No metadata provided. Skipping pephub upload..") - return False + return None self._config.phc.sample.update( namespace=self._config.config.phc.namespace, name=self._config.config.phc.name, From a6d51190c058e4d938f0b08271d40bbc91cb102f Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Fri, 20 Dec 2024 13:37:01 -0500 Subject: [PATCH 25/28] improved reindexing --- bbconf/modules/bedfiles.py | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/bbconf/modules/bedfiles.py b/bbconf/modules/bedfiles.py index c04a547..047d1ec 100644 --- a/bbconf/modules/bedfiles.py +++ b/bbconf/modules/bedfiles.py @@ -14,6 +14,7 @@ from sqlalchemy.exc import IntegrityError from sqlalchemy.orm import Session, aliased from tqdm import tqdm +from qdrant_client.http.models import PointStruct from bbconf.config_parser.bedbaseconfig import BedBaseConfig from geniml.search.backends import QdrantBackend @@ -1257,7 +1258,7 @@ def _sql_search_count(self, query: str) -> int: count = session.execute(statement).one() return count[0] - def reindex_qdrant(self) -> None: + def reindex_qdrant(self, batch: int = 1000) -> None: """ Re-upload all files to quadrant. !Warning: only hg38 genome can be added to qdrant! @@ -1265,6 +1266,8 @@ def reindex_qdrant(self) -> None: If you want to fully reindex/reupload to qdrant, first delete collection and create new one. Upload all files to qdrant. + + :param batch: number of files to upload in one batch """ bb_client = BBClient() @@ -1276,6 +1279,7 @@ def reindex_qdrant(self) -> None: results = annotation_result.results with tqdm(total=len(results), position=0, leave=True) as pbar: + points_list = [] for record in results: try: bed_region_set_obj = GRegionSet(bb_client.seek(record.id)) @@ -1284,14 +1288,28 @@ def reindex_qdrant(self) -> None: pbar.set_description(f"Processing file: {record.id}") - self.upload_file_qdrant( - bed_id=record.id, - bed_file=bed_region_set_obj, - payload=record.annotation.model_dump() if record.annotation else {}, + file_embedding = self._embed_file(bed_region_set_obj) + points_list.append( + PointStruct( + id=record.id, + vector=file_embedding.tolist()[0], + payload=( + record.annotation.model_dump() if record.annotation else {} + ), + ) ) - pbar.write(f"File: {record.id} uploaded to qdrant successfully.") + pbar.write(f"File: {record.id} successfully indexed.") pbar.update(1) + _LOGGER.info(f"Uploading points to qdrant using batches...") + for i in range(0, len(points_list), batch): + operation_info = self._config.qdrant_engine.qd_client.upsert( + collection_name=self._config.config.qdrant.file_collection, + points=points_list[i : i + batch], + ) + + assert operation_info.status == "completed" + return None def delete_qdrant_point(self, identifier: str) -> None: From 5b71c7a8e45195272ff12793a6570397a4b80dab Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Fri, 20 Dec 2024 16:22:15 -0500 Subject: [PATCH 26/28] improved reindexing batching --- bbconf/modules/bedfiles.py | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/bbconf/modules/bedfiles.py b/bbconf/modules/bedfiles.py index 047d1ec..20f7bb6 100644 --- a/bbconf/modules/bedfiles.py +++ b/bbconf/modules/bedfiles.py @@ -1258,7 +1258,7 @@ def _sql_search_count(self, query: str) -> int: count = session.execute(statement).one() return count[0] - def reindex_qdrant(self, batch: int = 1000) -> None: + def reindex_qdrant(self, batch: int = 100) -> None: """ Re-upload all files to quadrant. !Warning: only hg38 genome can be added to qdrant! @@ -1271,7 +1271,9 @@ def reindex_qdrant(self, batch: int = 1000) -> None: """ bb_client = BBClient() - annotation_result = self.get_ids_list(limit=100000, genome=QDRANT_GENOME) + annotation_result = self.get_ids_list( + limit=100000, genome=QDRANT_GENOME, offset=0 + ) if not annotation_result.results: _LOGGER.error("No bed files found.") @@ -1280,6 +1282,7 @@ def reindex_qdrant(self, batch: int = 1000) -> None: with tqdm(total=len(results), position=0, leave=True) as pbar: points_list = [] + processed_number = 0 for record in results: try: bed_region_set_obj = GRegionSet(bb_client.seek(record.id)) @@ -1298,18 +1301,26 @@ def reindex_qdrant(self, batch: int = 1000) -> None: ), ) ) + processed_number += 1 + if processed_number % batch == 0: + pbar.set_description(f"Uploading points to qdrant using batch...") + operation_info = self._config.qdrant_engine.qd_client.upsert( + collection_name=self._config.config.qdrant.file_collection, + points=points_list, + ) + pbar.write("Uploaded batch to qdrant.") + points_list = [] + assert operation_info.status == "completed" + pbar.write(f"File: {record.id} successfully indexed.") pbar.update(1) _LOGGER.info(f"Uploading points to qdrant using batches...") - for i in range(0, len(points_list), batch): - operation_info = self._config.qdrant_engine.qd_client.upsert( - collection_name=self._config.config.qdrant.file_collection, - points=points_list[i : i + batch], - ) - - assert operation_info.status == "completed" - + operation_info = self._config.qdrant_engine.qd_client.upsert( + collection_name=self._config.config.qdrant.file_collection, + points=points_list, + ) + assert operation_info.status == "completed" return None def delete_qdrant_point(self, identifier: str) -> None: From f58c7108ac30a096eae6d692fa9a9c8191bfbd30 Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Fri, 3 Jan 2025 11:51:07 -0500 Subject: [PATCH 27/28] updated requirements --- requirements/requirements-all.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index 716e509..f7c4351 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -1,6 +1,6 @@ yacman >= 0.9.1 sqlalchemy >= 2.0.0 -geniml[ml] >= 0.5.2 +geniml[ml] >= 0.6.0 psycopg >= 3.1.15 colorlogs pydantic >= 2.9.0 From 1549194b384d7fc78591b0e5e00a8e77fd2e204b Mon Sep 17 00:00:00 2001 From: Khoroshevskyi Date: Fri, 3 Jan 2025 11:53:15 -0500 Subject: [PATCH 28/28] updated requirements 2 --- requirements/requirements-all.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index f7c4351..cbe32bf 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -12,4 +12,3 @@ zarr pyyaml >= 6.0.1 # for s3fs because of the errors s3fs >= 2024.3.1 pandas >= 2.0.0 -pybiocfilecache < 0.5.0