Skip to content

Commit

Permalink
Merge pull request #72 from databio/partial_processing
Browse files Browse the repository at this point in the history
Bed file updating, and partial partial processing
  • Loading branch information
khoroshevskyi authored Dec 19, 2024
2 parents bcb20f8 + 4c9891b commit 2d7403b
Show file tree
Hide file tree
Showing 11 changed files with 575 additions and 108 deletions.
12 changes: 7 additions & 5 deletions bbconf/bbagent.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,18 +23,20 @@ class BedBaseAgent(object):
def __init__(
self,
config: Union[Path, str],
init_ml: bool = True,
):
"""
Initialize connection to the pep_db database. You can use the basic connection parameters
or libpq connection string.
:param config: path to the configuration file
:param init_ml: initialize ML models for search (default: True)
"""
_LOGGER.info(f"Initializing BedBaseConfig object")
self.config = BedBaseConfig(config)
_LOGGER.info(f"Initializing BedBaseAgent object")

self.config = BedBaseConfig(config, init_ml)

self._bed = BedAgentBedFile(self.config, self)
_LOGGER.info(f"Initializing BedAgentBedSet object")
self._bedset = BedAgentBedSet(self.config)
_LOGGER.info(f"Initializing BBObjects object")
self._objects = BBObjects(self.config)

@property
Expand Down
50 changes: 36 additions & 14 deletions bbconf/config_parser/bedbaseconfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,28 +46,34 @@ class BedBaseConfig(object):
Class to handle BEDbase configuration file and create objects for different modules.
"""

def __init__(self, config: Union[Path, str], init_search_interfaces: bool = True):
_LOGGER.info(f"Loading configuration file: {config}")
def __init__(self, config: Union[Path, str], init_ml: bool = True):
"""
Initialize BedBaseConfig object
:param config: path to the configuration file
:param init_ml: initialize machine learning models used for search
"""

self.cfg_path = get_bedbase_cfg(config)
self._config = self._read_config_file(self.cfg_path)

_LOGGER.info(f"Initializing database engine...")
self._db_engine = self._init_db_engine()
_LOGGER.info(f"Initializing qdrant engine...")
self._qdrant_engine = self._init_qdrant_backend()

_LOGGER.info(f"Initializing qdrant text engine...")
self._qdrant_engine = self._init_qdrant_backend()
self._qdrant_text_engine = self._init_qdrant_text_backend()

if init_search_interfaces:
_LOGGER.info(f"Initializing search interfaces...")
if init_ml:
self._b2bsi = self._init_b2bsi_object()
_LOGGER.info(f"Initializing R2V object...")
self._r2v = self._init_r2v_object()
_LOGGER.info(f"Initializing Bivec object...")
self._bivec = self._init_bivec_object()
else:
_LOGGER.info(
f"Skipping initialization of ML models, init_ml parameter set to False."
)

self._b2bsi = None
self._r2v = None
self._bivec = None

_LOGGER.info(f"Initializing PEPHub client...")
self._phc = self._init_pephubclient()
self._boto3_client = self._init_boto3_client()

Expand Down Expand Up @@ -197,6 +203,11 @@ def zarr_root(self) -> Union[Z_GROUP, None]:
return zarr.group(store=cache, overwrite=False)

def _init_db_engine(self) -> BaseEngine:
"""
Create database engine object using credentials provided in config file
"""

_LOGGER.info(f"Initializing database engine...")
return BaseEngine(
host=self._config.database.host,
port=self._config.database.port,
Expand All @@ -212,6 +223,8 @@ def _init_qdrant_backend(self) -> QdrantBackend:
:return: QdrantClient
"""

_LOGGER.info(f"Initializing qdrant engine...")
try:
return QdrantBackend(
collection=self._config.qdrant.file_collection,
Expand All @@ -225,12 +238,14 @@ def _init_qdrant_backend(self) -> QdrantBackend:
f"error in Connection to qdrant! skipping... Error: {err}", UserWarning
)

def _init_qdrant_text_backend(self) -> QdrantBackend:
def _init_qdrant_text_backend(self) -> Union[QdrantBackend, None]:
"""
Create qdrant client text embedding object using credentials provided in config file
:return: QdrantClient
"""

_LOGGER.info(f"Initializing qdrant text engine...")
try:
return QdrantBackend(
dim=TEXT_EMBEDDING_DIMENSION,
Expand All @@ -239,7 +254,11 @@ def _init_qdrant_text_backend(self) -> QdrantBackend:
qdrant_api_key=self.config.qdrant.api_key,
)
except Exception as e:
_LOGGER.error(f"Error while connecting to qdrant text engine: {e}")
_LOGGER.error(f"Error in Connection to qdrant text! skipping {e}")
warnings.warn(
"Error in Connection to qdrant text! skipping...", UserWarning
)
return None

def _init_bivec_object(self) -> Union[BiVectorSearchInterface, None]:
"""
Expand All @@ -266,6 +285,7 @@ def _init_b2bsi_object(self) -> Union[BED2BEDSearchInterface, None]:
:return: Bed2BEDSearchInterface object
"""
try:
_LOGGER.info(f"Initializing search interfaces...")
return BED2BEDSearchInterface(
backend=self.qdrant_engine,
query2vec=BED2Vec(model=self._config.path.region2vec),
Expand All @@ -286,6 +306,7 @@ def _init_pephubclient() -> Union[PEPHubClient, None]:
:return: PephubClient
"""
try:
_LOGGER.info(f"Initializing PEPHub client...")
return PEPHubClient()
except Exception as e:
_LOGGER.error(f"Error in creating PephubClient object: {e}")
Expand Down Expand Up @@ -317,6 +338,7 @@ def _init_r2v_object(self) -> Union[Region2VecExModel, None]:
Create Region2VecExModel object using credentials provided in config file
"""
try:
_LOGGER.info(f"Initializing R2V object...")
return Region2VecExModel(self.config.path.region2vec)
except Exception as e:
_LOGGER.error(f"Error in creating Region2VecExModel object: {e}")
Expand Down
16 changes: 14 additions & 2 deletions bbconf/db_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,9 @@ class Bed(Base):
ref_classifier: Mapped["GenomeRefStats"] = relationship(
"GenomeRefStats", back_populates="bed", cascade="all, delete-orphan"
)
processed: Mapped[bool] = mapped_column(
default=False, comment="Whether the bed file was processed"
)


class BedMetadata(Base):
Expand Down Expand Up @@ -255,6 +258,11 @@ class Files(Base):
bedfile: Mapped["Bed"] = relationship("Bed", back_populates="files")
bedset: Mapped["BedSets"] = relationship("BedSets", back_populates="files")

__table_args__ = (
UniqueConstraint("name", "bedfile_id"),
UniqueConstraint("name", "bedset_id"),
)


class BedFileBedSetRelation(Base):
__tablename__ = "bedfile_bedset_relation"
Expand Down Expand Up @@ -303,6 +311,10 @@ class BedSets(Base):
author: Mapped[str] = mapped_column(nullable=True, comment="Author of the bedset")
source: Mapped[str] = mapped_column(nullable=True, comment="Source of the bedset")

processed: Mapped[bool] = mapped_column(
default=False, comment="Whether the bedset was processed"
)


class Universes(Base):
__tablename__ = "universes"
Expand Down Expand Up @@ -339,7 +351,7 @@ class TokenizedBed(Base):
nullable=False,
)
universe_id: Mapped[str] = mapped_column(
ForeignKey("universes.id", ondelete="CASCADE", passive_deletes=True),
ForeignKey("universes.id", ondelete="CASCADE"),
primary_key=True,
index=True,
nullable=False,
Expand All @@ -352,7 +364,7 @@ class TokenizedBed(Base):
universe: Mapped["Universes"] = relationship(
"Universes",
back_populates="tokenized",
passive_deletes=True,
passive_deletes="all",
)


Expand Down
2 changes: 2 additions & 0 deletions bbconf/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ def get_bedbase_cfg(cfg: str = None) -> str:
Optional, the $BEDBASE config env var will be used if not provided
:return str: absolute configuration file path
"""

_LOGGER.info(f"Loading configuration file: {cfg}")
selected_cfg = select_config(config_filepath=cfg, config_env_vars=CFG_ENV_VARS)
if not selected_cfg:
raise BedBaseConnectionError(
Expand Down
8 changes: 7 additions & 1 deletion bbconf/models/bed_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ class BedStatsModel(BaseModel):


class BedPEPHub(BaseModel):
sample_name: str
sample_name: str = ""
genome: str = ""
organism: str = ""
species_id: str = ""
Expand Down Expand Up @@ -233,3 +233,9 @@ class RefGenValidModel(BaseModel):
tier_ranking: int

model_config = ConfigDict(extra="forbid")


class RefGenValidReturnModel(BaseModel):
id: str
provided_genome: Union[str, None] = None
compared_genome: List[RefGenValidModel]
Loading

0 comments on commit 2d7403b

Please sign in to comment.