Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Release 0.10.0 #71

Merged
merged 32 commits into from
Jan 3, 2025
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
d21e497
Added trackHub file creation to bedset
khoroshevskyi Nov 11, 2024
60a7cff
Added trackHub file creation to bedset
khoroshevskyi Nov 12, 2024
0e5a36e
Added bed sql search
khoroshevskyi Nov 13, 2024
0e75c72
Set limit on bedset trackhub response
khoroshevskyi Nov 13, 2024
9f231b7
improved error handling in bedset
khoroshevskyi Nov 21, 2024
8adf567
bump pephubclient version
khoroshevskyi Nov 26, 2024
a4dd469
add some logging and docs improvements
nsheff Nov 27, 2024
5ba0c56
some ideas toward #65
nsheff Nov 27, 2024
8940fb5
Fixed #64
khoroshevskyi Dec 2, 2024
3ad2523
Fixed #66
khoroshevskyi Dec 3, 2024
3c146f5
updated requirements
khoroshevskyi Dec 3, 2024
4ca2adf
Merge remote-tracking branch 'origin/dev' into dev
khoroshevskyi Dec 3, 2024
590ee70
lint
khoroshevskyi Dec 3, 2024
530c891
Merge branch 'refs/heads/dev' into partial_processing
khoroshevskyi Dec 9, 2024
af026e1
updated for partial_processing
khoroshevskyi Dec 9, 2024
e464003
updated tests and fixed uploading files
khoroshevskyi Dec 11, 2024
d4bf110
updated uv in tests
khoroshevskyi Dec 11, 2024
fa91690
updated installation
khoroshevskyi Dec 11, 2024
3899882
updated pip
khoroshevskyi Dec 11, 2024
a8d48f7
lower version of pybiocfilecache
khoroshevskyi Dec 11, 2024
bcb20f8
SERVICE_UNAVAILABLE TRUE
khoroshevskyi Dec 11, 2024
7511e01
Merge branch 'refs/heads/dev' into partial_processing
khoroshevskyi Dec 15, 2024
f9a55b2
- added tests
khoroshevskyi Dec 15, 2024
1018496
- added processed table to bedsets
khoroshevskyi Dec 16, 2024
8134d8f
Fixed qdrant init error
khoroshevskyi Dec 17, 2024
c3b4bcf
Added get method for reference validation results
khoroshevskyi Dec 18, 2024
4c9891b
Fixed Donald comments for PR
khoroshevskyi Dec 19, 2024
2d7403b
Merge pull request #72 from databio/partial_processing
khoroshevskyi Dec 19, 2024
a6d5119
improved reindexing
khoroshevskyi Dec 20, 2024
5b71c7a
improved reindexing batching
khoroshevskyi Dec 20, 2024
f58c710
updated requirements
khoroshevskyi Jan 3, 2025
1549194
updated requirements 2
khoroshevskyi Jan 3, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 10 additions & 8 deletions .github/workflows/run-pytest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
pytest:
strategy:
matrix:
python-version: ["3.9", "3.11"]
python-version: ["3.9", "3.12"]
os: [ubuntu-latest] # can't use macOS when using service containers or container jobs
runs-on: ${{ matrix.os }}
services:
Expand All @@ -27,21 +27,23 @@ jobs:
- 5432:5432
options: --health-cmd pg_isready --health-interval 10s --health-timeout 5s --health-retries 5
steps:
- uses: actions/checkout@v2

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}

- name: Install uv
run: pip install uv

- name: Install dev dependencies
run: if [ -f requirements/requirements-dev.txt ]; then pip install -r requirements/requirements-dev.txt; fi
run: if [ -f requirements/requirements-dev.txt ]; then uv pip install -r requirements/requirements-dev.txt --system; fi

- name: Install test dependencies
run: if [ -f requirements/requirements-test.txt ]; then pip install -r requirements/requirements-test.txt; fi
run: if [ -f requirements/requirements-test.txt ]; then uv pip install -r requirements/requirements-test.txt --system; fi

- name: Install package
run: python -m pip install .
run: uv pip install . --system

- name: Run pytest tests
run: pytest tests -x -vv
2 changes: 1 addition & 1 deletion bbconf/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.9.0"
__version__ = "0.10.0"
22 changes: 18 additions & 4 deletions bbconf/bbagent.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import logging

from functools import cached_property
from pathlib import Path
from typing import List, Union
Expand All @@ -12,22 +14,27 @@
from bbconf.modules.bedsets import BedAgentBedSet
from bbconf.modules.objects import BBObjects

from .const import PKG_NAME

_LOGGER = logging.getLogger(PKG_NAME)


class BedBaseAgent(object):
def __init__(
self,
config: Union[Path, str],
):
"""
Initialize connection to the pep_db database. You can use The basic connection parameters
Initialize connection to the pep_db database. You can use the basic connection parameters
or libpq connection string.

"""

_LOGGER.info(f"Initializing BedBaseConfig object")
self.config = BedBaseConfig(config)

_LOGGER.info(f"Initializing BedBaseAgent object")
self._bed = BedAgentBedFile(self.config, self)
_LOGGER.info(f"Initializing BedAgentBedSet object")
self._bedset = BedAgentBedSet(self.config)
_LOGGER.info(f"Initializing BBObjects object")
self._objects = BBObjects(self.config)

@property
Expand All @@ -42,6 +49,13 @@ def bedset(self) -> BedAgentBedSet:
def objects(self) -> BBObjects:
return self._objects

def __repr__(self) -> str:
repr = f"BedBaseAgent(config={self.config})"
repr += f"\n{self.bed}"
repr += f"\n{self.bedset}"
repr += f"\n{self.objects}"
return repr

def get_stats(self) -> StatsReturn:
"""
Get statistics for a bed file
Expand Down
43 changes: 31 additions & 12 deletions bbconf/config_parser/bedbaseconfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,18 +41,33 @@
_LOGGER = logging.getLogger(PKG_NAME)


class BedBaseConfig:
def __init__(self, config: Union[Path, str]):
class BedBaseConfig(object):
"""
Class to handle BEDbase configuration file and create objects for different modules.
"""

def __init__(self, config: Union[Path, str], init_search_interfaces: bool = True):
_LOGGER.info(f"Loading configuration file: {config}")
self.cfg_path = get_bedbase_cfg(config)
self._config = self._read_config_file(self.cfg_path)

_LOGGER.info(f"Initializing database engine...")
self._db_engine = self._init_db_engine()
_LOGGER.info(f"Initializing qdrant engine...")
self._qdrant_engine = self._init_qdrant_backend()

_LOGGER.info(f"Initializing qdrant text engine...")
self._qdrant_text_engine = self._init_qdrant_text_backend()
self._b2bsi = self._init_b2bsi_object()
self._r2v = self._init_r2v_object()
self._bivec = self._init_bivec_object()

if init_search_interfaces:
_LOGGER.info(f"Initializing search interfaces...")
self._b2bsi = self._init_b2bsi_object()
_LOGGER.info(f"Initializing R2V object...")
khoroshevskyi marked this conversation as resolved.
Show resolved Hide resolved
self._r2v = self._init_r2v_object()
_LOGGER.info(f"Initializing Bivec object...")
self._bivec = self._init_bivec_object()

_LOGGER.info(f"Initializing PEPHub client...")
self._phc = self._init_pephubclient()
self._boto3_client = self._init_boto3_client()

Expand Down Expand Up @@ -216,13 +231,15 @@ def _init_qdrant_text_backend(self) -> QdrantBackend:

:return: QdrantClient
"""

return QdrantBackend(
dim=TEXT_EMBEDDING_DIMENSION,
collection=self.config.qdrant.text_collection,
qdrant_host=self.config.qdrant.host,
qdrant_api_key=self.config.qdrant.api_key,
)
try:
return QdrantBackend(
dim=TEXT_EMBEDDING_DIMENSION,
collection=self.config.qdrant.text_collection,
qdrant_host=self.config.qdrant.host,
qdrant_api_key=self.config.qdrant.api_key,
)
except Exception as e:
_LOGGER.error(f"Error while connecting to qdrant text engine: {e}")

def _init_bivec_object(self) -> Union[BiVectorSearchInterface, None]:
"""
Expand All @@ -231,9 +248,11 @@ def _init_bivec_object(self) -> Union[BiVectorSearchInterface, None]:
:return: BiVectorSearchInterface
"""

_LOGGER.info(f"Initializing BiVectorBackend...")
search_backend = BiVectorBackend(
metadata_backend=self._qdrant_text_engine, bed_backend=self._qdrant_engine
)
_LOGGER.info(f"Initializing BiVectorSearchInterface...")
search_interface = BiVectorSearchInterface(
backend=search_backend,
query2vec=self.config.path.text2vec,
Expand Down
6 changes: 6 additions & 0 deletions bbconf/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,3 +83,9 @@ class QdrantInstanceNotInitializedError(BedBaseConfError):
"""Error type for missing qdrant instance"""

pass


class BedSetTrackHubLimitError(BedBaseConfError):
"""Limit for visualizing trackhub exceeded"""

pass
4 changes: 2 additions & 2 deletions bbconf/models/bed_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,8 +192,8 @@ class BedListResult(BaseModel):

class QdrantSearchResult(BaseModel):
id: str
payload: dict
score: float
payload: dict = None
score: float = None
metadata: Union[BedMetadataBasic, None] = None


Expand Down
149 changes: 134 additions & 15 deletions bbconf/modules/bedfiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from pephubclient.exceptions import ResponseError
from pydantic import BaseModel
from qdrant_client.models import Distance, PointIdsList, VectorParams
from sqlalchemy import and_, delete, func, select
from sqlalchemy import and_, delete, func, or_, select
from sqlalchemy.orm import Session, aliased
from tqdm import tqdm

Expand Down Expand Up @@ -62,9 +62,9 @@

class BedAgentBedFile:
"""
Class that represents Bedfile in Database.
Class that represents a BED file in the Database.

This class has method to add, delete, get files and metadata from the database.
Provides methods to add, delete, get BED files and metadata from the database.
"""

def __init__(self, config: BedBaseConfig, bbagent_obj=None):
Expand Down Expand Up @@ -241,6 +241,45 @@ def get_plots(self, identifier: str) -> BedPlots:
)
return bed_plots

def get_neighbours(
self, identifier: str, limit: int = 10, offset: int = 0
) -> BedListSearchResult:
"""
Get nearest neighbours of bed file from qdrant.

:param identifier: bed file identifier
:param limit: number of results to return
:param offset: offset to start from

:return: list of nearest neighbours
"""
if not self.exists(identifier):
raise BEDFileNotFoundError(f"Bed file with id: {identifier} not found.")
s = identifier
results = self._qdrant_engine.qd_client.query_points(
collection_name=self._config.config.qdrant.file_collection,
query="-".join([s[:8], s[8:12], s[12:16], s[16:20], s[20:]]),
limit=limit,
offset=offset,
)
result_list = []
for result in results.points:
result_id = result.id.replace("-", "")
result_list.append(
QdrantSearchResult(
id=result_id,
payload=result.payload,
score=result.score,
metadata=self.get(result_id, full=False),
)
)
return BedListSearchResult(
count=self.bb_agent.get_stats().bedfiles_number,
limit=limit,
offset=offset,
results=result_list,
)

def get_files(self, identifier: str) -> BedFiles:
"""
Get file files by identifier.
Expand Down Expand Up @@ -407,7 +446,7 @@ def add(
plots: dict = None,
files: dict = None,
classification: dict = None,
ref_validation: Dict[str, BaseModel] = None,
ref_validation: Union[Dict[str, BaseModel], None] = None,
license_id: str = DEFAULT_LICENSE,
upload_qdrant: bool = False,
upload_pephub: bool = False,
Expand Down Expand Up @@ -551,16 +590,17 @@ def add(
session.add(new_bedstat)
session.add(new_metadata)

for ref_gen_check, data in ref_validation.items():
new_gen_ref = GenomeRefStats(
**RefGenValidModel(
**data.model_dump(),
provided_genome=classification.genome_alias,
compared_genome=ref_gen_check,
).model_dump(),
bed_id=identifier,
)
session.add(new_gen_ref)
if ref_validation:
for ref_gen_check, data in ref_validation.items():
new_gen_ref = GenomeRefStats(
**RefGenValidModel(
**data.model_dump(),
provided_genome=classification.genome_alias,
compared_genome=ref_gen_check,
).model_dump(),
bed_id=identifier,
)
session.add(new_gen_ref)
session.commit()

return None
Expand Down Expand Up @@ -859,12 +899,91 @@ def bed_to_bed_search(
results=results_list,
)

def sql_search(
self, query: str, limit: int = 10, offset: int = 0
) -> BedListSearchResult:
"""
Search for bed files by using sql exact search.
This search will search files by id, name, and description

:param query: text query
:param limit: number of results to return
:param offset: offset to start from

:return: list of bed file metadata
"""
_LOGGER.debug(f"Looking for: {query}")

sql_search_str = f"%{query}%"
with Session(self._sa_engine) as session:
statement = (
select(Bed)
.where(
or_(
Bed.id.ilike(sql_search_str),
Bed.name.ilike(sql_search_str),
Bed.description.ilike(sql_search_str),
)
)
.limit(limit)
.offset(offset)
)
bed_objects = session.scalars(statement)
results = [
BedMetadataBasic(
**bedfile_obj.__dict__,
annotation=StandardMeta(
**(
bedfile_obj.annotations.__dict__
if bedfile_obj.annotations
else {}
)
),
)
for bedfile_obj in bed_objects
]
result_list = [
QdrantSearchResult(id=result.id, score=1, metadata=result)
for result in results
]

return BedListSearchResult(
count=self._sql_search_count(query),
limit=limit,
offset=offset,
results=result_list,
)

def _sql_search_count(self, query: str) -> int:
"""
Get number of total found files in the database.

:param query: text query

:return: number of found files
"""
sql_search_str = f"%{query}%"
with Session(self._sa_engine) as session:
statement = (
select(func.count())
.select_from(Bed)
.where(
or_(
Bed.id.ilike(sql_search_str),
Bed.name.ilike(sql_search_str),
Bed.description.ilike(sql_search_str),
)
)
)
count = session.execute(statement).one()
return count[0]

def reindex_qdrant(self) -> None:
"""
Re-upload all files to quadrant.
!Warning: only hg38 genome can be added to qdrant!

If you want want to fully reindex/reupload to qdrant, first delete collection and create new one.
If you want to fully reindex/reupload to qdrant, first delete collection and create new one.

Upload all files to qdrant.
"""
Expand Down
Loading
Loading