diff --git a/bbconf/db_utils.py b/bbconf/db_utils.py index d76fbc8..41ad0dd 100644 --- a/bbconf/db_utils.py +++ b/bbconf/db_utils.py @@ -3,7 +3,16 @@ from typing import List, Optional import pandas as pd -from sqlalchemy import TIMESTAMP, BigInteger, ForeignKey, Result, Select, event, select +from sqlalchemy import ( + TIMESTAMP, + BigInteger, + ForeignKey, + Result, + Select, + event, + select, + UniqueConstraint, +) from sqlalchemy.dialects.postgresql import JSON from sqlalchemy.engine import URL, Engine, create_engine from sqlalchemy.event import listens_for @@ -118,6 +127,10 @@ class Bed(Base): ) license_mapping: Mapped["License"] = relationship("License", back_populates="bed") + ref_classifier: Mapped["GenomeRefStats"] = relationship( + "GenomeRefStats", back_populates="bed", cascade="all, delete-orphan" + ) + class BedMetadata(Base): __tablename__ = "bed_metadata" @@ -351,6 +364,32 @@ class License(Base): bed: Mapped[List["Bed"]] = relationship("Bed", back_populates="license_mapping") +class GenomeRefStats(Base): + __tablename__ = "genome_ref_stats" + + id: Mapped[int] = mapped_column(primary_key=True, index=True) + + bed_id: Mapped[str] = mapped_column( + ForeignKey("bed.id", ondelete="CASCADE"), + index=True, + nullable=False, + ) + provided_genome: Mapped[str] + compared_genome: Mapped[str] = mapped_column( + nullable=False, comment="Compared Genome" + ) + + xs: Mapped[float] = mapped_column(nullable=True, default=None) + oobr: Mapped[float] = mapped_column(nullable=True, default=None) + sequence_fit: Mapped[float] = mapped_column(nullable=True, default=None) + assigned_points: Mapped[int] = mapped_column(nullable=False) + tier_ranking: Mapped[int] = mapped_column(nullable=False) + + bed: Mapped["Bed"] = relationship("Bed", back_populates="ref_classifier") + + __table_args__ = (UniqueConstraint("bed_id", "compared_genome"),) + + @listens_for(Universes, "after_insert") @listens_for(Universes, "after_update") def add_bed_universe(mapper, connection, target): diff --git a/bbconf/models/bed_models.py b/bbconf/models/bed_models.py index c5c9fcc..b2f256e 100644 --- a/bbconf/models/bed_models.py +++ b/bbconf/models/bed_models.py @@ -219,3 +219,15 @@ class TokenizedPathResponse(BaseModel): universe_id: str file_path: str endpoint_url: str + + +class RefGenValidModel(BaseModel): + provided_genome: str + compared_genome: str + xs: float = 0.0 + oobr: Union[float, None] = None + sequence_fit: Union[float, None] = None + assigned_points: int + tier_ranking: int + + model_config = ConfigDict(extra="forbid") diff --git a/bbconf/modules/bedfiles.py b/bbconf/modules/bedfiles.py index df1da60..1297799 100644 --- a/bbconf/modules/bedfiles.py +++ b/bbconf/modules/bedfiles.py @@ -1,6 +1,7 @@ import os from logging import getLogger from typing import Dict, Union +from pydantic import BaseModel import numpy as np from geniml.bbclient import BBClient @@ -14,7 +15,15 @@ from bbconf.config_parser.bedbaseconfig import BedBaseConfig from bbconf.const import DEFAULT_LICENSE, PKG_NAME, ZARR_TOKENIZED_FOLDER -from bbconf.db_utils import Bed, BedStats, Files, TokenizedBed, Universes, BedMetadata +from bbconf.db_utils import ( + Bed, + BedStats, + Files, + TokenizedBed, + Universes, + BedMetadata, + GenomeRefStats, +) from bbconf.exceptions import ( BedBaseConfError, BedFIleExistsError, @@ -43,6 +52,7 @@ TokenizedPathResponse, UniverseMetadata, StandardMeta, + RefGenValidModel, ) _LOGGER = getLogger(PKG_NAME) @@ -393,6 +403,7 @@ def add( plots: dict = None, files: dict = None, classification: dict = None, + ref_validation: Dict[str, BaseModel] = None, license_id: str = DEFAULT_LICENSE, upload_qdrant: bool = False, upload_pephub: bool = False, @@ -410,6 +421,7 @@ def add( :param plots: bed file plots :param files: bed file files :param classification: bed file classification + :param ref_validation: reference validation data. RefGenValidModel :param license_id: bed file license id (default: 'DUO:0000042'). Full list of licenses: https://raw.githubusercontent.com/EBISPOT/DUO/master/duo.csv :param upload_qdrant: add bed file to qdrant indexs @@ -532,6 +544,16 @@ def add( session.add(new_bedstat) session.add(new_metadata) + for ref_gen_check, data in ref_validation.items(): + new_gen_ref = GenomeRefStats( + **RefGenValidModel( + **data.model_dump(), + provided_genome=classification.genome_alias, + compared_genome=ref_gen_check, + ).model_dump(), + bed_id=identifier, + ) + session.add(new_gen_ref) session.commit() return None diff --git a/docs/changelog.md b/docs/changelog.md index bf40e34..4f64420 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -2,6 +2,12 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) format. +# [0.7.0] - 2024-09-20 +## Added +- Table and methods for reference genome validator +- Table with standard metadata schema +- Bed file opening improvements + # [0.6.1] - 2024-08-21 ## Added diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index 21a4d73..538c13e 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -1,12 +1,12 @@ yacman >= 0.9.1 sqlalchemy >= 2.0.0 -geniml >= 0.4.0 +geniml >= 0.4.1 psycopg >= 3.1.15 colorlogs pydantic >= 2.6.4 botocore boto3 >= 1.34.54 -pephubclient >= 0.4.1 +pephubclient >= 0.4.4 sqlalchemy_schemadisplay zarr pyyaml >= 6.0.1 # for s3fs because of the errors