Skip to content

Commit

Permalink
Work on tokenization
Browse files Browse the repository at this point in the history
  • Loading branch information
khoroshevskyi committed Jun 6, 2024
1 parent 605ff78 commit 0cbb853
Show file tree
Hide file tree
Showing 4 changed files with 100 additions and 14 deletions.
25 changes: 19 additions & 6 deletions bedboss/bedboss.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,10 @@ def run_all(
upload_qdrant: bool = False,
upload_s3: bool = False,
upload_pephub: bool = False,
# Universes
universe: bool = False,
universe_method: str = None,
universe_bedset: str = None,
pm: pypiper.PipelineManager = None,
) -> str:
"""
Expand All @@ -95,6 +99,10 @@ def run_all(
:param bool upload_qdrant: whether to skip qdrant indexing
:param bool upload_s3: whether to upload to s3
:param bool upload_pephub: whether to push bedfiles and metadata to pephub (default: False)
:param bool universe: whether to add the sample as the universe [Default: False]
:param str universe_method: method used to create the universe [Default: None]
:param str universe_bedset: bedset identifier for the universe [Default: None]
:param pypiper.PipelineManager pm: pypiper object
:return str bed_digest: bed digest
"""
Expand Down Expand Up @@ -201,6 +209,13 @@ def run_all(
nofail=True,
)

if universe:
bbagent.bed.add_universe(
bedfile_id=bed_metadata.bed_digest,
bedset_id=universe_bedset,
construct_method=universe_method,
)

if stop_pipeline:
pm.stop_pipeline()

Expand Down Expand Up @@ -302,14 +317,12 @@ def insert_pep(
upload_qdrant=upload_qdrant,
upload_s3=upload_s3,
upload_pephub=upload_pephub,
universe=pep_sample.get("universe"),
universe_method=pep_sample.get("universe_method"),
universe_bedset=pep_sample.get("universe_bedset"),
pm=pm,
)
if pep_sample.get("universe"):
bbagent.bed.add_universe(
bed_id,
bedset_id=pep_sample.get("universe_bedset"),
construct_method=pep_sample.get("universe_method"),
)

processed_ids.append(bed_id)
except BedBossException as e:
_LOGGER.error(f"Failed to process {pep_sample.sample_name}. See {e}")
Expand Down
12 changes: 4 additions & 8 deletions bedboss/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def run_all(

agent = BedBaseAgent(bedbase_config)

bed_id = run_all_bedboss(
run_all_bedboss(
input_file=input_file,
input_type=input_type,
outfolder=outfolder,
Expand All @@ -125,16 +125,12 @@ def run_all(
upload_qdrant=upload_qdrant,
upload_s3=upload_s3,
upload_pephub=upload_pephub,
universe=universe,
universe_method=universe_method,
universe_bedset=universe_bedset,
pm=create_pm(outfolder=outfolder, multi=multi, recover=recover, dirty=dirty),
)

if universe:
agent.bed.add_universe(
bedfile_id=bed_id,
bedset_id=universe_bedset,
construct_method=universe_method,
)


@app.command(help="Run the all bedboss pipeline for a bed files in a PEP")
def run_pep(
Expand Down
Empty file added bedboss/tokens/__init__.py
Empty file.
77 changes: 77 additions & 0 deletions bedboss/tokens/tokens.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# functions for tokenization of bed files
import logging
from typing import Union
import os
from bbconf.bbagent import BedBaseAgent
from geniml.bbclient import BBClient
from geniml.bbclient.const import DEFAULT_CACHE_FOLDER
from geniml.io import RegionSet

# from genimtools.tokenizers import RegionSet

from genimtools.tokenizers import TreeTokenizer

from bedboss.exceptions import BedBossException

_LOGGER = logging.getLogger("bedboss")


def tokenize_bed_file(
universe: str,
bed: str,
cache_folder: Union[str, os.PathLike] = DEFAULT_CACHE_FOLDER,
add_to_db: bool = False,
config: str = None,
) -> None:
"""
Tokenize all bed file and add to the local cache
:param universe: universe name to which the bed file will be tokenized
:param bed: bed file to be tokenized
:param cache_folder: path to the cache folder
:param add_to_db: flag to add tokenized bed file to the bedbase database [config should be provided if True]
:param config: path to the bedbase config file
:return: None
"""
bbc = BBClient(cache_folder=cache_folder)

tokenizer = TreeTokenizer(bbc.seek(universe))
rs = bbc.load_bed(bed)

tokens = tokenizer(rs).ids

# b = tokens.to_regions() # [Region(chr1, 100, 200), ... ]
# f = tokens.to_bit_vector() #

bbc.cache_tokens(universe, bed, tokens)
_LOGGER.info(f"Tokenized bed file '{bed}' added to the cache")

if add_to_db:
if not config:
BedBossException(
"Config file is required to add tokenized bed file to the database"
)

bbagent = BedBaseAgent(config=config)
bbagent.bed.add_tokenized(bed_id=bed, universe_id=universe, token_vector=tokens)
_LOGGER.info(f"Tokenized bed file '{bed}' added to the database")


def delete_tokenized(
universe: str,
bed: str,
config: str = None,
) -> None:
"""
Delete tokenized bed file from the database
:param universe: universe name to which the bed file will be tokenized
:param bed: bed file to be tokenized
:param config: path to the bedbase config file
:return: None
"""
bba = BedBaseAgent(config=config)

bba.bed.delete_tokenized(bed_id=bed, universe_id=universe)

0 comments on commit 0cbb853

Please sign in to comment.