-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
605ff78
commit 0cbb853
Showing
4 changed files
with
100 additions
and
14 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
# functions for tokenization of bed files | ||
import logging | ||
from typing import Union | ||
import os | ||
from bbconf.bbagent import BedBaseAgent | ||
from geniml.bbclient import BBClient | ||
from geniml.bbclient.const import DEFAULT_CACHE_FOLDER | ||
from geniml.io import RegionSet | ||
|
||
# from genimtools.tokenizers import RegionSet | ||
|
||
from genimtools.tokenizers import TreeTokenizer | ||
|
||
from bedboss.exceptions import BedBossException | ||
|
||
_LOGGER = logging.getLogger("bedboss") | ||
|
||
|
||
def tokenize_bed_file( | ||
universe: str, | ||
bed: str, | ||
cache_folder: Union[str, os.PathLike] = DEFAULT_CACHE_FOLDER, | ||
add_to_db: bool = False, | ||
config: str = None, | ||
) -> None: | ||
""" | ||
Tokenize all bed file and add to the local cache | ||
:param universe: universe name to which the bed file will be tokenized | ||
:param bed: bed file to be tokenized | ||
:param cache_folder: path to the cache folder | ||
:param add_to_db: flag to add tokenized bed file to the bedbase database [config should be provided if True] | ||
:param config: path to the bedbase config file | ||
:return: None | ||
""" | ||
bbc = BBClient(cache_folder=cache_folder) | ||
|
||
tokenizer = TreeTokenizer(bbc.seek(universe)) | ||
rs = bbc.load_bed(bed) | ||
|
||
tokens = tokenizer(rs).ids | ||
|
||
# b = tokens.to_regions() # [Region(chr1, 100, 200), ... ] | ||
# f = tokens.to_bit_vector() # | ||
|
||
bbc.cache_tokens(universe, bed, tokens) | ||
_LOGGER.info(f"Tokenized bed file '{bed}' added to the cache") | ||
|
||
if add_to_db: | ||
if not config: | ||
BedBossException( | ||
"Config file is required to add tokenized bed file to the database" | ||
) | ||
|
||
bbagent = BedBaseAgent(config=config) | ||
bbagent.bed.add_tokenized(bed_id=bed, universe_id=universe, token_vector=tokens) | ||
_LOGGER.info(f"Tokenized bed file '{bed}' added to the database") | ||
|
||
|
||
def delete_tokenized( | ||
universe: str, | ||
bed: str, | ||
config: str = None, | ||
) -> None: | ||
""" | ||
Delete tokenized bed file from the database | ||
:param universe: universe name to which the bed file will be tokenized | ||
:param bed: bed file to be tokenized | ||
:param config: path to the bedbase config file | ||
:return: None | ||
""" | ||
bba = BedBaseAgent(config=config) | ||
|
||
bba.bed.delete_tokenized(bed_id=bed, universe_id=universe) |