Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev bedclassifier script #67

Merged
merged 6 commits into from
May 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -139,4 +139,15 @@ openSignalMatrix
out2023/*

# test data
test/test_data/*
test/test_data/*
/scripts/bedclassifier_tuning/results/
/scripts/bedclassifier_tuning/data/
genome_config.yaml
alias/hg19/fasta/default/hg19.chrom.sizes
alias/hg19/fasta/default/hg19.fa
alias/hg19/fasta/default/hg19.fa.fai
data/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c__ASDs.json
data/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c/fasta/default/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c.chrom.sizes
data/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c/fasta/default/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c.fa
data/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c/fasta/default/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c.fa.fai
test/Untitled.ipynb
27 changes: 27 additions & 0 deletions bedboss/bedclassifier/bedclassifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,39 @@ def get_bed_type(bed: str, no_fail: Optional[bool] = True) -> Tuple[str, str]:

max_rows = 5
row_count = 0

while row_count <= max_rows:
try:
df = pd.read_csv(bed, sep="\t", header=None, nrows=4, skiprows=row_count)
if row_count > 0:
_LOGGER.info(f"Skipped {row_count} rows to parse bed file {bed}")
break
except UnicodeDecodeError as e:
try:
df = pd.read_csv(
bed,
sep="\t",
header=None,
nrows=4,
skiprows=row_count,
encoding="utf-16",
)
if row_count > 0:
_LOGGER.info(f"Skipped {row_count} rows to parse bed file {bed}")
break
except (pandas.errors.ParserError, pandas.errors.EmptyDataError) as e:
if row_count <= max_rows:
row_count += 1
else:
if no_fail:
_LOGGER.warning(
f"Unable to parse bed file {bed}, due to error {e}, setting bed_type = unknown_bedtype"
)
return "unknown_bedtype", "unknown_bedtype"
else:
raise BedTypeException(
reason=f"Bed type could not be determined due to CSV parse error {e}"
)
except (pandas.errors.ParserError, pandas.errors.EmptyDataError) as e:
if row_count <= max_rows:
row_count += 1
Expand Down
1 change: 0 additions & 1 deletion bedboss/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,6 @@ def make_bedset(
def init_config(
outfolder: str = typer.Option(..., help="Path to the output folder"),
):

from bedboss.utils import save_example_bedbase_config

save_example_bedbase_config(outfolder)
Expand Down
2 changes: 2 additions & 0 deletions scripts/bedclassifier_tuning/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Just making a script to download and assess BED files and determine if they are correctly classified.

23 changes: 23 additions & 0 deletions scripts/bedclassifier_tuning/bedclassifier_output_schema.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
title: Bed Classifier
description: Output for bed classification results
type: object
properties:
pipeline_name: "bedclassifier"
samples:
type: object
properties:
bedfile_named:
type: string
description: "reported bedfile name e.g. narrowpeak"
bedfile_type:
type: string
description: "reported bedfile type"
given_bedfile_type:
type: string
description: "given bed file type"
types_match:
type: boolean
description: "Do the types match?"
gsm:
type: string
description: "given gsm"
188 changes: 188 additions & 0 deletions scripts/bedclassifier_tuning/bedclassify.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
import gzip
import logging
import os
import shutil

import pipestat
import pypiper
from typing import Optional

from bedboss.bedclassifier import get_bed_type
from bedboss.exceptions import BedTypeException

_LOGGER = logging.getLogger("bedboss")

from geofetch import Finder, Geofetcher


class BedClassifier:
"""
This will take the input of either a .bed or a .bed.gz and classify the type of BED file.

"""

def __init__(
self,
input_file: str,
output_dir: Optional[str] = None,
bed_digest: Optional[str] = None,
input_type: Optional[str] = None,
pm: pypiper.PipelineManager = None,
report_to_database: Optional[bool] = False,
psm: pipestat.PipestatManager = None,
gsm: str = None,
):
# Raise Exception if input_type is given and it is NOT a BED file
# Raise Exception if the input file cannot be resolved

self.gsm = gsm
self.input_file = input_file
self.bed_digest = bed_digest
self.input_type = input_type

self.abs_bed_path = os.path.abspath(self.input_file)
self.file_name = os.path.splitext(os.path.basename(self.abs_bed_path))[0]
self.file_extension = os.path.splitext(self.abs_bed_path)[-1]

# we need this only if unzipping a file
self.output_dir = output_dir or os.path.join(
os.path.dirname(self.abs_bed_path), "temp_processing"
)
# Use existing Pipeline Manager if it exists
self.pm = pm

if psm is None:
pephuburl = "donaldcampbelljr/bedclassifier_tuning_geo:default"
self.psm = pipestat.PipestatManager(
pephub_path=pephuburl, schema_path="bedclassifier_output_schema.yaml"
)
else:
self.psm = psm

if self.file_extension == ".gz":
unzipped_input_file = os.path.join(self.output_dir, self.file_name)

with gzip.open(self.input_file, "rb") as f_in:
_LOGGER.info(
f"Unzipping file:{self.input_file} and Creating Unzipped file: {unzipped_input_file}"
)
with open(unzipped_input_file, "wb") as f_out:
shutil.copyfileobj(f_in, f_out)
self.input_file = unzipped_input_file
if self.pm:
self.pm.clean_add(unzipped_input_file)

try:
self.bed_type, self.bed_type_named = get_bed_type(self.input_file)
except BedTypeException as e:
_LOGGER.warning(msg=f"FAILED {bed_digest} Exception {e}")
self.bed_type = "unknown_bedtype"
self.bed_type_named = "unknown_bedtype"

if self.input_type is not None:
if self.bed_type_named != self.input_type:
_LOGGER.warning(
f"BED file classified as different type than given input: {self.bed_type} vs {self.input_type}"
)
do_types_match = False
else:
do_types_match = True
else:
do_types_match = False

# Create Value Dict to report via pipestat

all_values = {}

if self.input_type:
all_values.update({"given_bedfile_type": self.input_type})
if self.bed_type:
all_values.update({"bedfile_type": self.bed_type})
if self.bed_type_named:
all_values.update({"bedfile_named": self.bed_type_named})
if self.gsm:
all_values.update({"gsm": self.gsm})

all_values.update({"types_match": do_types_match})

try:
psm.report(record_identifier=bed_digest, values=all_values)
except Exception as e:
_LOGGER.warning(msg=f"FAILED {bed_digest} Exception {e}")

if self.pm:
self.pm.stop_pipeline()
Comment on lines +18 to +114
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are duplicating a lot of code here, just use main function from bedboss

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this code should be deleted, because it makes mess for future work



def main():
# PEP for reporting all classification results
pephuburl = "donaldcampbelljr/bedclassifier_tuning_geo:default"

# Place these external to pycharm folder!!!
data_output_path = os.path.abspath("data")
results_path = os.path.abspath("results")
logs_dir = os.path.join(results_path, "logs")

gse_obj = Finder()

# # Optionally: provide filter string and max number of retrieve elements
# gse_obj = Finder(filters="narrowpeak", retmax=100)
#
# gse_list = gse_obj.get_gse_all()
# gse_obj.generate_file("data/output.txt", gse_list=gse_list)

pm = pypiper.PipelineManager(
name="bedclassifier",
outfolder=logs_dir,
recover=True,
)

pm.start_pipeline()

# for geo in gse_list:
geofetcher_obj = Geofetcher(
filter="\.(bed|narrowPeak|broadPeak)\.",
filter_size="25MB",
data_source="samples",
geo_folder=data_output_path,
metadata_folder=data_output_path,
processed=True,
max_soft_size="20MB",
discard_soft=True,
)

# geofetcher_obj.fetch_all(input="data/output.txt", name="donald_test")
geofetched = geofetcher_obj.get_projects(
input=os.path.join(data_output_path, "output.txt"), just_metadata=False
)

samples = geofetched["output_samples"].samples

psm = pipestat.PipestatManager(
pephub_path=pephuburl, schema_path="bedclassifier_output_schema.yaml"
)

for sample in samples:
if isinstance(sample.output_file_path, list):
bedfile = sample.output_file_path[0]
else:
bedfile = sample.output_file_path
geo_accession = sample.sample_geo_accession
sample_name = sample.sample_name
bed_type_from_geo = sample.type.lower()

bed = BedClassifier(
input_file=bedfile,
bed_digest=sample_name, # TODO FIX THIS IT HOULD BE AN ACTUAL DIGEST
output_dir=results_path,
input_type=bed_type_from_geo,
psm=psm,
pm=pm,
gsm=geo_accession,
)

pm.stop_pipeline()


if __name__ == "__main__":
main()
1 change: 0 additions & 1 deletion test/test_bedclassifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@


class TestBedClassifier:

def test_classification(
self,
):
Expand Down
Loading