diff --git a/.gitignore b/.gitignore index 3a554a2..65e8d4b 100644 --- a/.gitignore +++ b/.gitignore @@ -139,4 +139,15 @@ openSignalMatrix out2023/* # test data -test/test_data/* \ No newline at end of file +test/test_data/* +/scripts/bedclassifier_tuning/results/ +/scripts/bedclassifier_tuning/data/ +genome_config.yaml +alias/hg19/fasta/default/hg19.chrom.sizes +alias/hg19/fasta/default/hg19.fa +alias/hg19/fasta/default/hg19.fa.fai +data/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c__ASDs.json +data/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c/fasta/default/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c.chrom.sizes +data/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c/fasta/default/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c.fa +data/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c/fasta/default/baa91c8f6e2780cfd8fd1040ff37f51c379947a2a4820d6c.fa.fai +test/Untitled.ipynb diff --git a/bedboss/bedclassifier/bedclassifier.py b/bedboss/bedclassifier/bedclassifier.py index 420cea0..d32c737 100644 --- a/bedboss/bedclassifier/bedclassifier.py +++ b/bedboss/bedclassifier/bedclassifier.py @@ -38,12 +38,39 @@ def get_bed_type(bed: str, no_fail: Optional[bool] = True) -> Tuple[str, str]: max_rows = 5 row_count = 0 + while row_count <= max_rows: try: df = pd.read_csv(bed, sep="\t", header=None, nrows=4, skiprows=row_count) if row_count > 0: _LOGGER.info(f"Skipped {row_count} rows to parse bed file {bed}") break + except UnicodeDecodeError as e: + try: + df = pd.read_csv( + bed, + sep="\t", + header=None, + nrows=4, + skiprows=row_count, + encoding="utf-16", + ) + if row_count > 0: + _LOGGER.info(f"Skipped {row_count} rows to parse bed file {bed}") + break + except (pandas.errors.ParserError, pandas.errors.EmptyDataError) as e: + if row_count <= max_rows: + row_count += 1 + else: + if no_fail: + _LOGGER.warning( + f"Unable to parse bed file {bed}, due to error {e}, setting bed_type = unknown_bedtype" + ) + return "unknown_bedtype", "unknown_bedtype" + else: + raise BedTypeException( + reason=f"Bed type could not be determined due to CSV parse error {e}" + ) except (pandas.errors.ParserError, pandas.errors.EmptyDataError) as e: if row_count <= max_rows: row_count += 1 diff --git a/bedboss/cli.py b/bedboss/cli.py index b20cffa..0023c5d 100644 --- a/bedboss/cli.py +++ b/bedboss/cli.py @@ -382,7 +382,6 @@ def make_bedset( def init_config( outfolder: str = typer.Option(..., help="Path to the output folder"), ): - from bedboss.utils import save_example_bedbase_config save_example_bedbase_config(outfolder) diff --git a/scripts/bedclassifier_tuning/README.md b/scripts/bedclassifier_tuning/README.md new file mode 100644 index 0000000..88c3fc6 --- /dev/null +++ b/scripts/bedclassifier_tuning/README.md @@ -0,0 +1,2 @@ +Just making a script to download and assess BED files and determine if they are correctly classified. + diff --git a/scripts/bedclassifier_tuning/bedclassifier_output_schema.yaml b/scripts/bedclassifier_tuning/bedclassifier_output_schema.yaml new file mode 100644 index 0000000..1f793a0 --- /dev/null +++ b/scripts/bedclassifier_tuning/bedclassifier_output_schema.yaml @@ -0,0 +1,23 @@ +title: Bed Classifier +description: Output for bed classification results +type: object +properties: + pipeline_name: "bedclassifier" + samples: + type: object + properties: + bedfile_named: + type: string + description: "reported bedfile name e.g. narrowpeak" + bedfile_type: + type: string + description: "reported bedfile type" + given_bedfile_type: + type: string + description: "given bed file type" + types_match: + type: boolean + description: "Do the types match?" + gsm: + type: string + description: "given gsm" \ No newline at end of file diff --git a/scripts/bedclassifier_tuning/bedclassify.py b/scripts/bedclassifier_tuning/bedclassify.py new file mode 100644 index 0000000..a4110f3 --- /dev/null +++ b/scripts/bedclassifier_tuning/bedclassify.py @@ -0,0 +1,188 @@ +import gzip +import logging +import os +import shutil + +import pipestat +import pypiper +from typing import Optional + +from bedboss.bedclassifier import get_bed_type +from bedboss.exceptions import BedTypeException + +_LOGGER = logging.getLogger("bedboss") + +from geofetch import Finder, Geofetcher + + +class BedClassifier: + """ + This will take the input of either a .bed or a .bed.gz and classify the type of BED file. + + """ + + def __init__( + self, + input_file: str, + output_dir: Optional[str] = None, + bed_digest: Optional[str] = None, + input_type: Optional[str] = None, + pm: pypiper.PipelineManager = None, + report_to_database: Optional[bool] = False, + psm: pipestat.PipestatManager = None, + gsm: str = None, + ): + # Raise Exception if input_type is given and it is NOT a BED file + # Raise Exception if the input file cannot be resolved + + self.gsm = gsm + self.input_file = input_file + self.bed_digest = bed_digest + self.input_type = input_type + + self.abs_bed_path = os.path.abspath(self.input_file) + self.file_name = os.path.splitext(os.path.basename(self.abs_bed_path))[0] + self.file_extension = os.path.splitext(self.abs_bed_path)[-1] + + # we need this only if unzipping a file + self.output_dir = output_dir or os.path.join( + os.path.dirname(self.abs_bed_path), "temp_processing" + ) + # Use existing Pipeline Manager if it exists + self.pm = pm + + if psm is None: + pephuburl = "donaldcampbelljr/bedclassifier_tuning_geo:default" + self.psm = pipestat.PipestatManager( + pephub_path=pephuburl, schema_path="bedclassifier_output_schema.yaml" + ) + else: + self.psm = psm + + if self.file_extension == ".gz": + unzipped_input_file = os.path.join(self.output_dir, self.file_name) + + with gzip.open(self.input_file, "rb") as f_in: + _LOGGER.info( + f"Unzipping file:{self.input_file} and Creating Unzipped file: {unzipped_input_file}" + ) + with open(unzipped_input_file, "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + self.input_file = unzipped_input_file + if self.pm: + self.pm.clean_add(unzipped_input_file) + + try: + self.bed_type, self.bed_type_named = get_bed_type(self.input_file) + except BedTypeException as e: + _LOGGER.warning(msg=f"FAILED {bed_digest} Exception {e}") + self.bed_type = "unknown_bedtype" + self.bed_type_named = "unknown_bedtype" + + if self.input_type is not None: + if self.bed_type_named != self.input_type: + _LOGGER.warning( + f"BED file classified as different type than given input: {self.bed_type} vs {self.input_type}" + ) + do_types_match = False + else: + do_types_match = True + else: + do_types_match = False + + # Create Value Dict to report via pipestat + + all_values = {} + + if self.input_type: + all_values.update({"given_bedfile_type": self.input_type}) + if self.bed_type: + all_values.update({"bedfile_type": self.bed_type}) + if self.bed_type_named: + all_values.update({"bedfile_named": self.bed_type_named}) + if self.gsm: + all_values.update({"gsm": self.gsm}) + + all_values.update({"types_match": do_types_match}) + + try: + psm.report(record_identifier=bed_digest, values=all_values) + except Exception as e: + _LOGGER.warning(msg=f"FAILED {bed_digest} Exception {e}") + + if self.pm: + self.pm.stop_pipeline() + + +def main(): + # PEP for reporting all classification results + pephuburl = "donaldcampbelljr/bedclassifier_tuning_geo:default" + + # Place these external to pycharm folder!!! + data_output_path = os.path.abspath("data") + results_path = os.path.abspath("results") + logs_dir = os.path.join(results_path, "logs") + + gse_obj = Finder() + + # # Optionally: provide filter string and max number of retrieve elements + # gse_obj = Finder(filters="narrowpeak", retmax=100) + # + # gse_list = gse_obj.get_gse_all() + # gse_obj.generate_file("data/output.txt", gse_list=gse_list) + + pm = pypiper.PipelineManager( + name="bedclassifier", + outfolder=logs_dir, + recover=True, + ) + + pm.start_pipeline() + + # for geo in gse_list: + geofetcher_obj = Geofetcher( + filter="\.(bed|narrowPeak|broadPeak)\.", + filter_size="25MB", + data_source="samples", + geo_folder=data_output_path, + metadata_folder=data_output_path, + processed=True, + max_soft_size="20MB", + discard_soft=True, + ) + + # geofetcher_obj.fetch_all(input="data/output.txt", name="donald_test") + geofetched = geofetcher_obj.get_projects( + input=os.path.join(data_output_path, "output.txt"), just_metadata=False + ) + + samples = geofetched["output_samples"].samples + + psm = pipestat.PipestatManager( + pephub_path=pephuburl, schema_path="bedclassifier_output_schema.yaml" + ) + + for sample in samples: + if isinstance(sample.output_file_path, list): + bedfile = sample.output_file_path[0] + else: + bedfile = sample.output_file_path + geo_accession = sample.sample_geo_accession + sample_name = sample.sample_name + bed_type_from_geo = sample.type.lower() + + bed = BedClassifier( + input_file=bedfile, + bed_digest=sample_name, # TODO FIX THIS IT HOULD BE AN ACTUAL DIGEST + output_dir=results_path, + input_type=bed_type_from_geo, + psm=psm, + pm=pm, + gsm=geo_accession, + ) + + pm.stop_pipeline() + + +if __name__ == "__main__": + main() diff --git a/test/test_bedclassifier.py b/test/test_bedclassifier.py index d4c6f19..4b4dd0e 100644 --- a/test/test_bedclassifier.py +++ b/test/test_bedclassifier.py @@ -16,7 +16,6 @@ class TestBedClassifier: - def test_classification( self, ):