-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Dev bedclassifier script #67
Conversation
donaldcampbelljr
commented
May 29, 2024
- Add bed classifier script which fetches bed files, classifies them and then reports the types to pephub.
- Use this script to aid in tuning the bed classifier system.
…with bed classifier
More work towards #60 The major modification to the pipeline:
The script I added for testing runs separately from the main pipeline and should not impact performance. Most recent testing added to PEP: https://pephub.databio.org/donaldcampbelljr/bedclassifier_tuning_geo?tag=default |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you please clean just one script?
Changes approved
class BedClassifier: | ||
""" | ||
This will take the input of either a .bed or a .bed.gz and classify the type of BED file. | ||
|
||
""" | ||
|
||
def __init__( | ||
self, | ||
input_file: str, | ||
output_dir: Optional[str] = None, | ||
bed_digest: Optional[str] = None, | ||
input_type: Optional[str] = None, | ||
pm: pypiper.PipelineManager = None, | ||
report_to_database: Optional[bool] = False, | ||
psm: pipestat.PipestatManager = None, | ||
gsm: str = None, | ||
): | ||
# Raise Exception if input_type is given and it is NOT a BED file | ||
# Raise Exception if the input file cannot be resolved | ||
|
||
self.gsm = gsm | ||
self.input_file = input_file | ||
self.bed_digest = bed_digest | ||
self.input_type = input_type | ||
|
||
self.abs_bed_path = os.path.abspath(self.input_file) | ||
self.file_name = os.path.splitext(os.path.basename(self.abs_bed_path))[0] | ||
self.file_extension = os.path.splitext(self.abs_bed_path)[-1] | ||
|
||
# we need this only if unzipping a file | ||
self.output_dir = output_dir or os.path.join( | ||
os.path.dirname(self.abs_bed_path), "temp_processing" | ||
) | ||
# Use existing Pipeline Manager if it exists | ||
self.pm = pm | ||
|
||
if psm is None: | ||
pephuburl = "donaldcampbelljr/bedclassifier_tuning_geo:default" | ||
self.psm = pipestat.PipestatManager( | ||
pephub_path=pephuburl, schema_path="bedclassifier_output_schema.yaml" | ||
) | ||
else: | ||
self.psm = psm | ||
|
||
if self.file_extension == ".gz": | ||
unzipped_input_file = os.path.join(self.output_dir, self.file_name) | ||
|
||
with gzip.open(self.input_file, "rb") as f_in: | ||
_LOGGER.info( | ||
f"Unzipping file:{self.input_file} and Creating Unzipped file: {unzipped_input_file}" | ||
) | ||
with open(unzipped_input_file, "wb") as f_out: | ||
shutil.copyfileobj(f_in, f_out) | ||
self.input_file = unzipped_input_file | ||
if self.pm: | ||
self.pm.clean_add(unzipped_input_file) | ||
|
||
try: | ||
self.bed_type, self.bed_type_named = get_bed_type(self.input_file) | ||
except BedTypeException as e: | ||
_LOGGER.warning(msg=f"FAILED {bed_digest} Exception {e}") | ||
self.bed_type = "unknown_bedtype" | ||
self.bed_type_named = "unknown_bedtype" | ||
|
||
if self.input_type is not None: | ||
if self.bed_type_named != self.input_type: | ||
_LOGGER.warning( | ||
f"BED file classified as different type than given input: {self.bed_type} vs {self.input_type}" | ||
) | ||
do_types_match = False | ||
else: | ||
do_types_match = True | ||
else: | ||
do_types_match = False | ||
|
||
# Create Value Dict to report via pipestat | ||
|
||
all_values = {} | ||
|
||
if self.input_type: | ||
all_values.update({"given_bedfile_type": self.input_type}) | ||
if self.bed_type: | ||
all_values.update({"bedfile_type": self.bed_type}) | ||
if self.bed_type_named: | ||
all_values.update({"bedfile_named": self.bed_type_named}) | ||
if self.gsm: | ||
all_values.update({"gsm": self.gsm}) | ||
|
||
all_values.update({"types_match": do_types_match}) | ||
|
||
try: | ||
psm.report(record_identifier=bed_digest, values=all_values) | ||
except Exception as e: | ||
_LOGGER.warning(msg=f"FAILED {bed_digest} Exception {e}") | ||
|
||
if self.pm: | ||
self.pm.stop_pipeline() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You are duplicating a lot of code here, just use main function from bedboss
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this code should be deleted, because it makes mess for future work
We chatted about the above and agreed to merge the changes since only exceptions were added to main pipeline and everything else (such as the BedClassifier Class) are self contained |