From eb1f82c13df4d47a32c12017c360a625e3449822 Mon Sep 17 00:00:00 2001 From: alejandrogzi Date: Wed, 20 Nov 2024 14:57:23 +0100 Subject: [PATCH] BREAKING CHANGE: v.0.1.9.3 + .gz readers! --- src/main.rs | 31 ++++++++++++++++++++++++++++--- src/utils.rs | 50 +++++++++++++++++++++++++++++++++++--------------- 2 files changed, 63 insertions(+), 18 deletions(-) diff --git a/src/main.rs b/src/main.rs index dca1e30..5dec732 100644 --- a/src/main.rs +++ b/src/main.rs @@ -51,6 +51,7 @@ use std::collections::HashMap; use std::error::Error; use std::fs::File; use std::io::{BufWriter, Write}; +use std::path::Path; use std::string::String; use std::time::Instant; @@ -65,7 +66,7 @@ use bed2gtf::*; const SOURCE: &str = "bed2gtf"; -fn main() { +fn main() -> Result<(), Box> { let args = Cli::parse(); args.check().unwrap_or_else(|e| { error!("{}", e); @@ -95,7 +96,29 @@ fn main() { HashMap::new() }; - let bed = bed_reader(&args.bed); + let bed = match args.bed.extension().and_then(|s| s.to_str()) { + Some("gz") => { + let bed = match Path::new(args.bed.file_stem().unwrap()) + .extension() + .expect("ERROR: No extension found") + .to_str() + { + Some("bed") => { + let contents = with_gz(&args.bed)?; + parallel_parse(&contents)? + } + _ => panic!("ERROR: Not a .BED/.BED.GZ. Wrong file format!"), + }; + + bed + } + Some("bed") => { + let contents = raw(&args.bed)?; + parallel_parse(&contents)? + } + _ => panic!("ERROR: Not a .BED/.BED.GZ. Wrong file format!"), + }; + let gene_track = custom_par_parse(&bed).unwrap_or_else(|_| { let message = format!("Error parsing BED file {}", args.bed.display()); panic!("{}", message); @@ -143,7 +166,9 @@ fn main() { let peak_mem = (max_mem_usage_mb() - bmem).max(0.0); log::info!("Memory usage: {} MB", peak_mem); - log::info!("Elapsed: {:.4?} secs", start.elapsed().as_secs_f32()) + log::info!("Elapsed: {:.4?} secs", start.elapsed().as_secs_f32()); + + Ok(()) } fn to_gtf( diff --git a/src/utils.rs b/src/utils.rs index c29ba3d..a17d6db 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -1,28 +1,22 @@ use crate::bed::BedRecord; use chrono::Datelike; - use colored::Colorize; - +use flate2::read::GzDecoder; use indoc::indoc; - use rayon::prelude::*; use std::collections::HashMap; +use std::error::Error; +use std::fmt::Debug; use std::fs::File; -use std::io::{self, Read, Write}; -use std::path::PathBuf; +use std::io::{self, BufReader, Read, Write}; +use std::path::{Path, PathBuf}; const SOURCE: &str = "bed2gtf"; const VERSION: &str = env!("CARGO_PKG_VERSION"); const REPOSITORY: &str = env!("CARGO_PKG_REPOSITORY"); -pub fn bed_reader(file: &PathBuf) -> Vec { - let bed = reader(file).unwrap(); - let records = parallel_parse(&bed).unwrap(); - records -} - pub fn get_isoforms(file: &String) -> HashMap { let pairs = parallel_hash_rev(file); // let rev_pairs = parallel_hash(&file); @@ -46,6 +40,23 @@ pub fn reader(file: &PathBuf) -> io::Result { Ok(contents) } +pub fn raw + Debug>(f: P) -> Result> { + let mut file = File::open(f)?; + let mut contents = String::new(); + file.read_to_string(&mut contents)?; + Ok(contents) +} + +pub fn with_gz + Debug>(f: P) -> Result> { + let file = File::open(f)?; + let mut decoder = GzDecoder::new(BufReader::new(file)); + + let mut contents = String::new(); + decoder.read_to_string(&mut contents)?; + + Ok(contents) +} + pub fn parallel_hash<'a>(s: &'a str) -> HashMap { s.par_lines() .filter_map(|line| { @@ -75,11 +86,20 @@ pub fn parallel_hash_rev<'a>(s: &'a str) -> HashMap { .collect() } -pub fn parallel_parse<'a>(s: &'a str) -> Result, &'static str> { - let records: Result, &'static str> = - s.par_lines().map(|line| BedRecord::parse(line)).collect(); +pub fn parallel_parse<'a>(s: &'a str) -> Result, String> { + let records = s + .par_lines() + // .map(|line| BedRecord::parse(line)) + .filter_map(|line| match std::str::from_utf8(line.as_bytes()) { + Ok(valid_line) => Some(BedRecord::parse(valid_line)), + Err(_) => { + eprintln!("Skipping invalid UTF-8 line: {:?}", line); + None + } + }) + .collect::, String>>(); - records + Ok(records?) } pub fn custom_par_parse(