diff --git a/Cargo.toml b/Cargo.toml index 1a101b9..ccf932f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "barcode-count" description = "NGS barcode counter for DEL, CRISPR-seq, and Barcode-seq" -version = "0.9.3" +version = "0.9.4" edition = "2021" license = "Apache-2.0" readme = "README.md" @@ -16,6 +16,7 @@ bench = false path = "src/main.rs" [dependencies] +anyhow = "1.0" rayon = "1.5" regex = "1.5" clap = "2.33.0" @@ -23,5 +24,4 @@ itertools = "0.10" num_cpus = "1.0" flate2 = "1" chrono = "0.4" -custom_error = "1.9" num-format = "0.4" diff --git a/src/arguments.rs b/src/arguments.rs new file mode 100644 index 0000000..66d1b37 --- /dev/null +++ b/src/arguments.rs @@ -0,0 +1,216 @@ +use anyhow::{Context, Result}; +use chrono::Local; +use clap::{crate_version, App, Arg}; + +/// A struct that contains and initiates all input arguments +pub struct Args { + pub fastq: String, // fastq file path + pub format: String, // format scheme file path + pub sample_barcodes_option: Option, // sample barcode file path. Optional + pub counted_barcodes_option: Option, // building block barcode file path. Optional + pub output_dir: String, // output directory. Deafaults to './' + pub threads: u8, // Number of threads to use. Defaults to number of threads on the machine + pub prefix: String, // Prefix string for the output files + pub merge_output: bool, // Whether or not to create an additional output file that merges all samples + pub barcodes_errors_option: Option, // Optional input of how many errors are allowed in each building block barcode. Defaults to 20% of the length + pub sample_errors_option: Option, // Optional input of how many errors are allowed in each sample barcode. Defaults to 20% of the length + pub constant_errors_option: Option, // Optional input of how many errors are allowed in each constant region barcode. Defaults to 20% of the length + pub min_average_quality_score: f32, + pub enrich: bool, +} + +impl Args { + pub fn new() -> Result { + let total_cpus = num_cpus::get().to_string(); + let today = Local::today().format("%Y-%m-%d").to_string(); + // parse arguments + let args = App::new("NGS-Barcode-Count") + .version(crate_version!()) + .author("Rory Coffey ") + .about("Counts barcodes located in sequencing data") + .arg( + Arg::with_name("fastq") + .short("f") + .long("fastq") + .takes_value(true) + .required(true) + .help("FastQ file"), + ) + .arg( + Arg::with_name("format_file") + .short("q") + .long("sequence-format") + .takes_value(true) + .required(true) + .help("Sequence format file"), + ) + .arg( + Arg::with_name("sample_file") + .short("s") + .long("sample-barcodes") + .takes_value(true) + .help("Sample barcodes file"), + ) + .arg( + Arg::with_name("barcode_file") + .short("c") + .long("counted-barcodes") + .takes_value(true) + .help("Counted barcodes file"), + ) + .arg( + Arg::with_name("threads") + .short("t") + .long("threads") + .takes_value(true) + .default_value(&total_cpus) + .help("Number of threads"), + ) + .arg( + Arg::with_name("dir") + .short("o") + .long("output-dir") + .takes_value(true) + .default_value("./") + .help("Directory to output the counts to"), + ) + .arg( + Arg::with_name("prefix") + .short("p") + .long("prefix") + .takes_value(true) + .default_value(&today) + .help("File prefix name. THe output will end with '__counts.csv'"), + ) + .arg( + Arg::with_name("merge-output") + .short("m") + .long("merge-output") + .takes_value(false) + .help("Merge sample output counts into a single file. Not necessary when there is only one sample"), + ) + .arg( + Arg::with_name("enrich") + .long("enrich") + .short("e") + .takes_value(false) + .help("Create output files of enrichment for single and double synthons/barcodes"), + ) + .arg( + Arg::with_name("max_barcode") + .long("max-errors-counted-barcode") + .takes_value(true) + .help("Maximimum number of sequence errors allowed within each counted barcode. Defaults to 20% of the total."), + ) + .arg( + Arg::with_name("max_sample") + .long("max-errors-sample") + .takes_value(true) + .help("Maximimum number of sequence errors allowed within sample barcode. Defaults to 20% of the total."), + ) + .arg( + Arg::with_name("max_constant") + .long("max-errors-constant") + .takes_value(true) + .help("Maximimum number of sequence errors allowed within constant region. Defaults to 20% of the total."), + ) + .arg( + Arg::with_name("min") + .long("min-quality") + .takes_value(true) + .default_value("0") + .help("Minimum average read quality score per barcode"), + ) + .get_matches(); + + let sample_barcodes_option; + if let Some(sample) = args.value_of("sample_file") { + sample_barcodes_option = Some(sample.to_string()) + } else { + sample_barcodes_option = None + } + + let counted_barcodes_option; + if let Some(barcodes) = args.value_of("barcode_file") { + counted_barcodes_option = Some(barcodes.to_string()) + } else { + counted_barcodes_option = None + } + + let barcodes_errors_option; + if let Some(barcodes) = args.value_of("max_barcode") { + barcodes_errors_option = Some( + barcodes + .parse::() + .context("Unable to convert maximum barcode errors to an integer")?, + ) + } else { + barcodes_errors_option = None + } + + let sample_errors_option; + if let Some(sample) = args.value_of("max_sample") { + sample_errors_option = Some( + sample + .parse::() + .context("Unable to convert maximum sample errors to an integer")?, + ) + } else { + sample_errors_option = None + } + + let constant_errors_option; + if let Some(constant) = args.value_of("max_constant") { + constant_errors_option = Some( + constant + .parse::() + .context("Unable to convert maximum constant errors to an integer")?, + ) + } else { + constant_errors_option = None + } + + let merge_output; + if args.is_present("merge-output") { + merge_output = true + } else { + merge_output = false + } + let enrich; + if args.is_present("enrich") { + enrich = true + } else { + enrich = false + } + let fastq = args.value_of("fastq").unwrap().to_string(); + let format = args.value_of("format_file").unwrap().to_string(); + let output_dir = args.value_of("dir").unwrap().to_string(); + let threads = args + .value_of("threads") + .unwrap() + .parse::() + .context("Unable to convert threads to an integer")?; + let prefix = args.value_of("prefix").unwrap().to_string(); + let min_average_quality_score = args + .value_of("min") + .unwrap() + .parse::() + .context("Unable to convert min score to a float")?; + + Ok(Args { + fastq, + format, + sample_barcodes_option, + counted_barcodes_option, + output_dir, + threads, + prefix, + merge_output, + barcodes_errors_option, + sample_errors_option, + constant_errors_option, + min_average_quality_score, + enrich, + }) + } +} diff --git a/src/info.rs b/src/info.rs index 7c99e37..294a545 100644 --- a/src/info.rs +++ b/src/info.rs @@ -1,9 +1,9 @@ +use anyhow::{anyhow, Context, Result}; use itertools::Itertools; use num_format::{Locale, ToFormattedString}; use regex::Regex; use std::{ collections::{HashMap, HashSet}, - error::Error, fmt, fs, sync::{ atomic::{AtomicBool, AtomicU32, AtomicUsize, Ordering}, @@ -190,9 +190,10 @@ pub struct SequenceFormat { impl SequenceFormat { /// Creates a new SequenceFormat struct which holds the sequencing format information, such as, where the barcodes are located within the sequence - pub fn new(format: String) -> Result> { + pub fn new(format: String) -> Result { // Read sequenc format file to string - let format_data = fs::read_to_string(format)? + let format_data = fs::read_to_string(format.clone()) + .context(format!("Failed to open {}", format))? .lines() // split into lines .filter(|line| !line.starts_with('#')) // remove any line that starts with '#' .collect::(); // collect into a String @@ -224,7 +225,7 @@ impl SequenceFormat { } /// Returns a Vec of the size of all counted barcodes within the seqeunce format - pub fn barcode_lengths(&self) -> Result, Box> { + pub fn barcode_lengths(&self) -> Result> { let barcode_search = Regex::new(r"(\{\d+\})")?; // Create a search that finds the '{#}' let digit_search = Regex::new(r"\d+")?; // Create a search that pulls out the number let mut barcode_lengths = Vec::new(); // Create a Vec that will contain the counted barcode lengths @@ -245,7 +246,7 @@ impl SequenceFormat { } /// Returns the sample barcode length found in the format file string - pub fn sample_length_option(&self) -> Result, Box> { + pub fn sample_length_option(&self) -> Result> { let sample_search = Regex::new(r"(\[\d+\])")?; // Create a search that finds the '[#]' let digit_search = Regex::new(r"\d+")?; // Create a search that pulls out the numeric value @@ -335,7 +336,7 @@ impl fmt::Display for SequenceFormat { /// /// assert_eq!(build_format_string(&format_data).unwrap(), "NNNNNNNNAGCTAGATCNNNNNNTGGANNNNNNTGGANNNNNNTGATTGCGCNNNNNNNNNNAT".to_string()) /// ``` -pub fn build_format_string(format_data: &str) -> Result> { +pub fn build_format_string(format_data: &str) -> Result { let digit_search = Regex::new(r"\d+")?; let barcode_search = Regex::new(r"(?i)(\{\d+\})|(\[\d+\])|(\(\d+\))|N+|[ATGC]+")?; let mut final_format = String::new(); @@ -364,7 +365,7 @@ pub fn build_format_string(format_data: &str) -> Result> /// /// assert_eq!(build_regions_string(format_data).unwrap(), "SSSSSSSSCCCCCCCCCBBBBBBCCCCBBBBBBCCCCBBBBBBCCCCCCCCCRRRRRRCCCCCC".to_string()) /// ``` -pub fn build_regions_string(format_data: &str) -> Result> { +pub fn build_regions_string(format_data: &str) -> Result { let digit_search = Regex::new(r"\d+")?; let barcode_search = Regex::new(r"(?i)(\{\d+\})|(\[\d+\])|(\(\d+\))|[ATGCN]+")?; let mut final_format = String::new(); @@ -401,7 +402,7 @@ pub fn build_regions_string(format_data: &str) -> Result> /// /// assert_eq!(build_regex_captures(&format_data).unwrap(), "(?P.{8})AGCTAGATC(?P.{6})TGGA(?P.{6})TGGA(?P.{6})TGATTGCGC(?P.{6}).{4}AT".to_string()) /// ``` -pub fn build_regex_captures(format_data: &str) -> Result> { +pub fn build_regex_captures(format_data: &str) -> Result { let digit_search = Regex::new(r"\d+")?; let barcode_search = Regex::new(r"(?i)(\{\d+\})|(\[\d+\])|(\(\d+\))|N+|[ATGC]+")?; // the previous does not bumber each barcode but names each caputre with barcode# @@ -473,12 +474,10 @@ impl BarcodeConversions { /// Reads in comma separated barcode file (CSV). The columns need to have headers. The first column needs to be the nucleotide barcode /// and the second needs to be the ID - pub fn sample_barcode_file_conversion( - &mut self, - barcode_path: &str, - ) -> Result<(), Box> { + pub fn sample_barcode_file_conversion(&mut self, barcode_path: &str) -> Result<()> { // read in the sample barcode file - for (barcode, sample_id) in fs::read_to_string(barcode_path)? + for (barcode, sample_id) in fs::read_to_string(barcode_path) + .context(format!("Failed to open {}", barcode_path))? .lines() // split the lines .skip(1) // skip the first line which should be the header .map(|line| { @@ -505,9 +504,10 @@ impl BarcodeConversions { &mut self, barcode_path: &str, barcode_num: usize, - ) -> Result<(), Box> { + ) -> Result<()> { // read in the sample barcode file - let barcode_vecs = fs::read_to_string(barcode_path)? + let barcode_vecs = fs::read_to_string(barcode_path) + .context(format!("Failed to read {}", barcode_path))? .lines() // split the lines .skip(1) // skip the first line which should be the header .map(|line| { @@ -523,9 +523,8 @@ impl BarcodeConversions { } let mut barcode_num_contained = HashSet::new(); for (barcode, id, barcode_num) in barcode_vecs { - let barcode_num_usize = barcode_num.parse::().unwrap_or_else(|err| { - panic!("Third column of barcode file contains something other than an integer: {}\nError: {}", barcode_num, err) - }) - 1; + let barcode_num_usize = barcode_num.parse::() + .context(format!("Third column of barcode file contains something other than an integer: {}", barcode_num))? - 1; barcode_num_contained.insert(barcode_num_usize); self.counted_barcodes_hash[barcode_num_usize].insert(barcode, id); } @@ -536,10 +535,10 @@ impl BarcodeConversions { } } if !missing_barcode_num.is_empty() { - panic!( + return Err(anyhow!(format!( "Barcode conversion file missing barcode numers {:?} in the third column", missing_barcode_num - ) + ))); } Ok(()) } @@ -597,7 +596,7 @@ impl MaxSeqErrors { /// let constant_errors_option = None; /// let constant_region_size = 30; /// let min_quality = 0.0; - /// let mut max_sequence_errors = MaxSeqErrors::new(sample_errors_option, sample_barcode_size_option, barcode_errors_option, barcode_sizes, constant_errors_option, constant_region_size, min_quality).unwrap(); + /// let mut max_sequence_errors = MaxSeqErrors::new(sample_errors_option, sample_barcode_size_option, barcode_errors_option, barcode_sizes, constant_errors_option, constant_region_size, min_quality); /// ``` pub fn new( sample_errors_option: Option, @@ -607,7 +606,7 @@ impl MaxSeqErrors { constant_errors_option: Option, constant_region_size: u8, min_quality: f32, - ) -> Result> { + ) -> Self { let max_sample_errors; // start with a sample size of 0 in case there is no sample barcode. If there is then mutate let mut sample_size = 0; @@ -643,7 +642,7 @@ impl MaxSeqErrors { // errors allowed is the length of the constant region - the Ns / 5 or 20% } - Ok(MaxSeqErrors { + MaxSeqErrors { constant_region: max_constant_errors, constant_region_size, sample_barcode: max_sample_errors, @@ -651,7 +650,7 @@ impl MaxSeqErrors { barcode: max_barcode_errors, barcode_sizes, min_quality, - }) + } } /// Returns the maximum allowed constant region errors @@ -667,11 +666,11 @@ impl MaxSeqErrors { /// let constant_errors_option = None; /// let constant_region_size = 30; /// let min_quality = 0.0; - /// let mut max_sequence_errors = MaxSeqErrors::new(sample_errors_option, sample_barcode_size_option, barcode_errors_option, barcode_sizes, constant_errors_option, constant_region_size, min_quality).unwrap(); + /// let mut max_sequence_errors = MaxSeqErrors::new(sample_errors_option, sample_barcode_size_option, barcode_errors_option, barcode_sizes, constant_errors_option, constant_region_size, min_quality); /// assert_eq!(max_sequence_errors.max_constant_errors(), 6); /// let barcode_sizes = vec![8,8,8]; /// let constant_errors_option = Some(3); - /// let mut max_sequence_errors = MaxSeqErrors::new(sample_errors_option, sample_barcode_size_option, barcode_errors_option, barcode_sizes, constant_errors_option, constant_region_size, min_quality).unwrap(); + /// let mut max_sequence_errors = MaxSeqErrors::new(sample_errors_option, sample_barcode_size_option, barcode_errors_option, barcode_sizes, constant_errors_option, constant_region_size, min_quality); /// assert_eq!(max_sequence_errors.max_constant_errors(), 3); /// ``` pub fn max_constant_errors(&self) -> u8 { @@ -691,11 +690,11 @@ impl MaxSeqErrors { /// let constant_errors_option = None; /// let constant_region_size = 30; /// let min_quality = 0.0; - /// let mut max_sequence_errors = MaxSeqErrors::new(sample_errors_option, sample_barcode_size_option, barcode_errors_option, barcode_sizes, constant_errors_option, constant_region_size, min_quality).unwrap(); + /// let mut max_sequence_errors = MaxSeqErrors::new(sample_errors_option, sample_barcode_size_option, barcode_errors_option, barcode_sizes, constant_errors_option, constant_region_size, min_quality); /// assert_eq!(max_sequence_errors.max_sample_errors(), 2); /// let barcode_sizes = vec![8,8,8]; /// let sample_errors_option = Some(3); - /// let mut max_sequence_errors = MaxSeqErrors::new(sample_errors_option, sample_barcode_size_option, barcode_errors_option, barcode_sizes, constant_errors_option, constant_region_size, min_quality).unwrap(); + /// let mut max_sequence_errors = MaxSeqErrors::new(sample_errors_option, sample_barcode_size_option, barcode_errors_option, barcode_sizes, constant_errors_option, constant_region_size, min_quality); /// assert_eq!(max_sequence_errors.max_sample_errors(), 3); /// ``` pub fn max_sample_errors(&self) -> u8 { @@ -715,11 +714,11 @@ impl MaxSeqErrors { /// let constant_errors_option = None; /// let constant_region_size = 30; /// let min_quality = 0.0; - /// let mut max_sequence_errors = MaxSeqErrors::new(sample_errors_option, sample_barcode_size_option, barcode_errors_option, barcode_sizes, constant_errors_option, constant_region_size, min_quality).unwrap(); + /// let mut max_sequence_errors = MaxSeqErrors::new(sample_errors_option, sample_barcode_size_option, barcode_errors_option, barcode_sizes, constant_errors_option, constant_region_size, min_quality); /// assert_eq!(max_sequence_errors.max_barcode_errors(), vec![1,1,1]); /// let barcode_sizes = vec![8,8,8]; /// let barcode_errors_option = Some(2); - /// let mut max_sequence_errors = MaxSeqErrors::new(sample_errors_option, sample_barcode_size_option, barcode_errors_option, barcode_sizes, constant_errors_option, constant_region_size, min_quality).unwrap(); + /// let mut max_sequence_errors = MaxSeqErrors::new(sample_errors_option, sample_barcode_size_option, barcode_errors_option, barcode_sizes, constant_errors_option, constant_region_size, min_quality); /// assert_eq!(max_sequence_errors.max_barcode_errors(), vec![2,2,2]); /// ``` pub fn max_barcode_errors(&self) -> &[u8] { diff --git a/src/input.rs b/src/input.rs index d4355ce..1c68273 100644 --- a/src/input.rs +++ b/src/input.rs @@ -1,9 +1,8 @@ -use custom_error::custom_error; +use anyhow::{anyhow, Context, Result}; use flate2::read::GzDecoder; use num_format::{Locale, ToFormattedString}; use std::{ collections::VecDeque, - error::Error, fmt, fs::File, io::{BufRead, BufReader, Write}, @@ -13,11 +12,7 @@ use std::{ }, }; -// Errors associated with checking the fastq format to make sure it is correct -custom_error! {FastqError - NotFastq = "This program only works with *.fastq files and *.fastq.gz files. The latter is still experimental", - LeftOver = "Sequence lines left over. Check code", -} +use crate::parse::RawSequenceRead; /// Reads in the FASTQ file line by line, then pushes every 2 out of 4 lines, which corresponds to the sequence line, into a Vec that is passed to other threads /// @@ -31,8 +26,9 @@ pub fn read_fastq( seq_clone: Arc>>, exit_clone: Arc, total_reads_arc: Arc, -) -> Result<(), Box> { - let fastq_file = File::open(fastq.clone())?; // open file +) -> Result<()> { + let fastq_file = + File::open(fastq.clone()).context(format!("Failed to open file: {}", fastq))?; // open file // Create a fastq line reader which keeps track of line number, reads, and posts the sequence to the shared vector let mut fastq_line_reader = FastqLineReader::new(seq_clone, exit_clone); @@ -41,15 +37,16 @@ pub fn read_fastq( if !fastq.ends_with("fastq.gz") { // If the file does not end with fastq, return with an error if !fastq.ends_with("fastq") { - return Err(Box::new(FastqError::NotFastq)); + return Err(anyhow!("This program only works with *.fastq files and *.fastq.gz files. The latter is still experimental")); } // go line by line for line_result in BufReader::new(fastq_file).lines() { - let mut line = line_result?; + let mut line = + line_result.context(format!("Bufread could not read line for file: {}", fastq))?; line.push('\n'); // post the line to the shared vector and keep track of the number of sequences etc - fastq_line_reader.read(line)?; + fastq_line_reader.read(line); if fastq_line_reader.line_num == 4 { fastq_line_reader.post()?; } @@ -73,7 +70,7 @@ pub fn read_fastq( // move the read line to the line variable and get the response to check if it is 0 and therefore the file is done read_response = reader.read_line(&mut line)?; // post the line to the shared vector and keep track of the number of sequences etc - fastq_line_reader.read(line)?; + fastq_line_reader.read(line); if fastq_line_reader.line_num == 4 { fastq_line_reader.post()?; } @@ -115,7 +112,7 @@ impl FastqLineReader { } /// Reads in the line and either passes to the vec or discards it, depending if it is a sequence line. Also increments on line count, sequence count etc. - pub fn read(&mut self, line: String) -> Result<(), Box> { + pub fn read(&mut self, line: String) { // Pause if there are already 10000 sequences in the vec so memory is not overloaded while self.seq_clone.lock().unwrap().len() >= 10000 { // if threads have failed exit out of this thread @@ -134,15 +131,13 @@ impl FastqLineReader { } else { self.raw_sequence_read_string.push_str(&line); } - Ok(()) } - pub fn post(&mut self) -> Result<(), Box> { + pub fn post(&mut self) -> Result<()> { self.raw_sequence_read_string.pop(); // removes the last \n // Insert the sequence into the vec. This will be popped out by other threads if self.test { - crate::parse::RawSequenceRead::unpack(self.raw_sequence_read_string.clone())? - .check_fastq_format()?; + RawSequenceRead::unpack(self.raw_sequence_read_string.clone())?.check_fastq_format()?; self.test = false; } self.seq_clone diff --git a/src/lib.rs b/src/lib.rs index 47d6ae3..a7f4a57 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,202 +1,5 @@ +pub mod arguments; pub mod info; pub mod input; pub mod output; pub mod parse; - -use chrono::Local; -use clap::{crate_version, App, Arg}; -// use rayon::prelude::*; -use std::error::Error; - -/// A struct that contains and initiates all input arguments -pub struct Args { - pub fastq: String, // fastq file path - pub format: String, // format scheme file path - pub sample_barcodes_option: Option, // sample barcode file path. Optional - pub counted_barcodes_option: Option, // building block barcode file path. Optional - pub output_dir: String, // output directory. Deafaults to './' - pub threads: u8, // Number of threads to use. Defaults to number of threads on the machine - pub prefix: String, // Prefix string for the output files - pub merge_output: bool, // Whether or not to create an additional output file that merges all samples - pub barcodes_errors_option: Option, // Optional input of how many errors are allowed in each building block barcode. Defaults to 20% of the length - pub sample_errors_option: Option, // Optional input of how many errors are allowed in each sample barcode. Defaults to 20% of the length - pub constant_errors_option: Option, // Optional input of how many errors are allowed in each constant region barcode. Defaults to 20% of the length - pub min_average_quality_score: f32, - pub enrich: bool, -} - -impl Args { - pub fn new() -> Result> { - let total_cpus = num_cpus::get().to_string(); - let today = Local::today().format("%Y-%m-%d").to_string(); - // parse arguments - let args = App::new("NGS-Barcode-Count") - .version(crate_version!()) - .author("Rory Coffey ") - .about("Counts barcodes located in sequencing data") - .arg( - Arg::with_name("fastq") - .short("f") - .long("fastq") - .takes_value(true) - .required(true) - .help("FastQ file"), - ) - .arg( - Arg::with_name("format_file") - .short("q") - .long("sequence-format") - .takes_value(true) - .required(true) - .help("Sequence format file"), - ) - .arg( - Arg::with_name("sample_file") - .short("s") - .long("sample-barcodes") - .takes_value(true) - .help("Sample barcodes file"), - ) - .arg( - Arg::with_name("barcode_file") - .short("c") - .long("counted-barcodes") - .takes_value(true) - .help("Counted barcodes file"), - ) - .arg( - Arg::with_name("threads") - .short("t") - .long("threads") - .takes_value(true) - .default_value(&total_cpus) - .help("Number of threads"), - ) - .arg( - Arg::with_name("dir") - .short("o") - .long("output-dir") - .takes_value(true) - .default_value("./") - .help("Directory to output the counts to"), - ) - .arg( - Arg::with_name("prefix") - .short("p") - .long("prefix") - .takes_value(true) - .default_value(&today) - .help("File prefix name. THe output will end with '__counts.csv'"), - ) - .arg( - Arg::with_name("merge-output") - .short("m") - .long("merge-output") - .takes_value(false) - .help("Merge sample output counts into a single file. Not necessary when there is only one sample"), - ) - .arg( - Arg::with_name("enrich") - .long("enrich") - .short("e") - .takes_value(false) - .help("Create output files of enrichment for single and double synthons/barcodes"), - ) - .arg( - Arg::with_name("max_barcode") - .long("max-errors-counted-barcode") - .takes_value(true) - .help("Maximimum number of sequence errors allowed within each counted barcode. Defaults to 20% of the total."), - ) - .arg( - Arg::with_name("max_sample") - .long("max-errors-sample") - .takes_value(true) - .help("Maximimum number of sequence errors allowed within sample barcode. Defaults to 20% of the total."), - ) - .arg( - Arg::with_name("max_constant") - .long("max-errors-constant") - .takes_value(true) - .help("Maximimum number of sequence errors allowed within constant region. Defaults to 20% of the total."), - ) - .arg( - Arg::with_name("min") - .long("min-quality") - .takes_value(true) - .default_value("0") - .help("Minimum average read quality score per barcode"), - ) - .get_matches(); - - let sample_barcodes_option; - if let Some(sample) = args.value_of("sample_file") { - sample_barcodes_option = Some(sample.to_string()) - } else { - sample_barcodes_option = None - } - - let counted_barcodes_option; - if let Some(barcodes) = args.value_of("barcode_file") { - counted_barcodes_option = Some(barcodes.to_string()) - } else { - counted_barcodes_option = None - } - - let barcodes_errors_option; - if let Some(barcodes) = args.value_of("max_barcode") { - barcodes_errors_option = Some(barcodes.parse::()?) - } else { - barcodes_errors_option = None - } - - let sample_errors_option; - if let Some(sample) = args.value_of("max_sample") { - sample_errors_option = Some(sample.parse::()?) - } else { - sample_errors_option = None - } - - let constant_errors_option; - if let Some(constant) = args.value_of("max_constant") { - constant_errors_option = Some(constant.parse::()?) - } else { - constant_errors_option = None - } - - let merge_output; - if args.is_present("merge-output") { - merge_output = true - } else { - merge_output = false - } - let enrich; - if args.is_present("enrich") { - enrich = true - } else { - enrich = false - } - let fastq = args.value_of("fastq").unwrap().to_string(); - let format = args.value_of("format_file").unwrap().to_string(); - let output_dir = args.value_of("dir").unwrap().to_string(); - let threads = args.value_of("threads").unwrap().parse::().unwrap(); - let prefix = args.value_of("prefix").unwrap().to_string(); - let min_average_quality_score = args.value_of("min").unwrap().parse::().unwrap(); - - Ok(Args { - fastq, - format, - sample_barcodes_option, - counted_barcodes_option, - output_dir, - threads, - prefix, - merge_output, - barcodes_errors_option, - sample_errors_option, - constant_errors_option, - min_average_quality_score, - enrich, - }) - } -} diff --git a/src/main.rs b/src/main.rs index f86bfc3..c1b6a52 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,3 +1,4 @@ +use anyhow::Result; use chrono::Local; use std::{ collections::VecDeque, @@ -7,16 +8,14 @@ use std::{ }, }; -fn main() { +fn main() -> Result<()> { // Start a clock to measure how long the algorithm takes let start_time = Local::now(); // get the argument inputs - let mut args = - barcode_count::Args::new().unwrap_or_else(|err| panic!("Argument error: {}", err)); + let mut args = barcode_count::arguments::Args::new()?; - let sequence_format = barcode_count::info::SequenceFormat::new(args.format.clone()) - .unwrap_or_else(|err| panic!("sequence format error: {}", err)); + let sequence_format = barcode_count::info::SequenceFormat::new(args.format.clone())?; println!("{}\n", sequence_format); // Check how many barcodes occur if either single or double barcode enrichment is callsed. If there are too few, ignore the argument flag @@ -29,9 +28,7 @@ fn main() { let mut barcode_conversions = barcode_count::info::BarcodeConversions::new(); // Create a hashmap of the sample barcodes in order to convert sequence to sample ID if let Some(ref samples) = args.sample_barcodes_option { - barcode_conversions - .sample_barcode_file_conversion(samples) - .unwrap(); + barcode_conversions.sample_barcode_file_conversion(samples)?; barcode_conversions.get_sample_seqs(); } @@ -44,9 +41,7 @@ fn main() { // Create a hashmap of the building block barcodes in order to convert sequence to building block if let Some(ref barcodes) = args.counted_barcodes_option { - barcode_conversions - .barcode_file_conversion(barcodes, sequence_format.barcode_num) - .unwrap(); + barcode_conversions.barcode_file_conversion(barcodes, sequence_format.barcode_num)?; barcode_conversions.get_barcode_seqs(); } @@ -65,8 +60,7 @@ fn main() { args.constant_errors_option, sequence_format.constant_region_length(), args.min_average_quality_score, - ) - .unwrap_or_else(|err| panic!("Max Sequencing Errors error: {}", err)); + ); // Display region sizes and errors allowed println!("{}\n", max_errors); @@ -88,7 +82,7 @@ fn main() { barcode_count::input::read_fastq(fastq, seq_clone, exit_clone, total_reads_arc_clone) .unwrap_or_else(|err| { finished_clone.store(true, Ordering::Relaxed); - panic!("Error: {}", err) + panic!("Read Fastq error: {}", err) }); finished_clone.store(true, Ordering::Relaxed); }); @@ -149,9 +143,7 @@ fn main() { args, ) .unwrap_or_else(|err| panic!("Output error: {}", err)); - output - .write_counts_files() - .unwrap_or_else(|err| panic!("Writing error: {}", err)); + output.write_counts_files()?; // Get the end time and print total time for the algorithm output .write_stats_file( @@ -160,9 +152,7 @@ fn main() { sequence_errors, total_reads_arc, sequence_format, - ) - .unwrap_or_else(|err| panic!("Writing stats error: {}", err)); - + )?; // Get the end time and print total time for the algorithm let elapsed_time = Local::now() - start_time; println!(); @@ -173,4 +163,5 @@ fn main() { elapsed_time.num_seconds() % 60, barcode_count::output::millisecond_decimal(elapsed_time) ); + Ok(()) } diff --git a/src/output.rs b/src/output.rs index 6e0b895..a352cdc 100644 --- a/src/output.rs +++ b/src/output.rs @@ -1,9 +1,8 @@ +use anyhow::{anyhow, Result}; use chrono::{DateTime, Local}; -use custom_error::custom_error; use num_format::{Locale, ToFormattedString}; use std::{ collections::{HashMap, HashSet}, - error::Error, fs::{File, OpenOptions}, io::{stdout, Write}, path::Path, @@ -15,10 +14,10 @@ use std::{ use itertools::Itertools; -// Errors associated with checking the fastq format to make sure it is correct -custom_error! {EnrichedErrors - MismatchedType = "Does not work with Full enrichment type. Only Single and Double", -} +use crate::{ + arguments::Args, + info::{FormatType, MaxSeqErrors, Results, ResultsEnrichment, SequenceErrors, SequenceFormat}, +}; #[derive(PartialEq, Clone)] enum EnrichedType { @@ -29,13 +28,13 @@ enum EnrichedType { /// A struct setup to output results and stat information into files pub struct WriteFiles { - results: crate::info::Results, - results_enriched: crate::info::ResultsEnrichment, - sequence_format: crate::info::SequenceFormat, + results: Results, + results_enriched: ResultsEnrichment, + sequence_format: SequenceFormat, counted_barcodes_hash: Vec>, samples_barcode_hash: HashMap, compounds_written: HashSet, - args: crate::Args, + args: Args, output_files: Vec, output_counts: Vec, merged_count: usize, @@ -45,16 +44,16 @@ pub struct WriteFiles { impl WriteFiles { pub fn new( - results_arc: Arc>, - sequence_format: crate::info::SequenceFormat, + results_arc: Arc>, + sequence_format: SequenceFormat, counted_barcodes_hash: Vec>, samples_barcode_hash: HashMap, - args: crate::Args, - ) -> Result> { + args: Args, + ) -> Result { let results = Arc::try_unwrap(results_arc).unwrap().into_inner().unwrap(); Ok(WriteFiles { results, - results_enriched: crate::info::ResultsEnrichment::new(), + results_enriched: ResultsEnrichment::new(), sequence_format, counted_barcodes_hash, samples_barcode_hash, @@ -69,17 +68,17 @@ impl WriteFiles { } /// Sets up and writes the results file. Works for either with or without a random barcode - pub fn write_counts_files(&mut self) -> Result<(), Box> { + pub fn write_counts_files(&mut self) -> Result<()> { let unknown_sample = "barcode".to_string(); // Pull all sample IDs from either random hashmap or counts hashmap let mut sample_barcodes = match self.results.format_type { - crate::info::FormatType::RandomBarcode => self + FormatType::RandomBarcode => self .results .random_hashmap .keys() .cloned() .collect::>(), - crate::info::FormatType::NoRandomBarcode => self + FormatType::NoRandomBarcode => self .results .count_hashmap .keys() @@ -155,10 +154,10 @@ impl WriteFiles { self.sample_text.push_str(&header); let count = match self.results.format_type { - crate::info::FormatType::RandomBarcode => { + FormatType::RandomBarcode => { self.add_random_string(sample_barcode, &sample_barcodes)? } - crate::info::FormatType::NoRandomBarcode => { + FormatType::NoRandomBarcode => { self.add_counts_string(sample_barcode, &sample_barcodes, EnrichedType::Full)? } }; @@ -211,7 +210,7 @@ impl WriteFiles { &mut self, sample_barcode: &str, sample_barcodes: &[String], - ) -> Result> { + ) -> Result { let sample_random_hash = self.results.random_hashmap.get(sample_barcode).unwrap(); // Iterate through all results and write as comma separated. The keys within the hashmap are already comma separated // If there is an included building block barcode file, it is converted here @@ -293,7 +292,7 @@ impl WriteFiles { sample_barcode: &str, sample_barcodes: &[String], enrichment: EnrichedType, // In order to make this non redundant with writing single and double barcodes, this enum determines some aspects - ) -> Result> { + ) -> Result { let mut hash_holder: HashMap> = HashMap::new(); // a hodler hash to hold the hashmap from sample_counts_hash for a longer lifetime. Also used later // Select from the hashmap connected the the EnrichedType let sample_counts_hash = match enrichment { @@ -395,7 +394,7 @@ impl WriteFiles { } /// Write enriched files for either single or double barcodes if either flag is called - fn write_enriched_files(&mut self, enrichment: EnrichedType) -> Result<(), Box> { + fn write_enriched_files(&mut self, enrichment: EnrichedType) -> Result<()> { let unknown_sample = "barcode".to_string(); // Pull all sample IDs from either single or double hashmap, which was added to in either random or counts write let mut sample_barcodes = match enrichment { @@ -411,7 +410,11 @@ impl WriteFiles { .keys() .cloned() .collect::>(), - EnrichedType::Full => return Err(Box::new(EnrichedErrors::MismatchedType)), + EnrichedType::Full => { + return Err(anyhow!( + "Does not work with Full enrichment type. Only Single and Double" + )) + } }; // If there was a sample conversion file, sort the barcodes by the sample IDs so that the columns for the merged file are in order @@ -427,7 +430,11 @@ impl WriteFiles { let descriptor = match enrichment { EnrichedType::Single => "Single", EnrichedType::Double => "Double", - EnrichedType::Full => return Err(Box::new(EnrichedErrors::MismatchedType)), + EnrichedType::Full => { + return Err(anyhow!( + "Does not work with Full enrichment type. Only Single and Double" + )) + } }; // create the directory variable to join the file to @@ -516,11 +523,11 @@ impl WriteFiles { pub fn write_stats_file( &self, start_time: DateTime, - max_sequence_errors: crate::info::MaxSeqErrors, - seq_errors: crate::info::SequenceErrors, + max_sequence_errors: MaxSeqErrors, + seq_errors: SequenceErrors, total_reads: Arc, - sequence_format: crate::info::SequenceFormat, - ) -> Result<(), Box> { + sequence_format: SequenceFormat, + ) -> Result<()> { // Create the stat file name let output_dir = self.args.output_dir.clone(); let directory = Path::new(&output_dir); diff --git a/src/parse.rs b/src/parse.rs index cf6f74e..6c94467 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -1,19 +1,21 @@ -use custom_error::custom_error; +use anyhow::{anyhow, Result}; use regex::Captures; use std::{ collections::{HashSet, VecDeque}, - error::Error, + fmt, sync::{ atomic::{AtomicBool, Ordering}, Arc, Mutex, }, }; +use crate::info::{MaxSeqErrors, Results, SequenceErrors, SequenceFormat}; + pub struct SequenceParser { shared_mut_clone: SharedMutData, - sequence_errors_clone: crate::info::SequenceErrors, - sequence_format_clone: crate::info::SequenceFormat, - max_errors_clone: crate::info::MaxSeqErrors, + sequence_errors_clone: SequenceErrors, + sequence_format_clone: SequenceFormat, + max_errors_clone: MaxSeqErrors, sample_seqs: HashSet, counted_barcode_seqs: Vec>, raw_sequence: RawSequenceRead, @@ -24,9 +26,9 @@ pub struct SequenceParser { impl SequenceParser { pub fn new( shared_mut_clone: SharedMutData, - sequence_errors_clone: crate::info::SequenceErrors, - sequence_format_clone: crate::info::SequenceFormat, - max_errors_clone: crate::info::MaxSeqErrors, + sequence_errors_clone: SequenceErrors, + sequence_format_clone: SequenceFormat, + max_errors_clone: MaxSeqErrors, sample_seqs: HashSet, counted_barcode_seqs: Vec>, min_quality_score: f32, @@ -47,7 +49,7 @@ impl SequenceParser { min_quality_score, } } - pub fn parse(&mut self) -> Result<(), Box> { + pub fn parse(&mut self) -> Result<()> { // Loop until there are no sequences left to parse. These are fed into seq vec by the reader thread loop { if self.get_seqeunce()? { @@ -81,7 +83,7 @@ impl SequenceParser { Ok(()) } - fn get_seqeunce(&mut self) -> Result> { + fn get_seqeunce(&mut self) -> Result { // Pop off the last sequence from the seq vec if let Some(new_raw_sequence) = self.shared_mut_clone.seq.lock().unwrap().pop_back() { self.raw_sequence = RawSequenceRead::unpack(new_raw_sequence)?; @@ -92,8 +94,8 @@ impl SequenceParser { } /// Does a regex search and captures the barcodes. Returns a struct of the results. - fn match_seq(&mut self) -> Result, Box> { - self.check_and_fix_consant_region()?; + fn match_seq(&mut self) -> Result> { + self.check_and_fix_consant_region(); // if the barcodes are found continue, else return None and record a constant region error if let Some(barcodes) = self .sequence_format_clone @@ -118,7 +120,9 @@ impl SequenceParser { return Ok(None); } } else { - return Err(Box::new(FastqError::RegexFindError)); + return Err(anyhow!( + "Regex find failed after regex captures was successful" + )); } } @@ -130,7 +134,7 @@ impl SequenceParser { self.max_errors_clone.max_barcode_errors(), &self.sample_seqs, self.max_errors_clone.max_sample_errors(), - )?; + ); // If the sample barcode was not found, record the error and return none so that the algorithm stops for this sequence if match_results.sample_barcode_error { @@ -152,7 +156,7 @@ impl SequenceParser { } /// Checks the constant region of the sequence then finds the best fix if it is not found. Basically whether or not the regex search worked - fn check_and_fix_consant_region(&mut self) -> Result<(), Box> { + fn check_and_fix_consant_region(&mut self) { // If the regex search does not work, try to fix the constant region if !self .sequence_format_clone @@ -162,23 +166,22 @@ impl SequenceParser { self.raw_sequence.fix_constant_region( &self.sequence_format_clone.format_string, self.max_errors_clone.max_constant_errors(), - )?; + ); } - Ok(()) } } pub struct SharedMutData { pub seq: Arc>>, pub finished: Arc, - pub results: Arc>, + pub results: Arc>, } impl SharedMutData { pub fn new( seq: Arc>>, finished: Arc, - results: Arc>, + results: Arc>, ) -> Self { SharedMutData { seq, @@ -199,15 +202,6 @@ impl SharedMutData { } } -// Errors associated with checking the fastq format to make sure it is correct -custom_error! {FastqError - NotFastq = "This program only works with *.fastq files and *.fastq.gz files. The latter is still experimental", - Line2NotSeq = "The second line within the FASTQ file is not a sequence. Check the FASTQ format", - Line1Seq = "The first line within the FASTQ contains DNA sequences. Check the FASTQ format", - ExtraNewLine = "Too many new lines found within fastq read", - RegexFindError = "Regex find failed after regex captures was successful", -} - /// A struct to hold the raw sequencing information and transform it if there are sequencing errors #[derive(Clone)] pub struct RawSequenceRead { @@ -247,13 +241,19 @@ impl RawSequenceRead { } } - pub fn add_line(&mut self, line_num: u8, line: String) -> Result<(), Box> { + pub fn add_line(&mut self, line_num: u8, line: String) -> Result<()> { match line_num { 1 => self.description = line, 2 => self.sequence = line, 3 => self.add_description = line, 4 => self.quality_values = line, - _ => return Err(Box::new(FastqError::ExtraNewLine)), + _ => { + return Err(anyhow!( + "Too many new lines found within fastq read\nCurrent read\n{}\nCurrent line: {}", + self, + line + )) + } } Ok(()) } @@ -265,7 +265,7 @@ impl RawSequenceRead { ) } - pub fn unpack(raw_string: String) -> Result> { + pub fn unpack(raw_string: String) -> Result { let mut raw_sequence_read = RawSequenceRead::new(); for (line_num, line) in raw_string.split('\n').enumerate() { let true_line = line_num as u8 + 1; @@ -292,11 +292,7 @@ impl RawSequenceRead { /// Fixes the constant region by finding the closest match within the full seqeuence that has fewer than the max errors allowed, /// then uses the format string to flip the barcodes into the 'N's and have a fixed constant region string - pub fn fix_constant_region( - &mut self, - format_string: &str, - max_constant_errors: u8, - ) -> Result<(), Box> { + pub fn fix_constant_region(&mut self, format_string: &str, max_constant_errors: u8) { // Find the region of the sequence that best matches the constant region. This is doen by iterating through the sequence // Get the length difference between what was sequenced and the barcode region with constant regions // This is to stop the iteration in the next step @@ -315,14 +311,12 @@ impl RawSequenceRead { possible_seqs.push(possible_seq); } // Find the closest match within what was sequenced to the constant region - let best_sequence_option = fix_error(format_string, &possible_seqs, max_constant_errors)?; + let best_sequence_option = fix_error(format_string, &possible_seqs, max_constant_errors); if let Some(best_sequence) = best_sequence_option { self.insert_barcodes_constant_region(format_string, best_sequence); - Ok(()) } else { self.sequence = "".to_string(); - Ok(()) } } @@ -388,32 +382,33 @@ impl RawSequenceRead { false } - pub fn check_fastq_format(&self) -> Result<(), Box> { + pub fn check_fastq_format(&self) -> Result<()> { // Test to see if the first line is not a sequence and the second is a sequence, which is typical fastq format match test_sequence(&self.description) { LineType::Sequence => { - self.print_read(); - return Err(Box::new(FastqError::Line1Seq)); + println!("{}", self); + return Err(anyhow!("The first line within the FASTQ contains DNA sequences. Check the FASTQ format")); } LineType::Metadata => (), } match test_sequence(&self.sequence) { LineType::Sequence => (), LineType::Metadata => { - self.print_read(); - return Err(Box::new(FastqError::Line2NotSeq)); + println!("{}", self); + return Err(anyhow!("The second line within the FASTQ file is not a sequence. Check the FASTQ format")); } } Ok(()) } +} - pub fn print_read(&self) { - println!("Fastq read:"); - println!( +impl fmt::Display for RawSequenceRead { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, "Line 1: {}\nLine 2: {}\nLine 3: {}\nLine 4: {}", self.description, self.sequence, self.add_description, self.quality_values - ); - println!(); + ) } } @@ -456,7 +451,7 @@ impl SequenceMatchResult { counted_barcode_max_errors: &[u8], // The maximum errors allowed for each counted barcode sample_seqs: &HashSet, // A hashset of all known sample barcodes. Will be empty if none are known or included sample_seqs_max_errors: u8, // Maximum allowed sample barcode sequencing errors - ) -> Result> { + ) -> SequenceMatchResult { // Check for sample barcode and start with setting error to false let mut sample_barcode_error = false; let sample_barcode; @@ -472,7 +467,7 @@ impl SequenceMatchResult { } else { // Otherwise try and fix it. If the fix returns none, then save the error and an empty string let sample_barcode_fix_option = - fix_error(sample_barcode_str, sample_seqs, sample_seqs_max_errors)?; + fix_error(sample_barcode_str, sample_seqs, sample_seqs_max_errors); if let Some(fixed_barcode) = sample_barcode_fix_option { sample_barcode = fixed_barcode; } else { @@ -504,7 +499,7 @@ impl SequenceMatchResult { &counted_barcode, &counted_barcode_seqs[index], counted_barcode_max_errors[index], - )?; + ); if let Some(fixed_barcode) = barcode_seq_fix_option { counted_barcode = fixed_barcode; } else { @@ -527,13 +522,13 @@ impl SequenceMatchResult { } else { random_barcode = String::new() } - Ok(SequenceMatchResult { + SequenceMatchResult { sample_barcode, counted_barcodes, counted_barcode_error, sample_barcode_error, random_barcode, - }) + } } /// Returns a comma separated counted barcodes string. Perfect for CSV file writing @@ -557,17 +552,13 @@ impl SequenceMatchResult { /// /// let max_mismatches = barcode.chars().count() as u8 / 5; // allow up to 20% mismatches /// -/// let fixed_error_one = fix_error(barcode, &possible_barcodes_one_match, max_mismatches).unwrap(); -/// let fixed_error_two = fix_error(barcode, &possible_barcodes_two_match, max_mismatches).unwrap(); +/// let fixed_error_one = fix_error(barcode, &possible_barcodes_one_match, max_mismatches); +/// let fixed_error_two = fix_error(barcode, &possible_barcodes_two_match, max_mismatches); /// /// assert_eq!(fixed_error_one, Some("AGCAG".to_string())); /// assert_eq!(fixed_error_two, None); /// ``` -pub fn fix_error<'a, I>( - mismatch_seq: &str, - possible_seqs: I, - mismatches: u8, -) -> Result, Box> +pub fn fix_error<'a, I>(mismatch_seq: &str, possible_seqs: I, mismatches: u8) -> Option where I: IntoIterator, { @@ -603,8 +594,8 @@ where } // If there is one best match and it is some, return it. Otherwise return None if keep && best_match.is_some() { - Ok(best_match) + best_match } else { - Ok(None) + None } }