From 6dd86255e89cfe5c54bcd6e3ad7f1211240640a2 Mon Sep 17 00:00:00 2001 From: Rory Coffey Date: Sun, 24 Sep 2023 20:41:26 -0400 Subject: [PATCH 1/4] converted to bgzip reader for fastq.gz files to fix early ending --- Cargo.toml | 2 +- src/input.rs | 21 ++++++++++----------- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index eeb8108..3a45fb7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,6 +23,6 @@ regex = "1.5" clap = "2.33.0" itertools = "0.10" num_cpus = "1.0" -flate2 = "1" chrono = "0.4" num-format = "0.4" +rust-htslib = "0.44" diff --git a/src/input.rs b/src/input.rs index 5a34e16..82f5504 100644 --- a/src/input.rs +++ b/src/input.rs @@ -1,5 +1,4 @@ use anyhow::{bail, Context, Result}; -use flate2::read::GzDecoder; use num_format::{Locale, ToFormattedString}; use std::{ collections::VecDeque, @@ -11,6 +10,7 @@ use std::{ Arc, Mutex, }, }; +use rust_htslib::bgzf; use crate::parse::RawSequenceRead; @@ -27,13 +27,13 @@ pub fn read_fastq( exit_clone: Arc, total_reads_arc: Arc, ) -> Result<()> { - let fastq_file = File::open(&fastq).context(format!("Failed to open file: {}", fastq))?; // open file // Create a fastq line reader which keeps track of line number, reads, and posts the sequence to the shared vector let mut fastq_line_reader = FastqLineReader::new(seq_clone, exit_clone); // If the file is not gzipped use BufReader to read in lines if !fastq.ends_with("fastq.gz") { + let fastq_file = File::open(&fastq).context(format!("Failed to open file: {}", fastq))?; // open file // If the file does not end with fastq, return with an error if !fastq.ends_with("fastq") { bail!("This program only works with *.fastq files and *.fastq.gz files. The latter is still experimental") @@ -61,15 +61,13 @@ pub fn read_fastq( println!("If this program stops reading before the expected number of sequencing reads, unzip the gzipped fastq and rerun."); println!(); // stream in first by decoding with GzDecoder, the reading into buffer - let mut reader = BufReader::new(GzDecoder::new(fastq_file)); + let mut reader = BufReader::new(bgzf::Reader::from_path(fastq)?); - // artificially set the read response to 10. The first number does not matter - let mut read_response = 10; + let mut stdout = std::io::stdout(); + let mut lock = stdout.lock(); // continue reading until there is a response of 0, which indicates the end of file. This may be where some gzipped files abrupty end - while read_response != 0 { - let mut line = String::new(); - // move the read line to the line variable and get the response to check if it is 0 and therefore the file is done - read_response = reader.read_line(&mut line)?; + let mut line = String::new(); + while let Ok(_read_response) = reader.read_line(&mut line) { // post the line to the shared vector and keep track of the number of sequences etc fastq_line_reader.read(line); if fastq_line_reader.line_num == 4 { @@ -77,9 +75,10 @@ pub fn read_fastq( } // Add to read count to print numnber of sequences read by this thread if fastq_line_reader.total_reads % 10000 == 0 { - print!("{}", fastq_line_reader); - std::io::stdout().flush()?; + write!(lock, "{}", fastq_line_reader)?; + stdout.flush()?; } + line = String::new(); } } // Display the final total read count From 47dc40f1c3869a6d6a6e44dd80ab1a7eb6d0144a Mon Sep 17 00:00:00 2001 From: Rory Coffey Date: Sun, 24 Sep 2023 21:37:21 -0400 Subject: [PATCH 2/4] increased version --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 3a45fb7..055b129 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "barcode-count" description = "NGS barcode counter for DEL, CRISPR-seq, and Barcode-seq" -version = "0.10.0" +version = "0.11.0" edition = "2021" license = "Apache-2.0" readme = "README.md" From 5102b67c2dda6c6e44d1b0e1455206b53e85a910 Mon Sep 17 00:00:00 2001 From: Rory Coffey Date: Sun, 24 Sep 2023 21:38:05 -0400 Subject: [PATCH 3/4] added time for gzipped --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index e125487..90a8d67 100644 --- a/README.md +++ b/README.md @@ -181,9 +181,9 @@ Counted barcode mismatches: 5,682,306 Duplicates: 0 Low quality barcodes: 0 -Compute time: 0 hours, 28 minutes, 43.359 seconds +Compute time: 0 hours, 23 minutes, 20.122 seconds -WRITING COUNTS- -Total time: 0 hours, 29 minutes, 8.219 seconds +Total time: 0 hours, 23 minutes, 47.645 seconds ``` From 75e17986ad400577a52653816bf466fa6b0ddb9b Mon Sep 17 00:00:00 2001 From: Rory Coffey Date: Sun, 24 Sep 2023 21:38:38 -0400 Subject: [PATCH 4/4] added read response length to stop when at EOF of gzip --- src/input.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/input.rs b/src/input.rs index 82f5504..45dbb62 100644 --- a/src/input.rs +++ b/src/input.rs @@ -65,9 +65,11 @@ pub fn read_fastq( let mut stdout = std::io::stdout(); let mut lock = stdout.lock(); + let mut read_response = 10; // continue reading until there is a response of 0, which indicates the end of file. This may be where some gzipped files abrupty end - let mut line = String::new(); - while let Ok(_read_response) = reader.read_line(&mut line) { + while read_response != 0 { + let mut line = String::new(); + read_response = reader.read_line(&mut line)?; // post the line to the shared vector and keep track of the number of sequences etc fastq_line_reader.read(line); if fastq_line_reader.line_num == 4 { @@ -78,7 +80,6 @@ pub fn read_fastq( write!(lock, "{}", fastq_line_reader)?; stdout.flush()?; } - line = String::new(); } } // Display the final total read count