From 53bab87f0c78eeb6b708068d9d8f87151a82ff96 Mon Sep 17 00:00:00 2001 From: noamteyssier <22600644+noamteyssier@users.noreply.github.com> Date: Thu, 4 Apr 2024 23:24:56 -0700 Subject: [PATCH] refactor: update all bam IO with htslib for increased performance --- src/cli/bam/mod.rs | 2 - src/cli/bam/wrap.rs | 98 --------------------------------- src/cli/outputs.rs | 36 +++++++++++- src/commands/bam/convert/bed.rs | 30 +++++----- src/commands/bam/convert/mod.rs | 19 ++----- src/commands/bam/filter.rs | 75 +++++++++++-------------- src/commands/bam/utils.rs | 54 +++++++----------- src/dispatch.rs | 65 +++------------------- src/io/general.rs | 37 ++++++------- 9 files changed, 127 insertions(+), 289 deletions(-) delete mode 100644 src/cli/bam/wrap.rs diff --git a/src/cli/bam/mod.rs b/src/cli/bam/mod.rs index cdd37bf..5d17383 100644 --- a/src/cli/bam/mod.rs +++ b/src/cli/bam/mod.rs @@ -1,9 +1,7 @@ mod commands; mod convert; mod filter; -mod wrap; pub use commands::BamCommand; pub use convert::{BamConversionType, ConvertArgs, ConvertParams}; pub use filter::{FilterArgs, FilterParams}; -pub use wrap::WrapCigar; diff --git a/src/cli/bam/wrap.rs b/src/cli/bam/wrap.rs deleted file mode 100644 index ee54b51..0000000 --- a/src/cli/bam/wrap.rs +++ /dev/null @@ -1,98 +0,0 @@ -use core::fmt; -use noodles::{ - bam::record::Cigar, - sam::alignment::record::cigar::{op::Kind, Op}, -}; -use std::fmt::{Display, Formatter}; - -/// A wrapper around a CIGAR string for display purposes. -pub struct WrapCigar<'a>(Cigar<'a>); -impl<'a> Display for WrapCigar<'a> { - fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { - for op in self.0.iter() { - if let Ok(op) = op { - write!(f, "{}", WrapOp(op))?; - } else { - Err(fmt::Error)? - } - } - Ok(()) - } -} -impl<'a> From> for WrapCigar<'a> { - fn from(cigar: Cigar<'a>) -> Self { - Self(cigar) - } -} - -/// A wrapper around a CIGAR operation for display purposes. -struct WrapOp(Op); -impl WrapOp { - pub fn kind(&self) -> WrapKind { - self.0.kind().into() - } -} -impl Display for WrapOp { - fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { - write!(f, "{}{}", self.0.len(), self.kind()) - } -} - -/// A wrapper around a CIGAR operation kind for display purposes. -#[derive(Clone, Copy, Debug, Eq, PartialEq)] -pub enum WrapKind { - /// An alignment match (`M`). - Match, - /// An insertion into the reference (`I`). - Insertion, - /// A deletion from the reference (`D`). - Deletion, - /// A skipped region from the reference (`N`). - Skip, - /// A soft clip (`S`). - SoftClip, - /// A hard clip (`H`). - HardClip, - /// Padding (`P`). - Pad, - /// A sequence match (`=`). - SequenceMatch, - /// A sequence mismatch (`X`). - SequenceMismatch, -} -impl From for WrapKind { - fn from(kind: Kind) -> Self { - match kind { - Kind::Match => Self::Match, - Kind::Insertion => Self::Insertion, - Kind::Deletion => Self::Deletion, - Kind::Skip => Self::Skip, - Kind::SoftClip => Self::SoftClip, - Kind::HardClip => Self::HardClip, - Kind::Pad => Self::Pad, - Kind::SequenceMatch => Self::SequenceMatch, - Kind::SequenceMismatch => Self::SequenceMismatch, - } - } -} -impl From for char { - fn from(kind: WrapKind) -> Self { - match kind { - WrapKind::Match => 'M', - WrapKind::Insertion => 'I', - WrapKind::Deletion => 'D', - WrapKind::Skip => 'N', - WrapKind::SoftClip => 'S', - WrapKind::HardClip => 'H', - WrapKind::Pad => 'P', - WrapKind::SequenceMatch => '=', - WrapKind::SequenceMismatch => 'X', - } - } -} -impl Display for WrapKind { - // use serde to convert to string - fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { - write!(f, "{}", char::from(*self)) - } -} diff --git a/src/cli/outputs.rs b/src/cli/outputs.rs index 2179947..9c2c5d7 100644 --- a/src/cli/outputs.rs +++ b/src/cli/outputs.rs @@ -1,6 +1,7 @@ use crate::io::{match_bam_output, match_output}; use anyhow::Result; -use clap::Parser; +use clap::{Parser, ValueEnum}; +use rust_htslib::bam::{Format, HeaderView, Writer as BamWriter}; use std::io::Write; #[derive(Parser, Debug, Clone)] @@ -34,9 +35,38 @@ pub struct BamOutput { /// Output BAM file to write to (default=stdout) #[clap(short, long)] pub output: Option, + + /// Output Format to write to (default=BAM) + #[clap(short = 'O', long, default_value = "bam")] + pub format: WrapHtsFormat, + + /// Threads to use when writing BAM files + #[clap(short = 't', long, default_value = "1")] + pub threads: usize, } impl BamOutput { - pub fn get_writer(&self) -> Result> { - match_bam_output(self.output.clone()) + pub fn get_writer(&self, header: &HeaderView) -> Result { + match_bam_output( + self.output.clone(), + header, + self.format.into(), + self.threads, + ) + } +} + +#[derive(Parser, Debug, Clone, ValueEnum, Copy)] +pub enum WrapHtsFormat { + Bam, + Sam, + Cram, +} +impl From for Format { + fn from(format: WrapHtsFormat) -> Self { + match format { + WrapHtsFormat::Bam => Format::Bam, + WrapHtsFormat::Sam => Format::Sam, + WrapHtsFormat::Cram => Format::Cram, + } } } diff --git a/src/commands/bam/convert/bed.rs b/src/commands/bam/convert/bed.rs index 26f94fb..0a208c5 100644 --- a/src/commands/bam/convert/bed.rs +++ b/src/commands/bam/convert/bed.rs @@ -1,19 +1,17 @@ -use crate::cli::bam::{ConvertParams, WrapCigar}; +use crate::cli::bam::ConvertParams; use crate::commands::bam::utils::{ get_strand, parse_chr_name, parse_endpoints, parse_mapping_quality, parse_query_name, }; use crate::io::build_writer; use anyhow::Result; -use noodles::bam::io::Reader; -use noodles::bam::Record as BamRecord; -use noodles::sam::Header; -use std::io::{stdout, BufWriter, Read, Write}; +use rust_htslib::bam::{HeaderView, Read, Reader as BamReader, Record}; +use std::io::{stdout, Write}; use std::str::from_utf8; fn format_print_record( - record: &BamRecord, - header: &Header, + record: &Record, + header: &HeaderView, params: &ConvertParams, wtr: &mut csv::Writer, ) -> Result<()> { @@ -22,9 +20,9 @@ fn format_print_record( let qname = parse_query_name(record)?; let mapq = parse_mapping_quality(record); let strand = get_strand(record); - + // if params.bed.cigar { - let cigar: WrapCigar = record.cigar().into(); + let cigar = record.cigar(); let tuple = ( from_utf8(chr_name)?, start, @@ -49,14 +47,12 @@ fn format_print_record( Ok(()) } -pub fn convert_bed( - mut bam: Reader, - header: Header, - params: ConvertParams, -) -> Result<()> { - let mut wtr = build_writer(BufWriter::new(stdout())); - for record in bam.records() { - let record = record?; +pub fn convert_bed(mut bam: BamReader, params: ConvertParams) -> Result<()> { + let header = bam.header().clone(); + let mut wtr = build_writer(stdout()); + let mut record = Record::new(); + while let Some(result) = bam.read(&mut record) { + result?; format_print_record(&record, &header, ¶ms, &mut wtr)?; } wtr.flush()?; diff --git a/src/commands/bam/convert/mod.rs b/src/commands/bam/convert/mod.rs index 1abab63..09d3809 100644 --- a/src/commands/bam/convert/mod.rs +++ b/src/commands/bam/convert/mod.rs @@ -5,18 +5,11 @@ use crate::cli::bam::{BamConversionType, ConvertArgs, ConvertParams}; use crate::io::match_bam_input; use anyhow::{bail, Result}; -use noodles::bam::io::reader::Builder; -use noodles::bam::io::Reader; -use noodles::sam::Header; -use std::io::Read; +use rust_htslib::bam::Reader as BamReader; -fn dispatch_conversion( - bam: Reader, - header: Header, - params: ConvertParams, -) -> Result<()> { +fn dispatch_conversion(bam: BamReader, params: ConvertParams) -> Result<()> { match params.conv { - BamConversionType::Bed => convert_bed(bam, header, params), + BamConversionType::Bed => convert_bed(bam, params), _ => bail!( "FASTQ conversion is not implemented yet - but checkout samtools fastq for a solution" ), @@ -24,8 +17,6 @@ fn dispatch_conversion( } pub fn convert(args: ConvertArgs) -> Result<()> { - let in_handle = match_bam_input(args.input.input)?; - let mut bam = Builder.build_from_reader(in_handle); - let header = bam.read_header()?; - dispatch_conversion(bam, header, args.params) + let bam = match_bam_input(args.input.input)?; + dispatch_conversion(bam, args.params) } diff --git a/src/commands/bam/filter.rs b/src/commands/bam/filter.rs index 3b8cfa2..0207d8d 100644 --- a/src/commands/bam/filter.rs +++ b/src/commands/bam/filter.rs @@ -8,18 +8,12 @@ use crate::{ use super::utils::{parse_chr_name, parse_endpoints}; use anyhow::Result; use bedrs::{traits::IntervalBounds, types::QueryMethod, IntervalContainer}; -use noodles::bam::io::Writer as BamWriter; -use noodles::bam::Record as BamRecord; -use noodles::{ - bam::io::{reader::Builder, Reader}, - sam::Header, -}; +use rust_htslib::bam::{HeaderView, Read, Reader as BamReader, Record, Writer as BamWriter}; use serde::Serialize; -use std::io::{Read, Write}; fn temp_bed3( - record: &BamRecord, - header: &Header, + record: &Record, + header: &HeaderView, translater: &SplitTranslater, ) -> Result> { let chr_bytes = parse_chr_name(record, header)?; @@ -33,17 +27,16 @@ fn temp_bed3( Ok(Some(NumericBed3::new(chr_idx, start, end))) } -fn run_inverted_overlap( - record: &BamRecord, - header: &Header, +fn run_inverted_overlap( + record: &Record, + header: &HeaderView, set: &IntervalContainer, translater: &SplitTranslater, query_method: QueryMethod, - wtr: &mut BamWriter, + wtr: &mut BamWriter, ) -> Result<()> where I: IntervalBounds + Copy + Serialize, - W: Write, WriteNamedIterImpl: WriteNamedIter, { if let Some(bed) = temp_bed3(record, header, translater)? { @@ -52,25 +45,24 @@ where .next() .is_none(); if no_overlaps { - wtr.write_record(header, record)?; + wtr.write(record)?; } } else { - wtr.write_record(header, record)?; + wtr.write(record)?; } Ok(()) } -fn run_overlap( - record: &BamRecord, - header: &Header, +fn run_overlap( + record: &Record, + header: &HeaderView, set: &IntervalContainer, translater: &SplitTranslater, query_method: QueryMethod, - wtr: &mut BamWriter, + wtr: &mut BamWriter, ) -> Result<()> where I: IntervalBounds + Copy + Serialize, - W: Write, WriteNamedIterImpl: WriteNamedIter, { if let Some(bed) = temp_bed3(record, header, translater)? { @@ -79,48 +71,47 @@ where .next() .is_some(); if any_overlaps { - wtr.write_record(header, record)?; + wtr.write(record)?; } } Ok(()) } -fn run_filter( - mut bam: Reader, - header: Header, +fn run_filter( + bam: &mut BamReader, mut set: IntervalContainer, translater: Option<&SplitTranslater>, params: FilterParams, - writer: W, + writer: &mut BamWriter, ) -> Result<()> where I: IntervalBounds + Copy + Serialize, - R: Read, - W: Write, WriteNamedIterImpl: WriteNamedIter, { + // Get the header + let header = bam.header().clone(); + // Sort the BED Set set.sort(); // Export the translater let translater = translater.unwrap(); - // Initialize the BAM Writer - let mut wtr = BamWriter::new(writer); - wtr.write_header(&header)?; - // Initialize the overlap query method let query_method: QueryMethod = params.overlap_predicates.into(); + // Initialize an empty record to avoid repeated allocations + let mut record = Record::new(); + if params.output_predicates.invert { - for record in bam.records() { - let record = record?; - run_inverted_overlap(&record, &header, &set, translater, query_method, &mut wtr)?; + while let Some(result) = bam.read(&mut record) { + result?; + run_inverted_overlap(&record, &header, &set, translater, query_method, writer)?; } } else { - for record in bam.records() { - let record = record?; - run_overlap(&record, &header, &set, translater, query_method, &mut wtr)?; + while let Some(result) = bam.read(&mut record) { + result?; + run_overlap(&record, &header, &set, translater, query_method, writer)?; } } Ok(()) @@ -128,9 +119,7 @@ where pub fn filter(args: FilterArgs) -> Result<()> { let bed_reader = args.inputs.get_reader_bed()?; - let bam_handle = match_bam_input(args.inputs.bam)?; - let mut bam = Builder.build_from_reader(bam_handle); - let header = bam.read_header()?; - let writer = args.output.get_writer()?; - dispatch_single_with_bam!(bam, header, bed_reader, writer, args.params, run_filter) + let mut bam = match_bam_input(args.inputs.bam)?; + let mut writer = args.output.get_writer(bam.header())?; + dispatch_single_with_bam!(&mut bam, bed_reader, &mut writer, args.params, run_filter) } diff --git a/src/commands/bam/utils.rs b/src/commands/bam/utils.rs index 461278a..75bee5b 100644 --- a/src/commands/bam/utils.rs +++ b/src/commands/bam/utils.rs @@ -1,56 +1,42 @@ use anyhow::{bail, Result}; -use noodles::bam::Record as BamRecord; -use noodles::sam::alignment::Record; -use noodles::sam::Header; +use rust_htslib::bam::{HeaderView, Record}; -pub fn parse_chr_name<'a>(record: &BamRecord, header: &'a Header) -> Result<&'a [u8]> { - if let Some(chr) = record.reference_sequence(header) { - let (chr_name, _map) = chr?; - Ok(chr_name.as_ref()) - } else { +pub fn parse_chr_name<'a>(record: &Record, header: &'a HeaderView) -> Result<&'a [u8]> { + let tid = record.tid(); + if tid < 0 { bail!("Record is missing chr name"); } + let chr_name = header.tid2name(tid as u32); + Ok(chr_name) } -pub fn parse_endpoints(record: &BamRecord) -> Result<(usize, usize)> { - let start = if let Some(start) = record.alignment_start() { - // Adjust to 0-based - start?.get() - 1 - } else { - bail!("Record is missing start"); - }; - let end = if let Some(end) = record.alignment_end() { - end?.get() - } else { - bail!("Record is missing end"); - }; +pub fn parse_endpoints(record: &Record) -> Result<(usize, usize)> { + let start = record.pos() as usize; + let end = record.cigar().end_pos() as usize; Ok((start, end)) } const FIRST_SEGMENT: &[u8] = &[b'/', b'1']; const LAST_SEGMENT: &[u8] = &[b'/', b'2']; -pub fn parse_query_name(record: &BamRecord) -> Result> { - if let Some(name) = record.name() { - if record.flags().is_segmented() { - if record.flags().is_first_segment() { - Ok([name.as_bytes(), FIRST_SEGMENT].concat()) - } else { - Ok([name.as_bytes(), LAST_SEGMENT].concat()) - } +pub fn parse_query_name(record: &Record) -> Result> { + let name = record.qname(); + if record.is_paired() { + if record.is_reverse() { + Ok([name, LAST_SEGMENT].concat()) } else { - Ok(name.as_bytes().to_vec()) + Ok([name, FIRST_SEGMENT].concat()) } } else { - bail!("Record is missing query name"); + Ok(name.to_vec()) } } -pub fn parse_mapping_quality(record: &BamRecord) -> u8 { - record.mapping_quality().map(|x| x.get()).unwrap_or(255) +pub fn parse_mapping_quality(record: &Record) -> u8 { + record.mapq() } -pub fn get_strand(record: &BamRecord) -> char { - if record.flags().is_reverse_complemented() { +pub fn get_strand(record: &Record) -> char { + if record.is_reverse() { '-' } else { '+' diff --git a/src/dispatch.rs b/src/dispatch.rs index 83a3e01..ed9eee1 100644 --- a/src/dispatch.rs +++ b/src/dispatch.rs @@ -77,84 +77,35 @@ macro_rules! dispatch_single_owned_tl { /// a writer alongside a BAM reader and header. #[macro_export] macro_rules! dispatch_single_with_bam { - ($bam_reader:expr, $bam_header:expr, $bed_reader:expr, $writer:expr, $params:expr, $func:expr) => { + ($bam_reader:expr, $bed_reader:expr, $writer:expr, $params:expr, $func:expr) => { match $bed_reader.input_format() { InputFormat::Bed3 => { let (set, translater) = $bed_reader.bed3_set()?; - $func( - $bam_reader, - $bam_header, - set, - translater.as_ref(), - $params, - $writer, - ) + $func($bam_reader, set, translater.as_ref(), $params, $writer) } InputFormat::Bed4 => { let (set, translater) = $bed_reader.bed4_set()?; - $func( - $bam_reader, - $bam_header, - set, - translater.as_ref(), - $params, - $writer, - ) + $func($bam_reader, set, translater.as_ref(), $params, $writer) } InputFormat::Bed6 => { let (set, translater) = $bed_reader.bed6_set()?; - $func( - $bam_reader, - $bam_header, - set, - translater.as_ref(), - $params, - $writer, - ) + $func($bam_reader, set, translater.as_ref(), $params, $writer) } InputFormat::Bed12 => { let (set, translater) = $bed_reader.bed12_set()?; - $func( - $bam_reader, - $bam_header, - set, - translater.as_ref(), - $params, - $writer, - ) + $func($bam_reader, set, translater.as_ref(), $params, $writer) } InputFormat::Gtf => { let (set, translater) = $bed_reader.gtf_set()?; - $func( - $bam_reader, - $bam_header, - set, - translater.as_ref(), - $params, - $writer, - ) + $func($bam_reader, set, translater.as_ref(), $params, $writer) } InputFormat::Ambiguous => { let (set, translater) = $bed_reader.meta_interval_set()?; - $func( - $bam_reader, - $bam_header, - set, - translater.as_ref(), - $params, - $writer, - ) + $func($bam_reader, set, translater.as_ref(), $params, $writer) } InputFormat::BedGraph => { let (set, translater) = $bed_reader.bedgraph_set()?; - $func( - $bam_reader, - $bam_header, - set, - translater.as_ref(), - $params, - $writer, - ) + $func($bam_reader, set, translater.as_ref(), $params, $writer) } } }; diff --git a/src/io/general.rs b/src/io/general.rs index 3c48e5a..de3c488 100644 --- a/src/io/general.rs +++ b/src/io/general.rs @@ -2,6 +2,7 @@ use anyhow::Result; use gzp::deflate::Bgzf; use gzp::{Compression, ZBuilder}; use niffler::get_reader; +use rust_htslib::bam::{Format, Header, HeaderView, Reader as BamReader, Writer as BamWriter}; use std::ffi::OsStr; use std::path::Path; use std::{ @@ -32,19 +33,10 @@ pub fn match_input(input: Option) -> Result> { } } -pub fn match_bam_input(input: Option) -> Result> { +pub fn match_bam_input(input: Option) -> Result { match input { - Some(filename) => { - let file = File::open(filename)?; - let reader = BufReader::new(file); - Ok(Box::new(reader)) - } - None => { - let stdin = std::io::stdin(); - let handle = stdin.lock(); - let buffer = BufReader::new(handle); - Ok(Box::new(buffer)) - } + Some(filename) => Ok(BamReader::from_path(filename)?), + None => Ok(BamReader::from_stdin()?), } } @@ -84,16 +76,19 @@ pub fn match_output( } } -pub fn match_bam_output(path: Option) -> Result> { - if let Some(path) = path { - let file = File::create(path)?; - let buffer = BufWriter::new(file); - Ok(Box::new(buffer)) +pub fn match_bam_output( + path: Option, + header: &HeaderView, + format: Format, + n_threads: usize, +) -> Result { + let mut writer = if let Some(filename) = path { + BamWriter::from_path(filename, &Header::from_template(header), format) } else { - let stdout = std::io::stdout(); - let buffer = BufWriter::new(stdout); - Ok(Box::new(buffer)) - } + BamWriter::from_stdout(&Header::from_template(header), format) + }?; + writer.set_threads(n_threads)?; + Ok(writer) } #[cfg(test)]