From b26ebd17fcbf281b2e1ca12be8634e666797f0f0 Mon Sep 17 00:00:00 2001 From: Vince Buffalo Date: Mon, 4 Mar 2024 00:11:41 -0800 Subject: [PATCH] New `granges hist-feature`, which is pretty neat! - New TSV writer convenience functions, `build_tsv_writer()` and `build_tsv_writer_with_config()`. - New `hist-feature` command, and `HistFeatures` struct. - New `GRangesEmpty::into_granges_data()` which zips an empty GRanges with a new set of data (re-indexing it). - `GRanges::from_iter_ok()` and analagous function for `GRangesEmpty`, which take input ranges stream that is not wrapped in `Result`. - New `overlaps()` method (more to come) for getting number of overlaps. - New `MergingEmptyIterator` (for ranges stream that is not in `Result`. - `GenomicRangeRecord::into_empty()` added. - TODO/NOTE: `granges hist-feature` has no analagous bedtools version, so this is not validated or extensively tested yet. --- src/commands.rs | 225 ++++++++++++++++++++++++++++++++------- src/error.rs | 3 + src/granges.rs | 150 ++++++++++++++++++++------ src/io/tsv.rs | 4 +- src/join.rs | 5 + src/lib.rs | 3 +- src/main/mod.rs | 4 +- src/merging_iterators.rs | 59 ++++++++++ src/ranges/mod.rs | 12 ++- 9 files changed, 386 insertions(+), 79 deletions(-) diff --git a/src/commands.rs b/src/commands.rs index 2e5a036..8b9c6d9 100644 --- a/src/commands.rs +++ b/src/commands.rs @@ -3,14 +3,20 @@ // TODO: these functions should be methods of the input struct. use clap::Parser; -use csv::WriterBuilder; -use std::{fs::File, io, path::PathBuf}; +use csv::{Writer, WriterBuilder}; +use std::{ + collections::HashMap, + fs::File, + io::{self, Write}, + path::PathBuf, +}; use crate::{ data::{operations::FloatOperation, SerializableDatumType}, io::{ parsers::{Bed5Iterator, GenomicRangesParser}, tsv::BED_TSV, + TsvConfig, }, merging_iterators::{MergingEmptyResultIterator, MergingResultIterator}, prelude::*, @@ -20,6 +26,48 @@ use crate::{ Position, PositionOffset, }; +/// Build a new TSV writer +pub fn build_tsv_writer( + output: Option>, +) -> Result>, GRangesError> { + let output = output.map(|path| path.into()); + let writer_boxed: Box = match &output { + Some(path) => Box::new(File::create(path)?), + None => Box::new(io::stdout()), + }; + + let writer = WriterBuilder::new() + .delimiter(b'\t') + .has_headers(false) + .from_writer(writer_boxed); + + Ok(writer) +} + +/// Build a new TSV writer with config, i.e. for manual headers, metadata etc. +// TODO: use proper builder pattern here and above? +pub fn build_tsv_writer_with_config( + output: Option>, + config: &TsvConfig, +) -> Result>, GRangesError> { + let output = output.map(|path| path.into()); + let mut writer_boxed: Box = match &output { + Some(path) => Box::new(File::create(path)?), + None => Box::new(io::stdout()), + }; + + if let Some(headers) = &config.headers { + writeln!(writer_boxed, "{}", headers.join("\t"))?; + } + + let writer = WriterBuilder::new() + .delimiter(b'\t') + .has_headers(false) + .from_writer(writer_boxed); + + Ok(writer) +} + /// An `enum` to indicate whether an streaming or in-memory algorithm should be used. #[derive(Clone)] pub enum ProcessingMode { @@ -60,15 +108,7 @@ pub fn granges_adjust( ) -> Result, GRangesError> { let genome = read_seqlens(seqlens)?; - let writer_boxed: Box = match output { - Some(path) => Box::new(File::create(path)?), - None => Box::new(io::stdout()), - }; - - let mut writer = WriterBuilder::new() - .delimiter(b'\t') - .has_headers(false) - .from_writer(writer_boxed); + let mut writer = build_tsv_writer(output)?; // For reporting stuff to the user. let mut report = Report::new(); @@ -342,15 +382,7 @@ pub fn granges_flank( } }, ProcessingMode::Streaming => { - let writer_boxed: Box = match output { - Some(path) => Box::new(File::create(path)?), - None => Box::new(io::stdout()), - }; - - let mut writer = WriterBuilder::new() - .delimiter(b'\t') - .has_headers(false) - .from_writer(writer_boxed); + let mut writer = build_tsv_writer(output)?; match ranges_iter { // FIXME: code redundancy. But too early now to design traits, etc. @@ -580,14 +612,7 @@ impl Merge { let ranges_iter = GenomicRangesFile::parsing_iterator(bedfile)?; let func = &self.func; - let writer_boxed: Box = match &self.output { - Some(path) => Box::new(File::create(path)?), - None => Box::new(io::stdout()), - }; - let mut writer = WriterBuilder::new() - .delimiter(b'\t') - .has_headers(false) - .from_writer(writer_boxed); + let mut writer = build_tsv_writer(self.output.as_ref())?; match ranges_iter { GenomicRangesParser::Bed3(iter) => { @@ -658,19 +683,10 @@ impl FilterChroms { pub fn run(&self) -> Result, GRangesError> { let bedfile = &self.bedfile; let genome = read_seqlens(&self.genome)?; - - let writer_boxed: Box = match &self.output { - Some(path) => Box::new(File::create(path)?), - None => Box::new(io::stdout()), - }; - - let mut writer = WriterBuilder::new() - .delimiter(b'\t') - .has_headers(false) - .from_writer(writer_boxed); - let bedlike_iterator = BedlikeIterator::new(bedfile)?; + let mut writer = build_tsv_writer(self.output.as_ref())?; + // If we don't need to sort, use iterator-based streaming processing. for record in bedlike_iterator { let range = record?; @@ -684,3 +700,134 @@ impl FilterChroms { Ok(CommandOutput::new((), None)) } } + +// tranpose two nested vecs +// thanks to this clever solution: https://stackoverflow.com/a/64499219/147427 +fn transpose(v: Vec>) -> Vec> { + assert!(!v.is_empty()); + let len = v[0].len(); + let mut iters: Vec<_> = v.into_iter().map(|n| n.into_iter()).collect(); + (0..len) + .map(|_| { + iters + .iter_mut() + .map(|n| n.next().unwrap()) + .collect::>() + }) + .collect() +} + +/// Histogram a set of features for a defined window size. +/// +/// # Tips +/// This is useful for a quick exploratory look at feature density +/// by window, e.g. with +/// +/// $ hist-features --bedfile hg38_ncbiRefSeq.bed.gz --width 1000 --genome \ +/// hg38_seqlens.tsv --headers | xsv table -d'\t' | less +/// +/// The xsv tool (https://github.com/BurntSushi/xsv) is useful for visualizing +/// the results more clearly. +#[derive(Parser)] +pub struct HistFeatures { + /// A TSV genome file of chromosome names and their lengths + #[arg(short, long, required = true)] + genome: PathBuf, + + /// The input BED file. + #[arg(short, long, required = true)] + bedfile: PathBuf, + + /// Width (in basepairs) of each window. + #[arg(short, long)] + width: Position, + + /// Step width (by default: window size). + #[arg(short, long)] + step: Option, + + /// If last window remainder is shorter than width, remove? + #[arg(short, long)] + chop: bool, + + /// Whether to output the results as a TSV with headers. + #[arg(short = 'l', long)] + headers: bool, + + /// An optional output file (standard output will be used if not specified) + #[arg(short, long)] + output: Option, +} + +impl HistFeatures { + pub fn run(&self) -> Result, GRangesError> { + let bedfile = &self.bedfile; + let genome = read_seqlens(&self.genome)?; + + let bed4_iter = Bed4Iterator::new(bedfile)?; + + // Split the elements in the iterator by feature into multiple GRanges objects. + let mut records_by_features: HashMap> = HashMap::new(); + for result in bed4_iter { + let range = result?; + let feature = &range.data.name; + let vec = records_by_features.entry(feature.to_string()).or_default(); + vec.push(range.into_empty()) // drop the feature, since we're hashing on it + } + + // Load all the *merged* split records into memory as interval trees. + let mut gr_by_features: HashMap> = HashMap::new(); + for (feature, ranges) in records_by_features.into_iter() { + // doing a streaming merge of each feature; bookend merge + let merging_iter = MergingEmptyIterator::new(ranges, 0); + + // load into memory and convert to interval trees + let gr = GRangesEmpty::from_iter_ok(merging_iter, &genome)?.into_coitrees()?; + + assert!(!gr_by_features.contains_key(&feature)); + gr_by_features.insert(feature, gr); + } + + // Now, create windows. + let windows = GRangesEmpty::from_windows(&genome, self.width, self.step, self.chop)?; + + let features: Vec<_> = gr_by_features.keys().cloned().collect(); + let mut feature_matrix = Vec::new(); + for (_feature, gr) in gr_by_features.into_iter() { + // find the features that overlap each window + // TODO/OPTIMIZE: how costly is this clone? + let window_counts = windows.clone() + .left_overlaps(&gr)? + .map_joins(|joins| { + // these are merged, so this is the number of *unique* basepairs + let total_overlaps: Position = joins.join.overlaps().iter().cloned().sum(); + total_overlaps + })? + .take_data()?; + feature_matrix.push(window_counts); + } + + let feature_matrix = transpose(feature_matrix); + + // Unite the windows with the feature matrix. + let windows = GRangesEmpty::from_windows(&genome, self.width, self.step, self.chop)?; + let windows = windows.into_granges_data(feature_matrix)?; + + // Write everything. + if !self.headers { + // we have to *manually* add headers (serde needs flat structs otherwise) + windows.write_to_tsv(self.output.as_ref(), &BED_TSV)?; + } else { + let mut headers = vec!["chrom".to_string(), "start".to_string(), "end".to_string()]; + headers.extend(features); + + let config = TsvConfig { + no_value_string: "NA".to_string(), + headers: Some(headers), + }; + windows.write_to_tsv(self.output.as_ref(), &config)?; + } + + Ok(CommandOutput::new((), None)) + } +} diff --git a/src/error.rs b/src/error.rs index 7211d2e..5cb0988 100644 --- a/src/error.rs +++ b/src/error.rs @@ -140,6 +140,9 @@ pub enum GRangesError { #[error("The GRanges object is invalid because it lacks an associated data container. Ensure data is loaded or associated with the GRanges object before attempting operations that require data.")] NoDataContainer, + #[error("The GRangesEmpty object cannot be united with this data because they are different lengths ({0} ≠ {1}")] + InvalidDataContainer(usize, usize), + #[error("The supplied GRanges object and data container cannot be united into a new GRanges since they have differing lengths.")] IncompatableGRangesAndData, diff --git a/src/granges.rs b/src/granges.rs index 4dcd600..cc1fbaf 100644 --- a/src/granges.rs +++ b/src/granges.rs @@ -37,14 +37,14 @@ //! [`BedlikeIterator`]: crate::io::parsers::BedlikeIterator //! [`GRanges::into_coitrees`]: crate::granges::GRanges::into_coitrees -use std::{collections::HashSet, fs::File, io, path::PathBuf}; +use std::{collections::HashSet, path::PathBuf}; -use csv::WriterBuilder; use genomap::GenomeMap; use indexmap::IndexMap; use serde::Serialize; use crate::{ + commands::build_tsv_writer_with_config, ensure_eq, io::tsv::TsvConfig, iterators::{GRangesIterator, GRangesRecordIterator}, @@ -139,19 +139,20 @@ where } } -impl GRanges -where - C: RangeContainer + Clone, -{ - /// Create a new [`GRanges`] object by cloning the ranges of this one, - /// and associating the supplied data with it (this consumes the data). - pub fn clone_with_data(&self, data: Option) -> GRanges { - GRanges { - ranges: self.ranges.clone(), - data, - } - } -} +// NOTE: not safe -- not used, so removed. +// impl GRanges +// where +// C: RangeContainer + Clone, +// { +// /// Create a new [`GRanges`] object by cloning the ranges of this one, +// /// and associating the supplied data with it (this consumes the data). +// pub fn clone_with_data(&self, data: Option) -> GRanges { +// GRanges { +// ranges: self.ranges.clone(), +// data, +// } +// } +// } impl GRangesEmpty where @@ -231,15 +232,7 @@ where output: Option>, config: &TsvConfig, ) -> Result<(), GRangesError> { - let writer_boxed: Box = match output { - Some(path) => Box::new(File::create(path.into())?), - None => Box::new(io::stdout()), - }; - - let mut writer = WriterBuilder::new() - .delimiter(b'\t') - .has_headers(config.headers) - .from_writer(writer_boxed); + let mut writer = build_tsv_writer_with_config(output, config)?; for range in self.iter_ranges() { let record = range.to_record( @@ -267,15 +260,7 @@ impl<'a, R: IterableRangeContainer> GenomicRangesTsvSerialize<'a, R> for GRanges output: Option>, config: &TsvConfig, ) -> Result<(), GRangesError> { - let writer_boxed: Box = match output { - Some(path) => Box::new(File::create(path.into())?), - None => Box::new(io::stdout()), - }; - - let mut writer = WriterBuilder::new() - .delimiter(b'\t') - .has_headers(config.headers) - .from_writer(writer_boxed); + let mut writer = build_tsv_writer_with_config(output, config)?; for range in self.iter_ranges() { let record = range.to_record_empty::<()>(&self.seqnames()); @@ -330,6 +315,37 @@ impl GRanges, T> { } } +impl GRangesEmpty +where + C: IterableRangeContainer, +{ + /// Consume this [`GRangesEmpty`], uniting it with a [`Vec`] data container + /// by indexing the data in order + pub fn into_granges_data( + self, + data: Vec, + ) -> Result>, GRangesError> { + if self.len() != data.len() { + return Err(GRangesError::InvalidDataContainer(self.len(), data.len())); + } + let seqlens = self.seqlens(); + let mut gr: GRanges> = GRanges::new_vec(&seqlens); + + let mut index = 0; + for (seqname, ranges_empty) in self.0.ranges.iter() { + // unwrap should be safe, since seqname is produced from ranges iterator. + for range_empty in ranges_empty.iter_ranges() { + gr.push_range_with_index(seqname, range_empty.start, range_empty.end, index)?; + index += 1; + } + } + Ok(GRanges { + ranges: gr.take_ranges(), + data: Some(data), + }) + } +} + impl GRanges where C: IterableRangeContainer, @@ -924,6 +940,7 @@ where ) -> Result { let mut gr: GRanges> = GRanges::new_vec(&self.0.seqlens()); + gr.data = Some(JoinData::new((), &())); for (seqname, left_ranges) in self.0.ranges.iter() { for left_range in left_ranges.iter_ranges() { @@ -1170,6 +1187,46 @@ impl GRanges, T> { impl GRanges> { /// Create a new [`GRanges>`] object from an iterator over /// [`GenomicRangeRecord`] records. + pub fn from_iter_ok( + iter: I, + seqlens: &IndexMap, + ) -> Result>, GRangesError> + where + I: Iterator>, + { + let mut gr = GRanges::new_vec(seqlens); + for entry in iter { + gr.push_range(&entry.seqname, entry.start, entry.end, entry.data)?; + } + Ok(gr) + } +} + +impl GRangesEmpty { + /// Create a new [`GRangesEmpty`] object from an iterator over + /// [`GenomicRangeRecordEmpty`] records. + pub fn from_iter_ok( + iter: I, + seqlens: &IndexMap, + ) -> Result, GRangesError> + where + I: Iterator, + { + let mut gr = GRangesEmpty::new_vec(seqlens); + for entry in iter { + gr.push_range(&entry.seqname, entry.start, entry.end)?; + } + Ok(gr) + } +} + +impl GRanges> { + /// Create a new [`GRanges>`] object from a parsing iterator over + /// [`Result, GRangesError>`] records. + /// + /// # ⚠️ Stability + /// + /// This may be renamed. pub fn from_iter( iter: I, seqlens: &IndexMap, @@ -1187,8 +1244,12 @@ impl GRanges> { } impl GRangesEmpty { - /// Create a new [`GRanges>`] object from an iterator over - /// [`GenomicRangeRecord`] records. + /// Create a new [`GRanges>`] object from a parsing iterator over + /// [`Result`] records. + /// + /// # ⚠️ Stability + /// + /// This may be renamed. pub fn from_iter( iter: I, seqlens: &IndexMap, @@ -1813,6 +1874,25 @@ mod tests { // rest are empty TODO should check } + #[test] + fn test_left_with_data_both_empty() { + let sl = seqlens!("chr1" => 50); + let mut left_gr: GRanges> = GRanges::new_vec(&sl); + left_gr.push_range("chr1", 1, 2, 1.1).unwrap(); + left_gr.push_range("chr1", 5, 7, 2.8).unwrap(); + let left_gr = left_gr.into_granges_empty().unwrap(); + + let windows = GRangesEmpty::from_windows(&sl, 10, None, true) + .unwrap() + .into_coitrees() + .unwrap(); + + let joined_results = left_gr.left_overlaps(&windows).unwrap(); + + // check the joined results + assert_eq!(joined_results.len(), 2) + } + #[test] fn test_left_with_data_right_empty() { let sl = seqlens!("chr1" => 50); diff --git a/src/io/tsv.rs b/src/io/tsv.rs index f287f2f..0f99fb7 100644 --- a/src/io/tsv.rs +++ b/src/io/tsv.rs @@ -6,7 +6,7 @@ lazy_static! { /// The standard BED format TSV configuration. pub static ref BED_TSV: TsvConfig = TsvConfig { no_value_string: ".".to_string(), - headers: false, + headers: None, }; } @@ -16,5 +16,5 @@ lazy_static! { #[derive(Debug, Clone)] pub struct TsvConfig { pub no_value_string: String, - pub headers: bool, + pub headers: Option>, } diff --git a/src/join.rs b/src/join.rs index abdcaaf..44f08fe 100644 --- a/src/join.rs +++ b/src/join.rs @@ -72,6 +72,11 @@ impl LeftGroupedJoin { pub fn num_overlaps(&self) -> usize { self.overlaps.len() } + + /// Retrieve the right overlaps. + pub fn overlaps(&self) -> &Vec { + &self.overlaps + } } /// [`JoinData`] contains a [`Vec`] of all overlap joins, diff --git a/src/lib.rs b/src/lib.rs index 5043309..28d4f77 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -425,7 +425,8 @@ pub mod prelude { }; pub use crate::merging_iterators::{ - ConditionalMergingResultIterator, MergingEmptyResultIterator, MergingResultIterator, + ConditionalMergingResultIterator, MergingEmptyIterator, MergingEmptyResultIterator, + MergingResultIterator, }; pub use crate::traits::{ AsGRangesRef, GeneralRangeRecordIterator, GenericRange, GenericRangeOperations, diff --git a/src/main/mod.rs b/src/main/mod.rs index 373a852..243a3e7 100644 --- a/src/main/mod.rs +++ b/src/main/mod.rs @@ -4,7 +4,7 @@ use clap::{Parser, Subcommand}; use granges::{ commands::{ granges_adjust, granges_filter, granges_flank, granges_map, granges_windows, FilterChroms, - Merge, ProcessingMode, + HistFeatures, Merge, ProcessingMode, }, data::operations::FloatOperation, prelude::GRangesError, @@ -142,6 +142,7 @@ enum Commands { #[arg(long)] in_mem: bool, }, + HistFeatures(HistFeatures), /// Do a "left grouped join", on the specified left and right genomic ranges, /// and apply one or more functions to the BED5 scores for all right genomic /// ranges. @@ -308,6 +309,7 @@ fn run() -> Result<(), GRangesError> { ) } // NOTE: this is the new API, so clean! + Some(Commands::HistFeatures(hist_features)) => hist_features.run(), Some(Commands::Merge(merge)) => merge.run(), Some(Commands::Windows { genome, diff --git a/src/merging_iterators.rs b/src/merging_iterators.rs index c298785..7b2d939 100644 --- a/src/merging_iterators.rs +++ b/src/merging_iterators.rs @@ -11,6 +11,63 @@ use crate::{ PositionOffset, }; +// Create a new [`MergingEmptyIterator`], which work over data-less +// "empty" ranges ([`GenomicRangeRecordEmpty`] and merge them based on their +// distance or degree of overlap. +pub struct MergingEmptyIterator +where + I: IntoIterator, +{ + last_range: Option, + inner: ::IntoIter, + minimum_distance: PositionOffset, +} + +impl MergingEmptyIterator +where + I: IntoIterator, +{ + pub fn new(inner: I, minimum_distance: PositionOffset) -> Self { + Self { + last_range: None, + inner: inner.into_iter(), + minimum_distance, + } + } +} + +impl Iterator for MergingEmptyIterator +where + I: IntoIterator, +{ + type Item = GenomicRangeRecordEmpty; + + fn next(&mut self) -> Option { + for next_range in self.inner.by_ref() { + if let Some(last_range) = &mut self.last_range { + let on_same_chrom = last_range.seqname == next_range.seqname; + if on_same_chrom + && last_range.distance_or_overlap(&next_range) <= self.minimum_distance + { + last_range.end = max(last_range.end, next_range.end); + } else { + let return_range = last_range.clone(); + self.last_range = Some(next_range); + return Some(return_range); + } + } else { + self.last_range = Some(next_range); + // If this is the first range, we continue looking for more to potentially merge with. + continue; + } + } + + // If we get here, the inner iterator is exhausted. + // We need to return the last_range if it exists and make sure it's cleared for subsequent calls. + self.last_range.take() + } +} + // Create a new [`MergingEmptyResultIterator`], which work over data-less // "empty" ranges ([`GenomicRangeRecordEmpty`] and merge them based on their // distance or degree of overlap. @@ -310,6 +367,8 @@ mod tests { ranges::{GenomicRangeRecord, GenomicRangeRecordEmpty}, }; + // TODO/TEST we need non-result iterator test. + use super::MergingResultIterator; #[test] diff --git a/src/ranges/mod.rs b/src/ranges/mod.rs index a4d4c1d..f62ff0f 100644 --- a/src/ranges/mod.rs +++ b/src/ranges/mod.rs @@ -199,6 +199,16 @@ impl GenomicRangeRecord { data: func(self.data), } } + + /// Consume this [`GenomicRangeRecord`], drop its data, + /// and turn it into a [`GenomicRangeRecordEmpty`]. + pub fn into_empty(self) -> GenomicRangeRecordEmpty { + GenomicRangeRecordEmpty { + seqname: self.seqname, + start: self.start, + end: self.end, + } + } } impl GenericRange for GenomicRangeRecord { @@ -303,7 +313,7 @@ impl AdjustableGenericRange for GenomicRangeRecordEmpty { } impl GenericRangeOperations for GenomicRangeRecordEmpty { - /// Create flanking regions for this [`GenomicRangeEmptyRecord`] range. + /// Create flanking regions for this [`GenomicRangeRecordEmpty`] range. fn flanking_ranges( &self, left_flank: Option,