diff --git a/src/cli/intersect.rs b/src/cli/intersect.rs index af805a3..503c4f2 100644 --- a/src/cli/intersect.rs +++ b/src/cli/intersect.rs @@ -26,6 +26,10 @@ pub struct IntersectParams { /// (only works if both files are sorted) #[clap(short = 'S', long, conflicts_with_all = &["with_query", "with_target", "unique", "inverse"])] pub stream: bool, + + /// Assert the inputs are pre-sorted + #[clap(short, long)] + pub sorted: bool, } #[derive(Parser, Debug)] diff --git a/src/commands/closest.rs b/src/commands/closest.rs index 3f88ab2..1947cc2 100644 --- a/src/commands/closest.rs +++ b/src/commands/closest.rs @@ -2,7 +2,7 @@ use crate::{ cli::{ClosestArgs, ClosestParams}, dispatch_pair, io::write_pairs_iter_with, - types::{InputFormat, IntervalPair, Rename, Renamer, Translater}, + types::{InputFormat, IntervalPair, Rename, Renamer, SplitTranslater}, utils::sort_pairs, }; use anyhow::Result; @@ -38,7 +38,7 @@ impl From for ClosestType { fn run_closest<'a, Ia, Ib, Na, Nb, W>( mut a_set: IntervalContainer, mut b_set: IntervalContainer, - translater: Option<&'a Translater>, + translater: Option<&'a SplitTranslater>, params: ClosestParams, output: W, ) -> Result<()> diff --git a/src/commands/coverage.rs b/src/commands/coverage.rs index bdad70f..2bcfa33 100644 --- a/src/commands/coverage.rs +++ b/src/commands/coverage.rs @@ -8,14 +8,14 @@ use crate::{ cli::{CoverageArgs, CoverageParams}, dispatch_pair, io::write_depth_iter_with, - types::{InputFormat, IntervalDepth, Rename, Renamer, Translater}, + types::{InputFormat, IntervalDepth, Rename, Renamer, SplitTranslater}, utils::sort_pairs, }; fn run_coverage<'a, Ia, Ib, Na, W>( mut set_a: IntervalContainer, mut set_b: IntervalContainer, - translater: Option<&'a Translater>, + translater: Option<&'a SplitTranslater>, params: CoverageParams, writer: W, ) -> Result<()> diff --git a/src/commands/extend.rs b/src/commands/extend.rs index 9bb42c4..86948a2 100644 --- a/src/commands/extend.rs +++ b/src/commands/extend.rs @@ -2,7 +2,7 @@ use crate::{ cli::{ExtendArgs, Growth}, dispatch_single, io::{write_records_iter_with, WriteNamedIter, WriteNamedIterImpl}, - types::{Genome, InputFormat, Translater}, + types::{Genome, InputFormat, SplitTranslater, TranslateGroup}, }; use anyhow::Result; use bedrs::{traits::IntervalBounds, IntervalContainer}; @@ -28,7 +28,7 @@ where fn extend_set( set: IntervalContainer, - translater: Option, + translater: Option<&SplitTranslater>, growth: Growth, output: W, ) -> Result<()> @@ -38,13 +38,13 @@ where WriteNamedIterImpl: WriteNamedIter, { growth.warn_args(); - let genome = growth.get_genome(translater.as_ref())?; + let genome = growth.get_genome(translater.map(|x| x.get_translater(TranslateGroup::Chr)))?; let extend_iter = set.into_iter().map(|mut iv| { let (left, right) = growth.get_values(&iv); extend_interval(&mut iv, left, right, genome.as_ref()); iv }); - write_records_iter_with(extend_iter, output, translater.as_ref()) + write_records_iter_with(extend_iter, output, translater) } pub fn extend(args: ExtendArgs) -> Result<()> { diff --git a/src/commands/flank.rs b/src/commands/flank.rs index 0edbf5a..cd57742 100644 --- a/src/commands/flank.rs +++ b/src/commands/flank.rs @@ -2,7 +2,7 @@ use crate::{ cli::{FlankArgs, Growth}, dispatch_single, io::{write_records_iter_with, WriteNamedIter, WriteNamedIterImpl}, - types::{Genome, InputFormat, Translater}, + types::{Genome, InputFormat, SplitTranslater, TranslateGroup}, }; use anyhow::Result; use bedrs::{traits::IntervalBounds, IntervalContainer}; @@ -80,7 +80,7 @@ where /// Flank the intervals in the set fn flank_set( set: IntervalContainer, - translater: Option, + translater: Option<&SplitTranslater>, growth: Growth, output: W, ) -> Result<()> @@ -90,12 +90,12 @@ where WriteNamedIterImpl: WriteNamedIter, { growth.warn_args(); - let genome = growth.get_genome(translater.as_ref())?; + let genome = growth.get_genome(translater.map(|x| x.get_translater(TranslateGroup::Chr)))?; let flank_iter = set.iter().flat_map(|iv| { let (left, right) = growth.get_values(iv); flank_interval(*iv, left, right, genome.as_ref()) }); - write_records_iter_with(flank_iter, output, translater.as_ref()) + write_records_iter_with(flank_iter, output, translater) } pub fn flank(args: FlankArgs) -> Result<()> { @@ -135,7 +135,7 @@ mod testing { #[test] fn test_flank_left_bed6() { - let iv = Bed6::new(1, 100, 400, 1, 2, Strand::default()); + let iv = Bed6::new(1, 100, 400, 1, 2.into(), Strand::default()); let left = left_flank(iv, 50).unwrap(); assert_eq!(left.start(), 50); assert_eq!(left.end(), 100); @@ -146,7 +146,20 @@ mod testing { #[test] fn test_flank_left_bed12() { - let iv = Bed12::new(1, 100, 400, 1, 2, Strand::default(), 3, 4, 5, 6, 7, 8); + let iv = Bed12::new( + 1, + 100, + 400, + 1, + 2.into(), + Strand::default(), + 3, + 4, + 5, + 6, + 7, + 8, + ); let left = left_flank(iv, 50).unwrap(); assert_eq!(left.start(), 50); assert_eq!(left.end(), 100); @@ -187,7 +200,7 @@ mod testing { #[test] fn test_flank_right_bed6() { - let iv = Bed6::new(1, 100, 400, 1, 2, Strand::default()); + let iv = Bed6::new(1, 100, 400, 1, 2.into(), Strand::default()); let right = right_flank(iv, 50, None).unwrap(); assert_eq!(right.start(), 400); assert_eq!(right.end(), 450); @@ -198,7 +211,20 @@ mod testing { #[test] fn test_flank_right_bed12() { - let iv = Bed12::new(1, 100, 400, 1, 2, Strand::default(), 3, 4, 5, 6, 7, 8); + let iv = Bed12::new( + 1, + 100, + 400, + 1, + 2.into(), + Strand::default(), + 3, + 4, + 5, + 6, + 7, + 8, + ); let right = right_flank(iv, 50, None).unwrap(); assert_eq!(right.start(), 400); assert_eq!(right.end(), 450); diff --git a/src/commands/get_fasta.rs b/src/commands/get_fasta.rs index ae26375..a32043f 100644 --- a/src/commands/get_fasta.rs +++ b/src/commands/get_fasta.rs @@ -153,5 +153,6 @@ pub fn get_fasta(args: GetFastaArgs) -> Result<()> { InputFormat::Bed4 => get_fasta_bed4(&mut csv_reader, &mut byterecord, fasta, writer), InputFormat::Bed6 => get_fasta_bed6(&mut csv_reader, &mut byterecord, fasta, writer), InputFormat::Bed12 => get_fasta_bed12(&mut csv_reader, &mut byterecord, fasta, writer), + _ => anyhow::bail!("Unable to process ambiguous input format"), } } diff --git a/src/commands/intersect/run.rs b/src/commands/intersect/run.rs index 650cec0..025502c 100644 --- a/src/commands/intersect/run.rs +++ b/src/commands/intersect/run.rs @@ -6,7 +6,8 @@ use crate::{ build_reader, write_named_records_iter_dashmap, write_records_iter_with, NamedIter, UnnamedIter, WriteNamedIter, WriteNamedIterImpl, }, - types::{InputFormat, NumericBed3, StreamTranslater, Translater}, + types::{InputFormat, NumericBed3, SplitTranslater, StreamTranslater}, + utils::sort_pairs, }; use anyhow::Result; use bedrs::{traits::IntervalBounds, IntersectIter, IntervalContainer, MergeIter}; @@ -14,9 +15,9 @@ use serde::Serialize; use std::io::Write; pub fn intersect_sets( - set_a: IntervalContainer, - set_b: IntervalContainer, - translater: Option<&Translater>, + mut set_a: IntervalContainer, + mut set_b: IntervalContainer, + translater: Option<&SplitTranslater>, params: IntersectParams, writer: W, ) -> Result<()> @@ -28,6 +29,7 @@ where { let query_method = params.overlap_predicates.into(); let output_method = params.output_predicates.try_into()?; + sort_pairs(&mut set_a, &mut set_b, params.sorted); match output_method { // Output the target intervals OutputMethod::Target => { @@ -93,7 +95,7 @@ fn intersect_stream(args: IntersectArgs) -> Result<()> { let merged_target_iter = MergeIter::new(target_iter); let intersect_iter = IntersectIter::new_with_method(merged_query_iter, merged_target_iter, method); - write_records_iter_with(intersect_iter, writer, None::<&Translater>)?; + write_records_iter_with(intersect_iter, writer, None::<&SplitTranslater>)?; } Ok(()) } diff --git a/src/commands/merge.rs b/src/commands/merge.rs index 52d781b..4af9e40 100644 --- a/src/commands/merge.rs +++ b/src/commands/merge.rs @@ -5,7 +5,7 @@ use crate::{ build_reader, iter_unnamed, write_3col_iter_with, write_records_iter, BedReader, WriteNamedIter, WriteNamedIterImpl, }, - types::{InputFormat, NumericBed12, NumericBed3, NumericBed4, NumericBed6, Translater}, + types::{InputFormat, NumericBed12, NumericBed3, NumericBed4, NumericBed6, SplitTranslater}, }; use anyhow::Result; use bedrs::{traits::IntervalBounds, IntervalContainer, MergeIter}; @@ -14,7 +14,7 @@ use std::io::Write; fn merge_in_memory( mut set: IntervalContainer, - translater: Option, + translater: Option<&SplitTranslater>, params: MergeParams, writer: W, ) -> Result<()> @@ -29,7 +29,7 @@ where set.set_sorted(); } let merged = set.merge()?; - write_3col_iter_with(merged.into_iter(), writer, translater.as_ref())?; + write_3col_iter_with(merged.into_iter(), writer, translater)?; Ok(()) } @@ -52,7 +52,7 @@ fn merge_streamed_by_format(bed_reader: BedReader, writer: W) -> Resul let input_format = bed_reader.input_format(); let mut csv_reader = build_reader(bed_reader.reader()); match input_format { - InputFormat::Bed3 => { + InputFormat::Bed3 | InputFormat::Ambiguous => { let record_iter: Box> = iter_unnamed(&mut csv_reader); merge_streamed(record_iter, writer) } diff --git a/src/commands/random.rs b/src/commands/random.rs index bc2c190..731bbd0 100644 --- a/src/commands/random.rs +++ b/src/commands/random.rs @@ -4,7 +4,7 @@ use crate::{ types::{Genome, InputFormat, NumericBed12, NumericBed3, NumericBed4, NumericBed6, Translater}, }; use anyhow::Result; -use bedrs::Strand; +use bedrs::{Score, Strand}; use rand::Rng; use std::io::Write; @@ -123,7 +123,7 @@ pub fn random_bed6(args: RandomArgs, writer: W) -> Result<()> { (c, x, y, s) }) // build the interval - .map(|(c, x, y, s)| NumericBed6::new(c, x, y, 0, 0.0, s)); + .map(|(c, x, y, s)| NumericBed6::new(c, x, y, 0, Score::Empty, s)); write_records_iter_with(interval_gen, writer, genome_sizes.translater())?; @@ -175,7 +175,7 @@ pub fn random_bed12(args: RandomArgs, writer: W) -> Result<()> { (c, x, y, t, u, s) }) // build the interval - .map(|(c, x, y, t, u, s)| NumericBed12::new(c, x, y, 0, 0.0, s, t, u, 0, 0, 0, 0)); + .map(|(c, x, y, t, u, s)| NumericBed12::new(c, x, y, 0, Score::Empty, s, t, u, 0, 0, 0, 0)); write_records_iter_with(interval_gen, writer, genome_sizes.translater())?; @@ -189,5 +189,6 @@ pub fn random(args: RandomArgs) -> Result<()> { InputFormat::Bed4 => random_bed4(args, writer), InputFormat::Bed6 => random_bed6(args, writer), InputFormat::Bed12 => random_bed12(args, writer), + _ => anyhow::bail!("Unable to process ambiguous input format"), } } diff --git a/src/commands/sample.rs b/src/commands/sample.rs index 6aadaa0..182311a 100644 --- a/src/commands/sample.rs +++ b/src/commands/sample.rs @@ -1,7 +1,7 @@ use crate::{ cli::{SampleArgs, SampleParams}, io::{write_records_iter_with, WriteNamedIter, WriteNamedIterImpl}, - types::{InputFormat, Translater}, + types::{InputFormat, SplitTranslater}, }; use anyhow::{bail, Result}; use bedrs::{traits::IntervalBounds, IntervalContainer}; @@ -10,7 +10,7 @@ use std::io::Write; fn sample_from_set( set: &mut IntervalContainer, - translater: Option<&Translater>, + translater: Option<&SplitTranslater>, params: SampleParams, writer: W, ) -> Result<()> @@ -71,5 +71,9 @@ pub fn sample(args: SampleArgs) -> Result<()> { let (mut set, translater) = reader.bed12_set()?; sample_from_set(&mut set, translater.as_ref(), args.params, writer) } + InputFormat::Ambiguous => { + let (mut set, translater) = reader.meta_interval_set()?; + sample_from_set(&mut set, translater.as_ref(), args.params, writer) + } } } diff --git a/src/commands/shift.rs b/src/commands/shift.rs index 9d6fffb..174636f 100644 --- a/src/commands/shift.rs +++ b/src/commands/shift.rs @@ -2,7 +2,7 @@ use crate::{ cli::{ShiftArgs, ShiftParams}, dispatch_single, io::{write_records_iter_with, WriteNamedIter, WriteNamedIterImpl}, - types::{Genome, InputFormat, Translater}, + types::{Genome, InputFormat, SplitTranslater, TranslateGroup}, }; use anyhow::Result; use bedrs::{traits::IntervalBounds, IntervalContainer}; @@ -55,7 +55,7 @@ where fn shift_set( set: IntervalContainer, - translater: Option, + translater: Option<&SplitTranslater>, params: ShiftParams, output: W, ) -> Result<()> @@ -65,11 +65,15 @@ where WriteNamedIterImpl: WriteNamedIter, { params.warn_args(); - let genome = Genome::from_opt_path_immutable_with(params.genome, translater.as_ref(), false)?; + let genome = Genome::from_opt_path_immutable_with( + params.genome, + translater.map(|x| x.get_translater(TranslateGroup::Chr)), + false, + )?; let shift_iter = set .into_iter() .map(|iv| shift_interval(iv, params.amount, params.percent, genome.as_ref())); - write_records_iter_with(shift_iter, output, translater.as_ref()) + write_records_iter_with(shift_iter, output, translater) } pub fn shift(args: ShiftArgs) -> Result<()> { @@ -158,7 +162,7 @@ mod testing { #[test] fn test_shift_bed6() { - let iv = Bed6::new(1, 100, 200, 1, 2, Strand::default()); + let iv = Bed6::new(1, 100, 200, 1, 2.into(), Strand::default()); let si = shift_interval(iv, 50.0, false, None); assert_eq!(si.start(), 150); assert_eq!(si.end(), 250); @@ -169,7 +173,20 @@ mod testing { #[test] fn test_shift_bed12() { - let iv = Bed12::new(1, 100, 400, 1, 2, Strand::default(), 3, 4, 5, 6, 7, 8); + let iv = Bed12::new( + 1, + 100, + 400, + 1, + 2.into(), + Strand::default(), + 3, + 4, + 5, + 6, + 7, + 8, + ); let si = shift_interval(iv, 50.0, false, None); assert_eq!(si.start(), 150); assert_eq!(si.end(), 450); diff --git a/src/commands/sort.rs b/src/commands/sort.rs index 40984dd..90745db 100644 --- a/src/commands/sort.rs +++ b/src/commands/sort.rs @@ -1,8 +1,8 @@ use crate::{ cli::{SortArgs, SortParams}, - dispatch_single, + dispatch_single_owned_tl, io::{write_records_iter_with, WriteNamedIter, WriteNamedIterImpl}, - types::{InputFormat, Reorder, Retranslater, Translater}, + types::{InputFormat, Reorder, SplitRetranslater, SplitTranslater}, }; use anyhow::Result; use bedrs::{traits::IntervalBounds, IntervalContainer}; @@ -11,15 +11,16 @@ use std::io::Write; fn sort_set( set: &mut IntervalContainer, - translater: Option, + translater: Option, parallel: bool, -) -> Option +) -> Option where I: IntervalBounds + Reorder, { let translater = if let Some(translater) = translater { - let retranslater = I::reorder_translater(set, translater); - Some(retranslater) + let (chr_tl, meta_tl) = translater.disband(); + let retranslater = I::reorder_translater(set, chr_tl); + Some(SplitRetranslater::new(retranslater, meta_tl)) } else { None }; @@ -33,7 +34,7 @@ where fn sort_and_write( mut set: IntervalContainer, - translater: Option, + translater: Option, params: SortParams, writer: W, ) -> Result<()> @@ -50,5 +51,5 @@ where pub fn sort(args: SortArgs) -> Result<()> { let reader = args.input.get_reader()?; let writer = args.output.get_writer()?; - dispatch_single!(reader, writer, args.params, sort_and_write) + dispatch_single_owned_tl!(reader, writer, args.params, sort_and_write) } diff --git a/src/commands/subtract.rs b/src/commands/subtract.rs index c031265..2a42bad 100644 --- a/src/commands/subtract.rs +++ b/src/commands/subtract.rs @@ -2,7 +2,7 @@ use crate::{ cli::{SubtractArgs, SubtractParams}, dispatch_pair, io::{write_records_iter_with, WriteNamedIter, WriteNamedIterImpl}, - types::{InputFormat, Translater}, + types::{InputFormat, SplitTranslater}, utils::sort_pairs, }; use anyhow::Result; @@ -64,7 +64,7 @@ where fn run_subtract( mut aset: IntervalContainer, mut bset: IntervalContainer, - translater: Option<&Translater>, + translater: Option<&SplitTranslater>, params: SubtractParams, writer: W, ) -> Result<()> diff --git a/src/commands/window.rs b/src/commands/window.rs index 90e701c..d87b58c 100644 --- a/src/commands/window.rs +++ b/src/commands/window.rs @@ -7,14 +7,14 @@ use crate::{ cli::{WindowArgs, WindowParams}, dispatch_pair, io::{write_pairs_iter_with, write_records_iter_with, WriteNamedIter, WriteNamedIterImpl}, - types::{InputFormat, IntervalPair, Rename, Renamer, Translater}, + types::{InputFormat, IntervalPair, Rename, Renamer, SplitTranslater}, utils::sort_pairs, }; fn windowed_set_overlaps<'a, Ia, Ib, Na, Nb, W>( mut set_a: IntervalContainer, mut set_b: IntervalContainer, - translater: Option<&'a Translater>, + translater: Option<&'a SplitTranslater>, params: WindowParams, output: W, ) -> Result<()> diff --git a/src/dispatch.rs b/src/dispatch.rs index 7378be4..4a77f49 100644 --- a/src/dispatch.rs +++ b/src/dispatch.rs @@ -2,6 +2,35 @@ /// a writer. #[macro_export] macro_rules! dispatch_single { + ($reader:expr, $writer:expr, $params:expr, $func:expr) => { + match $reader.input_format() { + InputFormat::Bed3 => { + let (set, translater) = $reader.bed3_set()?; + $func(set, translater.as_ref(), $params, $writer) + } + InputFormat::Bed4 => { + let (set, translater) = $reader.bed4_set()?; + $func(set, translater.as_ref(), $params, $writer) + } + InputFormat::Bed6 => { + let (set, translater) = $reader.bed6_set()?; + $func(set, translater.as_ref(), $params, $writer) + } + InputFormat::Bed12 => { + let (set, translater) = $reader.bed12_set()?; + $func(set, translater.as_ref(), $params, $writer) + } + InputFormat::Ambiguous => { + let (set, translater) = $reader.meta_interval_set()?; + $func(set, translater.as_ref(), $params, $writer) + } + } + }; +} +/// This is a macro to match the input format and dispatch to some function with some parameters and +/// a writer. +#[macro_export] +macro_rules! dispatch_single_owned_tl { ($reader:expr, $writer:expr, $params:expr, $func:expr) => { match $reader.input_format() { InputFormat::Bed3 => { @@ -20,6 +49,10 @@ macro_rules! dispatch_single { let (set, translater) = $reader.bed12_set()?; $func(set, translater, $params, $writer) } + InputFormat::Ambiguous => { + let (set, translater) = $reader.meta_interval_set()?; + $func(set, translater, $params, $writer) + } } }; } @@ -29,7 +62,9 @@ macro_rules! dispatch_single { #[macro_export] macro_rules! dispatch_pair { ($reader_a:expr, $reader_b:expr, $writer:expr, $params:expr, $func:expr) => {{ - let mut translater = $reader_a.is_named().then_some(Translater::new()); + let mut translater = $reader_a + .is_named() + .then_some($crate::types::SplitTranslater::new()); $crate::dispatch_to_lhs!($reader_a, $reader_b, translater, $writer, $params, $func) }}; } @@ -55,6 +90,10 @@ macro_rules! dispatch_to_lhs { let set_a = $reader_a.bed12_set_with($translater.as_mut())?; $crate::dispatch_to_rhs!(set_a, $reader_b, $translater, $writer, $params, $func) } + InputFormat::Ambiguous => { + let set_a = $reader_a.meta_interval_set_with($translater.as_mut())?; + $crate::dispatch_to_rhs!(set_a, $reader_b, $translater, $writer, $params, $func) + } } }; } @@ -81,6 +120,10 @@ macro_rules! dispatch_to_rhs { let set_b = $reader_b.bed12_set_with($translater.as_mut())?; $func($set_a, set_b, $translater.as_ref(), $params, $writer) } + InputFormat::Ambiguous => { + let set_b = $reader_b.meta_interval_set_with($translater.as_mut())?; + $func($set_a, set_b, $translater.as_ref(), $params, $writer) + } } }; } diff --git a/src/io/iter.rs b/src/io/iter.rs index 3264e5c..961a251 100644 --- a/src/io/iter.rs +++ b/src/io/iter.rs @@ -77,7 +77,7 @@ impl<'a, 'b, R: Read> Iterator for NamedIter<'a, 'b, R, NumericBed6> { record.start(), record.end(), *name_idx, - *record.score(), + record.score(), record.strand().unwrap_or_default(), ); Some(iv) diff --git a/src/io/read/bed12.rs b/src/io/read/bed12.rs index c1799eb..748883b 100644 --- a/src/io/read/bed12.rs +++ b/src/io/read/bed12.rs @@ -1,11 +1,14 @@ use super::build_reader; -use crate::types::{Bed12Set, NamedBed12, NumericBed12, Translater}; +use crate::types::{Bed12Set, NamedBed12, NumericBed12, SplitTranslater, TranslateGroup}; use anyhow::{bail, Result}; use bedrs::{Coordinates, IntervalContainer}; use csv::ByteRecord; use std::io::Read; -pub fn read_bed12_set(reader: R, named: bool) -> Result<(Bed12Set, Option)> { +pub fn read_bed12_set( + reader: R, + named: bool, +) -> Result<(Bed12Set, Option)> { if named { let (set, translater) = read_bed12_set_named(reader)?; Ok((set, Some(translater))) @@ -17,7 +20,7 @@ pub fn read_bed12_set(reader: R, named: bool) -> Result<(Bed12Set, Opti pub fn read_bed12_set_with( reader: R, - translater: Option<&mut Translater>, + translater: Option<&mut SplitTranslater>, ) -> Result { if let Some(translater) = translater { convert_bed12_set(reader, translater) @@ -43,9 +46,9 @@ fn read_bed12_set_unnamed(reader: R) -> Result { Ok(set) } -/// Reads a single file into a GenomicIntervalSet and a Translater -fn read_bed12_set_named(reader: R) -> Result<(Bed12Set, Translater)> { - let mut translater = Translater::new(); +/// Reads a single file into a GenomicIntervalSet and a SplitTranslater +fn read_bed12_set_named(reader: R) -> Result<(Bed12Set, SplitTranslater)> { + let mut translater = SplitTranslater::new(); let set = convert_bed12_set(reader, &mut translater)?; Ok((set, translater)) } @@ -55,29 +58,39 @@ fn read_bed12_set_named(reader: R) -> Result<(Bed12Set, Translater)> { /// It uses an externally initialized name map and index map to keep track of /// chromosome names and indices. This is useful for reading multiple files /// and keeping track of the same chromosome names and indices. -fn convert_bed12_set(reader: R, translater: &mut Translater) -> Result { +fn convert_bed12_set(reader: R, translater: &mut SplitTranslater) -> Result { let mut reader = build_reader(reader); let mut raw_record = ByteRecord::new(); let mut set = Bed12Set::empty(); while reader.read_byte_record(&mut raw_record)? { let record: NamedBed12 = raw_record.deserialize(None)?; - translater.add_name(record.chr()); - translater.add_name(record.name()); - translater.add_name(record.item_rgb()); - translater.add_name(record.block_sizes()); - translater.add_name(record.block_starts()); + translater.add_name(record.chr(), TranslateGroup::Chr); + translater.add_name(record.name(), TranslateGroup::Meta); + translater.add_name(record.item_rgb(), TranslateGroup::Meta); + translater.add_name(record.block_sizes(), TranslateGroup::Meta); + translater.add_name(record.block_starts(), TranslateGroup::Meta); - let chr_int = translater.get_idx(record.chr()).unwrap(); - let name_int = translater.get_idx(record.name()).unwrap(); - let item_rgb_int = translater.get_idx(record.item_rgb()).unwrap(); - let block_sizes_int = translater.get_idx(record.block_sizes()).unwrap(); - let block_starts_int = translater.get_idx(record.block_starts()).unwrap(); + let chr_int = translater + .get_idx(record.chr(), TranslateGroup::Chr) + .unwrap(); + let name_int = translater + .get_idx(record.name(), TranslateGroup::Meta) + .unwrap(); + let item_rgb_int = translater + .get_idx(record.item_rgb(), TranslateGroup::Meta) + .unwrap(); + let block_sizes_int = translater + .get_idx(record.block_sizes(), TranslateGroup::Meta) + .unwrap(); + let block_starts_int = translater + .get_idx(record.block_starts(), TranslateGroup::Meta) + .unwrap(); let interval = NumericBed12::new( chr_int, record.start(), record.end(), name_int, - *record.score(), + record.score(), record.strand().unwrap_or_default(), record.thick_start(), record.thick_end(), diff --git a/src/io/read/bed3.rs b/src/io/read/bed3.rs index da1b26f..8bb2035 100644 --- a/src/io/read/bed3.rs +++ b/src/io/read/bed3.rs @@ -1,14 +1,17 @@ use super::build_reader; use crate::{ io::NamedInterval, - types::{Bed3Set, NumericBed3, Translater}, + types::{Bed3Set, NumericBed3, SplitTranslater, TranslateGroup}, }; use anyhow::{bail, Result}; use bedrs::IntervalContainer; use csv::ByteRecord; use std::io::Read; -pub fn read_bed3_set(reader: R, named: bool) -> Result<(Bed3Set, Option)> { +pub fn read_bed3_set( + reader: R, + named: bool, +) -> Result<(Bed3Set, Option)> { if named { let (set, idx_map) = read_bed3_set_named(reader)?; Ok((set, Some(idx_map))) @@ -20,7 +23,7 @@ pub fn read_bed3_set(reader: R, named: bool) -> Result<(Bed3Set, Option pub fn read_bed3_set_with( reader: R, - translater: Option<&mut Translater>, + translater: Option<&mut SplitTranslater>, ) -> Result { if let Some(translater) = translater { convert_bed3_set(reader, translater) @@ -46,9 +49,9 @@ fn read_bed3_set_unnamed(reader: R) -> Result { Ok(set) } -/// Reads a single file into a GenomicIntervalSet and a Translater -fn read_bed3_set_named(reader: R) -> Result<(Bed3Set, Translater)> { - let mut translater = Translater::new(); +/// Reads a single file into a GenomicIntervalSet and a SplitTranslater +fn read_bed3_set_named(reader: R) -> Result<(Bed3Set, SplitTranslater)> { + let mut translater = SplitTranslater::new(); let set = convert_bed3_set(reader, &mut translater)?; Ok((set, translater)) } @@ -58,14 +61,16 @@ fn read_bed3_set_named(reader: R) -> Result<(Bed3Set, Translater)> { /// It uses an externally initialized name map and index map to keep track of /// chromosome names and indices. This is useful for reading multiple files /// and keeping track of the same chromosome names and indices. -fn convert_bed3_set(reader: R, translater: &mut Translater) -> Result { +fn convert_bed3_set(reader: R, translater: &mut SplitTranslater) -> Result { let mut reader = build_reader(reader); let mut raw_record = ByteRecord::new(); let mut set = IntervalContainer::empty(); while reader.read_byte_record(&mut raw_record)? { let record: NamedInterval = raw_record.deserialize(None)?; - translater.add_name(record.name); - let chr_int = translater.get_idx(record.name).unwrap(); + translater.add_name(record.name, TranslateGroup::Chr); + let chr_int = translater + .get_idx(record.name, TranslateGroup::Chr) + .unwrap(); let interval = NumericBed3::new(chr_int, record.start, record.end); set.insert(interval); } diff --git a/src/io/read/bed4.rs b/src/io/read/bed4.rs index 9bcc98c..319f3e1 100644 --- a/src/io/read/bed4.rs +++ b/src/io/read/bed4.rs @@ -1,11 +1,14 @@ use super::build_reader; -use crate::types::{Bed4Set, NamedBed4, NumericBed4, Translater}; +use crate::types::{Bed4Set, NamedBed4, NumericBed4, SplitTranslater, TranslateGroup}; use anyhow::{bail, Result}; use bedrs::Coordinates; use csv::ByteRecord; use std::io::Read; -pub fn read_bed4_set(reader: R, named: bool) -> Result<(Bed4Set, Option)> { +pub fn read_bed4_set( + reader: R, + named: bool, +) -> Result<(Bed4Set, Option)> { if named { let (set, translater) = read_bed4_set_named(reader)?; Ok((set, Some(translater))) @@ -17,7 +20,7 @@ pub fn read_bed4_set(reader: R, named: bool) -> Result<(Bed4Set, Option pub fn read_bed4_set_with( reader: R, - translater: Option<&mut Translater>, + translater: Option<&mut SplitTranslater>, ) -> Result { if let Some(translater) = translater { convert_bed4_set(reader, translater) @@ -43,9 +46,9 @@ fn read_bed4_set_unnamed(reader: R) -> Result { Ok(set) } -/// Reads a single file into a GenomicIntervalSet and a Translater -fn read_bed4_set_named(reader: R) -> Result<(Bed4Set, Translater)> { - let mut translater = Translater::new(); +/// Reads a single file into a GenomicIntervalSet and a SplitTranslater +fn read_bed4_set_named(reader: R) -> Result<(Bed4Set, SplitTranslater)> { + let mut translater = SplitTranslater::new(); let set = convert_bed4_set(reader, &mut translater)?; Ok((set, translater)) } @@ -55,16 +58,20 @@ fn read_bed4_set_named(reader: R) -> Result<(Bed4Set, Translater)> { /// It uses an externally initialized name map and index map to keep track of /// chromosome names and indices. This is useful for reading multiple files /// and keeping track of the same chromosome names and indices. -fn convert_bed4_set(reader: R, translater: &mut Translater) -> Result { +fn convert_bed4_set(reader: R, translater: &mut SplitTranslater) -> Result { let mut reader = build_reader(reader); let mut raw_record = ByteRecord::new(); let mut set = Bed4Set::empty(); while reader.read_byte_record(&mut raw_record)? { let record: NamedBed4 = raw_record.deserialize(None)?; - translater.add_name(record.chr()); - translater.add_name(record.name()); - let chr_int = translater.get_idx(record.chr()).unwrap(); - let name_int = translater.get_idx(record.name()).unwrap(); + translater.add_name(record.chr(), TranslateGroup::Chr); + translater.add_name(record.name(), TranslateGroup::Meta); + let chr_int = translater + .get_idx(record.chr(), TranslateGroup::Chr) + .unwrap(); + let name_int = translater + .get_idx(record.name(), TranslateGroup::Meta) + .unwrap(); let interval = NumericBed4::new(chr_int, record.start(), record.end(), name_int); set.insert(interval); } diff --git a/src/io/read/bed6.rs b/src/io/read/bed6.rs index 04d6a2f..5f30ffc 100644 --- a/src/io/read/bed6.rs +++ b/src/io/read/bed6.rs @@ -1,11 +1,14 @@ use super::build_reader; -use crate::types::{Bed6Set, NamedBed6, NumericBed6, Translater}; +use crate::types::{Bed6Set, NamedBed6, NumericBed6, SplitTranslater, TranslateGroup}; use anyhow::{bail, Result}; use bedrs::Coordinates; use csv::ByteRecord; use std::io::Read; -pub fn read_bed6_set(reader: R, named: bool) -> Result<(Bed6Set, Option)> { +pub fn read_bed6_set( + reader: R, + named: bool, +) -> Result<(Bed6Set, Option)> { if named { let (set, translater) = read_bed6_set_named(reader)?; Ok((set, Some(translater))) @@ -17,7 +20,7 @@ pub fn read_bed6_set(reader: R, named: bool) -> Result<(Bed6Set, Option pub fn read_bed6_set_with( reader: R, - translater: Option<&mut Translater>, + translater: Option<&mut SplitTranslater>, ) -> Result { if let Some(translater) = translater { convert_bed6_set(reader, translater) @@ -43,9 +46,9 @@ fn read_bed6_set_unnamed(reader: R) -> Result { Ok(set) } -/// Reads a single file into a GenomicIntervalSet and a Translater -fn read_bed6_set_named(reader: R) -> Result<(Bed6Set, Translater)> { - let mut translater = Translater::new(); +/// Reads a single file into a GenomicIntervalSet and a SplitTranslater +fn read_bed6_set_named(reader: R) -> Result<(Bed6Set, SplitTranslater)> { + let mut translater = SplitTranslater::new(); let set = convert_bed6_set(reader, &mut translater)?; Ok((set, translater)) } @@ -55,22 +58,26 @@ fn read_bed6_set_named(reader: R) -> Result<(Bed6Set, Translater)> { /// It uses an externally initialized name map and index map to keep track of /// chromosome names and indices. This is useful for reading multiple files /// and keeping track of the same chromosome names and indices. -fn convert_bed6_set(reader: R, translater: &mut Translater) -> Result { +fn convert_bed6_set(reader: R, translater: &mut SplitTranslater) -> Result { let mut reader = build_reader(reader); let mut raw_record = ByteRecord::new(); let mut set = Bed6Set::empty(); while reader.read_byte_record(&mut raw_record)? { let record: NamedBed6 = raw_record.deserialize(None)?; - translater.add_name(record.chr()); - translater.add_name(record.name()); - let chr_int = translater.get_idx(record.chr()).unwrap(); - let name_int = translater.get_idx(record.name()).unwrap(); + translater.add_name(record.chr(), TranslateGroup::Chr); + translater.add_name(record.name(), TranslateGroup::Meta); + let chr_int = translater + .get_idx(record.chr(), TranslateGroup::Chr) + .unwrap(); + let name_int = translater + .get_idx(record.name(), TranslateGroup::Meta) + .unwrap(); let interval = NumericBed6::new( chr_int, record.start(), record.end(), name_int, - *record.score(), + record.score(), record.strand().unwrap_or_default(), ); set.insert(interval); diff --git a/src/io/read/bed_reader.rs b/src/io/read/bed_reader.rs index 10e7e81..ca2f1d5 100644 --- a/src/io/read/bed_reader.rs +++ b/src/io/read/bed_reader.rs @@ -1,8 +1,11 @@ use super::{ read_bed12_set, read_bed12_set_with, read_bed3_set, read_bed3_set_with, read_bed4_set, - read_bed4_set_with, read_bed6_set, read_bed6_set_with, + read_bed4_set_with, read_bed6_set, read_bed6_set_with, read_meta_interval_set, + read_meta_interval_set_with, +}; +use crate::types::{ + Bed12Set, Bed3Set, Bed4Set, Bed6Set, FieldFormat, InputFormat, MetaIntervalSet, SplitTranslater, }; -use crate::types::{Bed12Set, Bed3Set, Bed4Set, Bed6Set, FieldFormat, InputFormat, Translater}; use anyhow::Result; use flate2::read::MultiGzDecoder; use gzp::BgzfSyncReader; @@ -99,47 +102,60 @@ impl BedReader { )) } - /// Returns a Bed3Set from the reader with an Option - pub fn bed3_set(self) -> Result<(Bed3Set, Option)> { + /// Returns a Bed3Set from the reader with an Option + pub fn bed3_set(self) -> Result<(Bed3Set, Option)> { let is_named = self.is_named(); read_bed3_set(self.reader(), is_named) } - /// Returns a Bed4Set from the reader with an Option - pub fn bed4_set(self) -> Result<(Bed4Set, Option)> { + /// Returns a Bed4Set from the reader with an Option + pub fn bed4_set(self) -> Result<(Bed4Set, Option)> { let is_named = self.is_named(); read_bed4_set(self.reader(), is_named) } - /// Returns a Bed6Set from the reader with an Option - pub fn bed6_set(self) -> Result<(Bed6Set, Option)> { + /// Returns a Bed6Set from the reader with an Option + pub fn bed6_set(self) -> Result<(Bed6Set, Option)> { let is_named = self.is_named(); read_bed6_set(self.reader(), is_named) } - /// Returns a Bed6Set from the reader with an Option - pub fn bed12_set(self) -> Result<(Bed12Set, Option)> { + /// Returns a Bed6Set from the reader with an Option + pub fn bed12_set(self) -> Result<(Bed12Set, Option)> { let is_named = self.is_named(); read_bed12_set(self.reader(), is_named) } + /// Returns a MetaIntervalSet from the reader with an Option + pub fn meta_interval_set(self) -> Result<(MetaIntervalSet, Option)> { + read_meta_interval_set(self.reader(), true) // meta intervals are always named + } + /// Returns a Bed3Set from the reader - pub fn bed3_set_with(self, translater: Option<&mut Translater>) -> Result { + pub fn bed3_set_with(self, translater: Option<&mut SplitTranslater>) -> Result { read_bed3_set_with(self.reader(), translater) } /// Returns a Bed4Set from the reader - pub fn bed4_set_with(self, translater: Option<&mut Translater>) -> Result { + pub fn bed4_set_with(self, translater: Option<&mut SplitTranslater>) -> Result { read_bed4_set_with(self.reader(), translater) } /// Returns a Bed6Set from the reader - pub fn bed6_set_with(self, translater: Option<&mut Translater>) -> Result { + pub fn bed6_set_with(self, translater: Option<&mut SplitTranslater>) -> Result { read_bed6_set_with(self.reader(), translater) } /// Returns a Bed6Set from the reader - pub fn bed12_set_with(self, translater: Option<&mut Translater>) -> Result { + pub fn bed12_set_with(self, translater: Option<&mut SplitTranslater>) -> Result { read_bed12_set_with(self.reader(), translater) } + + /// Returns a MetaIntervalSet from the reader + pub fn meta_interval_set_with( + self, + translater: Option<&mut SplitTranslater>, + ) -> Result { + read_meta_interval_set_with(self.reader(), translater) + } } diff --git a/src/io/read/meta_interval.rs b/src/io/read/meta_interval.rs new file mode 100644 index 0000000..81abbce --- /dev/null +++ b/src/io/read/meta_interval.rs @@ -0,0 +1,107 @@ +use super::build_reader; +use crate::types::{MetaIntervalSet, NumericMetaInterval, SplitTranslater, TranslateGroup}; +use anyhow::{bail, Result}; +use csv::ByteRecord; +use std::{io::Read, str::from_utf8}; + +pub fn read_meta_interval_set( + reader: R, + named: bool, +) -> Result<(MetaIntervalSet, Option)> { + if named { + let (set, translater) = read_meta_interval_set_named(reader)?; + Ok((set, Some(translater))) + } else { + let set = read_meta_interval_set_unnamed(reader)?; + Ok((set, None)) + } +} + +pub fn read_meta_interval_set_with( + reader: R, + translater: Option<&mut SplitTranslater>, +) -> Result { + if let Some(translater) = translater { + convert_meta_interval_set(reader, translater) + } else { + read_meta_interval_set_unnamed(reader) + } +} + +fn read_meta_interval_set_unnamed(reader: R) -> Result { + let mut reader = build_reader(reader); + let set = reader + .deserialize() + .map(|record| { + let record: NumericMetaInterval= match record { + Ok(record) => record, + Err(e) => { + bail!("Could not build bed record:\n\nIf your BED has non-integer chromosome names try rerunning with the `-N` flag:\n\nERROR: {}", e) + } + }; + Ok(record) + }) + .collect::>()?; + Ok(set) +} + +/// Reads a single file into a GenomicIntervalSet and a SplitTranslater +fn read_meta_interval_set_named(reader: R) -> Result<(MetaIntervalSet, SplitTranslater)> { + let mut translater = SplitTranslater::new(); + let set = convert_meta_interval_set(reader, &mut translater)?; + Ok((set, translater)) +} + +/// Convert a CSV reader into a GenomicIntervalSet +/// +/// It uses an externally initialized name map and index map to keep track of +/// chromosome names and indices. This is useful for reading multiple files +/// and keeping track of the same chromosome names and indices. +fn convert_meta_interval_set( + reader: R, + translater: &mut SplitTranslater, +) -> Result { + let mut reader = build_reader(reader); + let mut raw_record = ByteRecord::new(); + let mut set = MetaIntervalSet::empty(); + let mut buffer = String::new(); + while reader.read_byte_record(&mut raw_record)? { + // Iterate over the fields of the record + let mut record_iter = raw_record.iter(); + + // Parse the chromosome + let chr = record_iter.next().map(from_utf8).unwrap()?; + + // Parse the start and end + let start = record_iter + .next() + .map(from_utf8) + .unwrap()? + .parse::()?; + let end = record_iter + .next() + .map(from_utf8) + .unwrap()? + .parse::()?; + + // Parse the metadata into a single long string + buffer.clear(); + let first_meta = record_iter.next().unwrap(); + buffer.push_str(from_utf8(first_meta)?); + for field in record_iter { + buffer.push('\t'); + buffer.push_str(from_utf8(field)?); + } + + // Add the chromosome and metadata to the translater + translater.add_name(chr, TranslateGroup::Chr); + translater.add_name(&buffer, TranslateGroup::Meta); + let chr_int = translater.get_idx(chr, TranslateGroup::Chr).unwrap(); + let name_int = translater.get_idx(&buffer, TranslateGroup::Meta).unwrap(); + + // Create the interval and add it to the set + let interval = NumericMetaInterval::new(chr_int, start, end, name_int); + set.insert(interval); + } + Ok(set) +} diff --git a/src/io/read/mod.rs b/src/io/read/mod.rs index 65c1f52..a18fd3b 100644 --- a/src/io/read/mod.rs +++ b/src/io/read/mod.rs @@ -4,6 +4,7 @@ pub mod bed4; pub mod bed6; pub mod bed_reader; pub mod iter; +pub mod meta_interval; pub mod utils; pub use bed12::{read_bed12_set, read_bed12_set_with}; pub use bed3::{read_bed3_set, read_bed3_set_with}; @@ -11,4 +12,5 @@ pub use bed4::{read_bed4_set, read_bed4_set_with}; pub use bed6::{read_bed6_set, read_bed6_set_with}; pub use bed_reader::BedReader; pub use iter::iter_unnamed; +pub use meta_interval::{read_meta_interval_set, read_meta_interval_set_with}; pub use utils::build_reader; diff --git a/src/io/write/iter.rs b/src/io/write/iter.rs index 587174f..66979a8 100644 --- a/src/io/write/iter.rs +++ b/src/io/write/iter.rs @@ -1,5 +1,7 @@ use super::build_writer; -use crate::types::{NumericBed12, NumericBed3, NumericBed4, NumericBed6, Translate}; +use crate::types::{ + NumericBed12, NumericBed3, NumericBed4, NumericBed6, NumericMetaInterval, Translate, +}; use anyhow::Result; use bedrs::Coordinates; use serde::Serialize; @@ -103,7 +105,7 @@ where ) -> Result<()> { let mut wtr = build_writer(writer); for interval in iterator { - let chr = translater.get_name(*interval.chr()).unwrap(); + let chr = translater.get_chr_name(*interval.chr()).unwrap(); let named_interval = (chr, interval.start(), interval.end()); wtr.serialize(named_interval)?; } @@ -120,7 +122,7 @@ impl WriteNamedIter for WriteNamedIterImpl { ) -> Result<()> { let mut wtr = build_writer(writer); for interval in iterator { - let chr = translater.get_name(*interval.chr()).unwrap(); + let chr = translater.get_chr_name(*interval.chr()).unwrap(); let named_interval = (chr, interval.start(), interval.end()); wtr.serialize(named_interval)?; } @@ -136,7 +138,7 @@ impl<'a> WriteNamedIter<&'a NumericBed3> for WriteNamedIterImpl { ) -> Result<()> { let mut wtr = build_writer(writer); for interval in iterator { - let chr = translater.get_name(*interval.chr()).unwrap(); + let chr = translater.get_chr_name(*interval.chr()).unwrap(); let named_interval = (chr, interval.start(), interval.end()); wtr.serialize(named_interval)?; } @@ -152,8 +154,8 @@ impl WriteNamedIter for WriteNamedIterImpl { ) -> Result<()> { let mut wtr = build_writer(writer); for interval in iterator { - let chr = translater.get_name(*interval.chr()).unwrap(); - let name = translater.get_name(*interval.name()).unwrap(); + let chr = translater.get_chr_name(*interval.chr()).unwrap(); + let name = translater.get_meta_name(*interval.name()).unwrap(); let named_interval = (chr, interval.start(), interval.end(), name); wtr.serialize(named_interval)?; } @@ -169,8 +171,8 @@ impl<'a> WriteNamedIter<&'a NumericBed4> for WriteNamedIterImpl { ) -> Result<()> { let mut wtr = build_writer(writer); for interval in iterator { - let chr = translater.get_name(*interval.chr()).unwrap(); - let name = translater.get_name(*interval.name()).unwrap(); + let chr = translater.get_chr_name(*interval.chr()).unwrap(); + let name = translater.get_meta_name(*interval.name()).unwrap(); let named_interval = (chr, interval.start(), interval.end(), name); wtr.serialize(named_interval)?; } @@ -186,8 +188,8 @@ impl WriteNamedIter for WriteNamedIterImpl { ) -> Result<()> { let mut wtr = build_writer(writer); for interval in iterator { - let chr = translater.get_name(*interval.chr()).unwrap(); - let name = translater.get_name(*interval.name()).unwrap(); + let chr = translater.get_chr_name(*interval.chr()).unwrap(); + let name = translater.get_meta_name(*interval.name()).unwrap(); let named_interval = ( chr, interval.start(), @@ -210,8 +212,8 @@ impl<'a> WriteNamedIter<&'a NumericBed6> for WriteNamedIterImpl { ) -> Result<()> { let mut wtr = build_writer(writer); for interval in iterator { - let chr = translater.get_name(*interval.chr()).unwrap(); - let name = translater.get_name(*interval.name()).unwrap(); + let chr = translater.get_chr_name(*interval.chr()).unwrap(); + let name = translater.get_meta_name(*interval.name()).unwrap(); let named_interval = ( chr, interval.start(), @@ -234,12 +236,12 @@ impl WriteNamedIter for WriteNamedIterImpl { ) -> Result<()> { let mut wtr = build_writer(writer); for interval in iterator { - let chr = translater.get_name(*interval.chr()).unwrap(); - let name = translater.get_name(*interval.name()).unwrap(); - let item_rgb = translater.get_name(*interval.item_rgb()).unwrap(); - let block_count = translater.get_name(interval.block_count()).unwrap(); - let block_sizes = translater.get_name(*interval.block_sizes()).unwrap(); - let block_starts = translater.get_name(*interval.block_starts()).unwrap(); + let chr = translater.get_chr_name(*interval.chr()).unwrap(); + let name = translater.get_meta_name(*interval.name()).unwrap(); + let item_rgb = translater.get_meta_name(*interval.item_rgb()).unwrap(); + let block_count = translater.get_meta_name(interval.block_count()).unwrap(); + let block_sizes = translater.get_meta_name(*interval.block_sizes()).unwrap(); + let block_starts = translater.get_meta_name(*interval.block_starts()).unwrap(); let named_interval = ( chr, interval.start(), @@ -268,12 +270,12 @@ impl<'a> WriteNamedIter<&'a NumericBed12> for WriteNamedIterImpl { ) -> Result<()> { let mut wtr = build_writer(writer); for interval in iterator { - let chr = translater.get_name(*interval.chr()).unwrap(); - let name = translater.get_name(*interval.name()).unwrap(); - let item_rgb = translater.get_name(*interval.item_rgb()).unwrap(); - let block_count = translater.get_name(interval.block_count()).unwrap(); - let block_sizes = translater.get_name(*interval.block_sizes()).unwrap(); - let block_starts = translater.get_name(*interval.block_starts()).unwrap(); + let chr = translater.get_chr_name(*interval.chr()).unwrap(); + let name = translater.get_meta_name(*interval.name()).unwrap(); + let item_rgb = translater.get_meta_name(*interval.item_rgb()).unwrap(); + let block_count = translater.get_meta_name(interval.block_count()).unwrap(); + let block_sizes = translater.get_meta_name(*interval.block_sizes()).unwrap(); + let block_starts = translater.get_meta_name(*interval.block_starts()).unwrap(); let named_interval = ( chr, interval.start(), @@ -294,3 +296,37 @@ impl<'a> WriteNamedIter<&'a NumericBed12> for WriteNamedIterImpl { Ok(()) } } +impl WriteNamedIter for WriteNamedIterImpl { + fn write_named_iter, Tr: Translate>( + writer: W, + iterator: It, + translater: &Tr, + ) -> Result<()> { + let mut wtr = build_writer(writer); + for interval in iterator { + let chr = translater.get_chr_name(*interval.chr()).unwrap(); + let name = translater.get_meta_name(*interval.meta()).unwrap(); + let named_interval = (chr, interval.start(), interval.end(), name); + wtr.serialize(named_interval)?; + } + wtr.flush()?; + Ok(()) + } +} +impl<'a> WriteNamedIter<&'a NumericMetaInterval> for WriteNamedIterImpl { + fn write_named_iter, Tr: Translate>( + writer: W, + iterator: It, + translater: &Tr, + ) -> Result<()> { + let mut wtr = build_writer(writer); + for interval in iterator { + let chr = translater.get_chr_name(*interval.chr()).unwrap(); + let name = translater.get_meta_name(*interval.meta()).unwrap(); + let named_interval = (chr, interval.start(), interval.end(), name); + wtr.serialize(named_interval)?; + } + wtr.flush()?; + Ok(()) + } +} diff --git a/src/io/write/utils.rs b/src/io/write/utils.rs index ad876fd..b85b4dd 100644 --- a/src/io/write/utils.rs +++ b/src/io/write/utils.rs @@ -1,8 +1,9 @@ use crate::types::{ - IntervalDepth, IntervalPair, NumericBed3, Rename, Renamer, StreamTranslater, Translater, + IntervalDepth, IntervalPair, NumericBed3, Rename, Renamer, SplitTranslater, StreamTranslater, }; use anyhow::Result; use bedrs::{traits::IntervalBounds, Coordinates}; +use csv::QuoteStyle; use serde::Serialize; use std::io::Write; @@ -10,6 +11,7 @@ pub fn build_writer(writer: W) -> csv::Writer { csv::WriterBuilder::new() .delimiter(b'\t') .has_headers(false) + .quote_style(QuoteStyle::Never) .from_writer(writer) } @@ -30,7 +32,7 @@ where pub fn write_depth_iter_with<'a, W, I, N, It>( records: It, writer: W, - translater: Option<&Translater>, + translater: Option<&SplitTranslater>, ) -> Result<()> where I: IntervalBounds + Serialize, @@ -81,7 +83,7 @@ where pub fn write_pairs_iter_with<'a, W, Ia, Ib, Na, Nb, It>( records: It, writer: W, - translater: Option<&Translater>, + translater: Option<&SplitTranslater>, ) -> Result<()> where Ia: IntervalBounds + Serialize, diff --git a/src/types/depth.rs b/src/types/depth.rs index e8b75d7..bacc1a7 100644 --- a/src/types/depth.rs +++ b/src/types/depth.rs @@ -1,7 +1,4 @@ -use super::{ - translater::{Rename, Renamer}, - Translater, -}; +use super::{Rename, Renamer, SplitTranslater}; use bedrs::traits::IntervalBounds; pub struct IntervalDepth<'a, I, N> @@ -11,7 +8,7 @@ where { pub iv: I, pub n_overlaps: usize, - pub translater: Option<&'a Translater>, + pub translater: Option<&'a SplitTranslater>, phantom: std::marker::PhantomData, } impl<'a, I, N> IntervalDepth<'a, I, N> @@ -20,7 +17,7 @@ where N: IntervalBounds<&'a str, usize>, Renamer: Rename<'a, I, N>, { - pub fn new(iv: I, n_overlaps: usize, translater: Option<&'a Translater>) -> Self { + pub fn new(iv: I, n_overlaps: usize, translater: Option<&'a SplitTranslater>) -> Self { Self { iv, n_overlaps, @@ -36,7 +33,7 @@ where let n = Renamer::rename_with(&self.iv, translater); (n, self.n_overlaps) } else { - panic!("Translater was not provided but get_named_tuple was called - there is a bug somewhere!") + panic!("SplitTranslater was not provided but get_named_tuple was called - there is a bug somewhere!") } } } diff --git a/src/types/formats/genome.rs b/src/types/formats/genome.rs index 422fbc7..37b0d74 100644 --- a/src/types/formats/genome.rs +++ b/src/types/formats/genome.rs @@ -170,9 +170,9 @@ mod testing { assert_eq!(genome.chr_size_unchecked(2), 3000); assert!(genome.translater().is_some()); let translater = genome.translater().unwrap(); - assert_eq!(translater.get_name(0).unwrap(), "chr1"); - assert_eq!(translater.get_name(1).unwrap(), "chr2"); - assert_eq!(translater.get_name(2).unwrap(), "chr3"); + assert_eq!(translater.get_chr_name(0).unwrap(), "chr1"); + assert_eq!(translater.get_chr_name(1).unwrap(), "chr2"); + assert_eq!(translater.get_chr_name(2).unwrap(), "chr3"); } #[test] diff --git a/src/types/formats/in_formats.rs b/src/types/formats/in_formats.rs index a87dcc2..62f0b53 100644 --- a/src/types/formats/in_formats.rs +++ b/src/types/formats/in_formats.rs @@ -14,6 +14,7 @@ pub enum InputFormat { Bed4, Bed6, Bed12, + Ambiguous, } impl InputFormat { pub fn predict(bufreader: &BufReader) -> Result { @@ -28,14 +29,12 @@ impl InputFormat { }; let num_fields = first.split(|b| *b == b'\t').count(); match num_fields { + 1..=2 => bail!("Too few fields in line: {}", from_utf8(first)?), 3 => Ok(InputFormat::Bed3), 4 => Ok(InputFormat::Bed4), 6 => Ok(InputFormat::Bed6), 12 => Ok(InputFormat::Bed12), - _ => bail!( - "Cannot predict input format from line: {}", - std::str::from_utf8(first)? - ), + _ => Ok(InputFormat::Ambiguous), } } } @@ -79,6 +78,17 @@ impl FieldFormat { Ok(FieldFormat::IntegerBased) } } + InputFormat::Ambiguous => { + let all_int = fields + .iter() + .filter_map(|f| from_utf8(f).ok()) + .all(|f| f.parse::().is_ok()); + if all_int { + Ok(FieldFormat::IntegerBased) + } else { + Ok(FieldFormat::StringBased) + } + } } } } @@ -117,6 +127,15 @@ mod testing { assert_eq!(input_format, InputFormat::Bed3); } + #[test] + fn input_format_bed4() { + let line = b"chr1\t1\t2\tname"; + let mut buffer = BufReader::new(line.as_slice()); + buffer.fill_buf().unwrap(); + let input_format = InputFormat::predict(&buffer).unwrap(); + assert_eq!(input_format, InputFormat::Bed4); + } + #[test] fn input_format_bed6() { let line = b"chr1\t1\t2\tname\t0\t+"; @@ -131,8 +150,8 @@ mod testing { let line = b"chr1\t1\t2\tname\t0\t+\textra"; let mut buffer = BufReader::new(line.as_slice()); buffer.fill_buf().unwrap(); - let input_format = InputFormat::predict(&buffer); - assert!(input_format.is_err()); + let input_format = InputFormat::predict(&buffer).unwrap(); + assert_eq!(input_format, InputFormat::Ambiguous); } #[test] diff --git a/src/types/mod.rs b/src/types/mod.rs index 4fc68d8..f51d62d 100644 --- a/src/types/mod.rs +++ b/src/types/mod.rs @@ -1,13 +1,14 @@ mod depth; mod formats; mod pairs; -mod translater; -use bedrs::{Bed12, Bed3, Bed4, Bed6, IntervalContainer}; +mod translate; +use bedrs::{Bed12, Bed3, Bed4, Bed6, IntervalContainer, MetaInterval}; pub use depth::IntervalDepth; pub use formats::{FieldFormat, Genome, InputFormat}; pub use pairs::IntervalPair; -pub use translater::{ - Rename, Renamer, Reorder, Retranslater, StreamTranslater, Translate, Translater, +pub use translate::{ + Rename, Renamer, Reorder, SplitRetranslater, SplitTranslater, StreamTranslater, Translate, + TranslateGroup, Translater, }; pub type NumericBed3 = Bed3; @@ -18,11 +19,14 @@ pub type NumericBed4 = Bed4; pub type NamedBed4<'a> = Bed4<&'a str, usize, &'a str>; pub type Bed4Set = IntervalContainer; -pub type NumericBed6 = Bed6; -pub type NamedBed6<'a> = Bed6<&'a str, usize, &'a str, f64>; +pub type NumericBed6 = Bed6; +pub type NamedBed6<'a> = Bed6<&'a str, usize, &'a str>; pub type Bed6Set = IntervalContainer; -pub type NumericBed12 = Bed12; -pub type NamedBed12<'a> = - Bed12<&'a str, usize, &'a str, f64, usize, usize, &'a str, &'a str, &'a str>; +pub type NumericBed12 = Bed12; +pub type NamedBed12<'a> = Bed12<&'a str, usize, &'a str, usize, usize, &'a str, &'a str, &'a str>; pub type Bed12Set = IntervalContainer; + +pub type NumericMetaInterval = MetaInterval; +pub type NamedMetaInterval<'a> = MetaInterval<&'a str, usize, &'a str>; +pub type MetaIntervalSet = IntervalContainer; diff --git a/src/types/pairs.rs b/src/types/pairs.rs index ce12cdb..2312286 100644 --- a/src/types/pairs.rs +++ b/src/types/pairs.rs @@ -1,7 +1,4 @@ -use super::{ - translater::{Rename, Renamer}, - Translater, -}; +use super::{Rename, Renamer, SplitTranslater}; use bedrs::traits::IntervalBounds; pub struct IntervalPair<'a, Ia, Ib, Na, Nb> @@ -14,7 +11,7 @@ where { pub iv_a: Ia, pub iv_b: Ib, - pub translater: Option<&'a Translater>, + pub translater: Option<&'a SplitTranslater>, phantom_a: std::marker::PhantomData, phantom_b: std::marker::PhantomData, } @@ -26,7 +23,7 @@ where Nb: IntervalBounds<&'a str, usize>, Renamer: Rename<'a, Ia, Na> + Rename<'a, Ib, Nb>, { - pub fn new(iv_a: Ia, iv_b: Ib, translater: Option<&'a Translater>) -> Self { + pub fn new(iv_a: Ia, iv_b: Ib, translater: Option<&'a SplitTranslater>) -> Self { Self { iv_a, iv_b, @@ -44,7 +41,7 @@ where let named_b = Renamer::rename_with(&self.iv_b, translater); (named_a, named_b) } else { - panic!("Translater was not provided but get_named_tuple was called - there is a bug somewhere!") + panic!("SplitTranslater was not provided but get_named_tuple was called - there is a bug somewhere!") } } } diff --git a/src/types/translate/mod.rs b/src/types/translate/mod.rs new file mode 100644 index 0000000..eff0fdb --- /dev/null +++ b/src/types/translate/mod.rs @@ -0,0 +1,25 @@ +mod rename; +mod reorder; +mod retranslater; +mod split_retranslater; +mod split_translater; +mod stream_translater; +mod translater; +pub use rename::{Rename, Renamer}; +pub use reorder::Reorder; +pub use retranslater::Retranslater; +pub use split_retranslater::SplitRetranslater; +pub use split_translater::SplitTranslater; +pub use stream_translater::StreamTranslater; +pub use translater::Translater; + +pub trait Translate { + fn get_chr_name(&self, idx: usize) -> Option<&str>; + fn get_meta_name(&self, idx: usize) -> Option<&str>; +} + +#[derive(Debug, PartialEq, Eq, Clone, Copy)] +pub enum TranslateGroup { + Chr, + Meta, +} diff --git a/src/types/translate/rename.rs b/src/types/translate/rename.rs new file mode 100644 index 0000000..ec0fc1c --- /dev/null +++ b/src/types/translate/rename.rs @@ -0,0 +1,75 @@ +use super::{SplitTranslater, Translate}; +use crate::types::{ + NamedBed12, NamedBed3, NamedBed4, NamedBed6, NamedMetaInterval, NumericBed12, NumericBed3, + NumericBed4, NumericBed6, NumericMetaInterval, +}; +use bedrs::{traits::IntervalBounds, Coordinates}; + +pub struct Renamer; +pub trait Rename<'a, Ia, Ib> +where + Ia: IntervalBounds, + Ib: IntervalBounds<&'a str, usize>, +{ + fn rename_with(iv: &Ia, translater: &'a SplitTranslater) -> Ib; +} +impl<'a> Rename<'a, NumericBed3, NamedBed3<'a>> for Renamer { + fn rename_with(iv: &NumericBed3, translater: &'a SplitTranslater) -> NamedBed3<'a> { + let chr = translater.get_chr_name(*iv.chr()).unwrap(); + NamedBed3::new(chr, iv.start(), iv.end()) + } +} +impl<'a> Rename<'a, NumericBed4, NamedBed4<'a>> for Renamer { + fn rename_with(iv: &NumericBed4, translater: &'a SplitTranslater) -> NamedBed4<'a> { + let chr = translater.get_chr_name(*iv.chr()).unwrap(); + let name = translater.get_meta_name(*iv.name()).unwrap(); + NamedBed4::new(chr, iv.start(), iv.end(), name) + } +} +impl<'a> Rename<'a, NumericBed6, NamedBed6<'a>> for Renamer { + fn rename_with(iv: &NumericBed6, translater: &'a SplitTranslater) -> NamedBed6<'a> { + let chr = translater.get_chr_name(*iv.chr()).unwrap(); + let name = translater.get_meta_name(*iv.name()).unwrap(); + NamedBed6::new( + chr, + iv.start(), + iv.end(), + name, + iv.score(), + iv.strand().unwrap_or_default(), + ) + } +} +impl<'a> Rename<'a, NumericBed12, NamedBed12<'a>> for Renamer { + fn rename_with(iv: &NumericBed12, translater: &'a SplitTranslater) -> NamedBed12<'a> { + let chr = translater.get_chr_name(*iv.chr()).unwrap(); + let name = translater.get_meta_name(*iv.name()).unwrap(); + let item_rgb = translater.get_meta_name(*iv.item_rgb()).unwrap(); + let block_sizes = translater.get_meta_name(*iv.block_sizes()).unwrap(); + let block_starts = translater.get_meta_name(*iv.block_starts()).unwrap(); + NamedBed12::new( + chr, + iv.start(), + iv.end(), + name, + iv.score(), + iv.strand().unwrap_or_default(), + iv.thick_start(), + iv.thick_end(), + item_rgb, + iv.block_count(), + block_sizes, + block_starts, + ) + } +} +impl<'a> Rename<'a, NumericMetaInterval, NamedMetaInterval<'a>> for Renamer { + fn rename_with( + iv: &NumericMetaInterval, + translater: &'a SplitTranslater, + ) -> NamedMetaInterval<'a> { + let chr = translater.get_chr_name(*iv.chr()).unwrap(); + let meta = translater.get_chr_name(*iv.meta()).unwrap(); + NamedMetaInterval::new(chr, iv.start(), iv.end(), meta) + } +} diff --git a/src/types/translate/reorder.rs b/src/types/translate/reorder.rs new file mode 100644 index 0000000..c85252d --- /dev/null +++ b/src/types/translate/reorder.rs @@ -0,0 +1,78 @@ +use super::{Retranslater, Translater}; +use crate::types::{NumericBed12, NumericBed3, NumericBed4, NumericBed6, NumericMetaInterval}; +use bedrs::{traits::IntervalBounds, Coordinates, IntervalContainer}; + +pub trait Reorder +where + C: IntervalBounds, +{ + fn reorder_translater( + set: &mut IntervalContainer, + translater: Translater, + ) -> Retranslater; +} +impl Reorder for NumericBed3 { + fn reorder_translater( + set: &mut IntervalContainer, + translater: Translater, + ) -> Retranslater { + let retranslate = translater.lex_sort(); + set.apply_mut(|iv| { + let new_chr = retranslate.get_rank(*iv.chr()).unwrap(); + iv.update_chr(&new_chr); + }); + retranslate + } +} +impl Reorder for NumericBed4 { + fn reorder_translater( + set: &mut IntervalContainer, + translater: Translater, + ) -> Retranslater { + let retranslate = translater.lex_sort(); + set.apply_mut(|iv| { + let new_chr = retranslate.get_rank(*iv.chr()).unwrap(); + iv.update_chr(&new_chr); + }); + retranslate + } +} +impl Reorder for NumericBed6 { + fn reorder_translater( + set: &mut IntervalContainer, + translater: Translater, + ) -> Retranslater { + let retranslate = translater.lex_sort(); + set.apply_mut(|iv| { + let new_chr = retranslate.get_rank(*iv.chr()).unwrap(); + iv.update_chr(&new_chr); + }); + retranslate + } +} +impl Reorder for NumericBed12 { + fn reorder_translater( + set: &mut IntervalContainer, + translater: Translater, + ) -> Retranslater { + let retranslate = translater.lex_sort(); + set.apply_mut(|iv| { + let new_chr = retranslate.get_rank(*iv.chr()).unwrap(); + iv.update_chr(&new_chr); + }); + retranslate + } +} +impl Reorder for NumericMetaInterval { + fn reorder_translater( + set: &mut IntervalContainer, + translater: Translater, + ) -> Retranslater { + let retranslate = translater.lex_sort(); + set.apply_mut(|iv| { + let new_chr = retranslate.get_rank(*iv.chr()).unwrap(); + iv.update_chr(&new_chr); + }); + retranslate + } +} diff --git a/src/types/translate/retranslater.rs b/src/types/translate/retranslater.rs new file mode 100644 index 0000000..907bcf9 --- /dev/null +++ b/src/types/translate/retranslater.rs @@ -0,0 +1,28 @@ +use super::Translate; +use hashbrown::HashMap; + +#[derive(Debug)] +pub struct Retranslater { + pub idx_to_rank: HashMap, + pub rank_to_name: HashMap, +} +impl Retranslater { + pub fn new(idx_to_rank: HashMap, rank_to_name: HashMap) -> Self { + Self { + idx_to_rank, + rank_to_name, + } + } + + pub fn get_rank(&self, idx: usize) -> Option { + self.idx_to_rank.get(&idx).copied() + } +} +impl Translate for Retranslater { + fn get_chr_name(&self, rank: usize) -> Option<&str> { + self.rank_to_name.get(&rank).map(|s| s.as_str()) + } + fn get_meta_name(&self, rank: usize) -> Option<&str> { + self.get_chr_name(rank) + } +} diff --git a/src/types/translate/split_retranslater.rs b/src/types/translate/split_retranslater.rs new file mode 100644 index 0000000..a88d6c3 --- /dev/null +++ b/src/types/translate/split_retranslater.rs @@ -0,0 +1,19 @@ +use super::{Retranslater, Translate, Translater}; + +pub struct SplitRetranslater { + chr_tl: Retranslater, + meta_tl: Translater, +} +impl SplitRetranslater { + pub fn new(chr_tl: Retranslater, meta_tl: Translater) -> Self { + Self { chr_tl, meta_tl } + } +} +impl Translate for SplitRetranslater { + fn get_chr_name(&self, idx: usize) -> Option<&str> { + self.chr_tl.get_chr_name(idx) + } + fn get_meta_name(&self, idx: usize) -> Option<&str> { + self.meta_tl.get_meta_name(idx) + } +} diff --git a/src/types/translate/split_translater.rs b/src/types/translate/split_translater.rs new file mode 100644 index 0000000..30870c3 --- /dev/null +++ b/src/types/translate/split_translater.rs @@ -0,0 +1,43 @@ +use super::{Translate, TranslateGroup, Translater}; + +pub struct SplitTranslater { + chr_tl: Translater, + meta_tl: Translater, +} +impl SplitTranslater { + pub fn new() -> Self { + Self { + chr_tl: Translater::new(), + meta_tl: Translater::new(), + } + } + pub fn add_name(&mut self, name: &str, group: TranslateGroup) { + match group { + TranslateGroup::Chr => self.chr_tl.add_name(name), + TranslateGroup::Meta => self.meta_tl.add_name(name), + } + } + pub fn get_idx(&self, name: &str, group: TranslateGroup) -> Option { + match group { + TranslateGroup::Chr => self.chr_tl.get_idx(name), + TranslateGroup::Meta => self.meta_tl.get_idx(name), + } + } + pub fn get_translater(&self, group: TranslateGroup) -> &Translater { + match group { + TranslateGroup::Chr => &self.chr_tl, + TranslateGroup::Meta => &self.meta_tl, + } + } + pub fn disband(self) -> (Translater, Translater) { + (self.chr_tl, self.meta_tl) + } +} +impl Translate for SplitTranslater { + fn get_chr_name(&self, idx: usize) -> Option<&str> { + self.chr_tl.get_chr_name(idx) + } + fn get_meta_name(&self, idx: usize) -> Option<&str> { + self.meta_tl.get_meta_name(idx) + } +} diff --git a/src/types/translate/stream_translater.rs b/src/types/translate/stream_translater.rs new file mode 100644 index 0000000..1ca9ae9 --- /dev/null +++ b/src/types/translate/stream_translater.rs @@ -0,0 +1,30 @@ +use dashmap::DashMap; + +pub struct StreamTranslater { + name_to_idx: DashMap, + idx_to_name: DashMap, +} +impl StreamTranslater { + pub fn new() -> Self { + Self { + name_to_idx: DashMap::new(), + idx_to_name: DashMap::new(), + } + } + pub fn has_name(&self, name: &str) -> bool { + self.name_to_idx.contains_key(name) + } + pub fn add_name(&self, name: &str) { + if !self.has_name(name) { + let idx = self.name_to_idx.len(); + self.name_to_idx.insert(name.to_string(), idx); + self.idx_to_name.insert(idx, name.to_string()); + } + } + pub fn get_name_to_idx(&self) -> &DashMap { + &self.name_to_idx + } + pub fn get_idx_to_name(&self) -> &DashMap { + &self.idx_to_name + } +} diff --git a/src/types/translate/translater.rs b/src/types/translate/translater.rs new file mode 100644 index 0000000..75f99e8 --- /dev/null +++ b/src/types/translate/translater.rs @@ -0,0 +1,57 @@ +use super::{Retranslater, Translate}; +use hashbrown::HashMap; +use human_sort::compare; + +pub struct Translater { + name_to_idx: HashMap, + idx_to_name: HashMap, +} +impl Translater { + pub fn new() -> Self { + Self { + name_to_idx: HashMap::new(), + idx_to_name: HashMap::new(), + } + } + pub fn has_name(&self, name: &str) -> bool { + self.name_to_idx.contains_key(name) + } + pub fn add_name(&mut self, name: &str) { + if !self.has_name(name) { + let idx = self.name_to_idx.len(); + self.name_to_idx.insert(name.to_string(), idx); + self.idx_to_name.insert(idx, name.to_string()); + } + } + #[allow(dead_code)] + pub fn get_idx(&self, name: &str) -> Option { + self.name_to_idx.get(name).copied() + } + #[allow(dead_code)] + pub fn get_name_to_idx(&self) -> &HashMap { + &self.name_to_idx + } + pub fn lex_sort(self) -> Retranslater { + let mut idx_to_rank = HashMap::with_capacity(self.idx_to_name.len()); + let mut rank_to_name = HashMap::with_capacity(self.idx_to_name.len()); + let mut ordering = self + .idx_to_name + .iter() + .map(|(idx, name)| (name, idx)) + .collect::>(); + ordering.sort_by(|a, b| compare(a.0, b.0)); + for (order, (name, idx)) in ordering.into_iter().enumerate() { + rank_to_name.insert(order, name.to_string()); + idx_to_rank.insert(*idx, order); + } + Retranslater::new(idx_to_rank, rank_to_name) + } +} +impl Translate for Translater { + fn get_chr_name(&self, idx: usize) -> Option<&str> { + self.idx_to_name.get(&idx).map(|s| s.as_str()) + } + fn get_meta_name(&self, idx: usize) -> Option<&str> { + self.get_chr_name(idx) + } +} diff --git a/src/types/translater.rs b/src/types/translater.rs deleted file mode 100644 index d5dd9dd..0000000 --- a/src/types/translater.rs +++ /dev/null @@ -1,248 +0,0 @@ -use super::{ - NamedBed12, NamedBed3, NamedBed4, NamedBed6, NumericBed12, NumericBed3, NumericBed4, - NumericBed6, -}; -use bedrs::{traits::IntervalBounds, Coordinates, IntervalContainer}; -use dashmap::DashMap; -use hashbrown::HashMap; -use human_sort::compare; - -pub trait Translate { - fn get_name(&self, idx: usize) -> Option<&str>; -} - -pub struct Translater { - name_to_idx: HashMap, - idx_to_name: HashMap, -} -impl Translater { - pub fn new() -> Self { - Self { - name_to_idx: HashMap::new(), - idx_to_name: HashMap::new(), - } - } - pub fn has_name(&self, name: &str) -> bool { - self.name_to_idx.contains_key(name) - } - pub fn add_name(&mut self, name: &str) { - if !self.has_name(name) { - let idx = self.name_to_idx.len(); - self.name_to_idx.insert(name.to_string(), idx); - self.idx_to_name.insert(idx, name.to_string()); - } - } - #[allow(dead_code)] - pub fn get_idx(&self, name: &str) -> Option { - self.name_to_idx.get(name).copied() - } - #[allow(dead_code)] - pub fn get_name_to_idx(&self) -> &HashMap { - &self.name_to_idx - } - pub fn lex_sort(self) -> Retranslater { - let mut idx_to_rank = HashMap::with_capacity(self.idx_to_name.len()); - let mut rank_to_name = HashMap::with_capacity(self.idx_to_name.len()); - let mut ordering = self - .idx_to_name - .iter() - .map(|(idx, name)| (name, idx)) - .collect::>(); - ordering.sort_by(|a, b| compare(a.0, b.0)); - for (order, (name, idx)) in ordering.into_iter().enumerate() { - rank_to_name.insert(order, name.to_string()); - idx_to_rank.insert(*idx, order); - } - Retranslater::new(idx_to_rank, rank_to_name) - } -} -impl Translate for Translater { - fn get_name(&self, idx: usize) -> Option<&str> { - self.idx_to_name.get(&idx).map(|s| s.as_str()) - } -} - -#[derive(Debug)] -pub struct Retranslater { - idx_to_rank: HashMap, - rank_to_name: HashMap, -} -impl Retranslater { - pub fn new(idx_to_rank: HashMap, rank_to_name: HashMap) -> Self { - Self { - idx_to_rank, - rank_to_name, - } - } - - pub fn get_rank(&self, idx: usize) -> Option { - self.idx_to_rank.get(&idx).copied() - } -} -impl Translate for Retranslater { - fn get_name(&self, rank: usize) -> Option<&str> { - self.rank_to_name.get(&rank).map(|s| s.as_str()) - } -} - -pub struct StreamTranslater { - name_to_idx: DashMap, - idx_to_name: DashMap, -} -impl StreamTranslater { - pub fn new() -> Self { - Self { - name_to_idx: DashMap::new(), - idx_to_name: DashMap::new(), - } - } - pub fn has_name(&self, name: &str) -> bool { - self.name_to_idx.contains_key(name) - } - pub fn add_name(&self, name: &str) { - if !self.has_name(name) { - let idx = self.name_to_idx.len(); - self.name_to_idx.insert(name.to_string(), idx); - self.idx_to_name.insert(idx, name.to_string()); - } - } - pub fn get_name_to_idx(&self) -> &DashMap { - &self.name_to_idx - } - pub fn get_idx_to_name(&self) -> &DashMap { - &self.idx_to_name - } -} - -pub trait Reorder -where - C: IntervalBounds, -{ - fn reorder_translater( - set: &mut IntervalContainer, - translater: Translater, - ) -> Retranslater; -} -impl Reorder for NumericBed3 { - fn reorder_translater( - set: &mut IntervalContainer, - translater: Translater, - ) -> Retranslater { - let retranslate = translater.lex_sort(); - set.apply_mut(|iv| { - let new_chr = retranslate.get_rank(*iv.chr()).unwrap(); - iv.update_chr(&new_chr); - }); - retranslate - } -} -impl Reorder for NumericBed4 { - fn reorder_translater( - set: &mut IntervalContainer, - translater: Translater, - ) -> Retranslater { - let retranslate = translater.lex_sort(); - set.apply_mut(|iv| { - let new_chr = retranslate.get_rank(*iv.chr()).unwrap(); - let new_name = retranslate.get_rank(*iv.name()).unwrap(); - iv.update_chr(&new_chr); - iv.update_name(&new_name); - }); - retranslate - } -} -impl Reorder for NumericBed6 { - fn reorder_translater( - set: &mut IntervalContainer, - translater: Translater, - ) -> Retranslater { - let retranslate = translater.lex_sort(); - set.apply_mut(|iv| { - let new_chr = retranslate.get_rank(*iv.chr()).unwrap(); - let new_name = retranslate.get_rank(*iv.name()).unwrap(); - iv.update_chr(&new_chr); - iv.update_name(&new_name); - }); - retranslate - } -} -impl Reorder for NumericBed12 { - fn reorder_translater( - set: &mut IntervalContainer, - translater: Translater, - ) -> Retranslater { - let retranslate = translater.lex_sort(); - set.apply_mut(|iv| { - let new_chr = retranslate.get_rank(*iv.chr()).unwrap(); - let new_name = retranslate.get_rank(*iv.name()).unwrap(); - let new_item_rgb = retranslate.get_rank(*iv.item_rgb()).unwrap(); - let new_block_sizes = retranslate.get_rank(*iv.block_sizes()).unwrap(); - let new_block_starts = retranslate.get_rank(*iv.block_starts()).unwrap(); - iv.update_chr(&new_chr); - iv.update_name(&new_name); - iv.update_item_rgb(&new_item_rgb); - iv.update_block_sizes(&new_block_sizes); - iv.update_block_starts(&new_block_starts); - }); - retranslate - } -} - -pub struct Renamer; -pub trait Rename<'a, Ia, Ib> -where - Ia: IntervalBounds, - Ib: IntervalBounds<&'a str, usize>, -{ - fn rename_with(iv: &Ia, translater: &'a Translater) -> Ib; -} -impl<'a> Rename<'a, NumericBed3, NamedBed3<'a>> for Renamer { - fn rename_with(iv: &NumericBed3, translater: &'a Translater) -> NamedBed3<'a> { - let chr = translater.get_name(*iv.chr()).unwrap(); - NamedBed3::new(chr, iv.start(), iv.end()) - } -} -impl<'a> Rename<'a, NumericBed4, NamedBed4<'a>> for Renamer { - fn rename_with(iv: &NumericBed4, translater: &'a Translater) -> NamedBed4<'a> { - let chr = translater.get_name(*iv.chr()).unwrap(); - let name = translater.get_name(*iv.name()).unwrap(); - NamedBed4::new(chr, iv.start(), iv.end(), name) - } -} -impl<'a> Rename<'a, NumericBed6, NamedBed6<'a>> for Renamer { - fn rename_with(iv: &NumericBed6, translater: &'a Translater) -> NamedBed6<'a> { - let chr = translater.get_name(*iv.chr()).unwrap(); - let name = translater.get_name(*iv.name()).unwrap(); - NamedBed6::new( - chr, - iv.start(), - iv.end(), - name, - *iv.score(), - iv.strand().unwrap_or_default(), - ) - } -} -impl<'a> Rename<'a, NumericBed12, NamedBed12<'a>> for Renamer { - fn rename_with(iv: &NumericBed12, translater: &'a Translater) -> NamedBed12<'a> { - let chr = translater.get_name(*iv.chr()).unwrap(); - let name = translater.get_name(*iv.name()).unwrap(); - let item_rgb = translater.get_name(*iv.item_rgb()).unwrap(); - let block_sizes = translater.get_name(*iv.block_sizes()).unwrap(); - let block_starts = translater.get_name(*iv.block_starts()).unwrap(); - NamedBed12::new( - chr, - iv.start(), - iv.end(), - name, - *iv.score(), - iv.strand().unwrap_or_default(), - iv.thick_start(), - iv.thick_end(), - item_rgb, - iv.block_count(), - block_sizes, - block_starts, - ) - } -} diff --git a/tests/subtract.rs b/tests/subtract.rs index 0499120..40cc7a1 100644 --- a/tests/subtract.rs +++ b/tests/subtract.rs @@ -23,7 +23,7 @@ mod testing { .iter() .map(|(chr, start, end, name, score, strand)| { format!( - "{}\t{}\t{}\t{}\t{:.1}\t{}\n", + "{}\t{}\t{}\t{}\t{:.3}\t{}\n", chr, start, end, name, score, strand ) }) @@ -52,7 +52,7 @@ mod testing { block_starts, )| { format!( - "{}\t{}\t{}\t{}\t{:.1}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n", + "{}\t{}\t{}\t{}\t{:.3}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n", chr, start, end, @@ -114,12 +114,12 @@ mod testing { .output()?; let expected = vec![ - (1, 100, 120, 0, 0.0, '+'), - (1, 125, 150, 0, 0.0, '+'), - (1, 160, 300, 0, 0.0, '+'), - (1, 400, 460, 0, 0.0, '+'), - (1, 470, 475, 0, 0.0, '+'), - (1, 500, 550, 0, 0.0, '+'), + (1, 100, 120, 0, '.', '+'), + (1, 125, 150, 0, '.', '+'), + (1, 160, 300, 0, '.', '+'), + (1, 400, 460, 0, '.', '+'), + (1, 470, 475, 0, '.', '+'), + (1, 500, 550, 0, '.', '+'), ]; let expected_str = build_expected_str_bed6(&expected); assert_eq!(output.stdout, expected_str.as_bytes()); @@ -141,12 +141,12 @@ mod testing { .output()?; let expected = vec![ - (1, 100, 120, 0, 0.0, '+', 0, 0, 0, 0, 0, 0), - (1, 125, 150, 0, 0.0, '+', 0, 0, 0, 0, 0, 0), - (1, 160, 300, 0, 0.0, '+', 0, 0, 0, 0, 0, 0), - (1, 400, 460, 0, 0.0, '+', 0, 0, 0, 0, 0, 0), - (1, 470, 475, 0, 0.0, '+', 0, 0, 0, 0, 0, 0), - (1, 500, 550, 0, 0.0, '+', 0, 0, 0, 0, 0, 0), + (1, 100, 120, 0, ".", '+', 0, 0, 0, 0, 0, 0), + (1, 125, 150, 0, ".", '+', 0, 0, 0, 0, 0, 0), + (1, 160, 300, 0, ".", '+', 0, 0, 0, 0, 0, 0), + (1, 400, 460, 0, ".", '+', 0, 0, 0, 0, 0, 0), + (1, 470, 475, 0, ".", '+', 0, 0, 0, 0, 0, 0), + (1, 500, 550, 0, ".", '+', 0, 0, 0, 0, 0, 0), ]; let expected_str = build_expected_str_bed12(&expected); assert_eq!(output.stdout, expected_str.as_bytes()); @@ -198,13 +198,13 @@ mod testing { .output()?; let expected = vec![ - (1, 100, 120, 0, 0.0, '+'), - (1, 125, 150, 0, 0.0, '+'), - (1, 160, 200, 0, 0.0, '+'), - (1, 200, 300, 0, 0.0, '+'), - (1, 400, 460, 0, 0.0, '+'), - (1, 470, 475, 0, 0.0, '+'), - (1, 500, 550, 0, 0.0, '+'), + (1, 100, 120, 0, ".", '+'), + (1, 125, 150, 0, ".", '+'), + (1, 160, 200, 0, ".", '+'), + (1, 200, 300, 0, "0.0", '+'), + (1, 400, 460, 0, ".", '+'), + (1, 470, 475, 0, ".", '+'), + (1, 500, 550, 0, "0.0", '+'), ]; let expected_str = build_expected_str_bed6(&expected); assert_eq!(output.stdout, expected_str.as_bytes()); @@ -227,15 +227,19 @@ mod testing { .output()?; let expected = vec![ - (1, 100, 120, 0, 0.0, '+', 0, 0, 0, 0, 0, 0), - (1, 125, 150, 0, 0.0, '+', 0, 0, 0, 0, 0, 0), - (1, 160, 200, 0, 0.0, '+', 0, 0, 0, 0, 0, 0), - (1, 200, 300, 0, 0.0, '+', 0, 0, 0, 0, 0, 0), - (1, 400, 460, 0, 0.0, '+', 0, 0, 0, 0, 0, 0), - (1, 470, 475, 0, 0.0, '+', 0, 0, 0, 0, 0, 0), - (1, 500, 550, 0, 0.0, '+', 0, 0, 0, 0, 0, 0), + (1, 100, 120, 0, ".", '+', 0, 0, 0, 0, 0, 0), + (1, 125, 150, 0, ".", '+', 0, 0, 0, 0, 0, 0), + (1, 160, 200, 0, ".", '+', 0, 0, 0, 0, 0, 0), + (1, 200, 300, 0, "0.0", '+', 0, 0, 0, 0, 0, 0), + (1, 400, 460, 0, ".", '+', 0, 0, 0, 0, 0, 0), + (1, 470, 475, 0, ".", '+', 0, 0, 0, 0, 0, 0), + (1, 500, 550, 0, "0.0", '+', 0, 0, 0, 0, 0, 0), ]; let expected_str = build_expected_str_bed12(&expected); + + println!("{}", std::str::from_utf8(&output.stdout).unwrap()); + println!("{}", expected_str); + assert_eq!(output.stdout, expected_str.as_bytes()); Ok(()) }