From d1cbebb54264207463869f03f37999e628979f61 Mon Sep 17 00:00:00 2001 From: noam teyssier <22600644+noamteyssier@users.noreply.github.com> Date: Mon, 20 Nov 2023 14:04:08 -0800 Subject: [PATCH 1/5] feat: added support for bed12 file format --- src/commands/closest.rs | 54 ++++- src/commands/extend.rs | 49 ++++- src/commands/get_fasta.rs | 51 ++++- src/commands/intersect/intersect.rs | 41 +++- src/commands/merge.rs | 30 ++- src/commands/random.rs | 71 ++++++- src/commands/sample.rs | 17 +- src/commands/sort.rs | 32 ++- src/commands/subtract/subtract.rs | 43 +++- src/io/mod.rs | 4 +- src/io/read/bed12.rs | 110 +++++++++++ src/io/read/mod.rs | 2 + src/io/write/iter.rs | 70 ++++++- src/types/formats/bed12.rs | 293 ++++++++++++++++++++++++++++ src/types/formats/formats.rs | 11 ++ src/types/formats/mod.rs | 2 + src/types/mod.rs | 5 +- src/types/translater.rs | 23 ++- 18 files changed, 880 insertions(+), 28 deletions(-) create mode 100644 src/io/read/bed12.rs create mode 100644 src/types/formats/bed12.rs diff --git a/src/commands/closest.rs b/src/commands/closest.rs index bbae0e9..1a140da 100644 --- a/src/commands/closest.rs +++ b/src/commands/closest.rs @@ -1,7 +1,7 @@ use crate::{ io::{ - match_input, match_output, read_paired_bed3_sets, read_paired_bed6_sets, - write_pairs_iter_with, + match_input, match_output, read_paired_bed12_sets, read_paired_bed3_sets, + read_paired_bed6_sets, write_pairs_iter_with, }, types::{InputFormat, IntervalPair}, utils::sort_pairs, @@ -112,6 +112,34 @@ pub fn closest_bed6( Ok(()) } +pub fn closest_bed12( + a: Option, + b: String, + output: Option, + upstream: bool, + downstream: bool, + named: bool, + sorted: bool, + compression_threads: usize, + compression_level: u32, +) -> Result<()> { + // load pairs + let query_handle = match_input(a)?; + let target_handle = match_input(Some(b))?; + let (mut a_set, mut b_set, translater) = + read_paired_bed12_sets(query_handle, target_handle, named)?; + sort_pairs(&mut a_set, &mut b_set, sorted); + + // run closest + let method = ClosestType::new(upstream, downstream); + let pairs_iter = run_closest(&a_set, &b_set, method); + + // write output + let output_handle = match_output(output, compression_threads, compression_level)?; + write_pairs_iter_with(pairs_iter, output_handle, translater.as_ref())?; + Ok(()) +} + pub fn closest( a: Option, b: String, @@ -124,8 +152,19 @@ pub fn closest( compression_threads: usize, compression_level: u32, ) -> Result<()> { - if format == InputFormat::Bed3 { - closest_bed3( + match format { + InputFormat::Bed3 => closest_bed3( + a, + b, + output, + upstream, + downstream, + named, + sorted, + compression_threads, + compression_level, + ), + InputFormat::Bed6 => closest_bed6( a, b, output, @@ -135,9 +174,8 @@ pub fn closest( sorted, compression_threads, compression_level, - ) - } else { - closest_bed6( + ), + InputFormat::Bed12 => closest_bed12( a, b, output, @@ -147,7 +185,7 @@ pub fn closest( sorted, compression_threads, compression_level, - ) + ), } } diff --git a/src/commands/extend.rs b/src/commands/extend.rs index 3f8692a..52c703e 100644 --- a/src/commands/extend.rs +++ b/src/commands/extend.rs @@ -1,7 +1,7 @@ use crate::{ io::{ - match_input, match_output, read_bed3_set, read_bed6_set, write_records_iter_with, - WriteNamedIter, WriteNamedIterImpl, + match_input, match_output, read_bed12_set, read_bed3_set, read_bed6_set, + write_records_iter_with, WriteNamedIter, WriteNamedIterImpl, }, types::{Genome, InputFormat, Translater}, }; @@ -135,6 +135,40 @@ fn extend_bed6( Ok(()) } +fn extend_bed12( + input: Option, + output: Option, + both: Option, + left: Option, + right: Option, + genome_path: Option, + named: bool, + compression_threads: usize, + compression_level: u32, +) -> Result<()> { + let input_handle = match_input(input)?; + let (mut iset, translater) = read_bed12_set(input_handle, named)?; + let genome = if let Some(path) = genome_path { + let genome_handle = match_input(Some(path))?; + let genome = Genome::from_reader_immutable(genome_handle, translater.as_ref(), false)?; + Some(genome) + } else { + None + }; + extend_set( + output, + &mut iset, + both, + left, + right, + genome, + translater.as_ref(), + compression_threads, + compression_level, + )?; + Ok(()) +} + pub fn extend( input: Option, output: Option, @@ -170,5 +204,16 @@ pub fn extend( compression_threads, compression_level, ), + InputFormat::Bed12 => extend_bed12( + input, + output, + both, + left, + right, + genome_path, + named, + compression_threads, + compression_level, + ), } } diff --git a/src/commands/get_fasta.rs b/src/commands/get_fasta.rs index 787ac4f..be5943b 100644 --- a/src/commands/get_fasta.rs +++ b/src/commands/get_fasta.rs @@ -1,6 +1,6 @@ use crate::{ io::{build_reader, match_input, match_output}, - types::{Bed6, InputFormat}, + types::{Bed12, Bed6, InputFormat}, }; use anyhow::Result; use bedrs::{Coordinates, NamedInterval}; @@ -85,6 +85,52 @@ fn get_fasta_bed6( Ok(()) } +fn get_fasta_bed12( + bed: Option, + fasta: &str, + output: Option, + compression_threads: usize, + compression_level: u32, +) -> Result<()> { + let handle = match_input(bed)?; + let fasta_index = build_fasta_index(fasta)?; + let fasta = IndexedFasta::new(fasta_index, fasta)?; + + let mut csv_reader = build_reader(handle); + let mut byterecord = ByteRecord::new(); + let mut output = match_output(output, compression_threads, compression_level)?; + + while csv_reader.read_byte_record(&mut byterecord)? { + let record: Bed12 = byterecord.deserialize(None)?; + match fasta.query_buffer(record.chr, record.start, record.end) { + Ok(buffer) => { + write!( + output, + ">{}:{}-{}::{}::{}::{}::{}::{}::{}::{}::{}::{}\n", + record.chr, + record.start, + record.end, + record.name, + record.score, + record.strand, + record.thick_start, + record.thick_end, + record.item_rgb, + record.block_count, + record.block_sizes, + record.block_starts, + )?; + for subseq in buffer.split_str("\n") { + output.write(subseq)?; + } + output.write(b"\n")?; + } + Err(_) => continue, + } + } + Ok(()) +} + pub fn get_fasta( bed: Option, fasta: &str, @@ -100,5 +146,8 @@ pub fn get_fasta( InputFormat::Bed6 => { get_fasta_bed6(bed, fasta, output, compression_threads, compression_level) } + InputFormat::Bed12 => { + get_fasta_bed12(bed, fasta, output, compression_threads, compression_level) + } } } diff --git a/src/commands/intersect/intersect.rs b/src/commands/intersect/intersect.rs index b59cc4e..9325a5e 100644 --- a/src/commands/intersect/intersect.rs +++ b/src/commands/intersect/intersect.rs @@ -2,9 +2,9 @@ use super::iter::{run_function, OutputMethod}; use crate::{ commands::{run_find, OverlapMethod}, io::{ - build_reader, match_input, match_output, read_paired_bed3_sets, read_paired_bed6_sets, - write_named_records_iter_dashmap, write_records_iter_with, NamedIter, UnnamedIter, - WriteNamedIter, WriteNamedIterImpl, + build_reader, match_input, match_output, read_paired_bed12_sets, read_paired_bed3_sets, + read_paired_bed6_sets, write_named_records_iter_dashmap, write_records_iter_with, + NamedIter, UnnamedIter, WriteNamedIter, WriteNamedIterImpl, }, types::{InputFormat, StreamTranslater, Translater}, }; @@ -90,6 +90,31 @@ fn intersect_bed6( ) } +fn intersect_bed12( + a: Option, + b: String, + output: Option, + overlap_method: OverlapMethod, + output_method: OutputMethod, + named: bool, + compression_threads: usize, + compression_level: u32, +) -> Result<()> { + let handle_a = match_input(a)?; + let handle_b = match_input(Some(b))?; + let (query_set, target_set, translater) = read_paired_bed12_sets(handle_a, handle_b, named)?; + run_intersect_set( + &query_set, + &target_set, + overlap_method, + output_method, + output, + translater.as_ref(), + compression_threads, + compression_level, + ) +} + pub fn intersect_set( a: Option, b: String, @@ -131,6 +156,16 @@ pub fn intersect_set( compression_threads, compression_level, ), + InputFormat::Bed12 => intersect_bed12( + a, + b, + output, + overlap_method, + output_method, + named, + compression_threads, + compression_level, + ), } } diff --git a/src/commands/merge.rs b/src/commands/merge.rs index 34cee76..a96eba9 100644 --- a/src/commands/merge.rs +++ b/src/commands/merge.rs @@ -2,8 +2,8 @@ use std::io::{Read, Write}; use crate::{ io::{ - build_reader, iter_unnamed, match_input, match_output, read_bed3_set, read_bed6_set, - write_records_iter, write_records_iter_with, + build_reader, iter_unnamed, match_input, match_output, read_bed12_set, read_bed3_set, + read_bed6_set, write_records_iter, write_records_iter_with, }, types::InputFormat, }; @@ -60,6 +60,31 @@ where Ok(()) } +fn merge_in_memory_bed12( + input_handle: R, + output_handle: W, + sorted: bool, + named: bool, +) -> Result<()> +where + R: Read, + W: Write, +{ + let (mut set, translater) = read_bed12_set(input_handle, named)?; + if !sorted { + set.sort(); + } else { + set.set_sorted(); + } + let merged = set.merge()?; + write_records_iter_with( + merged.records().into_iter(), + output_handle, + translater.as_ref(), + )?; + Ok(()) +} + fn merge_streamed(input_handle: R, output_handle: W) -> Result<()> where R: Read, @@ -91,6 +116,7 @@ pub fn merge( match format { InputFormat::Bed3 => merge_in_memory_bed3(input_handle, output_handle, sorted, named), InputFormat::Bed6 => merge_in_memory_bed6(input_handle, output_handle, sorted, named), + InputFormat::Bed12 => merge_in_memory_bed12(input_handle, output_handle, sorted, named), } } } diff --git a/src/commands/random.rs b/src/commands/random.rs index 33ba0cc..a7a61a2 100644 --- a/src/commands/random.rs +++ b/src/commands/random.rs @@ -1,6 +1,6 @@ use crate::{ io::{match_input, match_output, write_records_iter_with}, - types::{Genome, InputFormat, NumericBed6, Translater}, + types::{Genome, InputFormat, NumericBed12, NumericBed6, Translater}, utils::build_rng, }; use anyhow::Result; @@ -110,6 +110,63 @@ pub fn random_bed6( Ok(()) } +pub fn random_bed12( + n_intervals: usize, + l_intervals: usize, + n_chr: usize, + max_chr_len: usize, + seed: Option, + output: Option, + genome: Option, + named: bool, + compression_threads: usize, + compression_level: u32, +) -> Result<()> { + let mut rng_intervals = build_rng(seed); + let mut rng_chr = build_rng(seed); + let mut rng_strand = build_rng(seed); + let mut rng_thick_start = build_rng(seed); + let mut rng_thick_end = build_rng(seed); + let mut translater = Translater::new(); + let genome_sizes = build_chr_size(n_chr, max_chr_len, genome, named, &mut translater)?; + + let interval_gen = (0..n_intervals) + // draw a random chromosome + .map(|_| genome_sizes.sample_chr(&mut rng_chr)) + // draw a random end position in the chromosome + .map(|c| { + let y = rng_intervals.gen_range(l_intervals..=genome_sizes.chr_size_unchecked(c)); + (c, y) + }) + // draw a random strand + .map(|(c, y)| { + let s: Strand = rng_strand.gen(); + (c, y, s) + }) + // calculate the start position + .map(|(c, y, s)| { + let x = y - l_intervals; + (c, x, y, s) + }) + // draw a random thick start + .map(|(c, x, y, s)| { + let t = rng_thick_start.gen_range(x..=y); + (c, x, y, t, s) + }) + // draw a random thick end + .map(|(c, x, y, t, s)| { + let u = rng_thick_end.gen_range(t..=y); + (c, x, y, t, u, s) + }) + // build the interval + .map(|(c, x, y, t, u, s)| NumericBed12::new(c, x, y, 0, 0.0, s, t, u, 0, 0, 0, 0)); + + let output_handle = match_output(output, compression_threads, compression_level)?; + write_records_iter_with(interval_gen, output_handle, genome_sizes.translater())?; + + Ok(()) +} + pub fn random( n_intervals: usize, l_intervals: usize, @@ -148,5 +205,17 @@ pub fn random( compression_threads, compression_level, ), + InputFormat::Bed12 => random_bed12( + n_intervals, + l_intervals, + n_chr, + max_chr_len, + seed, + output, + genome, + named, + compression_threads, + compression_level, + ), } } diff --git a/src/commands/sample.rs b/src/commands/sample.rs index 987ee2f..9fbfdc7 100644 --- a/src/commands/sample.rs +++ b/src/commands/sample.rs @@ -4,8 +4,8 @@ use serde::Serialize; use crate::{ io::{ - match_input, match_output, read_bed3_set, read_bed6_set, write_records_iter_with, - WriteNamedIter, WriteNamedIterImpl, + match_input, match_output, read_bed12_set, read_bed3_set, read_bed6_set, + write_records_iter_with, WriteNamedIter, WriteNamedIterImpl, }, types::{InputFormat, Translater}, utils::build_rng, @@ -99,5 +99,18 @@ pub fn sample( compression_level, ) } + InputFormat::Bed12 => { + let (set, translater) = read_bed12_set(input_handle, named)?; + sample_from_set( + &set, + number, + fraction, + seed, + translater.as_ref(), + output, + compression_threads, + compression_level, + ) + } } } diff --git a/src/commands/sort.rs b/src/commands/sort.rs index 27a4cf5..bbebf29 100644 --- a/src/commands/sort.rs +++ b/src/commands/sort.rs @@ -1,7 +1,7 @@ use crate::{ io::{ - match_input, match_output, read_bed3_set, read_bed6_set, write_records_iter_with, - WriteNamedIter, WriteNamedIterImpl, + match_input, match_output, read_bed12_set, read_bed3_set, read_bed6_set, + write_records_iter_with, WriteNamedIter, WriteNamedIterImpl, }, types::{InputFormat, Reorder, Retranslater, Translater}, }; @@ -90,6 +90,26 @@ fn sort_bed6( ) } +fn sort_bed12( + input: Option, + output: Option, + named: bool, + parallel: bool, + compression_threads: usize, + compression_level: u32, +) -> Result<()> { + let input_handle = match_input(input)?; + let (set, translater) = read_bed12_set(input_handle, named)?; + sort_and_write( + set, + output, + translater, + parallel, + compression_threads, + compression_level, + ) +} + fn initialize_thread_pool(threads: usize) -> Result { if threads > 1 { ThreadPoolBuilder::new() @@ -132,5 +152,13 @@ pub fn sort( compression_threads, compression_level, ), + InputFormat::Bed12 => sort_bed12( + input, + output, + named, + parallel, + compression_threads, + compression_level, + ), } } diff --git a/src/commands/subtract/subtract.rs b/src/commands/subtract/subtract.rs index 6649ca6..62b2d39 100644 --- a/src/commands/subtract/subtract.rs +++ b/src/commands/subtract/subtract.rs @@ -1,8 +1,8 @@ use crate::{ commands::{run_find, OverlapMethod}, io::{ - match_input, match_output, read_paired_bed3_sets, read_paired_bed6_sets, - write_records_iter_with, WriteNamedIter, WriteNamedIterImpl, + match_input, match_output, read_paired_bed12_sets, read_paired_bed3_sets, + read_paired_bed6_sets, write_records_iter_with, WriteNamedIter, WriteNamedIterImpl, }, types::{InputFormat, Translater}, utils::sort_pairs, @@ -149,6 +149,37 @@ fn subtract_bed6( ) } +fn subtract_bed12( + query_path: Option, + target_path: String, + output_handle: W, + overlap_method: OverlapMethod, + unmerged: bool, + named: bool, +) -> Result<()> { + // load query and target sets + let query_handle = match_input(query_path)?; + let target_handle = match_input(Some(target_path))?; + let (mut query_set, mut target_set, translater) = + read_paired_bed12_sets(query_handle, target_handle, named)?; + + // sort query and target sets + sort_pairs(&mut query_set, &mut target_set, false); + + // merge target set + let bset = target_set.merge()?; + + // run subtraction + run_subtract( + &query_set, + &bset, + &overlap_method, + unmerged, + output_handle, + translater.as_ref(), + ) +} + pub fn subtract( query_path: Option, target_path: String, @@ -183,5 +214,13 @@ pub fn subtract( unmerged, named, ), + InputFormat::Bed12 => subtract_bed12( + query_path, + target_path, + output_handle, + overlap_method, + unmerged, + named, + ), } } diff --git a/src/io/mod.rs b/src/io/mod.rs index 3291dcd..9e1fb18 100644 --- a/src/io/mod.rs +++ b/src/io/mod.rs @@ -5,8 +5,8 @@ mod write; pub use general::{match_input, match_output}; pub use iter::{NamedIter, UnnamedIter}; pub use read::{ - build_reader, iter_unnamed, read_bed3_set, read_bed6_set, read_paired_bed3_sets, - read_paired_bed6_sets, + build_reader, iter_unnamed, read_bed12_set, read_bed3_set, read_bed6_set, + read_paired_bed12_sets, read_paired_bed3_sets, read_paired_bed6_sets, }; use serde::{Deserialize, Serialize}; pub use write::{ diff --git a/src/io/read/bed12.rs b/src/io/read/bed12.rs new file mode 100644 index 0000000..ef75ebe --- /dev/null +++ b/src/io/read/bed12.rs @@ -0,0 +1,110 @@ +use super::build_reader; +use crate::types::{Bed12, NumericBed12, NumericBed12Set, Translater}; +use anyhow::{bail, Result}; +use bedrs::Container; +use csv::ByteRecord; +use std::io::Read; + +pub fn read_bed12_set( + reader: R, + named: bool, +) -> Result<(NumericBed12Set, Option)> { + if named { + let (set, translater) = read_bed12_set_named(reader)?; + Ok((set, Some(translater))) + } else { + let set = read_bed12_set_unnamed(reader)?; + Ok((set, None)) + } +} + +pub fn read_paired_bed12_sets( + reader_1: R, + reader_2: R, + named: bool, +) -> Result<(NumericBed12Set, NumericBed12Set, Option)> { + if named { + let (query_set, target_set, translater) = read_paired_bed12_named(reader_1, reader_2)?; + Ok((query_set, target_set, Some(translater))) + } else { + let query_set = read_bed12_set_unnamed(reader_1)?; + let target_set = read_bed12_set_unnamed(reader_2)?; + Ok((query_set, target_set, None)) + } +} + +fn read_bed12_set_unnamed(reader: R) -> Result { + let mut reader = build_reader(reader); + let set = reader + .deserialize() + .map(|record| { + let record: NumericBed12 = match record { + Ok(record) => record, + Err(e) => { + bail!("Could not build bed record:\n\nIf your BED has non-integer chromosome names try rerunning with the `-N` flag:\n\nERROR: {}", e) + } + }; + Ok(record) + }) + .collect::>()?; + Ok(set) +} + +/// Reads a single file into a GenomicIntervalSet and a Translater +fn read_bed12_set_named(reader: R) -> Result<(NumericBed12Set, Translater)> { + let mut translater = Translater::new(); + let set = convert_bed12_set(reader, &mut translater)?; + Ok((set, translater)) +} + +/// Convert a CSV reader into a GenomicIntervalSet +/// +/// It uses an externally initialized name map and index map to keep track of +/// chromosome names and indices. This is useful for reading multiple files +/// and keeping track of the same chromosome names and indices. +fn convert_bed12_set(reader: R, translater: &mut Translater) -> Result { + let mut reader = build_reader(reader); + let mut raw_record = ByteRecord::new(); + let mut set = NumericBed12Set::empty(); + while reader.read_byte_record(&mut raw_record)? { + let record: Bed12 = raw_record.deserialize(None)?; + translater.add_name(record.chr); + translater.add_name(record.name); + translater.add_name(record.item_rgb); + translater.add_name(record.block_sizes); + translater.add_name(record.block_starts); + + let chr_int = translater.get_idx(record.chr).unwrap(); + let name_int = translater.get_idx(record.name).unwrap(); + let item_rgb_int = translater.get_idx(record.item_rgb).unwrap(); + let block_sizes_int = translater.get_idx(record.block_sizes).unwrap(); + let block_starts_int = translater.get_idx(record.block_starts).unwrap(); + let interval = NumericBed12::new( + chr_int, + record.start, + record.end, + name_int, + record.score, + record.strand, + record.thick_start, + record.thick_end, + item_rgb_int, + record.block_count, + block_sizes_int, + block_starts_int, + ); + set.insert(interval); + } + Ok(set) +} + +/// Reads two files into two NumericBed12Set and a Translater +fn read_paired_bed12_named( + reader_1: R, + reader_2: R, +) -> Result<(NumericBed12Set, NumericBed12Set, Translater)> { + let mut translater = Translater::new(); + let set_1 = convert_bed12_set(reader_1, &mut translater)?; + let set_2 = convert_bed12_set(reader_2, &mut translater)?; + Ok((set_1, set_2, translater)) +} diff --git a/src/io/read/mod.rs b/src/io/read/mod.rs index 4ece176..d78fcf7 100644 --- a/src/io/read/mod.rs +++ b/src/io/read/mod.rs @@ -1,10 +1,12 @@ use crate::types::NumericBed6Set; use bedrs::GenomicIntervalSet; +pub mod bed12; pub mod bed3; pub mod bed6; pub mod iter; pub mod utils; +pub use bed12::{read_bed12_set, read_paired_bed12_sets}; pub use bed3::{read_bed3_set, read_paired_bed3_sets}; pub use bed6::{read_bed6_set, read_paired_bed6_sets}; pub use iter::iter_unnamed; diff --git a/src/io/write/iter.rs b/src/io/write/iter.rs index 9012a44..635a7cc 100644 --- a/src/io/write/iter.rs +++ b/src/io/write/iter.rs @@ -1,5 +1,5 @@ use super::build_writer; -use crate::types::{NumericBed6, Translate}; +use crate::types::{NumericBed12, NumericBed6, Translate}; use anyhow::Result; use bedrs::{Coordinates, GenomicInterval}; use serde::Serialize; @@ -150,3 +150,71 @@ impl<'a> WriteNamedIter<&'a NumericBed6> for WriteNamedIterImpl { Ok(()) } } +impl WriteNamedIter for WriteNamedIterImpl { + fn write_named_iter, Tr: Translate>( + writer: W, + iterator: It, + translater: &Tr, + ) -> Result<()> { + let mut wtr = build_writer(writer); + for interval in iterator { + let chr = translater.get_name(*interval.chr()).unwrap(); + let name = translater.get_name(interval.name()).unwrap(); + let item_rgb = translater.get_name(interval.item_rgb).unwrap(); + let block_count = translater.get_name(interval.block_count).unwrap(); + let block_sizes = translater.get_name(interval.block_sizes).unwrap(); + let block_starts = translater.get_name(interval.block_starts).unwrap(); + let named_interval = ( + chr, + interval.start(), + interval.end(), + name, + interval.score, + interval.strand, + interval.thick_start, + interval.thick_end, + item_rgb, + block_count, + block_sizes, + block_starts, + ); + wtr.serialize(named_interval)?; + } + wtr.flush()?; + Ok(()) + } +} +impl<'a> WriteNamedIter<&'a NumericBed12> for WriteNamedIterImpl { + fn write_named_iter, Tr: Translate>( + writer: W, + iterator: It, + translater: &Tr, + ) -> Result<()> { + let mut wtr = build_writer(writer); + for interval in iterator { + let chr = translater.get_name(*interval.chr()).unwrap(); + let name = translater.get_name(interval.name()).unwrap(); + let item_rgb = translater.get_name(interval.item_rgb).unwrap(); + let block_count = translater.get_name(interval.block_count).unwrap(); + let block_sizes = translater.get_name(interval.block_sizes).unwrap(); + let block_starts = translater.get_name(interval.block_starts).unwrap(); + let named_interval = ( + chr, + interval.start(), + interval.end(), + name, + interval.score, + interval.strand, + interval.thick_start, + interval.thick_end, + item_rgb, + block_count, + block_sizes, + block_starts, + ); + wtr.serialize(named_interval)?; + } + wtr.flush()?; + Ok(()) + } +} diff --git a/src/types/formats/bed12.rs b/src/types/formats/bed12.rs new file mode 100644 index 0000000..1e76995 --- /dev/null +++ b/src/types/formats/bed12.rs @@ -0,0 +1,293 @@ +use anyhow::{bail, Result}; +use bedrs::{Container, Coordinates, Strand}; +use hashbrown::HashMap; +use serde::{Deserialize, Serialize}; + +use crate::types::{Translate, Translater}; + +#[derive(Debug, Deserialize, Serialize)] +pub struct Bed12<'a> { + pub chr: &'a str, + pub start: usize, + pub end: usize, + pub name: &'a str, + pub score: f64, + pub strand: Strand, + pub thick_start: usize, + pub thick_end: usize, + pub item_rgb: &'a str, + pub block_count: usize, + pub block_sizes: &'a str, + pub block_starts: &'a str, +} +#[allow(dead_code)] +impl<'a> Bed12<'a> { + pub fn from_numeric(record: &NumericBed12, translater: &'a Translater) -> Self { + Self { + chr: translater.get_name(record.chr).unwrap(), + start: record.start, + end: record.end, + name: translater.get_name(record.name).unwrap(), + score: record.score, + strand: record.strand, + thick_start: record.thick_start, + thick_end: record.thick_end, + item_rgb: translater.get_name(record.item_rgb).unwrap(), + block_count: record.block_count, + block_sizes: translater.get_name(record.block_sizes).unwrap(), + block_starts: translater.get_name(record.block_starts).unwrap(), + } + } +} + +#[derive(Debug, Deserialize, Serialize)] +pub struct NumericBed12Set { + records: Vec, + max_len: Option, + is_sorted: bool, +} +impl FromIterator for NumericBed12Set { + fn from_iter>(iter: I) -> Self { + let mut max_len = 0; + let records = iter + .into_iter() + .map(|interval| { + max_len = max_len.max(interval.len()); + interval + }) + .collect(); + let max_len = if max_len == 0 { None } else { Some(max_len) }; + Self { + records, + max_len, + is_sorted: false, + } + } +} +impl Container for NumericBed12Set { + fn new(records: Vec) -> Self { + let max_len = records.iter().map(|iv| iv.len()).max(); + Self { + records, + max_len, + is_sorted: false, + } + } + + fn records(&self) -> &Vec { + &self.records + } + fn records_mut(&mut self) -> &mut Vec { + &mut self.records + } + fn records_owned(self) -> Vec { + self.records + } + fn is_sorted(&self) -> bool { + self.is_sorted + } + fn sorted_mut(&mut self) -> &mut bool { + &mut self.is_sorted + } + fn max_len(&self) -> Option { + self.max_len + } + fn max_len_mut(&mut self) -> &mut Option { + &mut self.max_len + } + fn span(&self) -> Result { + if self.is_empty() { + bail!("Cannot get span of empty interval set") + } else if !self.is_sorted() { + bail!("Cannot get span of unsorted interval set") + } else { + let first = self.records().first().unwrap(); + let last = self.records().last().unwrap(); + if first.chr() != last.chr() { + bail!("Cannot get span of interval set spanning multiple chromosomes") + } else { + let iv = NumericBed12::new( + *first.chr(), + first.start(), + last.end(), + 0, + 0.0, + Strand::Unknown, + 0, + 0, + 0, + 0, + 0, + 0, + ); + Ok(iv) + } + } + } +} + +#[derive(Debug, Clone, Copy, Deserialize, Serialize)] +pub struct NumericBed12 { + pub chr: usize, + pub start: usize, + pub end: usize, + pub name: usize, + pub score: f64, + pub strand: Strand, + pub thick_start: usize, + pub thick_end: usize, + pub item_rgb: usize, + pub block_count: usize, + pub block_sizes: usize, + pub block_starts: usize, +} +#[allow(dead_code)] +impl NumericBed12 { + pub fn from_bed12(bed12: &Bed12, name_to_idx: &HashMap) -> Self { + Self { + chr: name_to_idx[bed12.chr], + start: bed12.start, + end: bed12.end, + name: name_to_idx[bed12.name], + score: bed12.score, + strand: bed12.strand, + thick_start: bed12.thick_start, + thick_end: bed12.thick_end, + item_rgb: name_to_idx[bed12.item_rgb], + block_count: bed12.block_count, + block_sizes: name_to_idx[bed12.block_sizes], + block_starts: name_to_idx[bed12.block_starts], + } + } + pub fn new( + chr: usize, + start: usize, + end: usize, + name: usize, + score: f64, + strand: Strand, + thick_start: usize, + thick_end: usize, + item_rgb: usize, + block_count: usize, + block_sizes: usize, + block_starts: usize, + ) -> Self { + Self { + chr, + start, + end, + name, + score, + strand, + thick_start, + thick_end, + item_rgb, + block_count, + block_sizes, + block_starts, + } + } + pub fn name(&self) -> usize { + self.name + } + pub fn update_name(&mut self, name: &usize) { + self.name = *name; + } + pub fn update_item_rgb(&mut self, item_rgb: &usize) { + self.item_rgb = *item_rgb; + } + pub fn update_block_sizes(&mut self, block_sizes: &usize) { + self.block_sizes = *block_sizes; + } + pub fn update_block_starts(&mut self, block_starts: &usize) { + self.block_starts = *block_starts; + } +} +impl Coordinates for NumericBed12 { + fn chr(&self) -> &usize { + &self.chr + } + + fn start(&self) -> usize { + self.start + } + + fn end(&self) -> usize { + self.end + } + + fn strand(&self) -> Option { + Some(self.strand) + } + + fn update_chr(&mut self, chr: &usize) { + self.chr = *chr; + } + + fn update_start(&mut self, start: &usize) { + self.start = *start; + } + + fn update_end(&mut self, end: &usize) { + self.end = *end; + } + + fn from(other: &Self) -> Self { + Self { + chr: other.chr, + start: other.start, + end: other.end, + name: other.name, + score: other.score, + strand: other.strand, + thick_start: other.thick_start, + thick_end: other.thick_end, + item_rgb: other.item_rgb, + block_count: other.block_count, + block_sizes: other.block_sizes, + block_starts: other.block_starts, + } + } +} +impl Coordinates for &NumericBed12 { + fn chr(&self) -> &usize { + &self.chr + } + + fn start(&self) -> usize { + self.start + } + + fn end(&self) -> usize { + self.end + } + + fn strand(&self) -> Option { + Some(self.strand) + } + + #[allow(dead_code)] + #[allow(unused_variables)] + fn update_chr(&mut self, chr: &usize) { + unreachable!("Cannot update chr of immutable reference") + } + + #[allow(dead_code)] + #[allow(unused_variables)] + fn update_start(&mut self, start: &usize) { + unreachable!("Cannot update start of immutable reference") + } + + #[allow(dead_code)] + #[allow(unused_variables)] + fn update_end(&mut self, end: &usize) { + unreachable!("Cannot update end of immutable reference") + } + + #[allow(dead_code)] + #[allow(unused_variables)] + fn from(other: &Self) -> Self { + unimplemented!("Cannot create owned instance of a reference") + } +} diff --git a/src/types/formats/formats.rs b/src/types/formats/formats.rs index 6e834aa..8d036a8 100644 --- a/src/types/formats/formats.rs +++ b/src/types/formats/formats.rs @@ -15,6 +15,7 @@ pub enum InputFormat { #[default] Bed3, Bed6, + Bed12, } impl InputFormat { #[allow(dead_code)] @@ -30,6 +31,7 @@ impl InputFormat { match num_fields { 3 => Ok(Self::Bed3), 6 => Ok(Self::Bed6), + 12 => Ok(Self::Bed12), _ => bail!( "Cannot predict input format from line: {}", std::str::from_utf8(line)? @@ -63,6 +65,15 @@ impl FieldFormat { let input_format = InputFormat::predict_from_bytes(line)?; let mut fields = line.split(|b| *b == b'\t'); match input_format { + InputFormat::Bed12 => { + let chr = from_utf8(fields.nth(0).unwrap())?; + let name = from_utf8(fields.nth(3).unwrap())?; + if chr.parse::().is_ok() && name.parse::().is_ok() { + Ok(Self::IntegerBased) + } else { + Ok(Self::StringBased) + } + } InputFormat::Bed6 => { let chr = from_utf8(fields.nth(0).unwrap())?; let name = from_utf8(fields.nth(2).unwrap())?; diff --git a/src/types/formats/mod.rs b/src/types/formats/mod.rs index 5fd1ed2..fd34740 100644 --- a/src/types/formats/mod.rs +++ b/src/types/formats/mod.rs @@ -1,6 +1,8 @@ +mod bed12; mod bed6; mod formats; mod genome; +pub use bed12::{Bed12, NumericBed12, NumericBed12Set}; pub use bed6::{Bed6, NumericBed6, NumericBed6Set}; pub use formats::{FieldFormat, InputFormat}; pub use genome::Genome; diff --git a/src/types/mod.rs b/src/types/mod.rs index da2b7c8..0dc2b9d 100644 --- a/src/types/mod.rs +++ b/src/types/mod.rs @@ -1,6 +1,9 @@ mod formats; mod pairs; mod translater; -pub use formats::{Bed6, FieldFormat, Genome, InputFormat, NumericBed6, NumericBed6Set}; +pub use formats::{ + Bed12, Bed6, FieldFormat, Genome, InputFormat, NumericBed12, NumericBed12Set, NumericBed6, + NumericBed6Set, +}; pub use pairs::IntervalPair; pub use translater::{Reorder, Retranslater, StreamTranslater, Translate, Translater}; diff --git a/src/types/translater.rs b/src/types/translater.rs index a1f0727..5260b6a 100644 --- a/src/types/translater.rs +++ b/src/types/translater.rs @@ -1,4 +1,4 @@ -use super::NumericBed6; +use super::{NumericBed12, NumericBed6}; use bedrs::{traits::IntervalBounds, Container, Coordinates, GenomicInterval}; use dashmap::DashMap; use hashbrown::HashMap; @@ -148,3 +148,24 @@ impl Reorder for NumericBed6 { retranslate } } +impl Reorder for NumericBed12 { + fn reorder_translater( + set: &mut impl Container, + translater: Translater, + ) -> Retranslater { + let retranslate = translater.lex_sort(); + set.apply_mut(|iv| { + let new_chr = retranslate.get_rank(*iv.chr()).unwrap(); + let new_name = retranslate.get_rank(iv.name()).unwrap(); + let new_item_rgb = retranslate.get_rank(iv.item_rgb).unwrap(); + let new_block_sizes = retranslate.get_rank(iv.block_sizes).unwrap(); + let new_block_starts = retranslate.get_rank(iv.block_starts).unwrap(); + iv.update_chr(&new_chr); + iv.update_name(&new_name); + iv.update_item_rgb(&new_item_rgb); + iv.update_block_sizes(&new_block_sizes); + iv.update_block_starts(&new_block_starts); + }); + retranslate + } +} From 365c9bb25552d19856219c52d401d462e1e078a9 Mon Sep 17 00:00:00 2001 From: noam teyssier <22600644+noamteyssier@users.noreply.github.com> Date: Mon, 20 Nov 2023 15:19:55 -0800 Subject: [PATCH 2/5] tests: added testing for bed12 for applicable commands --- tests/datasets/get_fasta/named.bed12 | 2 + tests/datasets/get_fasta/unnamed.bed12 | 2 + tests/datasets/intersect/intersect_a.bed12 | 10 ++ tests/datasets/intersect/intersect_b.bed12 | 10 ++ tests/datasets/merge/sorted.bed12 | 7 + tests/datasets/merge/sorted_named.bed12 | 7 + tests/datasets/merge/unsorted.bed12 | 7 + tests/datasets/merge/unsorted_named.bed12 | 7 + tests/datasets/sample/sample.bed12 | 100 ++++++++++++++ tests/datasets/sample/sample.bed6 | 100 ++++++++++++++ tests/datasets/sort/unsorted.bed12 | 20 +++ tests/datasets/sort/unsorted_named.bed12 | 20 +++ tests/datasets/subtract/subtract_a.bed12 | 5 + tests/datasets/subtract/subtract_b.bed12 | 5 + tests/get_fasta.rs | 38 ++++++ tests/intersect.rs | 30 +++++ tests/merge.rs | 121 +++++++++++------ tests/random.rs | 13 ++ tests/sample.rs | 38 ++++++ tests/sort.rs | 143 +++++++++++++++++++++ tests/subtract.rs | 101 +++++++++++++++ 21 files changed, 747 insertions(+), 39 deletions(-) create mode 100644 tests/datasets/get_fasta/named.bed12 create mode 100644 tests/datasets/get_fasta/unnamed.bed12 create mode 100644 tests/datasets/intersect/intersect_a.bed12 create mode 100644 tests/datasets/intersect/intersect_b.bed12 create mode 100644 tests/datasets/merge/sorted.bed12 create mode 100644 tests/datasets/merge/sorted_named.bed12 create mode 100644 tests/datasets/merge/unsorted.bed12 create mode 100644 tests/datasets/merge/unsorted_named.bed12 create mode 100644 tests/datasets/sample/sample.bed12 create mode 100644 tests/datasets/sample/sample.bed6 create mode 100644 tests/datasets/sort/unsorted.bed12 create mode 100644 tests/datasets/sort/unsorted_named.bed12 create mode 100644 tests/datasets/subtract/subtract_a.bed12 create mode 100644 tests/datasets/subtract/subtract_b.bed12 diff --git a/tests/datasets/get_fasta/named.bed12 b/tests/datasets/get_fasta/named.bed12 new file mode 100644 index 0000000..be75350 --- /dev/null +++ b/tests/datasets/get_fasta/named.bed12 @@ -0,0 +1,2 @@ +chr1 20 30 0 0.0 + 0 0 0 0 0 0 +chr2 30 40 0 0.0 - 0 0 0 0 0 0 diff --git a/tests/datasets/get_fasta/unnamed.bed12 b/tests/datasets/get_fasta/unnamed.bed12 new file mode 100644 index 0000000..7e9a983 --- /dev/null +++ b/tests/datasets/get_fasta/unnamed.bed12 @@ -0,0 +1,2 @@ +1 20 30 0 0.0 + 0 0 0 0 0 0 +2 30 40 0 0.0 - 0 0 0 0 0 0 diff --git a/tests/datasets/intersect/intersect_a.bed12 b/tests/datasets/intersect/intersect_a.bed12 new file mode 100644 index 0000000..97b9f53 --- /dev/null +++ b/tests/datasets/intersect/intersect_a.bed12 @@ -0,0 +1,10 @@ +1 72 222 0 0.0 + 0 0 0 0 0 0 +1 257 407 0 0.0 + 0 0 0 0 0 0 +1 268 418 0 0.0 + 0 0 0 0 0 0 +1 467 617 0 0.0 + 0 0 0 0 0 0 +1 819 969 0 0.0 + 0 0 0 0 0 0 +2 174 324 0 0.0 + 0 0 0 0 0 0 +2 587 737 0 0.0 + 0 0 0 0 0 0 +3 395 545 0 0.0 + 0 0 0 0 0 0 +3 554 704 0 0.0 + 0 0 0 0 0 0 +3 653 803 0 0.0 + 0 0 0 0 0 0 diff --git a/tests/datasets/intersect/intersect_b.bed12 b/tests/datasets/intersect/intersect_b.bed12 new file mode 100644 index 0000000..515e704 --- /dev/null +++ b/tests/datasets/intersect/intersect_b.bed12 @@ -0,0 +1,10 @@ +1 55 205 0 0.0 + 0 0 0 0 0 0 +1 69 219 0 0.0 + 0 0 0 0 0 0 +1 93 243 0 0.0 + 0 0 0 0 0 0 +1 156 306 0 0.0 + 0 0 0 0 0 0 +1 603 753 0 0.0 + 0 0 0 0 0 0 +1 837 987 0 0.0 + 0 0 0 0 0 0 +2 39 189 0 0.0 + 0 0 0 0 0 0 +2 71 221 0 0.0 + 0 0 0 0 0 0 +2 672 822 0 0.0 + 0 0 0 0 0 0 +3 138 288 0 0.0 + 0 0 0 0 0 0 diff --git a/tests/datasets/merge/sorted.bed12 b/tests/datasets/merge/sorted.bed12 new file mode 100644 index 0000000..a6007b2 --- /dev/null +++ b/tests/datasets/merge/sorted.bed12 @@ -0,0 +1,7 @@ +1 10 30 0 0.0 + 0 0 0 0 0 0 +1 25 45 0 0.0 + 0 0 0 0 0 0 +1 100 105 0 0.0 + 0 0 0 0 0 0 +1 105 300 0 0.0 + 0 0 0 0 0 0 +2 105 300 0 0.0 + 0 0 0 0 0 0 +2 105 300 0 0.0 + 0 0 0 0 0 0 +2 105 301 0 0.0 + 0 0 0 0 0 0 diff --git a/tests/datasets/merge/sorted_named.bed12 b/tests/datasets/merge/sorted_named.bed12 new file mode 100644 index 0000000..4bace91 --- /dev/null +++ b/tests/datasets/merge/sorted_named.bed12 @@ -0,0 +1,7 @@ +chr1 10 30 0 0.0 + 0 0 0 0 0 0 +chr1 25 45 0 0.0 + 0 0 0 0 0 0 +chr1 100 105 0 0.0 + 0 0 0 0 0 0 +chr1 105 300 0 0.0 + 0 0 0 0 0 0 +chr2 105 300 0 0.0 + 0 0 0 0 0 0 +chr2 105 300 0 0.0 + 0 0 0 0 0 0 +chr2 105 301 0 0.0 + 0 0 0 0 0 0 diff --git a/tests/datasets/merge/unsorted.bed12 b/tests/datasets/merge/unsorted.bed12 new file mode 100644 index 0000000..9dab437 --- /dev/null +++ b/tests/datasets/merge/unsorted.bed12 @@ -0,0 +1,7 @@ +1 105 300 0 0.0 + 0 0 0 0 0 0 +2 105 301 0 0.0 + 0 0 0 0 0 0 +2 105 300 0 0.0 + 0 0 0 0 0 0 +1 100 105 0 0.0 + 0 0 0 0 0 0 +1 25 45 0 0.0 + 0 0 0 0 0 0 +1 10 30 0 0.0 + 0 0 0 0 0 0 +2 105 300 0 0.0 + 0 0 0 0 0 0 diff --git a/tests/datasets/merge/unsorted_named.bed12 b/tests/datasets/merge/unsorted_named.bed12 new file mode 100644 index 0000000..f3b2cb3 --- /dev/null +++ b/tests/datasets/merge/unsorted_named.bed12 @@ -0,0 +1,7 @@ +chr1 105 300 0 0.0 + 0 0 0 0 0 0 +chr2 105 301 0 0.0 + 0 0 0 0 0 0 +chr2 105 300 0 0.0 + 0 0 0 0 0 0 +chr1 100 105 0 0.0 + 0 0 0 0 0 0 +chr1 25 45 0 0.0 + 0 0 0 0 0 0 +chr1 10 30 0 0.0 + 0 0 0 0 0 0 +chr2 105 300 0 0.0 + 0 0 0 0 0 0 diff --git a/tests/datasets/sample/sample.bed12 b/tests/datasets/sample/sample.bed12 new file mode 100644 index 0000000..56da3b0 --- /dev/null +++ b/tests/datasets/sample/sample.bed12 @@ -0,0 +1,100 @@ +1 12 162 0 0.0 + 0 0 0 0 0 0 +1 12 162 0 0.0 + 0 0 0 0 0 0 +1 59 209 0 0.0 + 0 0 0 0 0 0 +1 62 212 0 0.0 + 0 0 0 0 0 0 +1 144 294 0 0.0 + 0 0 0 0 0 0 +1 191 341 0 0.0 + 0 0 0 0 0 0 +1 313 463 0 0.0 + 0 0 0 0 0 0 +1 322 472 0 0.0 + 0 0 0 0 0 0 +1 349 499 0 0.0 + 0 0 0 0 0 0 +1 357 507 0 0.0 + 0 0 0 0 0 0 +1 358 508 0 0.0 + 0 0 0 0 0 0 +1 385 535 0 0.0 + 0 0 0 0 0 0 +1 392 542 0 0.0 + 0 0 0 0 0 0 +1 454 604 0 0.0 + 0 0 0 0 0 0 +1 459 609 0 0.0 + 0 0 0 0 0 0 +1 494 644 0 0.0 + 0 0 0 0 0 0 +1 532 682 0 0.0 + 0 0 0 0 0 0 +1 564 714 0 0.0 + 0 0 0 0 0 0 +1 631 781 0 0.0 + 0 0 0 0 0 0 +1 647 797 0 0.0 + 0 0 0 0 0 0 +1 672 822 0 0.0 + 0 0 0 0 0 0 +1 678 828 0 0.0 + 0 0 0 0 0 0 +1 717 867 0 0.0 + 0 0 0 0 0 0 +1 740 890 0 0.0 + 0 0 0 0 0 0 +1 754 904 0 0.0 + 0 0 0 0 0 0 +1 796 946 0 0.0 + 0 0 0 0 0 0 +1 796 946 0 0.0 + 0 0 0 0 0 0 +2 214 364 0 0.0 + 0 0 0 0 0 0 +2 277 427 0 0.0 + 0 0 0 0 0 0 +2 279 429 0 0.0 + 0 0 0 0 0 0 +2 348 498 0 0.0 + 0 0 0 0 0 0 +2 359 509 0 0.0 + 0 0 0 0 0 0 +2 376 526 0 0.0 + 0 0 0 0 0 0 +2 419 569 0 0.0 + 0 0 0 0 0 0 +2 495 645 0 0.0 + 0 0 0 0 0 0 +2 565 715 0 0.0 + 0 0 0 0 0 0 +2 606 756 0 0.0 + 0 0 0 0 0 0 +2 650 800 0 0.0 + 0 0 0 0 0 0 +2 718 868 0 0.0 + 0 0 0 0 0 0 +2 725 875 0 0.0 + 0 0 0 0 0 0 +2 750 900 0 0.0 + 0 0 0 0 0 0 +3 106 256 0 0.0 + 0 0 0 0 0 0 +3 115 265 0 0.0 + 0 0 0 0 0 0 +3 118 268 0 0.0 + 0 0 0 0 0 0 +3 161 311 0 0.0 + 0 0 0 0 0 0 +3 166 316 0 0.0 + 0 0 0 0 0 0 +3 239 389 0 0.0 + 0 0 0 0 0 0 +3 242 392 0 0.0 + 0 0 0 0 0 0 +3 382 532 0 0.0 + 0 0 0 0 0 0 +3 392 542 0 0.0 + 0 0 0 0 0 0 +3 453 603 0 0.0 + 0 0 0 0 0 0 +3 538 688 0 0.0 + 0 0 0 0 0 0 +3 569 719 0 0.0 + 0 0 0 0 0 0 +3 572 722 0 0.0 + 0 0 0 0 0 0 +3 611 761 0 0.0 + 0 0 0 0 0 0 +3 635 785 0 0.0 + 0 0 0 0 0 0 +3 639 789 0 0.0 + 0 0 0 0 0 0 +3 644 794 0 0.0 + 0 0 0 0 0 0 +3 705 855 0 0.0 + 0 0 0 0 0 0 +3 724 874 0 0.0 + 0 0 0 0 0 0 +3 736 886 0 0.0 + 0 0 0 0 0 0 +4 57 207 0 0.0 + 0 0 0 0 0 0 +4 61 211 0 0.0 + 0 0 0 0 0 0 +4 74 224 0 0.0 + 0 0 0 0 0 0 +4 110 260 0 0.0 + 0 0 0 0 0 0 +4 123 273 0 0.0 + 0 0 0 0 0 0 +4 165 315 0 0.0 + 0 0 0 0 0 0 +4 167 317 0 0.0 + 0 0 0 0 0 0 +4 189 339 0 0.0 + 0 0 0 0 0 0 +4 197 347 0 0.0 + 0 0 0 0 0 0 +4 262 412 0 0.0 + 0 0 0 0 0 0 +4 308 458 0 0.0 + 0 0 0 0 0 0 +4 313 463 0 0.0 + 0 0 0 0 0 0 +4 342 492 0 0.0 + 0 0 0 0 0 0 +4 437 587 0 0.0 + 0 0 0 0 0 0 +4 505 655 0 0.0 + 0 0 0 0 0 0 +4 688 838 0 0.0 + 0 0 0 0 0 0 +4 750 900 0 0.0 + 0 0 0 0 0 0 +4 772 922 0 0.0 + 0 0 0 0 0 0 +5 61 211 0 0.0 + 0 0 0 0 0 0 +5 86 236 0 0.0 + 0 0 0 0 0 0 +5 111 261 0 0.0 + 0 0 0 0 0 0 +5 158 308 0 0.0 + 0 0 0 0 0 0 +5 179 329 0 0.0 + 0 0 0 0 0 0 +5 184 334 0 0.0 + 0 0 0 0 0 0 +5 257 407 0 0.0 + 0 0 0 0 0 0 +5 262 412 0 0.0 + 0 0 0 0 0 0 +5 274 424 0 0.0 + 0 0 0 0 0 0 +5 294 444 0 0.0 + 0 0 0 0 0 0 +5 315 465 0 0.0 + 0 0 0 0 0 0 +5 434 584 0 0.0 + 0 0 0 0 0 0 +5 536 686 0 0.0 + 0 0 0 0 0 0 +5 555 705 0 0.0 + 0 0 0 0 0 0 +5 618 768 0 0.0 + 0 0 0 0 0 0 +5 638 788 0 0.0 + 0 0 0 0 0 0 +5 669 819 0 0.0 + 0 0 0 0 0 0 +5 704 854 0 0.0 + 0 0 0 0 0 0 +5 717 867 0 0.0 + 0 0 0 0 0 0 +5 732 882 0 0.0 + 0 0 0 0 0 0 +5 777 927 0 0.0 + 0 0 0 0 0 0 diff --git a/tests/datasets/sample/sample.bed6 b/tests/datasets/sample/sample.bed6 new file mode 100644 index 0000000..9784a9c --- /dev/null +++ b/tests/datasets/sample/sample.bed6 @@ -0,0 +1,100 @@ +1 12 162 0 0.0 + +1 12 162 0 0.0 + +1 59 209 0 0.0 + +1 62 212 0 0.0 + +1 144 294 0 0.0 + +1 191 341 0 0.0 + +1 313 463 0 0.0 + +1 322 472 0 0.0 + +1 349 499 0 0.0 + +1 357 507 0 0.0 + +1 358 508 0 0.0 + +1 385 535 0 0.0 + +1 392 542 0 0.0 + +1 454 604 0 0.0 + +1 459 609 0 0.0 + +1 494 644 0 0.0 + +1 532 682 0 0.0 + +1 564 714 0 0.0 + +1 631 781 0 0.0 + +1 647 797 0 0.0 + +1 672 822 0 0.0 + +1 678 828 0 0.0 + +1 717 867 0 0.0 + +1 740 890 0 0.0 + +1 754 904 0 0.0 + +1 796 946 0 0.0 + +1 796 946 0 0.0 + +2 214 364 0 0.0 + +2 277 427 0 0.0 + +2 279 429 0 0.0 + +2 348 498 0 0.0 + +2 359 509 0 0.0 + +2 376 526 0 0.0 + +2 419 569 0 0.0 + +2 495 645 0 0.0 + +2 565 715 0 0.0 + +2 606 756 0 0.0 + +2 650 800 0 0.0 + +2 718 868 0 0.0 + +2 725 875 0 0.0 + +2 750 900 0 0.0 + +3 106 256 0 0.0 + +3 115 265 0 0.0 + +3 118 268 0 0.0 + +3 161 311 0 0.0 + +3 166 316 0 0.0 + +3 239 389 0 0.0 + +3 242 392 0 0.0 + +3 382 532 0 0.0 + +3 392 542 0 0.0 + +3 453 603 0 0.0 + +3 538 688 0 0.0 + +3 569 719 0 0.0 + +3 572 722 0 0.0 + +3 611 761 0 0.0 + +3 635 785 0 0.0 + +3 639 789 0 0.0 + +3 644 794 0 0.0 + +3 705 855 0 0.0 + +3 724 874 0 0.0 + +3 736 886 0 0.0 + +4 57 207 0 0.0 + +4 61 211 0 0.0 + +4 74 224 0 0.0 + +4 110 260 0 0.0 + +4 123 273 0 0.0 + +4 165 315 0 0.0 + +4 167 317 0 0.0 + +4 189 339 0 0.0 + +4 197 347 0 0.0 + +4 262 412 0 0.0 + +4 308 458 0 0.0 + +4 313 463 0 0.0 + +4 342 492 0 0.0 + +4 437 587 0 0.0 + +4 505 655 0 0.0 + +4 688 838 0 0.0 + +4 750 900 0 0.0 + +4 772 922 0 0.0 + +5 61 211 0 0.0 + +5 86 236 0 0.0 + +5 111 261 0 0.0 + +5 158 308 0 0.0 + +5 179 329 0 0.0 + +5 184 334 0 0.0 + +5 257 407 0 0.0 + +5 262 412 0 0.0 + +5 274 424 0 0.0 + +5 294 444 0 0.0 + +5 315 465 0 0.0 + +5 434 584 0 0.0 + +5 536 686 0 0.0 + +5 555 705 0 0.0 + +5 618 768 0 0.0 + +5 638 788 0 0.0 + +5 669 819 0 0.0 + +5 704 854 0 0.0 + +5 717 867 0 0.0 + +5 732 882 0 0.0 + +5 777 927 0 0.0 + diff --git a/tests/datasets/sort/unsorted.bed12 b/tests/datasets/sort/unsorted.bed12 new file mode 100644 index 0000000..c0dc900 --- /dev/null +++ b/tests/datasets/sort/unsorted.bed12 @@ -0,0 +1,20 @@ +1 519 669 0 0.0 + 0 0 0 0 0 0 +2 676 826 0 0.0 + 0 0 0 0 0 0 +3 249 399 0 0.0 + 0 0 0 0 0 0 +1 453 603 0 0.0 + 0 0 0 0 0 0 +1 643 793 0 0.0 + 0 0 0 0 0 0 +1 180 330 0 0.0 + 0 0 0 0 0 0 +3 423 573 0 0.0 + 0 0 0 0 0 0 +2 581 731 0 0.0 + 0 0 0 0 0 0 +2 538 688 0 0.0 + 0 0 0 0 0 0 +3 822 972 0 0.0 + 0 0 0 0 0 0 +3 188 338 0 0.0 + 0 0 0 0 0 0 +2 698 848 0 0.0 + 0 0 0 0 0 0 +3 405 555 0 0.0 + 0 0 0 0 0 0 +1 580 730 0 0.0 + 0 0 0 0 0 0 +3 232 382 0 0.0 + 0 0 0 0 0 0 +1 451 601 0 0.0 + 0 0 0 0 0 0 +3 315 465 0 0.0 + 0 0 0 0 0 0 +3 271 421 0 0.0 + 0 0 0 0 0 0 +3 791 941 0 0.0 + 0 0 0 0 0 0 +3 629 779 0 0.0 + 0 0 0 0 0 0 diff --git a/tests/datasets/sort/unsorted_named.bed12 b/tests/datasets/sort/unsorted_named.bed12 new file mode 100644 index 0000000..c0dc900 --- /dev/null +++ b/tests/datasets/sort/unsorted_named.bed12 @@ -0,0 +1,20 @@ +1 519 669 0 0.0 + 0 0 0 0 0 0 +2 676 826 0 0.0 + 0 0 0 0 0 0 +3 249 399 0 0.0 + 0 0 0 0 0 0 +1 453 603 0 0.0 + 0 0 0 0 0 0 +1 643 793 0 0.0 + 0 0 0 0 0 0 +1 180 330 0 0.0 + 0 0 0 0 0 0 +3 423 573 0 0.0 + 0 0 0 0 0 0 +2 581 731 0 0.0 + 0 0 0 0 0 0 +2 538 688 0 0.0 + 0 0 0 0 0 0 +3 822 972 0 0.0 + 0 0 0 0 0 0 +3 188 338 0 0.0 + 0 0 0 0 0 0 +2 698 848 0 0.0 + 0 0 0 0 0 0 +3 405 555 0 0.0 + 0 0 0 0 0 0 +1 580 730 0 0.0 + 0 0 0 0 0 0 +3 232 382 0 0.0 + 0 0 0 0 0 0 +1 451 601 0 0.0 + 0 0 0 0 0 0 +3 315 465 0 0.0 + 0 0 0 0 0 0 +3 271 421 0 0.0 + 0 0 0 0 0 0 +3 791 941 0 0.0 + 0 0 0 0 0 0 +3 629 779 0 0.0 + 0 0 0 0 0 0 diff --git a/tests/datasets/subtract/subtract_a.bed12 b/tests/datasets/subtract/subtract_a.bed12 new file mode 100644 index 0000000..7cfe0ff --- /dev/null +++ b/tests/datasets/subtract/subtract_a.bed12 @@ -0,0 +1,5 @@ +1 100 200 0 0.0 + 0 0 0 0 0 0 +1 150 160 0 0.0 + 0 0 0 0 0 0 +1 200 300 0 0.0 + 0 0 0 0 0 0 +1 400 475 0 0.0 + 0 0 0 0 0 0 +1 500 550 0 0.0 + 0 0 0 0 0 0 diff --git a/tests/datasets/subtract/subtract_b.bed12 b/tests/datasets/subtract/subtract_b.bed12 new file mode 100644 index 0000000..1159058 --- /dev/null +++ b/tests/datasets/subtract/subtract_b.bed12 @@ -0,0 +1,5 @@ +1 120 125 0 0.0 + 0 0 0 0 0 0 +1 150 155 0 0.0 + 0 0 0 0 0 0 +1 150 160 0 0.0 + 0 0 0 0 0 0 +1 460 470 0 0.0 + 0 0 0 0 0 0 +1 490 500 0 0.0 + 0 0 0 0 0 0 diff --git a/tests/get_fasta.rs b/tests/get_fasta.rs index 6f6c978..0e7f35b 100644 --- a/tests/get_fasta.rs +++ b/tests/get_fasta.rs @@ -75,4 +75,42 @@ mod testing { assert_eq!(String::from_utf8_lossy(&output.stdout), expected); Ok(()) } + + #[test] + fn test_get_fasta_bed12() -> Result<()> { + let input = "tests/datasets/get_fasta/unnamed.bed12"; + let fasta = "tests/datasets/get_fasta/unnamed.fa"; + let mut cmd = Command::cargo_bin("gia")?; + let output = cmd + .arg("get-fasta") + .arg("-b") + .arg(input) + .arg("-f") + .arg(fasta) + .arg("--format") + .arg("bed12") + .output()?; + let expected = ">1:20-30::0::0::+::0::0::0::0::0::0\nAGCGACTACG\n>2:30-40::0::0::-::0::0::0::0::0::0\nCGATCGATCG\n"; + assert_eq!(String::from_utf8_lossy(&output.stdout), expected); + Ok(()) + } + + #[test] + fn test_get_fasta_named_bed12() -> Result<()> { + let input = "tests/datasets/get_fasta/named.bed12"; + let fasta = "tests/datasets/get_fasta/named.fa"; + let mut cmd = Command::cargo_bin("gia")?; + let output = cmd + .arg("get-fasta") + .arg("-b") + .arg(input) + .arg("-f") + .arg(fasta) + .arg("--format") + .arg("bed12") + .output()?; + let expected = ">chr1:20-30::0::0::+::0::0::0::0::0::0\nAGCGACTACG\n>chr2:30-40::0::0::-::0::0::0::0::0::0\nCGATCGATCG\n"; + assert_eq!(String::from_utf8_lossy(&output.stdout), expected); + Ok(()) + } } diff --git a/tests/intersect.rs b/tests/intersect.rs index 08cc448..d86a4ce 100644 --- a/tests/intersect.rs +++ b/tests/intersect.rs @@ -70,6 +70,36 @@ mod testing { Ok(()) } + #[test] + fn test_intersect_bed12() -> Result<()> { + let a = "tests/datasets/intersect/intersect_a.bed12"; + let b = "tests/datasets/intersect/intersect_b.bed12"; + + let mut cmd = Command::cargo_bin("gia")?; + let output = cmd + .arg("intersect") + .arg("-a") + .arg(a) + .arg("-b") + .arg(b) + .arg("--format") + .arg("bed12") + .output()?; + + let num_intervals = output.stdout.split(|&c| c == b'\n').count() - 1; + assert_eq!(num_intervals, 11); + + let num_fields = output + .stdout + .split(|&c| c == b'\n') + .next() + .unwrap() + .split(|&c| c == b'\t') + .count(); + assert_eq!(num_fields, 12); + Ok(()) + } + #[test] fn test_intersect_fractional_query() -> Result<()> { let a = "tests/datasets/intersect/intersect_a.bed"; diff --git a/tests/merge.rs b/tests/merge.rs index c20d30f..37b984e 100644 --- a/tests/merge.rs +++ b/tests/merge.rs @@ -12,21 +12,6 @@ mod testing { .join("") } - fn build_expected_str_bed6( - expected: &Vec<(S, T, T, S, F, C)>, - ) -> String { - expected - .iter() - .map(|(chr, start, end, name, score, strand)| { - format!( - "{}\t{}\t{}\t{}\t{:.1}\t{}\n", - chr, start, end, name, score, strand - ) - }) - .collect::>() - .join("") - } - #[test] fn test_merge_sorted_bed3() -> Result<()> { let input = "tests/datasets/merge/sorted.bed"; @@ -51,12 +36,26 @@ mod testing { .arg("bed6") .output()?; - let expected = vec![ - (1, 10, 45, 0, 0.0, '+'), - (1, 100, 300, 0, 0.0, '+'), - (2, 105, 301, 0, 0.0, '+'), - ]; - let expected_str = build_expected_str_bed6(&expected); + let expected = vec![(1, 10, 45), (1, 100, 300), (2, 105, 301)]; + let expected_str = build_expected_str(&expected); + assert_eq!(output.stdout, expected_str.as_bytes()); + Ok(()) + } + + #[test] + fn test_merge_sorted_bed12() -> Result<()> { + let input = "tests/datasets/merge/sorted.bed12"; + let mut cmd = Command::cargo_bin("gia")?; + let output = cmd + .arg("merge") + .arg("-i") + .arg(input) + .arg("--format") + .arg("bed12") + .output()?; + + let expected = vec![(1, 10, 45), (1, 100, 300), (2, 105, 301)]; + let expected_str = build_expected_str(&expected); assert_eq!(output.stdout, expected_str.as_bytes()); Ok(()) } @@ -97,12 +96,26 @@ mod testing { .arg("bed6") .output()?; - let expected = vec![ - (1, 10, 45, 0, 0.0, '+'), - (1, 100, 300, 0, 0.0, '+'), - (2, 105, 301, 0, 0.0, '+'), - ]; - let expected_str = build_expected_str_bed6(&expected); + let expected = vec![(1, 10, 45), (1, 100, 300), (2, 105, 301)]; + let expected_str = build_expected_str(&expected); + assert_eq!(output.stdout, expected_str.as_bytes()); + Ok(()) + } + + #[test] + fn test_merge_unsorted_bed12() -> Result<()> { + let input = "tests/datasets/merge/unsorted.bed12"; + let mut cmd = Command::cargo_bin("gia")?; + let output = cmd + .arg("merge") + .arg("-i") + .arg(input) + .arg("--format") + .arg("bed12") + .output()?; + + let expected = vec![(1, 10, 45), (1, 100, 300), (2, 105, 301)]; + let expected_str = build_expected_str(&expected); assert_eq!(output.stdout, expected_str.as_bytes()); Ok(()) } @@ -132,12 +145,27 @@ mod testing { .arg("bed6") .output()?; - let expected = vec![ - ("chr1", 10, 45, "0", 0.0, '+'), - ("chr1", 100, 300, "0", 0.0, '+'), - ("chr2", 105, 301, "0", 0.0, '+'), - ]; - let expected_str = build_expected_str_bed6(&expected); + let expected = vec![("chr1", 10, 45), ("chr1", 100, 300), ("chr2", 105, 301)]; + let expected_str = build_expected_str(&expected); + assert_eq!(output.stdout, expected_str.as_bytes()); + Ok(()) + } + + #[test] + fn test_merge_unsorted_named_bed12() -> Result<()> { + let input = "tests/datasets/merge/unsorted_named.bed12"; + let mut cmd = Command::cargo_bin("gia")?; + let output = cmd + .arg("merge") + .arg("-N") + .arg("-i") + .arg(input) + .arg("--format") + .arg("bed12") + .output()?; + + let expected = vec![("chr1", 10, 45), ("chr1", 100, 300), ("chr2", 105, 301)]; + let expected_str = build_expected_str(&expected); assert_eq!(output.stdout, expected_str.as_bytes()); Ok(()) } @@ -167,12 +195,27 @@ mod testing { .arg("bed6") .output()?; - let expected = vec![ - ("chr1", 10, 45, "0", 0.0, '+'), - ("chr1", 100, 300, "0", 0.0, '+'), - ("chr2", 105, 301, "0", 0.0, '+'), - ]; - let expected_str = build_expected_str_bed6(&expected); + let expected = vec![("chr1", 10, 45), ("chr1", 100, 300), ("chr2", 105, 301)]; + let expected_str = build_expected_str(&expected); + assert_eq!(output.stdout, expected_str.as_bytes()); + Ok(()) + } + + #[test] + fn test_merge_sorted_named_bed12() -> Result<()> { + let input = "tests/datasets/merge/sorted_named.bed12"; + let mut cmd = Command::cargo_bin("gia")?; + let output = cmd + .arg("merge") + .arg("-N") + .arg("-i") + .arg(input) + .arg("--format") + .arg("bed12") + .output()?; + + let expected = vec![("chr1", 10, 45), ("chr1", 100, 300), ("chr2", 105, 301)]; + let expected_str = build_expected_str(&expected); assert_eq!(output.stdout, expected_str.as_bytes()); Ok(()) } diff --git a/tests/random.rs b/tests/random.rs index a299752..9d713fa 100644 --- a/tests/random.rs +++ b/tests/random.rs @@ -125,4 +125,17 @@ mod testing { } Ok(()) } + + #[test] + fn test_random_bed12() -> Result<()> { + let mut cmd = Command::cargo_bin("gia")?; + let output = cmd.arg("random").arg("--format").arg("bed12").output()?; + let string_out = String::from_utf8(output.stdout)?.trim().to_string(); + let rows = string_out.split("\n"); + for row in rows { + let cols = row.split("\t").collect::>(); + assert_eq!(cols.len(), 12); + } + Ok(()) + } } diff --git a/tests/sample.rs b/tests/sample.rs index a7087eb..cdea61b 100644 --- a/tests/sample.rs +++ b/tests/sample.rs @@ -21,6 +21,44 @@ mod testing { Ok(()) } + #[test] + fn test_sample_int_bed6() -> Result<()> { + let input = "tests/datasets/sample/sample.bed6"; + let num_samples = 10; + let mut cmd = Command::cargo_bin("gia")?; + let output = cmd + .arg("sample") + .arg("-i") + .arg(input) + .arg("-n") + .arg(num_samples.to_string()) + .arg("--format") + .arg("bed6") + .output()?; + let num_intervals = output.stdout.split(|&c| c == b'\n').count() - 1; + assert_eq!(num_intervals, num_samples); + Ok(()) + } + + #[test] + fn test_sample_int_bed12() -> Result<()> { + let input = "tests/datasets/sample/sample.bed12"; + let num_samples = 10; + let mut cmd = Command::cargo_bin("gia")?; + let output = cmd + .arg("sample") + .arg("-i") + .arg(input) + .arg("-n") + .arg(num_samples.to_string()) + .arg("--format") + .arg("bed12") + .output()?; + let num_intervals = output.stdout.split(|&c| c == b'\n').count() - 1; + assert_eq!(num_intervals, num_samples); + Ok(()) + } + #[test] fn test_sample_float() -> Result<()> { let input = "tests/datasets/sample/sample.bed"; diff --git a/tests/sort.rs b/tests/sort.rs index 40022a3..a123b64 100644 --- a/tests/sort.rs +++ b/tests/sort.rs @@ -226,4 +226,147 @@ mod testing { } Ok(()) } + + #[test] + fn test_sort_bed12() -> Result<()> { + let input = "tests/datasets/sort/unsorted.bed12"; + let mut cmd = Command::cargo_bin("gia")?; + let output = cmd + .arg("sort") + .arg("-i") + .arg(input) + .arg("--format") + .arg("bed12") + .output()?; + let output_str = String::from_utf8(output.stdout)?; + let rows = output_str + .split("\n") + .filter(|row| !row.is_empty()) + .collect::>(); + + let mut last_interval = GenomicInterval::new(0, 0, 0); + for row in rows { + let fields = row.split("\t").collect::>(); + assert_eq!(fields.len(), 12); + let numeric_fields = fields + .iter() + .take(3) + .map(|field| field.parse::().unwrap()) + .collect::>(); + let interval = + GenomicInterval::new(numeric_fields[0], numeric_fields[1], numeric_fields[2]); + assert!(interval.gt(&last_interval) || interval == last_interval); + last_interval = interval; + } + Ok(()) + } + + #[test] + fn test_par_sort_bed12() -> Result<()> { + let input = "tests/datasets/sort/unsorted.bed12"; + let mut cmd = Command::cargo_bin("gia")?; + let output = cmd + .arg("sort") + .arg("-i") + .arg(input) + .arg("-t") + .arg("0") + .arg("--format") + .arg("bed12") + .output()?; + let output_str = String::from_utf8(output.stdout)?; + let rows = output_str + .split("\n") + .filter(|row| !row.is_empty()) + .collect::>(); + + let mut last_interval = GenomicInterval::new(0, 0, 0); + for row in rows { + let fields = row.split("\t").collect::>(); + assert_eq!(fields.len(), 12); + let numeric_fields = fields + .iter() + .take(3) + .map(|field| field.parse::().unwrap()) + .collect::>(); + let interval = + GenomicInterval::new(numeric_fields[0], numeric_fields[1], numeric_fields[2]); + assert!(interval.gt(&last_interval) || interval == last_interval); + last_interval = interval; + } + Ok(()) + } + + #[test] + fn test_lex_sort_bed12() -> Result<()> { + let input = "tests/datasets/sort/unsorted_named.bed12"; + let mut cmd = Command::cargo_bin("gia")?; + let output = cmd + .arg("sort") + .arg("-i") + .arg(input) + .arg("--format") + .arg("bed12") + .output()?; + let output_str = String::from_utf8(output.stdout)?; + let rows = output_str + .split("\n") + .filter(|row| !row.is_empty()) + .collect::>(); + + let mut last_interval = GenomicInterval::new(0, 0, 0); + for row in rows { + let fields = row.split("\t").collect::>(); + assert_eq!(fields.len(), 12); + let numeric_fields = fields + .iter() + .take(3) + .map(|field| field.replace("chr", "")) + .map(|field| field.parse::().unwrap()) + .collect::>(); + let interval = + GenomicInterval::new(numeric_fields[0], numeric_fields[1], numeric_fields[2]); + assert!(interval.gt(&last_interval) || interval == last_interval); + last_interval = interval; + } + Ok(()) + } + + #[test] + fn test_lex_sort_bed12_correct_name() -> Result<()> { + let input = "tests/datasets/sort/unsorted_named.bed12"; + let mut cmd = Command::cargo_bin("gia")?; + let output = cmd + .arg("sort") + .arg("-i") + .arg(input) + .arg("--format") + .arg("bed12") + .output()?; + let output_str = String::from_utf8(output.stdout)?; + let rows = output_str + .split("\n") + .filter(|row| !row.is_empty()) + .collect::>(); + + let mut last_interval = GenomicInterval::new(0, 0, 0); + for row in rows { + let fields = row.split("\t").collect::>(); + assert_eq!(fields.len(), 12); + let numeric_fields = fields + .iter() + .take(3) + .map(|field| field.replace("chr", "")) + .map(|field| field.parse::().unwrap()) + .collect::>(); + let interval = + GenomicInterval::new(numeric_fields[0], numeric_fields[1], numeric_fields[2]); + assert!(interval.gt(&last_interval) || interval == last_interval); + last_interval = interval; + assert_eq!(fields[3], "0"); + assert_eq!(fields[4], "0.0"); + assert_eq!(fields[5], "+"); + } + Ok(()) + } } diff --git a/tests/subtract.rs b/tests/subtract.rs index 2d6c097..27abac3 100644 --- a/tests/subtract.rs +++ b/tests/subtract.rs @@ -27,6 +27,47 @@ mod testing { .join("") } + fn build_expected_str_bed12( + expected: &Vec<(S, T, T, S, F, C, T, T, T, T, T, T)>, + ) -> String { + expected + .iter() + .map( + |( + chr, + start, + end, + name, + score, + strand, + thick_start, + thick_end, + rgb, + block_count, + block_sizes, + block_starts, + )| { + format!( + "{}\t{}\t{}\t{}\t{:.1}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n", + chr, + start, + end, + name, + score, + strand, + thick_start, + thick_end, + rgb, + block_count, + block_sizes, + block_starts + ) + }, + ) + .collect::>() + .join("") + } + #[test] fn test_subtract_merged_bed3() -> Result<()> { let a = "tests/datasets/subtract/subtract_a.bed"; @@ -83,6 +124,35 @@ mod testing { Ok(()) } + #[test] + fn test_subtract_merged_bed12() -> Result<()> { + let a = "tests/datasets/subtract/subtract_a.bed12"; + let b = "tests/datasets/subtract/subtract_b.bed12"; + + let mut cmd = Command::cargo_bin("gia")?; + let output = cmd + .arg("subtract") + .arg("-a") + .arg(a) + .arg("-b") + .arg(b) + .arg("--format") + .arg("bed12") + .output()?; + + let expected = vec![ + (1, 100, 120, 0, 0.0, '+', 0, 0, 0, 0, 0, 0), + (1, 125, 150, 0, 0.0, '+', 0, 0, 0, 0, 0, 0), + (1, 160, 300, 0, 0.0, '+', 0, 0, 0, 0, 0, 0), + (1, 400, 460, 0, 0.0, '+', 0, 0, 0, 0, 0, 0), + (1, 470, 475, 0, 0.0, '+', 0, 0, 0, 0, 0, 0), + (1, 500, 550, 0, 0.0, '+', 0, 0, 0, 0, 0, 0), + ]; + let expected_str = build_expected_str_bed12(&expected); + assert_eq!(output.stdout, expected_str.as_bytes()); + Ok(()) + } + #[test] fn test_subtract_unmerged_bed3() -> Result<()> { let a = "tests/datasets/subtract/subtract_a.bed"; @@ -143,6 +213,37 @@ mod testing { Ok(()) } + #[test] + fn test_subtract_unmerged_bed12() -> Result<()> { + let a = "tests/datasets/subtract/subtract_a.bed12"; + let b = "tests/datasets/subtract/subtract_b.bed12"; + + let mut cmd = Command::cargo_bin("gia")?; + let output = cmd + .arg("subtract") + .arg("-a") + .arg(a) + .arg("-b") + .arg(b) + .arg("--format") + .arg("bed12") + .arg("-u") + .output()?; + + let expected = vec![ + (1, 100, 120, 0, 0.0, '+', 0, 0, 0, 0, 0, 0), + (1, 125, 150, 0, 0.0, '+', 0, 0, 0, 0, 0, 0), + (1, 160, 200, 0, 0.0, '+', 0, 0, 0, 0, 0, 0), + (1, 200, 300, 0, 0.0, '+', 0, 0, 0, 0, 0, 0), + (1, 400, 460, 0, 0.0, '+', 0, 0, 0, 0, 0, 0), + (1, 470, 475, 0, 0.0, '+', 0, 0, 0, 0, 0, 0), + (1, 500, 550, 0, 0.0, '+', 0, 0, 0, 0, 0, 0), + ]; + let expected_str = build_expected_str_bed12(&expected); + assert_eq!(output.stdout, expected_str.as_bytes()); + Ok(()) + } + #[test] fn test_subtract_fractional_query() -> Result<()> { let a = "tests/datasets/subtract/subtract_a.bed"; From a7a8a150d90982c9e2eeed1e88ea829ad7e7b2e8 Mon Sep 17 00:00:00 2001 From: noam teyssier <22600644+noamteyssier@users.noreply.github.com> Date: Mon, 20 Nov 2023 15:20:51 -0800 Subject: [PATCH 3/5] feat: added 3 column output as a function within writenamediter for merging --- src/commands/merge.rs | 8 ++++---- src/io/mod.rs | 6 +++--- src/io/write/iter.rs | 47 +++++++++++++++++++++++++++++++++++++++++++ src/io/write/mod.rs | 3 ++- 4 files changed, 56 insertions(+), 8 deletions(-) diff --git a/src/commands/merge.rs b/src/commands/merge.rs index a96eba9..8b58cae 100644 --- a/src/commands/merge.rs +++ b/src/commands/merge.rs @@ -3,7 +3,7 @@ use std::io::{Read, Write}; use crate::{ io::{ build_reader, iter_unnamed, match_input, match_output, read_bed12_set, read_bed3_set, - read_bed6_set, write_records_iter, write_records_iter_with, + read_bed6_set, write_3col_iter_with, write_records_iter, }, types::InputFormat, }; @@ -27,7 +27,7 @@ where set.set_sorted(); } let merged = set.merge()?; - write_records_iter_with( + write_3col_iter_with( merged.records().into_iter(), output_handle, translater.as_ref(), @@ -52,7 +52,7 @@ where set.set_sorted(); } let merged = set.merge()?; - write_records_iter_with( + write_3col_iter_with( merged.records().into_iter(), output_handle, translater.as_ref(), @@ -77,7 +77,7 @@ where set.set_sorted(); } let merged = set.merge()?; - write_records_iter_with( + write_3col_iter_with( merged.records().into_iter(), output_handle, translater.as_ref(), diff --git a/src/io/mod.rs b/src/io/mod.rs index 9e1fb18..fc830d5 100644 --- a/src/io/mod.rs +++ b/src/io/mod.rs @@ -10,9 +10,9 @@ pub use read::{ }; use serde::{Deserialize, Serialize}; pub use write::{ - build_writer, write_named_pairs_iter, write_named_records_iter_dashmap, write_pairs_iter, - write_pairs_iter_with, write_records_iter, write_records_iter_with, WriteIter, WriteIterImpl, - WriteNamedIter, WriteNamedIterImpl, + build_writer, write_3col_iter_with, write_named_pairs_iter, write_named_records_iter_dashmap, + write_pairs_iter, write_pairs_iter_with, write_records_iter, write_records_iter_with, + WriteIter, WriteIterImpl, WriteNamedIter, WriteNamedIterImpl, }; #[derive(Deserialize, Serialize)] diff --git a/src/io/write/iter.rs b/src/io/write/iter.rs index 635a7cc..aec60cd 100644 --- a/src/io/write/iter.rs +++ b/src/io/write/iter.rs @@ -25,11 +25,33 @@ where Ok(()) } +pub fn write_3col_iter_with( + records: I, + writer: W, + translater: Option<&Tr>, +) -> Result<()> +where + W: Write, + I: Iterator, + Co: Coordinates + Serialize, + Tr: Translate, + WriteNamedIterImpl: WriteNamedIter, +{ + if let Some(translater) = translater { + // WriteNamedIterImpl::write_named_iter(writer, records, translater)?; + WriteNamedIterImpl::write_named_3col_iter(writer, records, translater)?; + } else { + WriteIterImpl::::write_3col_iter(writer, records)?; + } + Ok(()) +} + pub trait WriteIter where C: Coordinates, { fn write_iter>(writer: W, iterator: It) -> Result<()>; + fn write_3col_iter>(writer: W, iterator: It) -> Result<()>; } pub struct WriteIterImpl @@ -50,6 +72,16 @@ where wtr.flush()?; Ok(()) } + + fn write_3col_iter>(writer: W, iterator: It) -> Result<()> { + let mut wtr = build_writer(writer); + for interval in iterator { + let named_interval = (interval.chr(), interval.start(), interval.end()); + wtr.serialize(named_interval)?; + } + wtr.flush()?; + Ok(()) + } } pub trait WriteNamedIter @@ -64,6 +96,21 @@ where ) -> Result<()> { unimplemented!() } + #[allow(unused_variables)] + fn write_named_3col_iter, Tr: Translate>( + writer: W, + iterator: It, + translater: &Tr, + ) -> Result<()> { + let mut wtr = build_writer(writer); + for interval in iterator { + let chr = translater.get_name(*interval.chr()).unwrap(); + let named_interval = (chr, interval.start(), interval.end()); + wtr.serialize(named_interval)?; + } + wtr.flush()?; + Ok(()) + } } pub struct WriteNamedIterImpl; impl WriteNamedIter> for WriteNamedIterImpl { diff --git a/src/io/write/mod.rs b/src/io/write/mod.rs index 8c84064..42a8f07 100644 --- a/src/io/write/mod.rs +++ b/src/io/write/mod.rs @@ -1,7 +1,8 @@ mod iter; mod utils; pub use iter::{ - write_records_iter_with, WriteIter, WriteIterImpl, WriteNamedIter, WriteNamedIterImpl, + write_3col_iter_with, write_records_iter_with, WriteIter, WriteIterImpl, WriteNamedIter, + WriteNamedIterImpl, }; pub use utils::{ build_writer, write_named_pairs_iter, write_named_records_iter_dashmap, write_pairs_iter, From a297c962c7492580af7adf54993415e29ec2c1ed Mon Sep 17 00:00:00 2001 From: noam teyssier <22600644+noamteyssier@users.noreply.github.com> Date: Mon, 20 Nov 2023 15:21:17 -0800 Subject: [PATCH 4/5] chore: remove old commented code --- src/io/write/iter.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/io/write/iter.rs b/src/io/write/iter.rs index aec60cd..695a66f 100644 --- a/src/io/write/iter.rs +++ b/src/io/write/iter.rs @@ -38,7 +38,6 @@ where WriteNamedIterImpl: WriteNamedIter, { if let Some(translater) = translater { - // WriteNamedIterImpl::write_named_iter(writer, records, translater)?; WriteNamedIterImpl::write_named_3col_iter(writer, records, translater)?; } else { WriteIterImpl::::write_3col_iter(writer, records)?; From 828c6f4c99793ba1c52f84a01b187efa1a0a4da6 Mon Sep 17 00:00:00 2001 From: noam teyssier <22600644+noamteyssier@users.noreply.github.com> Date: Mon, 20 Nov 2023 15:22:06 -0800 Subject: [PATCH 5/5] chore: bump patch version --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 4d65047..800c313 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "gia" -version = "0.1.21" +version = "0.1.22" edition = "2021" description = "A tool for set theoretic operations of genomic intervals" license = "MIT"