diff --git a/Cargo.toml b/Cargo.toml index 8e28b0c..1db5367 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "gia" -version = "0.2.5" +version = "0.2.6" edition = "2021" description = "A tool for set theoretic operations of genomic intervals" license = "MIT" @@ -12,33 +12,33 @@ keywords = ["genomics", "bioinformatics", "bed", "set", "interval"] # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -anyhow = "1.0.72" +anyhow = "1.0.79" # bedrs = { version = "0.1.11", features = ["serde", "rayon"] } bedrs = { git = "https://github.com/noamteyssier/bedrs", branch = "bedrs-2.0", features = [ "serde", "rayon", ] } -bstr = "1.6.0" -clap = { version = "4.3.19", features = ["derive"] } -csv = "1.2.2" -dashmap = { version = "5.5.0", features = ["serde"] } +bstr = "1.9.0" +clap = { version = "4.4.18", features = ["derive"] } +csv = "1.3.0" +dashmap = { version = "5.5.3", features = ["serde"] } faiquery = "0.1.3" -hashbrown = "0.14.0" +hashbrown = "0.14.3" human-sort = "0.2.2" -memmap2 = "0.7.1" +memmap2 = "0.9.4" rand = "0.8.5" rand_chacha = "0.3.1" -serde = { version = "1.0.183", features = ["derive"] } +serde = { version = "1.0.196", features = ["derive"] } niffler = { version = "2.5.0", default-features = false, features = ["gz"] } gzp = { version = "0.11.3", features = [ "deflate_rust", ], default-features = false } -rayon = "1.8.0" +rayon = "1.8.1" flate2 = "1.0.28" [dev-dependencies] -assert_cmd = "2.0.11" -predicates = "3.0.3" +assert_cmd = "2.0.13" +predicates = "3.1.0" [profile.release] lto = true diff --git a/src/cli.rs b/src/cli.rs index a18a407..864ab15 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -72,6 +72,48 @@ pub enum Command { stream: bool, }, + /// Calculates the coverage of intervals in Set A by intervals in Set B + Coverage { + /// Input BED file to intersect (default=stdin) + #[clap(short, long)] + a: Option, + + /// Secondary BED file to intersect + #[clap(short, long)] + b: String, + + /// Output BED file to write to (default=stdout) + #[clap(short, long)] + output: Option, + + /// Minimum fraction of a's interval that must be covered by b's interval + #[clap(short = 'f', long)] + fraction_query: Option, + + /// Minimum fraction of b's interval that must be covered by a's interval + #[clap(short = 'F', long)] + fraction_target: Option, + + /// Require that the fraction provided with `-f` is reciprocal to both + /// query and target + #[clap( + short, + long, + requires = "fraction_query", + conflicts_with = "fraction_target" + )] + reciprocal: bool, + + /// Requires that either fraction provided with `-f` or `-F` is met + #[clap(short, long, requires_all=&["fraction_query", "fraction_target"], conflicts_with = "reciprocal")] + either: bool, + + /// Assert that the intervals are presorted in BOTH files (unexpected behavior if they are + /// not) + #[clap(short, long)] + sorted: bool, + }, + /// Extends the intervals of a BED file /// /// The extension is either done on both sides at once diff --git a/src/commands/coverage.rs b/src/commands/coverage.rs new file mode 100644 index 0000000..aa6f167 --- /dev/null +++ b/src/commands/coverage.rs @@ -0,0 +1,189 @@ +use std::io::Write; + +use anyhow::{bail, Result}; +use bedrs::{traits::IntervalBounds, types::QueryMethod, IntervalContainer}; +use serde::Serialize; + +use crate::{ + io::{match_output, write_depth_iter_with, BedReader}, + types::{InputFormat, IntervalDepth, Rename, Renamer, Translater}, + utils::{assign_query_method, sort_pairs}, +}; + +fn run_coverage<'a, Ia, Ib, Na, W>( + set_a: &'a mut IntervalContainer, + set_b: &'a mut IntervalContainer, + translater: Option<&'a Translater>, + query_method: QueryMethod, + presorted: bool, + output_handle: W, +) -> Result<()> +where + Ia: IntervalBounds + Copy + Serialize, + Ib: IntervalBounds + Copy + Serialize, + Na: IntervalBounds<&'a str, usize> + Serialize, + W: Write, + Renamer: Rename<'a, Ia, Na>, +{ + sort_pairs(set_a, set_b, presorted); + let depth_iter = set_a.records().iter().map(|iv| { + let n_overlaps = set_b + .find_iter_sorted_method_unchecked(iv, query_method) + .expect("Error in finding overlaps") + .count(); + IntervalDepth::new(*iv, n_overlaps, translater) + }); + write_depth_iter_with(depth_iter, output_handle, translater) +} + +fn dispatch_coverage( + reader_a: BedReader, + reader_b: BedReader, + output: W, + query_method: QueryMethod, + presorted: bool, +) -> Result<()> { + if reader_a.is_named() != reader_b.is_named() { + bail!("Input files must both be named or both be unnamed"); + } + let mut translater = if reader_a.is_named() { + Some(Translater::new()) + } else { + None + }; + match reader_a.input_format() { + InputFormat::Bed3 => { + let mut set_a = reader_a.bed3_set_with(translater.as_mut())?; + match reader_b.input_format() { + InputFormat::Bed3 => { + let mut set_b = reader_b.bed3_set_with(translater.as_mut())?; + run_coverage( + &mut set_a, + &mut set_b, + translater.as_ref(), + query_method, + presorted, + output, + ) + } + InputFormat::Bed6 => { + let mut set_b = reader_b.bed6_set_with(translater.as_mut())?; + run_coverage( + &mut set_a, + &mut set_b, + translater.as_ref(), + query_method, + presorted, + output, + ) + } + InputFormat::Bed12 => { + let mut set_b = reader_b.bed12_set_with(translater.as_mut())?; + run_coverage( + &mut set_a, + &mut set_b, + translater.as_ref(), + query_method, + presorted, + output, + ) + } + } + } + InputFormat::Bed6 => { + let mut set_a = reader_a.bed6_set_with(translater.as_mut())?; + match reader_b.input_format() { + InputFormat::Bed3 => { + let mut set_b = reader_b.bed3_set_with(translater.as_mut())?; + run_coverage( + &mut set_a, + &mut set_b, + translater.as_ref(), + query_method, + presorted, + output, + ) + } + InputFormat::Bed6 => { + let mut set_b = reader_b.bed6_set_with(translater.as_mut())?; + run_coverage( + &mut set_a, + &mut set_b, + translater.as_ref(), + query_method, + presorted, + output, + ) + } + InputFormat::Bed12 => { + let mut set_b = reader_b.bed12_set_with(translater.as_mut())?; + run_coverage( + &mut set_a, + &mut set_b, + translater.as_ref(), + query_method, + presorted, + output, + ) + } + } + } + InputFormat::Bed12 => { + let mut set_a = reader_a.bed12_set_with(translater.as_mut())?; + match reader_b.input_format() { + InputFormat::Bed3 => { + let mut set_b = reader_b.bed3_set_with(translater.as_mut())?; + run_coverage( + &mut set_a, + &mut set_b, + translater.as_ref(), + query_method, + presorted, + output, + ) + } + InputFormat::Bed6 => { + let mut set_b = reader_b.bed6_set_with(translater.as_mut())?; + run_coverage( + &mut set_a, + &mut set_b, + translater.as_ref(), + query_method, + presorted, + output, + ) + } + InputFormat::Bed12 => { + let mut set_b = reader_b.bed12_set_with(translater.as_mut())?; + run_coverage( + &mut set_a, + &mut set_b, + translater.as_ref(), + query_method, + presorted, + output, + ) + } + } + } + } +} + +pub fn coverage( + a: Option, + b: String, + output: Option, + fraction_query: Option, + fraction_target: Option, + reciprocal: bool, + either: bool, + presorted: bool, + compression_threads: usize, + compression_level: u32, +) -> Result<()> { + let bed_a = BedReader::from_path(a, None, None)?; + let bed_b = BedReader::from_path(Some(b), None, None)?; + let query_method = assign_query_method(fraction_query, fraction_target, reciprocal, either); + let output_handle = match_output(output, compression_threads, compression_level)?; + dispatch_coverage(bed_a, bed_b, output_handle, query_method, presorted) +} diff --git a/src/commands/mod.rs b/src/commands/mod.rs index a4e49a4..c75e31b 100644 --- a/src/commands/mod.rs +++ b/src/commands/mod.rs @@ -1,5 +1,6 @@ mod closest; mod complement; +mod coverage; mod extend; mod flank; mod get_fasta; @@ -14,6 +15,7 @@ mod subtract; mod window; pub use closest::closest; pub use complement::complement; +pub use coverage::coverage; pub use extend::extend; pub use flank::flank; pub use get_fasta::get_fasta; diff --git a/src/io/mod.rs b/src/io/mod.rs index 5fee305..bf22871 100644 --- a/src/io/mod.rs +++ b/src/io/mod.rs @@ -9,9 +9,9 @@ pub use read::{ }; use serde::{Deserialize, Serialize}; pub use write::{ - build_writer, write_3col_iter_with, write_named_pairs_iter, write_named_records_iter_dashmap, - write_pairs_iter, write_pairs_iter_with, write_records_iter, write_records_iter_with, - WriteIter, WriteIterImpl, WriteNamedIter, WriteNamedIterImpl, + build_writer, write_3col_iter_with, write_depth_iter_with, write_named_pairs_iter, + write_named_records_iter_dashmap, write_pairs_iter, write_pairs_iter_with, write_records_iter, + write_records_iter_with, WriteIter, WriteIterImpl, WriteNamedIter, WriteNamedIterImpl, }; #[derive(Deserialize, Serialize)] diff --git a/src/io/write/mod.rs b/src/io/write/mod.rs index 42a8f07..c40f523 100644 --- a/src/io/write/mod.rs +++ b/src/io/write/mod.rs @@ -5,6 +5,6 @@ pub use iter::{ WriteNamedIterImpl, }; pub use utils::{ - build_writer, write_named_pairs_iter, write_named_records_iter_dashmap, write_pairs_iter, - write_pairs_iter_with, write_records_iter, + build_writer, write_depth_iter_with, write_named_pairs_iter, write_named_records_iter_dashmap, + write_pairs_iter, write_pairs_iter_with, write_records_iter, }; diff --git a/src/io/write/utils.rs b/src/io/write/utils.rs index 3e1dba8..ad876fd 100644 --- a/src/io/write/utils.rs +++ b/src/io/write/utils.rs @@ -1,4 +1,6 @@ -use crate::types::{IntervalPair, NumericBed3, Rename, Renamer, StreamTranslater, Translater}; +use crate::types::{ + IntervalDepth, IntervalPair, NumericBed3, Rename, Renamer, StreamTranslater, Translater, +}; use anyhow::Result; use bedrs::{traits::IntervalBounds, Coordinates}; use serde::Serialize; @@ -25,6 +27,57 @@ where Ok(()) } +pub fn write_depth_iter_with<'a, W, I, N, It>( + records: It, + writer: W, + translater: Option<&Translater>, +) -> Result<()> +where + I: IntervalBounds + Serialize, + N: IntervalBounds<&'a str, usize> + Serialize, + W: Write, + It: Iterator>, + Renamer: Rename<'a, I, N>, +{ + if translater.is_some() { + write_named_depth_iter(records, writer) + } else { + write_depth_iter(records, writer) + } +} + +fn write_depth_iter<'a, W, I, N, It>(records: It, writer: W) -> Result<()> +where + I: IntervalBounds + Serialize, + N: IntervalBounds<&'a str, usize> + Serialize, + W: Write, + It: Iterator>, + Renamer: Rename<'a, I, N>, +{ + let mut wtr = build_writer(writer); + for record in records { + wtr.serialize(record.get_tuple())?; + } + wtr.flush()?; + Ok(()) +} + +pub fn write_named_depth_iter<'a, W, I, N, It>(records: It, writer: W) -> Result<()> +where + I: IntervalBounds + Serialize, + N: IntervalBounds<&'a str, usize> + Serialize, + W: Write, + It: Iterator>, + Renamer: Rename<'a, I, N>, +{ + let mut wtr = build_writer(writer); + for record in records { + wtr.serialize(record.get_named_tuple())?; + } + wtr.flush()?; + Ok(()) +} + pub fn write_pairs_iter_with<'a, W, Ia, Ib, Na, Nb, It>( records: It, writer: W, diff --git a/src/main.rs b/src/main.rs index c5c4e88..7b30bfe 100644 --- a/src/main.rs +++ b/src/main.rs @@ -8,8 +8,8 @@ use anyhow::Result; use clap::Parser; use cli::{Cli, Command}; use commands::{ - closest, complement, extend, flank, get_fasta, intersect, merge, name_map, random, sample, - shift, sort, subtract, window, + closest, complement, coverage, extend, flank, get_fasta, intersect, merge, name_map, random, + sample, shift, sort, subtract, window, }; fn main() -> Result<()> { @@ -45,6 +45,27 @@ fn main() -> Result<()> { cli.compression_threads, cli.compression_level, )?, + Command::Coverage { + a, + b, + output, + fraction_query, + fraction_target, + reciprocal, + either, + sorted, + } => coverage( + a, + b, + output, + fraction_query, + fraction_target, + reciprocal, + either, + sorted, + cli.compression_threads, + cli.compression_level, + )?, Command::Extend { input, output, diff --git a/src/types/depth.rs b/src/types/depth.rs new file mode 100644 index 0000000..e8b75d7 --- /dev/null +++ b/src/types/depth.rs @@ -0,0 +1,42 @@ +use super::{ + translater::{Rename, Renamer}, + Translater, +}; +use bedrs::traits::IntervalBounds; + +pub struct IntervalDepth<'a, I, N> +where + I: IntervalBounds, + N: IntervalBounds<&'a str, usize>, +{ + pub iv: I, + pub n_overlaps: usize, + pub translater: Option<&'a Translater>, + phantom: std::marker::PhantomData, +} +impl<'a, I, N> IntervalDepth<'a, I, N> +where + I: IntervalBounds, + N: IntervalBounds<&'a str, usize>, + Renamer: Rename<'a, I, N>, +{ + pub fn new(iv: I, n_overlaps: usize, translater: Option<&'a Translater>) -> Self { + Self { + iv, + n_overlaps, + translater, + phantom: std::marker::PhantomData, + } + } + pub fn get_tuple(&self) -> (&I, usize) { + (&self.iv, self.n_overlaps) + } + pub fn get_named_tuple(&self) -> (N, usize) { + if let Some(translater) = self.translater { + let n = Renamer::rename_with(&self.iv, translater); + (n, self.n_overlaps) + } else { + panic!("Translater was not provided but get_named_tuple was called - there is a bug somewhere!") + } + } +} diff --git a/src/types/mod.rs b/src/types/mod.rs index 93b7085..ca8676c 100644 --- a/src/types/mod.rs +++ b/src/types/mod.rs @@ -1,6 +1,8 @@ +mod depth; mod formats; mod pairs; mod translater; +pub use depth::IntervalDepth; pub use formats::{ Bed12Set, Bed3Set, Bed6Set, FieldFormat, Genome, InputFormat, NamedBed12, NamedBed3, NamedBed6, NumericBed12, NumericBed3, NumericBed6, diff --git a/tests/coverage.rs b/tests/coverage.rs new file mode 100644 index 0000000..fdbc3ee --- /dev/null +++ b/tests/coverage.rs @@ -0,0 +1,141 @@ +#[cfg(test)] +mod testing { + use anyhow::Result; + use assert_cmd::prelude::*; + use std::{fmt::Display, process::Command}; + + fn build_expected_str(expected: &[(T, u32, u32, u32)]) -> String { + expected + .iter() + .map(|(chr, start, end, depth)| format!("{}\t{}\t{}\t{}\n", chr, start, end, depth)) + .collect::>() + .join("") + } + + #[test] + fn test_coverage() -> Result<()> { + let a = "tests/datasets/coverage/coverage_a.bed"; + let b = "tests/datasets/coverage/coverage_b.bed"; + + let mut cmd = Command::cargo_bin("gia")?; + let output = cmd + .arg("coverage") + .arg("-a") + .arg(a) + .arg("-b") + .arg(b) + .output()?; + println!("{:?}", output); + let num_intervals = output.stdout.split(|&c| c == b'\n').count() - 1; + assert_eq!(num_intervals, 10); + + let num_fields = output + .stdout + .split(|&c| c == b'\n') + .next() + .unwrap() + .split(|&c| c == b'\t') + .count(); + assert_eq!(num_fields, 4); + + let expected = vec![ + (1, 72, 222, 4), + (1, 257, 407, 1), + (1, 268, 418, 1), + (1, 467, 617, 1), + (1, 819, 969, 1), + (2, 174, 324, 2), + (2, 587, 737, 1), + (3, 395, 545, 0), + (3, 554, 704, 0), + (3, 653, 803, 0), + ]; + let expected_str = build_expected_str(&expected); + assert_eq!(output.stdout, expected_str.as_bytes()); + Ok(()) + } + + #[test] + fn test_coverage_bed3_bed3() -> Result<()> { + let a = "tests/datasets/coverage/coverage_a.bed"; + let b = "tests/datasets/coverage/coverage_b.bed"; + + let mut cmd = Command::cargo_bin("gia")?; + let output = cmd + .arg("coverage") + .arg("-a") + .arg(a) + .arg("-b") + .arg(b) + .output()?; + println!("{:?}", output); + let num_intervals = output.stdout.split(|&c| c == b'\n').count() - 1; + assert_eq!(num_intervals, 10); + + let num_fields = output + .stdout + .split(|&c| c == b'\n') + .next() + .unwrap() + .split(|&c| c == b'\t') + .count(); + assert_eq!(num_fields, 4); + Ok(()) + } + + #[test] + fn test_coverage_bed6_bed3() -> Result<()> { + let a = "tests/datasets/coverage/coverage_a.bed6"; + let b = "tests/datasets/coverage/coverage_b.bed"; + + let mut cmd = Command::cargo_bin("gia")?; + let output = cmd + .arg("coverage") + .arg("-a") + .arg(a) + .arg("-b") + .arg(b) + .output()?; + println!("{:?}", output); + let num_intervals = output.stdout.split(|&c| c == b'\n').count() - 1; + assert_eq!(num_intervals, 10); + + let num_fields = output + .stdout + .split(|&c| c == b'\n') + .next() + .unwrap() + .split(|&c| c == b'\t') + .count(); + assert_eq!(num_fields, 7); + Ok(()) + } + + #[test] + fn test_coverage_bed12_bed3() -> Result<()> { + let a = "tests/datasets/coverage/coverage_a.bed12"; + let b = "tests/datasets/coverage/coverage_b.bed"; + + let mut cmd = Command::cargo_bin("gia")?; + let output = cmd + .arg("coverage") + .arg("-a") + .arg(a) + .arg("-b") + .arg(b) + .output()?; + println!("{:?}", output); + let num_intervals = output.stdout.split(|&c| c == b'\n').count() - 1; + assert_eq!(num_intervals, 10); + + let num_fields = output + .stdout + .split(|&c| c == b'\n') + .next() + .unwrap() + .split(|&c| c == b'\t') + .count(); + assert_eq!(num_fields, 13); + Ok(()) + } +} diff --git a/tests/datasets/coverage/coverage_a.bed b/tests/datasets/coverage/coverage_a.bed new file mode 100644 index 0000000..abc19c9 --- /dev/null +++ b/tests/datasets/coverage/coverage_a.bed @@ -0,0 +1,10 @@ +1 72 222 +1 257 407 +1 268 418 +1 467 617 +1 819 969 +2 174 324 +2 587 737 +3 395 545 +3 554 704 +3 653 803 diff --git a/tests/datasets/coverage/coverage_a.bed12 b/tests/datasets/coverage/coverage_a.bed12 new file mode 100644 index 0000000..97b9f53 --- /dev/null +++ b/tests/datasets/coverage/coverage_a.bed12 @@ -0,0 +1,10 @@ +1 72 222 0 0.0 + 0 0 0 0 0 0 +1 257 407 0 0.0 + 0 0 0 0 0 0 +1 268 418 0 0.0 + 0 0 0 0 0 0 +1 467 617 0 0.0 + 0 0 0 0 0 0 +1 819 969 0 0.0 + 0 0 0 0 0 0 +2 174 324 0 0.0 + 0 0 0 0 0 0 +2 587 737 0 0.0 + 0 0 0 0 0 0 +3 395 545 0 0.0 + 0 0 0 0 0 0 +3 554 704 0 0.0 + 0 0 0 0 0 0 +3 653 803 0 0.0 + 0 0 0 0 0 0 diff --git a/tests/datasets/coverage/coverage_a.bed6 b/tests/datasets/coverage/coverage_a.bed6 new file mode 100644 index 0000000..802646e --- /dev/null +++ b/tests/datasets/coverage/coverage_a.bed6 @@ -0,0 +1,10 @@ +1 72 222 0 0.0 + +1 257 407 0 0.0 + +1 268 418 0 0.0 + +1 467 617 0 0.0 + +1 819 969 0 0.0 + +2 174 324 0 0.0 + +2 587 737 0 0.0 + +3 395 545 0 0.0 + +3 554 704 0 0.0 + +3 653 803 0 0.0 + diff --git a/tests/datasets/coverage/coverage_b.bed b/tests/datasets/coverage/coverage_b.bed new file mode 100644 index 0000000..7c20624 --- /dev/null +++ b/tests/datasets/coverage/coverage_b.bed @@ -0,0 +1,10 @@ +1 55 205 +1 69 219 +1 93 243 +1 156 306 +1 603 753 +1 837 987 +2 39 189 +2 71 221 +2 672 822 +3 138 288 diff --git a/tests/datasets/coverage/coverage_b.bed12 b/tests/datasets/coverage/coverage_b.bed12 new file mode 100644 index 0000000..515e704 --- /dev/null +++ b/tests/datasets/coverage/coverage_b.bed12 @@ -0,0 +1,10 @@ +1 55 205 0 0.0 + 0 0 0 0 0 0 +1 69 219 0 0.0 + 0 0 0 0 0 0 +1 93 243 0 0.0 + 0 0 0 0 0 0 +1 156 306 0 0.0 + 0 0 0 0 0 0 +1 603 753 0 0.0 + 0 0 0 0 0 0 +1 837 987 0 0.0 + 0 0 0 0 0 0 +2 39 189 0 0.0 + 0 0 0 0 0 0 +2 71 221 0 0.0 + 0 0 0 0 0 0 +2 672 822 0 0.0 + 0 0 0 0 0 0 +3 138 288 0 0.0 + 0 0 0 0 0 0 diff --git a/tests/datasets/coverage/coverage_b.bed6 b/tests/datasets/coverage/coverage_b.bed6 new file mode 100644 index 0000000..c8611d5 --- /dev/null +++ b/tests/datasets/coverage/coverage_b.bed6 @@ -0,0 +1,10 @@ +1 55 205 0 0.0 + +1 69 219 0 0.0 + +1 93 243 0 0.0 + +1 156 306 0 0.0 + +1 603 753 0 0.0 + +1 837 987 0 0.0 + +2 39 189 0 0.0 + +2 71 221 0 0.0 + +2 672 822 0 0.0 + +3 138 288 0 0.0 +